def augment_data(target_df, sub_entity_list, obj_entity_list, target_class, times):
new = {'id':[], 'sentence':[], 'subject_entity':[], 'object_entity':[], 'label':[], 'source':[]}
# 원본 데이터수 * times 개 생성
for time in range(times):
for idx, (id_, sentence, subject_entity, object_entity, label, source) in enumerate(target_df.values):
# 정보 추출
subject_dict = literal_eval(subject_entity)
object_dict = literal_eval(object_entity)
# 랜덤 추출
sub_idx = np.random.randint(0, sub_entity_list.shape[0])
obj_idx = np.random.randint(0, obj_entity_list.shape[0])
sub_word = sub_entity_list['name'].iloc[sub_idx]
obj_word = obj_entity_list['name'].iloc[obj_idx]
# Sentence 바꾸기
# 먼저 나오는 entity 속성에 따라 입력 값, 반환 값 순서를 다르게!
if subject_dict['start_idx'] > object_dict['start_idx']:
new_sentence, sy, ey, sx, ex = change_sentence(sentence, object_dict['start_idx'], object_dict['end_idx'], subject_dict['start_idx'], subject_dict['end_idx'], obj_word, sub_word)
else:
new_sentence, sx, ex, sy, ey = change_sentence(sentence, subject_dict['start_idx'], subject_dict['end_idx'], object_dict['start_idx'], object_dict['end_idx'], sub_word, obj_word)
# subject_entity//object_entity 새로운 정보 입력
subject_dict['word'] = sub_word
subject_dict['start_idx'] = sx
subject_dict['end_idx'] = ex
object_dict['word'] = obj_word
object_dict['start_idx'] = sy
object_dict['end_idx'] = ey
new['id'].append(idx)
new['sentence'].append(new_sentence)
new['subject_entity'].append(str(subject_dict)) # string으로 변환해 입력
new['object_entity'].append(str(object_dict)) # string으로 변환해 입력
new['label'].append(label)
new['source'].append(source)
# 데이터 프레임으로 변경
new = pd.DataFrame(new)
# 문장 기준 중복제거
new = new.drop_duplicates('sentence')
print("생성된 데이터 수:", new.shape)
new.to_csv('new_'+target_class+'_members.csv', index=False)
return new