Skip to content

Commit

Permalink
[exp]: back translation 실험
Browse files Browse the repository at this point in the history
  • Loading branch information
BJH9 committed Jan 10, 2024
1 parent 2624393 commit faf15a7
Show file tree
Hide file tree
Showing 8 changed files with 114,360 additions and 19 deletions.
57 changes: 45 additions & 12 deletions code/augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,32 +54,65 @@ def google_ko2en2ko(ko_text, translator):

#%%

# 번역된 문장에서 subject에 entity가 없는 경우 제거, subject_entity와 object_entity idx sentence에 맞게 조정

def filter_row(row):
return row['subject_entity']['word'] in row['sentence'] and row['object_entity']['word'] in row['sentence']

print('원래 길이', len(trans_train))

# sentence에 subject entity와 object entity가 있는지
# sentence에 subject entity와 object entity가 없으면 제거
trans_train['subject_entity'] = trans_train['subject_entity'].apply(ast.literal_eval)
trans_train['object_entity'] = trans_train['object_entity'].apply(ast.literal_eval)
trans_train = trans_train[trans_train.apply(filter_row, axis=1)]
print('sentence에 entity들 있는 거만 남기면', len(trans_train))
filtered_trans1 = trans_train[trans_train.apply(filter_row, axis=1)]
print('sentence에 entity들 있는 거만 남기면', len(filtered_trans1))


# 분포를 맞춰주기 위해 no_relation, org:top_members/employees, per:employee_of을 제거
#filtered_trans2 = filtered_trans1[(filtered_trans1['label'] != "no_relation") & (trans_train['label'] != "org:top_members/employees") & (trans_train['label'] != "per:employee_of")]
#print('filter 후 길이', len(filtered_trans2))

# subject_entity와 object_entity의 start_idx 및 end_idx 수정해준다.
filtered_trans1['subject_entity'] = filtered_trans1.apply(lambda row: {
'word': row['subject_entity']['word'],
'start_idx': row['sentence'].find(row['subject_entity']['word']),
'end_idx': row['sentence'].find(row['subject_entity']['word']) + len(row['subject_entity']['word'])-1,
'type': row['subject_entity']['type']
}, axis=1)

filtered_trans1['object_entity'] = filtered_trans1.apply(lambda row: {
'word': row['object_entity']['word'],
'start_idx': row['sentence'].find(row['object_entity']['word']),
'end_idx': row['sentence'].find(row['object_entity']['word']) + len(row['object_entity']['word'])-1,
'type': row['object_entity']['type']
}, axis=1)

filtered_trans1['id'] = range(1, len(filtered_trans1)+1)
filtered_trans1.reset_index(drop=True, inplace=True)
filtered_trans1.to_csv('../dataset/train/filtered_trans1.csv', index=False)

# %%
# POH만 가지도록

filtered1_trans_train_path = '../dataset/train/filtered_trans1.csv'
filtered1_trans_train = pd.read_csv(filtered1_trans_train_path)

smooth_trans = trans_train[(trans_train['label'] != "no_relation") & (trans_train['label'] != "org:top_members/employees") & (trans_train['label'] != "per:employee_of")]
print('smoothing 후 길이', len(smooth_trans))
filtered1_trans_train['object_entity'] = filtered1_trans_train['object_entity'].apply(ast.literal_eval)
filtered2_trans = filtered1_trans_train[(filtered1_trans_train['object_entity'].apply(lambda x: x.get('type') == 'POH'))]
print("filtered2길이: ", len(filtered2_trans))

print('2 len', smooth_trans)
smooth_trans['id'] = range(1, len(smooth_trans)+1)
smooth_trans.reset_index(drop=True, inplace=True)
#smooth_trans.to_csv('../dataset/train/smooth_trans.csv', index=False)
filtered2_trans['id'] = range(1, len(filtered2_trans)+1)
filtered2_trans.reset_index(drop=True, inplace=True)
filtered2_trans.to_csv('../dataset/train/filtered_trans2.csv', index=False)

# %%
# 번역된 데이터(전처리) + 원본 데이터

trans_train_path = '../dataset/train/smooth_trans.csv'
trans_train_path = '../dataset/train/filtered_trans2.csv'
trans_train = pd.read_csv(trans_train_path)

augmented_df = pd.merge(train_df, smooth_trans, how='outer')
augmented_df = pd.merge(train_df, trans_train, how='outer')
augmented_df['id'] = range(1, len(augmented_df)+1)
augmented_df.reset_index(drop=True, inplace=True)
augmented_df.to_csv('../dataset/train/augmented_train.csv', index=False)
augmented_df.to_csv('../dataset/train/augmented_train2.csv', index=False)
# %%
2 changes: 1 addition & 1 deletion code/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
test_path = '../dataset/test/test_data.csv'
test_df = pd.read_csv(test_path)

augmented_path = '../dataset/train/augmented_train.csv'
augmented_path = '../dataset/train/augmented_train2.csv'
augmented_df = pd.read_csv(augmented_path)

# %%
Expand Down
4 changes: 2 additions & 2 deletions code/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def inference(model, tokenized_sent, device):
test dataset을 DataLoader로 만들어 준 후,
batch_size로 나눠 model이 예측 합니다.
"""
dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False)
model.eval()
output_pred = []
output_prob = []
Expand Down Expand Up @@ -57,7 +57,7 @@ def main(args):
P_CONFIG = {'prompt_kind' : 's_and_o',
'preprocess_method' : 'typed_entity_marker_punct',
'and_marker' : '와',
'add_question' : True,
'add_question' : False,
'only_sentence' : False}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
Expand Down
8 changes: 4 additions & 4 deletions code/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ def train():

# MODEL_NAME = "bert-base-uncased"
MODEL_NAME = "klue/roberta-large"
TRAIN_PATH = "../dataset/train/train.csv"
TRAIN_PATH = "../dataset/train/augmented_train1.csv"
LABEL_CNT = 30
P_CONFIG = {'prompt_kind' : 's_and_o', # ['s_sep_o', 's_and_o', 'quiz']
'preprocess_method' : 'typed_entity_marker_punct', # ['baseline_preprocessor', 'entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct']
'and_marker' : '와', # ['와', '그리고', '&', '[SEP]']
'add_question' : True, # sentence 뒷 부분에 "sub_e 와 obj_e의 관계는 무엇입니까?""
'add_question' : False, # sentence 뒷 부분에 "sub_e 와 obj_e의 관계는 무엇입니까?""
'only_sentence' : False} # True : (sentence) / False : (prompt + sentence)


Expand Down Expand Up @@ -96,15 +96,15 @@ def train():
train_dataset=re_train_dataset, # training dataset
eval_dataset=re_dev_dataset, # evaluation dataset
compute_metrics=compute_metrics, # define metrics function
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] # early_stopping
# callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] # early_stopping
# early stopping사용을 원하지 않는다면 그냥 callbacks 줄을 주석 처리 하면됨
)

# train model
trainer.train()
# git에 올린 코드
model_state_dict = model.state_dict()
torch.save({'model_state_dict' : model_state_dict}, './best_model/bestmodel.pth')
torch.save({'model_state_dict' : model_state_dict}, './best_model/augment1_bestmodel.pth')

def main():
train()
Expand Down
Loading

0 comments on commit faf15a7

Please sign in to comment.