[exp]: back translation 실험

boostcampaitech6 · Jan 10, 2024 · faf15a7 · faf15a7
1 parent 2624393
commit faf15a7
Show file tree

Hide file tree

Showing 8 changed files with 114,360 additions and 19 deletions.
diff --git a/code/augmentation.py b/code/augmentation.py
@@ -54,32 +54,65 @@ def google_ko2en2ko(ko_text, translator):
 
 #%%
 
+# 번역된 문장에서 subject에 entity가 없는 경우 제거, subject_entity와 object_entity idx sentence에 맞게 조정
+
 def filter_row(row):
     return row['subject_entity']['word'] in row['sentence'] and row['object_entity']['word'] in row['sentence']
 
 print('원래 길이', len(trans_train))
 
-# sentence에 subject entity와 object entity가 있는지
+# sentence에 subject entity와 object entity가 없으면 제거
 trans_train['subject_entity'] = trans_train['subject_entity'].apply(ast.literal_eval)
 trans_train['object_entity'] = trans_train['object_entity'].apply(ast.literal_eval)
-trans_train = trans_train[trans_train.apply(filter_row, axis=1)]
-print('sentence에 entity들 있는 거만 남기면', len(trans_train))
+filtered_trans1 = trans_train[trans_train.apply(filter_row, axis=1)]
+print('sentence에 entity들 있는 거만 남기면', len(filtered_trans1))
+
+
+# 분포를 맞춰주기 위해 no_relation, org:top_members/employees, per:employee_of을 제거
+#filtered_trans2 = filtered_trans1[(filtered_trans1['label'] != "no_relation") & (trans_train['label'] != "org:top_members/employees") & (trans_train['label'] != "per:employee_of")]
+#print('filter 후 길이', len(filtered_trans2))
+
+# subject_entity와 object_entity의 start_idx 및 end_idx 수정해준다.
+filtered_trans1['subject_entity'] = filtered_trans1.apply(lambda row: {
+    'word': row['subject_entity']['word'],
+    'start_idx': row['sentence'].find(row['subject_entity']['word']),
+    'end_idx': row['sentence'].find(row['subject_entity']['word']) + len(row['subject_entity']['word'])-1,
+    'type': row['subject_entity']['type']
+}, axis=1)
+
+filtered_trans1['object_entity'] = filtered_trans1.apply(lambda row: {
+    'word': row['object_entity']['word'],
+    'start_idx': row['sentence'].find(row['object_entity']['word']),
+    'end_idx': row['sentence'].find(row['object_entity']['word']) + len(row['object_entity']['word'])-1,
+    'type': row['object_entity']['type']
+}, axis=1)
+
+filtered_trans1['id'] = range(1, len(filtered_trans1)+1)
+filtered_trans1.reset_index(drop=True, inplace=True)
+filtered_trans1.to_csv('../dataset/train/filtered_trans1.csv', index=False)
+
+# %%
+# POH만 가지도록
+
+filtered1_trans_train_path = '../dataset/train/filtered_trans1.csv'
+filtered1_trans_train = pd.read_csv(filtered1_trans_train_path)
 
-smooth_trans = trans_train[(trans_train['label'] != "no_relation") & (trans_train['label'] != "org:top_members/employees") & (trans_train['label'] != "per:employee_of")]
-print('smoothing 후 길이', len(smooth_trans))
+filtered1_trans_train['object_entity'] = filtered1_trans_train['object_entity'].apply(ast.literal_eval)
+filtered2_trans = filtered1_trans_train[(filtered1_trans_train['object_entity'].apply(lambda x: x.get('type') == 'POH'))]
+print("filtered2길이: ", len(filtered2_trans))
 
-print('2 len', smooth_trans)
-smooth_trans['id'] = range(1, len(smooth_trans)+1)
-smooth_trans.reset_index(drop=True, inplace=True)
-#smooth_trans.to_csv('../dataset/train/smooth_trans.csv', index=False)
+filtered2_trans['id'] = range(1, len(filtered2_trans)+1)
+filtered2_trans.reset_index(drop=True, inplace=True)
+filtered2_trans.to_csv('../dataset/train/filtered_trans2.csv', index=False)
 
 # %%
+# 번역된 데이터(전처리) + 원본 데이터
 
-trans_train_path = '../dataset/train/smooth_trans.csv'
+trans_train_path = '../dataset/train/filtered_trans2.csv'
 trans_train = pd.read_csv(trans_train_path)
 
-augmented_df = pd.merge(train_df, smooth_trans, how='outer')
+augmented_df = pd.merge(train_df, trans_train, how='outer')
 augmented_df['id'] = range(1, len(augmented_df)+1)
 augmented_df.reset_index(drop=True, inplace=True)
-augmented_df.to_csv('../dataset/train/augmented_train.csv', index=False)
+augmented_df.to_csv('../dataset/train/augmented_train2.csv', index=False)
 # %%
diff --git a/code/data_analysis.py b/code/data_analysis.py
@@ -14,7 +14,7 @@
 test_path = '../dataset/test/test_data.csv'
 test_df = pd.read_csv(test_path)
 
-augmented_path = '../dataset/train/augmented_train.csv'
+augmented_path = '../dataset/train/augmented_train2.csv'
 augmented_df = pd.read_csv(augmented_path)
 
 # %%

diff --git a/code/inference.py b/code/inference.py
@@ -20,7 +20,7 @@ def inference(model, tokenized_sent, device):
     test dataset을 DataLoader로 만들어 준 후,
     batch_size로 나눠 model이 예측 합니다.
   """
-  dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
+  dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False)
   model.eval()
   output_pred = []
   output_prob = []
@@ -57,7 +57,7 @@ def main(args):
   P_CONFIG = {'prompt_kind' : 's_and_o',
                 'preprocess_method' : 'typed_entity_marker_punct',
                 'and_marker' : '와',
-                'add_question' : True,
+                'add_question' : False,
                 'only_sentence' : False} 
 
   tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

diff --git a/code/train.py b/code/train.py
@@ -20,12 +20,12 @@ def train():
 
     # MODEL_NAME = "bert-base-uncased"
     MODEL_NAME = "klue/roberta-large"
-    TRAIN_PATH = "../dataset/train/train.csv"
+    TRAIN_PATH = "../dataset/train/augmented_train1.csv"
     LABEL_CNT = 30
     P_CONFIG = {'prompt_kind' : 's_and_o',  # ['s_sep_o', 's_and_o', 'quiz']
                 'preprocess_method' : 'typed_entity_marker_punct', # ['baseline_preprocessor', 'entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct']
                 'and_marker' : '와',      # ['와', '그리고', '&', '[SEP]']
-                'add_question' : True,    # sentence 뒷 부분에 "sub_e 와 obj_e의 관계는 무엇입니까?""
+                'add_question' : False,    # sentence 뒷 부분에 "sub_e 와 obj_e의 관계는 무엇입니까?""
                 'only_sentence' : False}  # True : (sentence) / False : (prompt + sentence) 
 
 
@@ -96,15 +96,15 @@ def train():
       train_dataset=re_train_dataset,         # training dataset
       eval_dataset=re_dev_dataset,             # evaluation dataset
       compute_metrics=compute_metrics,         # define metrics function
-      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]  # early_stopping 
+      # callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]  # early_stopping 
       # early stopping사용을 원하지 않는다면 그냥 callbacks 줄을 주석 처리 하면됨
     )
 
     # train model
     trainer.train()
     # git에 올린 코드
     model_state_dict = model.state_dict()
-    torch.save({'model_state_dict' : model_state_dict}, './best_model/bestmodel.pth')
+    torch.save({'model_state_dict' : model_state_dict}, './best_model/augment1_bestmodel.pth')
 
 def main():
     train()