Merge pull request #3 from Yale-LILY/dev

updated README, added ACU deduplication for A2CU
Yale-LILY · Jul 3, 2024 · 25286e2 · 25286e2
2 parents 20cdd84 + 358a87f
commit 25286e2
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+tmp.py
+autoacu/__pycache__
diff --git a/README.md b/README.md
@@ -42,16 +42,17 @@ Please note that to use A2CU, you may need to have a GPU with at least 16GB memo
 Below is an example of using A2CU to evaluate the similarity between two text sequences.
 ```python
 from autoacu import A2CU
-candidates, references = ["This is a test"], ["This is a test"]
+candidates = ["Real Madrid have fought off all the competition to win the 2023/24 UEFA Champions League after beating Borussia Dortmund 2-0 in the final at Wembley Stadium on 1 June."]
+references = ["On June 1st, Real Madrid emerged victorious in the 2023/24 UEFA Champions League, defeating Borussia Dortmund 2-0 in the final at Wembley Stadium, overcoming all competitors to claim the title."]
 a2cu = A2CU(device=0)  # the GPU device to use
 recall_scores, prec_scores, f1_scores = a2cu.score(
     references=references,
     candidates=candidates,
     generation_batch_size=2, # the batch size for ACU generation
-    matching_batch_size=16 # the batch size for ACU matching
-    output_path=None # the path to save the evaluation results
-    recall_only=False # whether to only compute the recall score
-    acu_path=None # the path to save the generated ACUs
+    matching_batch_size=16, # the batch size for ACU matching
+    output_path=None, # the path to save the evaluation results
+    recall_only=False, # whether to only compute the recall score
+    acu_path=None, # the path to save the generated ACUs
     )
 print(f"Recall: {recall_scores[0]:.4f}, Precision {prec_scores[0]:.4f}, F1: {f1_scores[0]:.4f}")
 ```
@@ -61,13 +62,17 @@ The default model checkpoint for A3CU is [Yale-LILY/a3cu](https://huggingface.co
 Below is an example of using A3CU to evaluate the similarity between two text sequences.
 ```python
 from autoacu import A3CU
-candidates, references = ["This is a test"], ["This is a test"]
+candidates = ["Real Madrid have fought off all the competition to win the 2023/24 UEFA Champions League after beating Borussia Dortmund 2-0 in the final at Wembley Stadium on 1 June."]
+references = ["On June 1st, Real Madrid emerged victorious in the 2023/24 UEFA Champions League, defeating Borussia Dortmund 2-0 in the final at Wembley Stadium, overcoming all competitors to claim the title."]
 a3cu = A3CU(device=0)  # the GPU device to use
 recall_scores, prec_scores, f1_scores = a3cu.score(
     references=references,
     candidates=candidates,
-    batch_size=16 # the batch size for ACU generation
-    output_path=None # the path to save the evaluation results
+    batch_size=16, # the batch size for ACU generation
+    output_path=None, # the path to save the evaluation results
     )
 print(f"Recall: {recall_scores[0]:.4f}, Precision {prec_scores[0]:.4f}, F1: {f1_scores[0]:.4f}")
 ```
+
+### Warning
+A2CU and A3CU may not work properly on short text sequences (e.g., less than 10 words) due to the limitations of the ACU generation model, especially for non-reducible text sequences.
diff --git a/autoacu/a2cu.py b/autoacu/a2cu.py
@@ -86,17 +86,17 @@ def data_loader():
                         input_ids=text_id,
                         attention_mask=input_mask,
                         max_length=512,
-                        min_length=10,
                         num_beams=4,
                         length_penalty=1.0,
                         early_stopping=True,
                     )
                     dec = [self.generation_tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
                     for (i, x) in enumerate(dec):
+                        _acus = list(set(x.split("|||")))
                         print(json.dumps({
-                            "acus": x.split("|||"),
+                            "acus": _acus,
                         }), file=f, flush=True)
-                        acus.append(x.split("|||"))
+                        acus.append(_acus)
         self.generation_model = self.generation_model.cpu()
         return acus