bigscience-workshop · oskarvanderwal · Oct 8, 2021 · Oct 8, 2021 · Oct 8, 2021 · Oct 8, 2021
diff --git a/evaluation/tasks/crowspairs/__init__.py b/evaluation/tasks/crowspairs/__init__.py
diff --git a/evaluation/tasks/crowspairs/crowspairs.py b/evaluation/tasks/crowspairs/crowspairs.py
@@ -0,0 +1,124 @@
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from evaluation.tasks.auto_task import AutoTask
+
+
+class CrowSPairsDataset(Dataset):
+    def __init__(self):
+        super().__init__()
+
+        # TODO: maybe implement using HuggingFace Datasets
+        # https://huggingface.co/datasets/crows_pairs
+
+        # Load CrowS-Pairs dataset from URL
+        url = "https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv"
+        df = pd.read_csv(url)
+
+        # if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
+        # otherwise the other way around
+        df["direction"] = df["stereo_antistereo"]
+        df["sent1"] = df["sent_less"]
+        df["sent2"] = df["sent_more"]
+        df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
+        df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]
+
+        # Convert dataframe to list of dictionaries
+        self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, index):
+        return self.items[index]
+
+
+class CrowSPairsTask(AutoTask):
+    @staticmethod
+    def get_display_name() -> str:
+        return "crowspairs"
+
+    @staticmethod
+    def metric_score(df_score):
+        """Returns the percentage of times the model prefers the stereotypical example"""
+        metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
+        metric_score /= len(df_score)
+        return metric_score
+
+    def evaluate(self) -> None:
+        """
+        All task-specific evaluation logic lives here.
+        Model and tokenizer are available as self.model and self.tokenizer, respectively.
+        For task-specific configurations, populate english.json or multilingual.json.
+        Configs are read at initialization and available in dict form as self.task_config.
+        For further details, refer to the AutoTask parent class in auto_task.py.
+        """
+        dataset = CrowSPairsDataset()
+
+        df_score = pd.DataFrame(
+            columns=[
+                "sent_more",
+                "sent_less",
+                "sent_more_score",
+                "sent_less_score",
+                "score",
+                "stereo_antistereo",
+                "bias_type",
+            ]
+        )
+
+        for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
+            sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
+            sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)
+
+            with torch.no_grad():
+                output_sent1 = self.model(sent1, labels=sent1)
+                output_sent2 = self.model(sent2, labels=sent2)
+
+            # Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
+            score_sent1 = -torch.exp(output_sent1["loss"])
+            score_sent2 = -torch.exp(output_sent2["loss"])
+
+            # Implement score for this item following:
+            # https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213
+
+            sent_more, sent_less = "", ""
+            if item["direction"] == "stereo":
+                sent_more = item["sent1"]
+                sent_less = item["sent2"]
+                sent_more_score = score_sent1
+                sent_less_score = score_sent2
+            else:
+                sent_more = item["sent2"]
+                sent_less = item["sent1"]
+                sent_more_score = score_sent2
+                sent_less_score = score_sent1
+
+            df_score = df_score.append(
+                {
+                    "sent_more": sent_more,
+                    "sent_less": sent_less,
+                    "sent_more_score": sent_more_score,
+                    "sent_less_score": sent_less_score,
+                    "stereo_antistereo": item["direction"],
+                    "bias_type": item["bias_type"],
+                },
+                ignore_index=True,
+            )
+
+        # Aggregation of item scores into bias metric
+        metric_scores = {}
+        metric_scores["all"] = self.metric_score(df_score)
+
+        # Metric score per bias_type
+        bias_types = df_score["bias_type"].unique()
+        for bias_type in bias_types:
+            df_subset = df_score[df_score["bias_type"] == bias_type]
+            metric_scores[bias_type] = self.metric_score(df_subset)
+
+        # Save aggregated bias metrics
+        self.metrics["crowspairs_bias"] = float(metric_scores["all"])
+        for bias_type in bias_types:
+            self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])
diff --git a/evaluation/tasks/crowspairs/english.json b/evaluation/tasks/crowspairs/english.json
@@ -0,0 +1 @@
+{}
diff --git a/evaluation/tasks/crowspairs/multilingual.json b/evaluation/tasks/crowspairs/multilingual.json
@@ -0,0 +1 @@
+{}