From 5b1a5b408b087d207884c01931714199b80e44c1 Mon Sep 17 00:00:00 2001
From: nzteb <patrickmbetz@gmail.com>
Date: Thu, 4 Jun 2020 15:50:55 +0200
Subject: [PATCH] Add preprocess functionality for wn11

---
 data/download_all.sh    |  20 +++++---
 data/preprocess.py      | 111 +++++++++++++++++++++++++++-------------
 kge/config-default.yaml |   8 ---
 3 files changed, 89 insertions(+), 50 deletions(-)

diff --git a/data/download_all.sh b/data/download_all.sh
index adba8b274..ea63e0d9d 100755
--- a/data/download_all.sh
+++ b/data/download_all.sh
@@ -204,19 +204,25 @@ fi
 
 # wn11
 if [ ! -d "$BASEDIR/wn11" ]; then
-  echo Downloading wikidata5m
+  echo Downloading wn11
   cd $BASEDIR
+  # TODO this also creates a __MACOSX folder on non-mac in the zip
+  # TODO download file from mannheim server
   curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip
-  unzip wn11.zip
-  mv wordnet11/wordnet11 wn11
-  rm -r wordnet11/
+  unzip wordnet11.zip
+  if [ -d "__MACOSX" ]; then
+    rm -r __MACOSX
+  fi
+  mv wordnet11 wn11
   mv wn11/dev.txt wn11/valid.txt
 else
-    echo wikidata5m already present
+    echo wn11 already present
 fi
+
 if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then
-  python preprocess.py wikidata5m
+  python preprocess.py wn11 --triple_class
 else
-    echo wikidata5m already prepared
+    echo wn11 already prepared
 fi
 
+
diff --git a/data/preprocess.py b/data/preprocess.py
index 43da59d02..3d2627528 100755
--- a/data/preprocess.py
+++ b/data/preprocess.py
@@ -7,7 +7,7 @@
 
 During preprocessing, each distinct entity name and each distinct distinct relation name
 is assigned an index (dense). The index-to-object mapping is stored in files
-"entity_map.del" and "relation_map.del", resp. The triples (as indexes) are stored in
+"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in
 files "train.del", "valid.del", and "test.del". Metadata information is stored in a file
 "dataset.yaml".
 
@@ -36,11 +36,7 @@ def store_map(symbol_map, filename):
     print(f"Preprocessing {args.folder}...")
     raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"}
     split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"}
-    split_files_label = {
-        "train_label": "train_label.del",
-        "valid_label": "valid_label.del",
-        "test_label": "test_label.del",
-    }
+
     string_files = {
         "entity_strings": "entity_strings.del",
         "relation_strings": "relation_strings.del",
@@ -50,6 +46,15 @@ def store_map(symbol_map, filename):
         "valid_without_unseen": "valid_without_unseen.del",
         "test_without_unseen": "test_without_unseen.del",
     }
+
+    if args.triple_class:
+        split_files_negatives = {
+            "valid_negatives": "valid_negatives.del",
+            "test_negatives": "test_negatives.del"}
+        split_files_negatives_without_unseen = {
+            "valid_negatives_without_unseen": "valid_negatives_without_unseen.del",
+            "test_negatives_without_unseen": "test_negatives_without_unseen.del"}
+
     split_sizes = {}
 
     if args.order_sop:
@@ -106,6 +111,15 @@ def store_map(symbol_map, filename):
                 ),
                 "w",
             )
+            if args.triple_class:
+                split_negatives_wo_unseen = f"{split}_negatives_without_unseen"
+                f_negatives_wo_unseen = open(
+                    os.path.join(
+                        args.folder,
+                        split_files_negatives_without_unseen[split_negatives_wo_unseen]
+                    ),
+                    "w"
+                )
         else:
             split_without_unseen = split + "_sample"
             f_tr_sample = open(
@@ -118,9 +132,34 @@ def store_map(symbol_map, filename):
                 split_sizes["train"], split_sizes["valid"], False
             )
         with open(os.path.join(args.folder, filename), "w") as f:
-            size_unseen = 0
+            if args.triple_class and split in ["valid", "test"]:
+                split_negatives = f"{split}_negatives"
+                f_negatives = open(
+                    os.path.join(
+                        args.folder,
+                        split_files_negatives[split_negatives],
+                    ),
+                    "w",
+                )
+
+            if args.triple_class:
+                size_negatives = 0
+                size_negatives_unseen = 0
+                # positives; valid and test sizes have to be recalculated
+                size_positives = 0
+                size_positives_unseen = 0
+            else:
+                size_positives_unseen = 0
             for n, t in enumerate(raw[split]):
-                f.write(
+                if args.triple_class and split in ["valid", "test"] and int(t[3]) == -1:
+                    file_wrapper = f_negatives
+                    size_negatives += 1
+                elif args.triple_class and split in ["valid", "test"]:
+                    size_positives += 1
+                    file_wrapper = f
+                else:
+                    file_wrapper = f
+                file_wrapper.write(
                     str(entities[t[S]])
                     + "\t"
                     + str(relations[t[P]])
@@ -137,14 +176,22 @@ def store_map(symbol_map, filename):
                         + str(entities[t[O]])
                         + "\n"
                     )
-                    size_unseen += 1
+                    size_positives_unseen += 1
                 elif (
                     split in ["valid", "test"]
                     and t[S] in entities_in_train
                     and t[O] in entities_in_train
                     and t[P] in relations_in_train
                 ):
-                    f_wo_unseen.write(
+
+                    if args.triple_class and int(t[3]) == -1:
+                        file_wrapper = f_negatives_wo_unseen
+                        size_negatives_unseen += 1
+                    else:
+                        file_wrapper = f_wo_unseen
+                        size_positives_unseen += 1
+
+                    file_wrapper.write(
                         str(entities[t[S]])
                         + "\t"
                         + str(relations[t[P]])
@@ -152,27 +199,11 @@ def store_map(symbol_map, filename):
                         + str(entities[t[O]])
                         + "\n"
                     )
-                    size_unseen += 1
-            without_unseen_sizes[split_without_unseen] = size_unseen
-    if args.triple_class:
-        for split, filename in split_files_label.items():
-            if split in ["valid", "test"]:
-                split_without_unseen = split + "_without_unseen"
-                f_wo_unseen = open(
-                    os.path.join(
-                        args.folder, split_files_without_unseen[split_without_unseen]
-                    ),
-                    "w",
-                )
-                with open(os.path.join(args.folder, filename), "w") as f:
-                    for n, t in enumerate(raw[split]):
-                        f.write(t[4] + "\n")
-                        if (
-                            t[S] in entities_in_train
-                            and t[O] in entities_in_train
-                            and t[P] in relations_in_train
-                        ):
-                            f_wo_unseen.write(t[4] + "\n")
+            if args.triple_class and split in ["valid", "test"]:
+                without_unseen_sizes[split_negatives_wo_unseen] = size_negatives_unseen
+                split_sizes[split] = size_positives
+                split_sizes[split_negatives] = size_negatives
+            without_unseen_sizes[split_without_unseen] = size_positives_unseen
 
     # write config
     print("Writing dataset.yaml...")
@@ -193,10 +224,20 @@ def store_map(symbol_map, filename):
         dataset_config[f"files.{split}.type"] = "triples"
         dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split)
     if args.triple_class:
-        for split in split_files_label.keys():
-            dataset_config[f"files.{split}.filename"] = split_files_label.get(split)
-            dataset_config[f"files.{split}.type"] = "label"
-            dataset_config[f"files.{split}.size"] = split_sizes.get(split)
+        for split in split_files_negatives.keys():
+            dataset_config[f"files.{split}.filename"] = split_files_negatives.get(split)
+            dataset_config[f"files.{split}.type"] = "triples"
+            dataset_config[f"files.{split}.size"] = split_sizes[split]
+
+        for split in split_files_negatives_without_unseen.keys():
+            dataset_config[f"files.{split}.filename"] = split_files_negatives_without_unseen.get(
+                split)
+            dataset_config[f"files.{split}.type"] = "triples"
+            dataset_config[f"files.{split}.size"] = without_unseen_sizes[
+               split]
+
+
+
     for string in string_files.keys():
         if os.path.exists(os.path.join(args.folder, string_files[string])):
             dataset_config[f"files.{string}.filename"] = string_files.get(string)
diff --git a/kge/config-default.yaml b/kge/config-default.yaml
index c1a57ae76..08ca70681 100644
--- a/kge/config-default.yaml
+++ b/kge/config-default.yaml
@@ -393,15 +393,7 @@ valid:
 
   # Name of the trace entry that holds the validation metric (higher value is
   # better)
-<<<<<<< HEAD
-<<<<<<< HEAD
   metric: mean_reciprocal_rank_filtered_with_test
-=======
-  metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification
->>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train)
-=======
-  metric: mean_reciprocal_rank_filtered # accuracy for triple_classification
->>>>>>> Moved sampling function to sampler.py, updated code documentation
 
   # If the above metric is not present in trace (e.g., because a custom metric
   # should be used), a Python expression to compute the metric. Can refer to