From 49ed5ef2ae00ed240bc573527a9922da05d26b21 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Mon, 29 Jul 2019 14:32:34 +0200
Subject: [PATCH 01/19] Implemented triple classification

---
 examples/toy-complex-train-tripleclass.yaml |  16 ++
 kge/job/__init__.py                         |   1 +
 kge/job/eval.py                             |   6 +-
 kge/job/triple_classification.py            | 246 ++++++++++++++++++++
 4 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 examples/toy-complex-train-tripleclass.yaml
 create mode 100644 kge/job/triple_classification.py

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
new file mode 100644
index 000000000..864f2280f
--- /dev/null
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -0,0 +1,16 @@
+job.type: train
+dataset.name: toy
+model: distmult
+train:
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.2
+    weight_decay: 0.4e-7
+lookup_embedder.dim: 100
+#lookup_embedder.initialize: normal_
+lookup_embedder.initialize: xavier_uniform_
+eval.type: triple_classification
+valid.metric: Accuracy
+eval.thresholds: valid
+eval.test: test
+
diff --git a/kge/job/__init__.py b/kge/job/__init__.py
index c3bee8a37..de00c257b 100644
--- a/kge/job/__init__.py
+++ b/kge/job/__init__.py
@@ -9,3 +9,4 @@
 from kge.job.ax_search import AxSearchJob
 from kge.job.entity_ranking import EntityRankingJob
 from kge.job.entity_pair_ranking import EntityPairRankingJob
+from kge.job.triple_classification import TripleClassificationJob
diff --git a/kge/job/eval.py b/kge/job/eval.py
index 138726728..d97e838a9 100644
--- a/kge/job/eval.py
+++ b/kge/job/eval.py
@@ -72,7 +72,7 @@ def __init__(self, config, dataset, parent_job, model):
     @staticmethod
     def create(config, dataset, parent_job=None, model=None):
         """Factory method to create an evaluation job """
-        from kge.job import EntityRankingJob, EntityPairRankingJob
+        from kge.job import EntityRankingJob, EntityPairRankingJob, TripleClassificationJob
 
         # create the job
         if config.get("eval.type") == "entity_ranking":
@@ -81,6 +81,10 @@ def create(config, dataset, parent_job=None, model=None):
             return EntityPairRankingJob(
                 config, dataset, parent_job=parent_job, model=model
             )
+        elif config.get("eval.type") == "triple_classification":
+            return TripleClassificationJob(
+                config, dataset, parent_job=parent_job, model=model
+            )
         else:
             raise ValueError("eval.type")
 
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
new file mode 100644
index 000000000..457f26836
--- /dev/null
+++ b/kge/job/triple_classification.py
@@ -0,0 +1,246 @@
+import time
+import random
+
+import torch
+from sklearn.metrics import accuracy_score, precision_score
+from kge.job import EvaluationJob
+
+
+class TripleClassificationJob(EvaluationJob):
+    """Triple classification evaluation protocol:
+    Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for
+    each relation. Unseen triples will be predicted as True if the score is higher than the treshold.
+    Todo: Get rid of as many for loops as possible to make the evaluation faster!!
+    """
+    def __init__(self, config, dataset, parent_job, model):
+        super().__init__(config, dataset, parent_job, model)
+        self.threshold_data = self.config.get("eval.thresholds")
+        self.eval_data = self.config.get("eval.test") #Todo: Use eval.data and delete eval.test in configuration (didnt work for some reason)
+        self.is_prepared = False
+
+    def _prepare(self):
+        """Load specified data."""
+
+        if self.is_prepared:
+            return
+
+        # Set test dataset
+        if self.eval_data == "test":
+            self.eval = self.dataset.test
+        else:
+            self.eval = self.dataset.valid
+
+        # Set dataset for which thresholds are found
+        if self.threshold_data == "valid":
+            self.threshold = self.dataset.valid
+        else:  self.threshold = self.dataset.train
+
+        # let the model add some hooks, if it wants to do so
+        self.model.prepare_job(self)
+        self.is_prepared = True
+
+    def run(self):
+        """1. Generation of (corrupted) negative triples:
+               Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples.
+               Allow only entities which appeared at the given position in the dataset
+            2. Get scores for the corrupted datasets
+            3. Find the best threshold for every relation by maximizing accuracy on validation data
+            4. Classify triples in test data
+            5. Compute Metrics for test data
+            6. Trace & Log
+        """
+        self._prepare()
+
+        was_training = self.model.training #Todo-Question: Copied that from entity ranking but don't know if it is needed
+        self.model.eval()
+
+        self.config.log("Starting triple classification...")
+        epoch_time = -time.time()
+
+        # 1. Generate corrupted data. Output: triples, labels, labels per relation
+        self.config.log("Generate corrupted datasets...")
+        valid_corrupted, valid_labels, rel_valid_labels = self._generate_negatives(self.threshold)
+        test_corrupted, test_labels, rel_test_labels = self._generate_negatives(self.eval)
+
+        # 2. Get scores for the new data. Relevant Output: Scores and scores per relation
+        self.config.log("Get scores for datasets...")
+        s_valid, p_valid, o_valid = valid_corrupted[:, 0], valid_corrupted[:, 1], valid_corrupted[:, 2]
+        valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
+        rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
+
+        s_test, p_test, o_test = test_corrupted[:, 0], test_corrupted[:, 1], test_corrupted[:, 2]
+        test_scores = self.model.score_spo(s_test, p_test, o_test)
+        rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
+
+        # 3. Find the best thresholds for every relation and their accuracies on the valid data
+        self.config.log("Learning thresholds on " + self.threshold_data +  " data.")
+        rel_thresholds, accuracies_valid = self.findThresholds(p_valid, rel_valid_scores, rel_valid_labels)
+
+        # 4. Classification on test data. Output: predictions per relation and number of relations in test which are
+        # not included in valid
+        self.config.log("Evaluating on " + self.eval_data + " data.")
+        self.config.log("Predict...")
+        rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test)
+
+        # 5. Report Metrics on test data
+        self.config.log("Classification results:")
+        metrics = self._compute_metrics(rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval)
+
+        # 6. Trace & Log
+
+        epoch_time += time.time()
+        # compute trace
+        trace_entry = dict(
+            type="triple_classification",
+            scope="epoch",
+            data_learn_thresholds=self.threshold_data,
+            data_evaluate=self.eval_data,
+            epoch=self.epoch,
+            size=2*len(self.eval),
+            epoch_time=epoch_time,
+            **metrics,
+        )
+        for f in self.post_epoch_trace_hooks:
+            f(self, trace_entry)
+
+        # if validation metric is not present, try to compute it
+        metric_name = self.config.get("valid.metric")
+        if metric_name not in trace_entry:
+            trace_entry[metric_name] = eval(
+                self.config.get("valid.metric_expr"),
+                None,
+                {"config": self.config, **trace_entry},
+            )
+
+        # write out trace
+        trace_entry = self.trace(**trace_entry, echo=True, echo_prefix="  ", log=True)
+
+        # reset model and return metrics
+        if was_training:
+            self.model.train()
+        self.config.log("Finished evaluating on " + self.eval_data + " data.")
+
+        return trace_entry
+    # Todo-Question: Not sure if what is included in the trace is correct or enough. Feedback needed.
+
+    def _generate_negatives(self, dataset):
+        # 1. Corrupt triples
+        labels = []
+        corrupted = []
+        for triple in dataset:
+            corrupted.append(triple)
+            labels.append(1)
+            # Random decision if sample subject(False) or object(True)
+            if bool(random.getrandbits(1))==True:
+                s = corrupted[-1][0]
+                p = corrupted[-1][1]
+                o = random.sample(list(dataset[:,2]), 1)[0]
+                # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
+                while int(s)==int(o) \
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train\
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid\
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test:
+                    o = random.sample(list(dataset[:,2]), 1)[0]
+            else:
+                s = random.sample(list(dataset[:,0]), 1)[0]
+                p = corrupted[-1][1]
+                o = corrupted[-1][2]
+                # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
+                while int(s) == int(o) \
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train \
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid \
+                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test:
+                    o = random.sample(list(dataset[:,0]), 1)[0]
+
+            corrupted.append(torch.tensor([s, p, o], dtype=torch.int32))
+            labels.append(0)
+        corrupted = torch.stack(corrupted)
+
+        # TODO-Question: Would it make sense to use and modify util.sampler for that task?
+        # TODO-Question: Right now we allow only samples at the position where they appeared and only from the same dataset as specified.
+        #  Would it make sense to allow to sample from all three available datasets?
+
+        # Save the labels per relation, since this will be needed frequently later
+        p = corrupted[:, 1]
+        rel_labels = {int(r): [labels[int((p == r).nonzero()[i])]
+                               for i in range(len((p == r).nonzero()))] for r in p.unique()}
+
+        return corrupted, labels, rel_labels
+
+    def findThresholds(self, p, rel_scores, rel_labels):
+        # Initialize accuracies, thresholds (and predictions)
+        rel_accuracies = {int(r): -1 for r in p.unique()}
+        rel_thresholds = {int(r): 0 for r in p.unique()}
+#        rel_predictions = {int(r): 0 for r in p.unique()}
+
+        # Find best thresholds
+        for r in p.unique():
+            for t in rel_scores[int(r)]:
+                preds = torch.zeros(len((p == r).nonzero()))
+                for i in range(len(rel_scores[int(r)])):
+                    if rel_scores[int(r)][i] >= t:
+                        preds[i] = 1
+                accuracy = accuracy_score(rel_labels[int(r)], preds)
+                if accuracy > rel_accuracies[int(r)]:
+                    rel_accuracies[int(r)] = accuracy
+                    rel_thresholds[int(r)] = float(t)
+                    #rel_predictions[int(r)] = preds
+
+        return rel_thresholds, rel_accuracies
+
+    def predict(self, rel_thresholds, rel_scores, p_valid, p_test):
+
+        rel_predictions = {int(r):[0]*len(rel_scores[int(r)]) for r in p_test.unique()}
+
+        # Set counter for triples for which the relation is not in valid data
+        not_in_eval = []
+        for r in p_test.unique():
+            # Check if relation which is in valid data also is in test data
+            if r in p_valid.unique():
+                # Predict
+                for i in range(len(rel_scores[int(r)])):
+                        if float(rel_scores[int(r)][i]) >= rel_thresholds[int(r)]:
+                            rel_predictions[int(r)][i] = 1
+            else: not_in_eval.append(r)
+
+        return rel_predictions, not_in_eval
+
+    def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval):
+        metrics = {}
+
+        labels_in_test_list = [i
+                     for r in p_test.unique()
+                     for i in rel_test_labels[int(r)]]
+
+        pred_list = [i
+                     for r in p_test.unique()
+                     for i in rel_predictions[int(r)]]
+
+
+        metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list))
+        metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list))
+
+        precision_per_r = {}
+        accuracy_per_r = {}
+        for r in p_test.unique():
+                precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+                accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+        # Todo: Find out what the warning "UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
+        #   'precision', 'predicted', average, warn_for)" is.
+        metrics["Accuracy_per_Relation"] = accuracy_per_r
+
+        metrics["Precision_Per_Relation"] = precision_per_r
+
+        # Since we evaluate on test data, only the relations in the test data which cannot be evaluated are counted here.
+        # In general we miss more than teh half of the existing relations for toy data, because they are not in test/valid.
+        metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval)
+
+        return metrics
+
+    # TODO-Question: We optimized the tresholds only for one randomly corrupted sample of the data.
+    #  Another sample would give (a little) different results due to a different threshold.
+    #  I would probably optimize the thresholds for different samples and in the end take something like the mean of all
+    #  thresholds as final threshold, but in the literature, it seems like they really corrupt the data only once.
+    #  Anyway for comparison of models, we have to pay attention to use the same data samples.Thus it might be better to
+    #  create and save a dataset with negative labels and use always the same for all models.
+    #  Any feedback on this?
\ No newline at end of file

From a6aec4fb637dbf99c06d4d4ba41aead75b3383a2 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Tue, 20 Aug 2019 12:26:30 +0200
Subject: [PATCH 02/19] got rid of unnecessary codelines, Improved
 classification time to ~15sec on fb15k, implemented an alternative way to
 find the thresholds, different slight changes

---
 examples/toy-transe-train-tripleclass.yaml |  58 ++++++
 kge/job/triple_classification.py           | 195 ++++++++++-----------
 2 files changed, 155 insertions(+), 98 deletions(-)
 create mode 100644 examples/toy-transe-train-tripleclass.yaml

diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
new file mode 100644
index 000000000..2119bc1ab
--- /dev/null
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -0,0 +1,58 @@
+job:
+  device: cuda
+  type: train
+
+model: transe
+
+dataset:
+  name: fb15k
+
+train:
+  batch_size: 256
+  loss: margin_ranking
+  loss_arg: 0.2
+  max_epochs: 200
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.01
+  type: negative_sampling
+
+negative_sampling:
+  num_negatives_o: 3
+  num_negatives_p: 0
+  num_negatives_s: 3
+  sampling_type: uniform
+
+valid:
+  early_stopping.patience: 5
+  every: 5
+  filter_with_test: True
+  metric: Accuracy
+
+eval:
+  batch_size: 512
+  type: triple_classification
+
+transe:
+  class_name: TransE
+  entity_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_weight: 1.e-05
+  relation_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_weight: 1.e-05
+  l_norm: 1.
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 457f26836..643dacb9d 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -1,5 +1,7 @@
 import time
 import random
+import itertools
+from copy import deepcopy
 
 import torch
 from sklearn.metrics import accuracy_score, precision_score
@@ -10,34 +12,17 @@ class TripleClassificationJob(EvaluationJob):
     """Triple classification evaluation protocol:
     Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for
     each relation. Unseen triples will be predicted as True if the score is higher than the treshold.
-    Todo: Get rid of as many for loops as possible to make the evaluation faster!!
+
     """
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
-        self.threshold_data = self.config.get("eval.thresholds")
-        self.eval_data = self.config.get("eval.test") #Todo: Use eval.data and delete eval.test in configuration (didnt work for some reason)
-        self.is_prepared = False
-
-    def _prepare(self):
-        """Load specified data."""
-
-        if self.is_prepared:
-            return
-
-        # Set test dataset
-        if self.eval_data == "test":
-            self.eval = self.dataset.test
-        else:
-            self.eval = self.dataset.valid
 
-        # Set dataset for which thresholds are found
-        if self.threshold_data == "valid":
-            self.threshold = self.dataset.valid
-        else:  self.threshold = self.dataset.train
+        # 1. Generate corrupted data. Output: triples, labels, labels per relation
+        self.config.log("Generate corrupted datasets...")
+        # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data
+        self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
+        self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
 
-        # let the model add some hooks, if it wants to do so
-        self.model.prepare_job(self)
-        self.is_prepared = True
 
     def run(self):
         """1. Generation of (corrupted) negative triples:
@@ -49,42 +34,38 @@ def run(self):
             5. Compute Metrics for test data
             6. Trace & Log
         """
-        self._prepare()
 
-        was_training = self.model.training #Todo-Question: Copied that from entity ranking but don't know if it is needed
+        was_training = self.model.training
         self.model.eval()
 
         self.config.log("Starting triple classification...")
         epoch_time = -time.time()
 
-        # 1. Generate corrupted data. Output: triples, labels, labels per relation
-        self.config.log("Generate corrupted datasets...")
-        valid_corrupted, valid_labels, rel_valid_labels = self._generate_negatives(self.threshold)
-        test_corrupted, test_labels, rel_test_labels = self._generate_negatives(self.eval)
+        # 1. Generate corrupted data - already done
 
         # 2. Get scores for the new data. Relevant Output: Scores and scores per relation
         self.config.log("Get scores for datasets...")
-        s_valid, p_valid, o_valid = valid_corrupted[:, 0], valid_corrupted[:, 1], valid_corrupted[:, 2]
+        s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
         rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
 
-        s_test, p_test, o_test = test_corrupted[:, 0], test_corrupted[:, 1], test_corrupted[:, 2]
+        s_test, p_test, o_test = self.test_corrupted[:, 0], self.test_corrupted[:, 1], self.test_corrupted[:, 2]
         test_scores = self.model.score_spo(s_test, p_test, o_test)
         rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
 
         # 3. Find the best thresholds for every relation and their accuracies on the valid data
-        self.config.log("Learning thresholds on " + self.threshold_data +  " data.")
-        rel_thresholds, accuracies_valid = self.findThresholds(p_valid, rel_valid_scores, rel_valid_labels)
+        self.config.log("Learning thresholds on validation data.")
+        rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted)
 
         # 4. Classification on test data. Output: predictions per relation and number of relations in test which are
         # not included in valid
-        self.config.log("Evaluating on " + self.eval_data + " data.")
+        self.config.log("Evaluating on test data.")
         self.config.log("Predict...")
-        rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test)
+        rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test)
 
         # 5. Report Metrics on test data
         self.config.log("Classification results:")
-        metrics = self._compute_metrics(rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval)
+        metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval)
 
         # 6. Trace & Log
 
@@ -93,10 +74,10 @@ def run(self):
         trace_entry = dict(
             type="triple_classification",
             scope="epoch",
-            data_learn_thresholds=self.threshold_data,
-            data_evaluate=self.eval_data,
+            data_learn_thresholds="Valid",
+            data_evaluate="Test",
             epoch=self.epoch,
-            size=2*len(self.eval),
+            size=2*len(self.dataset.valid),
             epoch_time=epoch_time,
             **metrics,
         )
@@ -125,40 +106,33 @@ def run(self):
 
     def _generate_negatives(self, dataset):
         # 1. Corrupt triples
-        labels = []
-        corrupted = []
-        for triple in dataset:
-            corrupted.append(triple)
-            labels.append(1)
-            # Random decision if sample subject(False) or object(True)
-            if bool(random.getrandbits(1))==True:
-                s = corrupted[-1][0]
-                p = corrupted[-1][1]
-                o = random.sample(list(dataset[:,2]), 1)[0]
-                # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
-                while int(s)==int(o) \
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train\
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid\
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test:
-                    o = random.sample(list(dataset[:,2]), 1)[0]
-            else:
-                s = random.sample(list(dataset[:,0]), 1)[0]
-                p = corrupted[-1][1]
-                o = corrupted[-1][2]
-                # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
-                while int(s) == int(o) \
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train \
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid \
-                        and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test:
-                    o = random.sample(list(dataset[:,0]), 1)[0]
-
-            corrupted.append(torch.tensor([s, p, o], dtype=torch.int32))
-            labels.append(0)
-        corrupted = torch.stack(corrupted)
-
-        # TODO-Question: Would it make sense to use and modify util.sampler for that task?
-        # TODO-Question: Right now we allow only samples at the position where they appeared and only from the same dataset as specified.
-        #  Would it make sense to allow to sample from all three available datasets?
+        corrupted = dataset.repeat(1, 2).view(-1, 3)
+        labels = torch.as_tensor([1, 0] * len(dataset))
+
+        sample = torch.randint(0,2,(1,len(dataset)))
+
+        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
+        corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
+            torch.as_tensor(random.sample(
+                list(map(int, dataset[:, 0])), len(corrupted[1::2][:, 0][sample.nonzero()[:, 1]])), dtype=torch.int32)
+
+        corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \
+            torch.as_tensor(random.sample(
+                list(map(int, dataset[:, 2])), len(corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]])), dtype=torch.int32)
+
+        # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
+        for i in range(len(corrupted[1::2])):
+            while int(corrupted[1::2][i][0]) == int(corrupted[1::2][i][2]) \
+                    and corrupted[1::2][i] in self.dataset.train \
+                    and corrupted[1::2][i] in self.dataset.valid \
+                    and corrupted[1::2][i] in self.dataset.test:
+                if bool(random.getrandbits(1)) == True:
+                    corrupted[1::2][i][2] = random.sample(list(dataset[:, 2]), 1)[0]
+                else:
+                    corrupted[1::2][i][0] = random.sample(list(dataset[:, 0]), 1)[0]
+
+        # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample
+        #  (e.g. from test, train and valid entities instead of only valid.
 
         # Save the labels per relation, since this will be needed frequently later
         p = corrupted[:, 1]
@@ -167,30 +141,62 @@ def _generate_negatives(self, dataset):
 
         return corrupted, labels, rel_labels
 
-    def findThresholds(self, p, rel_scores, rel_labels):
+    def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
+        """Method 1: Threshold is always one of the scores"""
         # Initialize accuracies, thresholds (and predictions)
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
-#        rel_predictions = {int(r): 0 for r in p.unique()}
 
-        # Find best thresholds
+        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
+
+
         for r in p.unique():
-            for t in rel_scores[int(r)]:
-                preds = torch.zeros(len((p == r).nonzero()))
-                for i in range(len(rel_scores[int(r)])):
-                    if rel_scores[int(r)][i] >= t:
-                        preds[i] = 1
-                accuracy = accuracy_score(rel_labels[int(r)], preds)
+            #Predict
+            current_rel = (valid_data[:, 1] == r)
+            true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
+            preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int)
+            accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
+
+            rel_accuracies[int(r)] = max(accuracy)
+            # Todo: Sometimes different scores can be the largest. Add condition, that always the largest/smalles/something else score that gives the maximum accuracy is chosen
+            rel_thresholds[int(r)] = rel_scores[int(r)][accuracy.index(max(accuracy))]
+
+
+        """Method 2: Search for best threshold in an interval
+        https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
+        # Initialize accuracies, thresholds (and predictions)
+        min_score = valid_scores.min()
+        max_score = valid_scores.max()
+
+        rel_accuracies = {int(r): -1 for r in p.unique()}
+        rel_thresholds = {int(r): min_score for r in p.unique()}
+
+        score = min_score
+
+        # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model
+        # Suggestion: float((max_score-min_score)/len(valid_scores))
+        interval = float((max_score-min_score)/len(valid_scores))
+        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
+
+        while(score<=max_score):
+            for r in p.unique():
+                #Predict
+                current_rel = (valid_data[:, 1] == r)
+                true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
+                preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
+                accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
+
                 if accuracy > rel_accuracies[int(r)]:
                     rel_accuracies[int(r)] = accuracy
-                    rel_thresholds[int(r)] = float(t)
-                    #rel_predictions[int(r)] = preds
+                    rel_thresholds[int(r)] = score.clone()
 
+            score += interval
+        """
         return rel_thresholds, rel_accuracies
 
-    def predict(self, rel_thresholds, rel_scores, p_valid, p_test):
+    def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test):
 
-        rel_predictions = {int(r):[0]*len(rel_scores[int(r)]) for r in p_test.unique()}
+        rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])) for r in p_test.unique()}
 
         # Set counter for triples for which the relation is not in valid data
         not_in_eval = []
@@ -216,7 +222,7 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no
                      for r in p_test.unique()
                      for i in rel_predictions[int(r)]]
 
-
+        # Todo: Calculate accuracy and precision instead of using sklearn function
         metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list))
         metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list))
 
@@ -225,22 +231,15 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no
         for r in p_test.unique():
                 precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
                 accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
-        # Todo: Find out what the warning "UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
-        #   'precision', 'predicted', average, warn_for)" is.
+
         metrics["Accuracy_per_Relation"] = accuracy_per_r
 
         metrics["Precision_Per_Relation"] = precision_per_r
 
-        # Since we evaluate on test data, only the relations in the test data which cannot be evaluated are counted here.
-        # In general we miss more than teh half of the existing relations for toy data, because they are not in test/valid.
+
         metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval)
 
         return metrics
 
-    # TODO-Question: We optimized the tresholds only for one randomly corrupted sample of the data.
-    #  Another sample would give (a little) different results due to a different threshold.
-    #  I would probably optimize the thresholds for different samples and in the end take something like the mean of all
-    #  thresholds as final threshold, but in the literature, it seems like they really corrupt the data only once.
-    #  Anyway for comparison of models, we have to pay attention to use the same data samples.Thus it might be better to
-    #  create and save a dataset with negative labels and use always the same for all models.
-    #  Any feedback on this?
\ No newline at end of file
+    # TODO-Question: We optimized the thresholds only for one randomly corrupted sample of the data.
+    #  Another sample would give (a little) different results. How can we ǵet reproduceable results?

From af64a5476a9d6d0d89eccd47aead1012edd881f9 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Thu, 22 Aug 2019 17:03:49 +0200
Subject: [PATCH 03/19] Integrate _prepare function, delete conditions in
 generate_negatives which were not used in other implementations, include
 unique condition while sampling negatives from list to ensure same
 probability, Change find_thresholds so that the smallest score which gives
 the highest accuracy is used as threshold

---
 examples/toy-transe-train-tripleclass.yaml |  17 +--
 kge/job/triple_classification.py           | 155 +++++++++++----------
 2 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
index 2119bc1ab..5cf81a2b7 100644
--- a/examples/toy-transe-train-tripleclass.yaml
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -5,7 +5,7 @@ job:
 model: transe
 
 dataset:
-  name: fb15k
+  name: toy
 
 train:
   batch_size: 256
@@ -19,24 +19,21 @@ train:
 
 negative_sampling:
   num_negatives_o: 3
-  num_negatives_p: 0
   num_negatives_s: 3
   sampling_type: uniform
 
 valid:
   early_stopping.patience: 5
   every: 5
-  filter_with_test: True
   metric: Accuracy
 
 eval:
-  batch_size: 512
   type: triple_classification
 
 transe:
   class_name: TransE
   entity_embedder:
-    dim: 100
+    dim: 128
     initialize: uniform_
     initialize_args:
       uniform_ :
@@ -44,9 +41,11 @@ transe:
     sparse: false
     type: lookup_embedder
     regularize: l2
-    regularize_weight: 1.e-05
+    regularize_args:
+      weight: 1.e-05
+      weighted: False
   relation_embedder:
-    dim: 100
+    dim: 128
     initialize: uniform_
     initialize_args:
       uniform_ :
@@ -54,5 +53,7 @@ transe:
     sparse: false
     type: lookup_embedder
     regularize: l2
-    regularize_weight: 1.e-05
+    regularize_args:
+      weight: 1.e-05
+      weighted: False
   l_norm: 1.
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 643dacb9d..54a1185f6 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -1,7 +1,5 @@
 import time
 import random
-import itertools
-from copy import deepcopy
 
 import torch
 from sklearn.metrics import accuracy_score, precision_score
@@ -9,31 +7,49 @@
 
 
 class TripleClassificationJob(EvaluationJob):
-    """Triple classification evaluation protocol:
-    Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for
-    each relation. Unseen triples will be predicted as True if the score is higher than the treshold.
-
+    """Triple classification evaluation protocol.
+
+    Testing model's ability to discriminate between true and false triples based on scores. Introduces a threshold for
+    each relation. Unseen triples will be predicted as True if the score is higher than the threshold. Procedure:
+
+    1. Generation of (corrupted) negative triples:
+       Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples.
+       Allow only entities which appeared at the given position in the dataset
+    2. Get scores for the corrupted datasets
+    3. Find the best threshold for every relation by maximizing accuracy on validation data
+    4. Classify triples in test data
+    5. Compute Metrics for test data
+    6. Report metrics in Trace
+    # Todo: Check where it is necessary to add .to(self.device) to created tensors
+    # Todo: Change comments to fit the standard guidelines
+    # Todo: Find out if it makes sense to use a dataloader with the relations as batches with a _collate function
+    # Todo: Check all datatypes and make them consistent where possible
     """
+
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
+        self.is_prepared = False
+
+    def _prepare(self):
+        """Construct the datasets needed."""
 
-        # 1. Generate corrupted data. Output: triples, labels, labels per relation
+        if self.is_prepared:
+            return
+
+        # 1. Generate corrupted data
         self.config.log("Generate corrupted datasets...")
         # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data
         self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
         self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
 
+        # let the model add some hooks, if it wants to do so
+        self.model.prepare_job(self)
+        self.is_prepared = True
 
     def run(self):
-        """1. Generation of (corrupted) negative triples:
-               Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples.
-               Allow only entities which appeared at the given position in the dataset
-            2. Get scores for the corrupted datasets
-            3. Find the best threshold for every relation by maximizing accuracy on validation data
-            4. Classify triples in test data
-            5. Compute Metrics for test data
-            6. Trace & Log
-        """
+        """Runs the triple classification job."""
+
+        self._prepare()
 
         was_training = self.model.training
         self.model.eval()
@@ -41,8 +57,6 @@ def run(self):
         self.config.log("Starting triple classification...")
         epoch_time = -time.time()
 
-        # 1. Generate corrupted data - already done
-
         # 2. Get scores for the new data. Relevant Output: Scores and scores per relation
         self.config.log("Get scores for datasets...")
         s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2]
@@ -109,27 +123,20 @@ def _generate_negatives(self, dataset):
         corrupted = dataset.repeat(1, 2).view(-1, 3)
         labels = torch.as_tensor([1, 0] * len(dataset))
 
+        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
         sample = torch.randint(0,2,(1,len(dataset)))
 
-        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
+        # Sample subjects from subjects which appeared in the dataset
         corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
-            torch.as_tensor(random.sample(
-                list(map(int, dataset[:, 0])), len(corrupted[1::2][:, 0][sample.nonzero()[:, 1]])), dtype=torch.int32)
+            torch.as_tensor(random.choice(
+                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32)
 
+        # Sample objects from objects which appeared in the dataset
         corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \
-            torch.as_tensor(random.sample(
-                list(map(int, dataset[:, 2])), len(corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]])), dtype=torch.int32)
-
-        # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset
-        for i in range(len(corrupted[1::2])):
-            while int(corrupted[1::2][i][0]) == int(corrupted[1::2][i][2]) \
-                    and corrupted[1::2][i] in self.dataset.train \
-                    and corrupted[1::2][i] in self.dataset.valid \
-                    and corrupted[1::2][i] in self.dataset.test:
-                if bool(random.getrandbits(1)) == True:
-                    corrupted[1::2][i][2] = random.sample(list(dataset[:, 2]), 1)[0]
-                else:
-                    corrupted[1::2][i][0] = random.sample(list(dataset[:, 0]), 1)[0]
+            torch.as_tensor(random.choice(
+                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32)
+
+        # Todo: Add condition that corrupted triple!=original triple
 
         # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample
         #  (e.g. from test, train and valid entities instead of only valid.
@@ -142,8 +149,16 @@ def _generate_negatives(self, dataset):
         return corrupted, labels, rel_labels
 
     def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
+        # Todo: Check if methods are equivalent
+        #Todo-Question: Method 1 is what seems reasonable for me, Method 2 is the reimplementation of the NTNH Paper of Socher et al. 2013.
+        # Method 1 is much faster and delivers equally good results. Since the threshold entirely is determined by the valid_scores
+        # and is a cut between them, the best threshold in terms of valid data is any value between two specific score values.
+        # Thus I assume, that we can just use one of these score values as the threshold, since we can't know better anyway.
+        # Is this thought correct?
+        # If not and Method 2 has to be used, how can it be fastened up?
+
         """Method 1: Threshold is always one of the scores"""
-        # Initialize accuracies, thresholds (and predictions)
+        #Initialize accuracies, thresholds (and predictions)
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
 
@@ -158,40 +173,39 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
             accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
 
             rel_accuracies[int(r)] = max(accuracy)
-            # Todo: Sometimes different scores can be the largest. Add condition, that always the largest/smalles/something else score that gives the maximum accuracy is chosen
-            rel_thresholds[int(r)] = rel_scores[int(r)][accuracy.index(max(accuracy))]
+            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation
+            rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])
+
+        # #Method 2: Search for best threshold in an interval
+        # #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
+        # # Initialize accuracies, thresholds (and predictions)
+        # min_score = valid_scores.min()
+        # max_score = valid_scores.max()
+        #
+        # rel_accuracies = {int(r): -1 for r in p.unique()}
+        # rel_thresholds = {int(r): min_score for r in p.unique()}
+        #
+        # score = min_score
+        #
+        # # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model
+        # # Suggestion: float((max_score-min_score)/len(valid_scores))
+        # interval = float((max_score-min_score)/len(valid_scores))
+        # valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
+        #
+        # while(score<=max_score):
+        #     for r in p.unique():
+        #         #Predict
+        #         current_rel = (valid_data[:, 1] == r)
+        #         true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
+        #         preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
+        #         accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
+        #
+        #         if accuracy > rel_accuracies[int(r)]:
+        #             rel_accuracies[int(r)] = accuracy
+        #             rel_thresholds[int(r)] = score.clone()
+        #
+        #     score += interval
 
-
-        """Method 2: Search for best threshold in an interval
-        https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
-        # Initialize accuracies, thresholds (and predictions)
-        min_score = valid_scores.min()
-        max_score = valid_scores.max()
-
-        rel_accuracies = {int(r): -1 for r in p.unique()}
-        rel_thresholds = {int(r): min_score for r in p.unique()}
-
-        score = min_score
-
-        # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model
-        # Suggestion: float((max_score-min_score)/len(valid_scores))
-        interval = float((max_score-min_score)/len(valid_scores))
-        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
-
-        while(score<=max_score):
-            for r in p.unique():
-                #Predict
-                current_rel = (valid_data[:, 1] == r)
-                true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-                preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
-                accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
-
-                if accuracy > rel_accuracies[int(r)]:
-                    rel_accuracies[int(r)] = accuracy
-                    rel_thresholds[int(r)] = score.clone()
-
-            score += interval
-        """
         return rel_thresholds, rel_accuracies
 
     def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test):
@@ -239,7 +253,4 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no
 
         metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval)
 
-        return metrics
-
-    # TODO-Question: We optimized the thresholds only for one randomly corrupted sample of the data.
-    #  Another sample would give (a little) different results. How can we ǵet reproduceable results?
+        return metrics
\ No newline at end of file

From c18e5f8ae6beb7829eb74dd9da5e6f67d6e82bf9 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Thu, 17 Oct 2019 15:36:39 +0200
Subject: [PATCH 04/19] Fixed some minor Todos, improved documentation

---
 kge/job/triple_classification.py | 111 ++++++++++++++++---------------
 1 file changed, 57 insertions(+), 54 deletions(-)

diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 54a1185f6..de2bb7c45 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -20,10 +20,10 @@ class TripleClassificationJob(EvaluationJob):
     4. Classify triples in test data
     5. Compute Metrics for test data
     6. Report metrics in Trace
-    # Todo: Check where it is necessary to add .to(self.device) to created tensors
     # Todo: Change comments to fit the standard guidelines
-    # Todo: Find out if it makes sense to use a dataloader with the relations as batches with a _collate function
     # Todo: Check all datatypes and make them consistent where possible
+    # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function
+    # Todo: Make printing out predictions per relation optionally with recent additions in config_default
     """
 
     def __init__(self, config, dataset, parent_job, model):
@@ -48,16 +48,16 @@ def _prepare(self):
 
     def run(self):
         """Runs the triple classification job."""
-
+        self.config.log("Starting triple classification...")
         self._prepare()
 
+        # Todo Question: What is the purpose of was_training? It was in entity ranking and it already
         was_training = self.model.training
         self.model.eval()
 
-        self.config.log("Starting triple classification...")
         epoch_time = -time.time()
 
-        # 2. Get scores for the new data. Relevant Output: Scores and scores per relation
+        # 2. Get scores for the corrupted valid and test data
         self.config.log("Get scores for datasets...")
         s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
@@ -70,7 +70,7 @@ def run(self):
         # 3. Find the best thresholds for every relation and their accuracies on the valid data
         self.config.log("Learning thresholds on validation data.")
         rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted)
-
+        print(rel_thresholds)
         # 4. Classification on test data. Output: predictions per relation and number of relations in test which are
         # not included in valid
         self.config.log("Evaluating on test data.")
@@ -116,30 +116,28 @@ def run(self):
         self.config.log("Finished evaluating on " + self.eval_data + " data.")
 
         return trace_entry
-    # Todo-Question: Not sure if what is included in the trace is correct or enough. Feedback needed.
 
     def _generate_negatives(self, dataset):
         # 1. Corrupt triples
         corrupted = dataset.repeat(1, 2).view(-1, 3)
-        labels = torch.as_tensor([1, 0] * len(dataset))
+        labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device)
 
         # Random decision if sample subject(sample=nonzero) or object(sample=zero)
-        sample = torch.randint(0,2,(1,len(dataset)))
+        sample = torch.randint(0,2,(1,len(dataset))).to(self.device)
 
         # Sample subjects from subjects which appeared in the dataset
         corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
             torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32)
+                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device)
 
         # Sample objects from objects which appeared in the dataset
         corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \
             torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32)
-
-        # Todo: Add condition that corrupted triple!=original triple
+                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device)
 
-        # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample
-        #  (e.g. from test, train and valid entities instead of only valid.
+        # TODO: Create a function in util.sampler for that task. Optionally include: Allow to choose from which entities
+        #  to sample (e.g. from test, train and valid entities instead of only valid;
+        #  Add condition that corrupted triple!=original triple
 
         # Save the labels per relation, since this will be needed frequently later
         p = corrupted[:, 1]
@@ -149,20 +147,25 @@ def _generate_negatives(self, dataset):
         return corrupted, labels, rel_labels
 
     def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
-        # Todo: Check if methods are equivalent
-        #Todo-Question: Method 1 is what seems reasonable for me, Method 2 is the reimplementation of the NTNH Paper of Socher et al. 2013.
-        # Method 1 is much faster and delivers equally good results. Since the threshold entirely is determined by the valid_scores
-        # and is a cut between them, the best threshold in terms of valid data is any value between two specific score values.
-        # Thus I assume, that we can just use one of these score values as the threshold, since we can't know better anyway.
-        # Is this thought correct?
-        # If not and Method 2 has to be used, how can it be fastened up?
+        #Todo-Question: Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the
+        # NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the
+        # threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of
+        # valid data is any value between two specific score values. Thus I assume, that we can just use one of these
+        # score values as the threshold, since we can't know better anyway. Is this thought correct?
+        # The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The
+        # reason most likely is, that it is really a better threshold, since it is more based on the data than just the
+        # lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two
+        # methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and
+        # the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only
+        # important for us to make comparisons inside our framework possible, then I would prefer Method 1.
 
         """Method 1: Threshold is always one of the scores"""
         #Initialize accuracies, thresholds (and predictions)
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
 
-        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
+        # Change the scores to be entries instead of separated lists the tensor
+        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
 
 
         for r in p.unique():
@@ -176,41 +179,42 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
             # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation
             rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])
 
-        # #Method 2: Search for best threshold in an interval
-        # #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
-        # # Initialize accuracies, thresholds (and predictions)
-        # min_score = valid_scores.min()
-        # max_score = valid_scores.max()
-        #
-        # rel_accuracies = {int(r): -1 for r in p.unique()}
-        # rel_thresholds = {int(r): min_score for r in p.unique()}
-        #
-        # score = min_score
-        #
-        # # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model
-        # # Suggestion: float((max_score-min_score)/len(valid_scores))
-        # interval = float((max_score-min_score)/len(valid_scores))
-        # valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))])
-        #
-        # while(score<=max_score):
-        #     for r in p.unique():
-        #         #Predict
-        #         current_rel = (valid_data[:, 1] == r)
-        #         true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-        #         preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
-        #         accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
-        #
-        #         if accuracy > rel_accuracies[int(r)]:
-        #             rel_accuracies[int(r)] = accuracy
-        #             rel_thresholds[int(r)] = score.clone()
-        #
-        #     score += interval
+#     #Method 2: Search for best threshold in an interval
+#     #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
+#     # Initialize accuracies, thresholds (and predictions)
+#     min_score = valid_scores.min()
+#     max_score = valid_scores.max()
+#
+#     rel_accuracies = {int(r): -1 for r in p.unique()}
+#     rel_thresholds = {int(r): min_score for r in p.unique()}
+#
+#     score = min_score
+#
+#     # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should
+#     # depend on the range of the score values of the model
+#     # Suggestion: float((max_score-min_score)/len(valid_scores))
+#     interval = 0.01#float((max_score-min_score)/len(valid_scores))
+#     valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
+#
+#     while(score<=max_score):
+#         for r in p.unique():
+#             #Predict
+#             current_rel = (valid_data[:, 1] == r)
+#             true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
+#             preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
+#             accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
+#
+#             if accuracy > rel_accuracies[int(r)]:
+#                 rel_accuracies[int(r)] = accuracy
+#                 rel_thresholds[int(r)] = score.clone()
+#
+#         score += interval
 
         return rel_thresholds, rel_accuracies
 
     def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test):
 
-        rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])) for r in p_test.unique()}
+        rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()}
 
         # Set counter for triples for which the relation is not in valid data
         not_in_eval = []
@@ -236,7 +240,6 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no
                      for r in p_test.unique()
                      for i in rel_predictions[int(r)]]
 
-        # Todo: Calculate accuracy and precision instead of using sklearn function
         metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list))
         metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list))
 

From b90c9d5ac6d61f37d728175a74f7c5fae584e13a Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Thu, 17 Oct 2019 16:23:06 +0200
Subject: [PATCH 05/19] Make printing out predictions per relation optionally,
 delete unnecessary specifications in config files

---
 examples/toy-complex-train-tripleclass.yaml |  3 +--
 examples/toy-transe-train-tripleclass.yaml  |  1 +
 kge/job/triple_classification.py            | 17 +++++++++--------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index 864f2280f..db72f7bae 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -11,6 +11,5 @@ lookup_embedder.dim: 100
 lookup_embedder.initialize: xavier_uniform_
 eval.type: triple_classification
 valid.metric: Accuracy
-eval.thresholds: valid
-eval.test: test
+eval.metrics_per.relation_type: False
 
diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
index 5cf81a2b7..c4b25dceb 100644
--- a/examples/toy-transe-train-tripleclass.yaml
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -29,6 +29,7 @@ valid:
 
 eval:
   type: triple_classification
+  metrics_per.relation_type: True
 
 transe:
   class_name: TransE
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index de2bb7c45..81ab663a8 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -70,7 +70,7 @@ def run(self):
         # 3. Find the best thresholds for every relation and their accuracies on the valid data
         self.config.log("Learning thresholds on validation data.")
         rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted)
-        print(rel_thresholds)
+
         # 4. Classification on test data. Output: predictions per relation and number of relations in test which are
         # not included in valid
         self.config.log("Evaluating on test data.")
@@ -243,15 +243,16 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no
         metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list))
         metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list))
 
-        precision_per_r = {}
-        accuracy_per_r = {}
-        for r in p_test.unique():
-                precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
-                accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+        if self.config.get("eval.metrics_per.relation_type"):
+            precision_per_r = {}
+            accuracy_per_r = {}
+            for r in p_test.unique():
+                    precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+                    accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
 
-        metrics["Accuracy_per_Relation"] = accuracy_per_r
+            metrics["Accuracy_per_Relation"] = accuracy_per_r
 
-        metrics["Precision_Per_Relation"] = precision_per_r
+            metrics["Precision_Per_Relation"] = precision_per_r
 
 
         metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval)

From f9d8feb0de5295ee486deb6090fb0c5c28b8c103 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Fri, 18 Oct 2019 16:15:05 +0200
Subject: [PATCH 06/19] Improved in-code documentation, removed accuracy output
 from get_thresholds, added comments for triple classification specification
 in default file, Included specification of evaluating on either test or valid
 data depending on the task (Test or validation during train)

---
 examples/toy-complex-train-tripleclass.yaml |   2 +-
 examples/toy-transe-train-tripleclass.yaml  |   2 +-
 kge/config-default.yaml                     |   4 +
 kge/job/triple_classification.py            | 130 +++++++++++++-------
 4 files changed, 89 insertions(+), 49 deletions(-)

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index db72f7bae..148fce5af 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -11,5 +11,5 @@ lookup_embedder.dim: 100
 lookup_embedder.initialize: xavier_uniform_
 eval.type: triple_classification
 valid.metric: Accuracy
-eval.metrics_per.relation_type: False
+eval.metrics_per.relation: False
 
diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
index c4b25dceb..f0b89704f 100644
--- a/examples/toy-transe-train-tripleclass.yaml
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -29,7 +29,7 @@ valid:
 
 eval:
   type: triple_classification
-  metrics_per.relation_type: True
+  metrics_per.relation: False
 
 transe:
   class_name: TransE
diff --git a/kge/config-default.yaml b/kge/config-default.yaml
index 08ca70681..d83fb1518 100644
--- a/kge/config-default.yaml
+++ b/kge/config-default.yaml
@@ -393,7 +393,11 @@ valid:
 
   # Name of the trace entry that holds the validation metric (higher value is
   # better)
+<<<<<<< HEAD
   metric: mean_reciprocal_rank_filtered_with_test
+=======
+  metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification
+>>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train)
 
   # If the above metric is not present in trace (e.g., because a custom metric
   # should be used), a Python expression to compute the metric. Can refer to
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 81ab663a8..f604576f6 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -5,91 +5,106 @@
 from sklearn.metrics import accuracy_score, precision_score
 from kge.job import EvaluationJob
 
+"""Daniel feedback: 
+- Gather other results as baseline
+"""
+
+
+"""
+Since last commit: Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train),
+"""
+
 
 class TripleClassificationJob(EvaluationJob):
     """Triple classification evaluation protocol.
 
-    Testing model's ability to discriminate between true and false triples based on scores. Introduces a threshold for
-    each relation. Unseen triples will be predicted as True if the score is higher than the threshold. Procedure:
-
-    1. Generation of (corrupted) negative triples:
-       Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples.
-       Allow only entities which appeared at the given position in the dataset
-    2. Get scores for the corrupted datasets
-    3. Find the best threshold for every relation by maximizing accuracy on validation data
-    4. Classify triples in test data
-    5. Compute Metrics for test data
-    6. Report metrics in Trace
+    Testing model's ability to discriminate between true and false triples based on scores. First, negative (corrupted)
+    triples are generated by randomly corrupting each triple in the validation and test data. Then the scores for each
+    triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation.
+    The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen
+    triples from the train data will then be predicted as True if the score is higher than the threshold of the
+    respective relation. The metrics include accuracy and precision on test data. IF necessary the accuracy/precision
+    per relation can be returned as well.
+    """
+
     # Todo: Change comments to fit the standard guidelines
     # Todo: Check all datatypes and make them consistent where possible
     # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function
-    # Todo: Make printing out predictions per relation optionally with recent additions in config_default
-    """
+
 
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
         self.is_prepared = False
 
     def _prepare(self):
-        """Construct the datasets needed."""
+        """Prepare the corrupted validation and test datasets.
+
+        The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure
+        that every epoch is evaluated on the same data.
+        """
 
         if self.is_prepared:
             return
 
-        # 1. Generate corrupted data
-        self.config.log("Generate corrupted datasets...")
-        # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data
-        self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
-        self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
+        self.config.log("Generate datasets with corrupted and true triples...")
+        # Generate corrupted data
+        if self.eval_data == "test":
+            self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
+            self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
+        else:
+            self.triples_valid, self.valid_labels, self.rel_valid_label = self._generate_negatives(self.dataset.valid)
+            self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.valid)
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
         self.is_prepared = True
 
     def run(self):
-        """Runs the triple classification job."""
+        """Runs the triple classification job and returns the trace."""
+
         self.config.log("Starting triple classification...")
         self._prepare()
 
-        # Todo Question: What is the purpose of was_training? It was in entity ranking and it already
         was_training = self.model.training
         self.model.eval()
 
         epoch_time = -time.time()
 
-        # 2. Get scores for the corrupted valid and test data
+        # Get scores for the corrupted valid and test data
         self.config.log("Get scores for datasets...")
-        s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2]
+        s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
         rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
 
-        s_test, p_test, o_test = self.test_corrupted[:, 0], self.test_corrupted[:, 1], self.test_corrupted[:, 2]
+        s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2]
         test_scores = self.model.score_spo(s_test, p_test, o_test)
         rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
 
-        # 3. Find the best thresholds for every relation and their accuracies on the valid data
+        # Find the best thresholds for every relation and their accuracies on the valid data
         self.config.log("Learning thresholds on validation data.")
-        rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted)
+        rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid)
 
-        # 4. Classification on test data. Output: predictions per relation and number of relations in test which are
+        # Classification on test data. Output: predictions per relation and number of relations in test which are
         # not included in valid
         self.config.log("Evaluating on test data.")
         self.config.log("Predict...")
         rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test)
 
-        # 5. Report Metrics on test data
+        # Report Metrics on test data
         self.config.log("Classification results:")
         metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval)
 
-        # 6. Trace & Log
+        # Trace & Log
 
         epoch_time += time.time()
         # compute trace
         trace_entry = dict(
             type="triple_classification",
             scope="epoch",
-            data_learn_thresholds="Valid",
-            data_evaluate="Test",
+            data_thresholds="Valid",
+            size_threshold_data = len(self.triples_valid),
+            data_evaluate=self.eval_data,
+            size_data_evaluate=len(self.triples_test),
             epoch=self.epoch,
             size=2*len(self.dataset.valid),
             epoch_time=epoch_time,
@@ -118,7 +133,22 @@ def run(self):
         return trace_entry
 
     def _generate_negatives(self, dataset):
-        # 1. Corrupt triples
+        """Generates dataset with negative triples.
+
+        Takes each triple of the specified dataset and randomly replaces either the subject or the object with another
+        subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same
+        position in the dataset.
+
+        Returns:
+            corrupted: A new dataset with the original and corrupted triples.
+
+            labels: A vector with labels for the corresponding triples in the dataset.
+
+            rel_labels: A dictionary mapping relations to labels. The values contain as many 1,0-pairs as we have triples
+            for the relation in the specified dataset Example for 2 original triples: {9: [1, 0, 1, 0]}
+        """
+
+        # Create objects for the corrupted dataset and the corresponding labels
         corrupted = dataset.repeat(1, 2).view(-1, 3)
         labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device)
 
@@ -139,7 +169,7 @@ def _generate_negatives(self, dataset):
         #  to sample (e.g. from test, train and valid entities instead of only valid;
         #  Add condition that corrupted triple!=original triple
 
-        # Save the labels per relation, since this will be needed frequently later
+        # Save the labels per relation, since this will be needed frequently later on
         p = corrupted[:, 1]
         rel_labels = {int(r): [labels[int((p == r).nonzero()[i])]
                                for i in range(len((p == r).nonzero()))] for r in p.unique()}
@@ -147,19 +177,25 @@ def _generate_negatives(self, dataset):
         return corrupted, labels, rel_labels
 
     def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
-        #Todo-Question: Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the
-        # NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the
-        # threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of
-        # valid data is any value between two specific score values. Thus I assume, that we can just use one of these
-        # score values as the threshold, since we can't know better anyway. Is this thought correct?
-        # The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The
-        # reason most likely is, that it is really a better threshold, since it is more based on the data than just the
-        # lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two
-        # methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and
-        # the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only
-        # important for us to make comparisons inside our framework possible, then I would prefer Method 1.
-
-        """Method 1: Threshold is always one of the scores"""
+        """Find the best thresholds per relation by maximizing accuracy on validation data.
+
+        Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the
+        NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the
+        threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of
+        valid data is any value between two specific score values. Thus I assume, that we can just use one of these
+        score values as the threshold, since we can't know better anyway. Is this thought correct?
+        The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The
+        reason most likely is, that it is really a better threshold, since it is more based on the data than just the
+        lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two
+        methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and
+        the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only
+        important for us to make comparisons inside our framework possible, then I would prefer Method 1.
+
+        Returns:
+            rel_thresholds: Dictionary with thresholds per relation {relation: threshold}
+        """
+
+        #Method 1: Threshold is always one of the scores
         #Initialize accuracies, thresholds (and predictions)
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
@@ -210,7 +246,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
 #
 #         score += interval
 
-        return rel_thresholds, rel_accuracies
+        return rel_thresholds
 
     def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test):
 

From ef7a7f8184c279f13f7d6222db8849876e52bf4f Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Wed, 23 Oct 2019 17:54:18 +0200
Subject: [PATCH 07/19] vectorized prediction function, gt rid of unnecessary
 part in _compute_metrics, updated comment documentation, easier way to
 retrieve labels per relations in generate function, minor simplifications and
 error fixings

---
 examples/toy-complex-train-tripleclass.yaml |   1 +
 kge/job/triple_classification.py            | 118 +++++++++++---------
 2 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index 148fce5af..fc9e03854 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -12,4 +12,5 @@ lookup_embedder.initialize: xavier_uniform_
 eval.type: triple_classification
 valid.metric: Accuracy
 eval.metrics_per.relation: False
+valid.every: 1
 
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index f604576f6..568e0d08b 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -5,16 +5,6 @@
 from sklearn.metrics import accuracy_score, precision_score
 from kge.job import EvaluationJob
 
-"""Daniel feedback: 
-- Gather other results as baseline
-"""
-
-
-"""
-Since last commit: Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train),
-"""
-
-
 class TripleClassificationJob(EvaluationJob):
     """Triple classification evaluation protocol.
 
@@ -27,7 +17,6 @@ class TripleClassificationJob(EvaluationJob):
     per relation can be returned as well.
     """
 
-    # Todo: Change comments to fit the standard guidelines
     # Todo: Check all datatypes and make them consistent where possible
     # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function
 
@@ -37,7 +26,7 @@ def __init__(self, config, dataset, parent_job, model):
         self.is_prepared = False
 
     def _prepare(self):
-        """Prepare the corrupted validation and test datasets.
+        """Prepare the corrupted validation and test data.
 
         The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure
         that every epoch is evaluated on the same data.
@@ -46,8 +35,8 @@ def _prepare(self):
         if self.is_prepared:
             return
 
-        self.config.log("Generate datasets with corrupted and true triples...")
-        # Generate corrupted data
+        self.config.log("Generate data with corrupted and true triples...")
+
         if self.eval_data == "test":
             self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
             self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
@@ -60,7 +49,7 @@ def _prepare(self):
         self.is_prepared = True
 
     def run(self):
-        """Runs the triple classification job and returns the trace."""
+        """Runs the triple classification job."""
 
         self.config.log("Starting triple classification...")
         self._prepare()
@@ -71,7 +60,7 @@ def run(self):
         epoch_time = -time.time()
 
         # Get scores for the corrupted valid and test data
-        self.config.log("Get scores for datasets...")
+        self.config.log("Compute scores for datasets used...")
         s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
         rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
@@ -80,21 +69,17 @@ def run(self):
         test_scores = self.model.score_spo(s_test, p_test, o_test)
         rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
 
-        # Find the best thresholds for every relation and their accuracies on the valid data
+        # Find the best thresholds for every relation on validation data
         self.config.log("Learning thresholds on validation data.")
         rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid)
 
-        # Classification on test data. Output: predictions per relation and number of relations in test which are
-        # not included in valid
-        self.config.log("Evaluating on test data.")
-        self.config.log("Predict...")
-        rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test)
+        # Make prediction on the specified evaluation data
+        self.config.log("Evaluating on {} data.".format(self.eval_data))
+        rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test)
 
-        # Report Metrics on test data
+        # ComputeReport Metrics
         self.config.log("Classification results:")
-        metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval)
-
-        # Trace & Log
+        metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval)
 
         epoch_time += time.time()
         # compute trace
@@ -137,7 +122,8 @@ def _generate_negatives(self, dataset):
 
         Takes each triple of the specified dataset and randomly replaces either the subject or the object with another
         subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same
-        position in the dataset.
+        position in the dataset. The term corrupted dataset is used throughout the document for a dataset containing
+        both corrupted and original triples.
 
         Returns:
             corrupted: A new dataset with the original and corrupted triples.
@@ -171,8 +157,8 @@ def _generate_negatives(self, dataset):
 
         # Save the labels per relation, since this will be needed frequently later on
         p = corrupted[:, 1]
-        rel_labels = {int(r): [labels[int((p == r).nonzero()[i])]
-                               for i in range(len((p == r).nonzero()))] for r in p.unique()}
+        rel_labels = {int(r): labels[p == r] for r in p.unique()}
+
 
         return corrupted, labels, rel_labels
 
@@ -191,8 +177,15 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
         the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only
         important for us to make comparisons inside our framework possible, then I would prefer Method 1.
 
+        Args:
+            p: 1-D tensor containing the relations of the corrupted validation dataset.
+            valid_scores: 1D tensor containing the scores of all corrupted validation triples.
+            rel_scores: Dictionary containing the scores of the triples in a relation.
+            valid_labels: 1D tensor containing the labels of all corrupted validation triples.
+            valid_data: Dataset used. Should be corrupted validation dataset.
+
         Returns:
-            rel_thresholds: Dictionary with thresholds per relation {relation: threshold}
+            rel_thresholds: Dictionary with thresholds per relation {relation: threshold}.
         """
 
         #Method 1: Threshold is always one of the scores
@@ -200,7 +193,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
 
-        # Change the scores to be entries instead of separated lists the tensor
+        # Change the scores from a 2D to a 1D tensor
         valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
 
 
@@ -208,15 +201,18 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
             #Predict
             current_rel = (valid_data[:, 1] == r)
             true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
+            # true_labels = valid_labels[current_rel]
             preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int)
             accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
-
+            # accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
             rel_accuracies[int(r)] = max(accuracy)
-            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation
-            rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])
+            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay as consistent
+            # as possible with original implementation
+            rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0]
 
 #     #Method 2: Search for best threshold in an interval
-#     #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
+#     #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py
+      # or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
 #     # Initialize accuracies, thresholds (and predictions)
 #     min_score = valid_scores.min()
 #     max_score = valid_scores.max()
@@ -226,7 +222,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
 #
 #     score = min_score
 #
-#     # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should
+#     # Original implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should
 #     # depend on the range of the score values of the model
 #     # Suggestion: float((max_score-min_score)/len(valid_scores))
 #     interval = 0.01#float((max_score-min_score)/len(valid_scores))
@@ -248,38 +244,54 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
 
         return rel_thresholds
 
-    def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test):
+    def predict(self, rel_thresholds, rel_scores, p_valid, p_test):
+        """Makes predictions on evaluation/test data.
+
+        Parameters:
+            rel_thresholds: dictionary with relation thresholds, e.g. {1: 1.5}.
+            rel_scores: dictionary with scores of triples in each relation:
+                        E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}.
+
+        Returns:
+            rel_predictions: dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}.
+            not_in_eval: list with relations that are in the test data, but not in the validation data.
+        """
 
         rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()}
 
-        # Set counter for triples for which the relation is not in valid data
+        # Set variable for relations which are not in valid data, but in test data
         not_in_eval = []
         for r in p_test.unique():
-            # Check if relation which is in valid data also is in test data
-            if r in p_valid.unique():
+            if r in p_valid.unique(): # Check if relation which is in valid data also is in test data
                 # Predict
-                for i in range(len(rel_scores[int(r)])):
-                        if float(rel_scores[int(r)][i]) >= rel_thresholds[int(r)]:
-                            rel_predictions[int(r)][i] = 1
+                rel_predictions[int(r)] = rel_scores[int(r)][:, 0, 0] >= rel_thresholds[int(r)]
             else: not_in_eval.append(r)
 
         return rel_predictions, not_in_eval
 
-    def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval):
-        metrics = {}
+    def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval):
+        """Computes accuracy and precision metrics of predictions.
 
-        labels_in_test_list = [i
-                     for r in p_test.unique()
-                     for i in rel_test_labels[int(r)]]
+        Returns:
+            metrics: dictionary with the specified metrics accuracy and precision as keys. If spedified, metric per
+            relation are safed as dictionaries in the dictionary.
+            E.g.: {accuracy: 0.9
+                   accuracy_per_relation:
+                        {relation 1: 0.8}
+                        {relation 2: 0.9}
+                    }
+        """
+        metrics = {}
 
-        pred_list = [i
+        # Create a list for all predicted labels, matching the shape of test_labels
+        pred_list = torch.tensor([i
                      for r in p_test.unique()
-                     for i in rel_predictions[int(r)]]
+                     for i in rel_predictions[int(r)]], dtype=torch.int64)
 
-        metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list))
-        metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list))
+        metrics["Accuracy"] = float(accuracy_score(test_labels, pred_list))
+        metrics["Precision"] = float(precision_score(test_labels, pred_list))
 
-        if self.config.get("eval.metrics_per.relation_type"):
+        if self.config.get("eval.metrics_per.relation"):
             precision_per_r = {}
             accuracy_per_r = {}
             for r in p_test.unique():

From 1d929744fc9cc33d7217b38bf672201e135e4384 Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Fri, 25 Oct 2019 16:19:16 +0200
Subject: [PATCH 08/19] Update

---
 examples/fb15k-transe-grid-tripleclass.yaml  | 83 ++++++++++++++++++
 examples/toy-rescal-train-tripleclass.yaml   | 33 +++++++
 examples/toy-transe-ax-tripleclass-real.yaml | 91 ++++++++++++++++++++
 examples/toy-transe-ax-tripleclass.yaml      | 91 ++++++++++++++++++++
 examples/toy-transe-train-tripleclass.yaml   |  4 +-
 kge/job/triple_classification.py             | 11 +--
 6 files changed, 306 insertions(+), 7 deletions(-)
 create mode 100644 examples/fb15k-transe-grid-tripleclass.yaml
 create mode 100644 examples/toy-rescal-train-tripleclass.yaml
 create mode 100644 examples/toy-transe-ax-tripleclass-real.yaml
 create mode 100644 examples/toy-transe-ax-tripleclass.yaml

diff --git a/examples/fb15k-transe-grid-tripleclass.yaml b/examples/fb15k-transe-grid-tripleclass.yaml
new file mode 100644
index 000000000..98d3d0284
--- /dev/null
+++ b/examples/fb15k-transe-grid-tripleclass.yaml
@@ -0,0 +1,83 @@
+job:
+  device: cuda
+  type: search
+
+model: transe
+
+dataset:
+  name: fb15k
+
+train:
+  batch_size: 256
+  loss: margin_ranking
+  loss_arg: 4.0
+  max_epochs: 80
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.1
+  type: negative_sampling
+
+negative_sampling:
+  num_negatives_o: 3
+  num_negatives_p: 0
+  num_negatives_s: 3
+  sampling_type: uniform
+
+valid:
+  early_stopping.patience: 5
+  every: 5
+  filter_with_test: True
+  metric: Accuracy
+
+eval:
+  batch_size: 512
+  type: triple_classification
+
+transe:
+  class_name: TransE
+  entity_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  relation_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  l_norm: 1.
+
+search.type: grid
+grid_search.parameters:
+
+    train.optimizer_args.lr: [ 0.001, 0,005, 0.01, 0.1 ]
+    
+    transe.entity_embedder.dim: [20, 50, 100]
+
+    train.batch_size: [30, 120, 480, 1920]
+
+    train.loss_arg: [1, 2, 4]
+
+    lookup_embedder.regularize_args.weight: [0.0, 0.001]
+
+
+    lookup_embedder.regularize_args.weight: 
+search.num_workers: 4
+train.num_workers: 4
+eval.num_workers: 4
+
+
+
+
diff --git a/examples/toy-rescal-train-tripleclass.yaml b/examples/toy-rescal-train-tripleclass.yaml
new file mode 100644
index 000000000..293c62bbd
--- /dev/null
+++ b/examples/toy-rescal-train-tripleclass.yaml
@@ -0,0 +1,33 @@
+job.type: train
+dataset.name: toy
+model: sd_rescal
+
+sd_rescal:
+  class_name: SparseDiagonalRescal
+  blocks: -1
+  block_size: 1
+  entity_embedder:
+    type: lookup_embedder
+    dim: 128  # determine automatically
+    dropout: 0.2
+  relation_embedder:
+    type: lookup_embedder
+    dim: -1  # determine automatically
+    dropout: 0.2
+
+valid:
+  early_stopping:
+    patience: 5
+  every: 1
+  filter_with_test: True
+  metric: Accuracy
+train:
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.1
+  batch_size: 128
+  max_epochs: 200
+
+eval.type: triple_classification
+eval.metrics_per.relation: False
+
diff --git a/examples/toy-transe-ax-tripleclass-real.yaml b/examples/toy-transe-ax-tripleclass-real.yaml
new file mode 100644
index 000000000..25f25656c
--- /dev/null
+++ b/examples/toy-transe-ax-tripleclass-real.yaml
@@ -0,0 +1,91 @@
+job:
+  device: cuda
+  type: search
+
+model: transe
+
+dataset:
+  name: toy
+
+train:
+  batch_size: 256
+  loss: margin_ranking
+  loss_arg: 4.0
+  max_epochs: 10
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.1
+  type: negative_sampling
+
+negative_sampling:
+  num_negatives_o: 3
+  num_negatives_p: 0
+  num_negatives_s: 3
+  sampling_type: uniform
+
+valid:
+  early_stopping.patience: 5
+  every: 5
+  filter_with_test: True
+  metric: Accuracy
+
+eval:
+  batch_size: 512
+  type: triple_classification
+
+transe:
+  class_name: TransE
+  entity_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  relation_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  l_norm: 1.
+
+ax_search:
+  num_trials: 10
+  num_sobol_trials: 20
+  parameters:
+    - name: train.optimizer
+      type: fixed
+      value: Adagrad
+    - name: train.optimizer_args.lr
+      type: range
+      bounds: [0.001, 1.0]
+    - name: train.loss_arg
+      type: range
+      bounds: [0.0001, 10.0]
+    - name: transe.entity_embedder.normalize.p
+      type: choice
+      values: [-1., 2.]
+      is_numerical: False
+      is_ordered: False
+    - name: transe.relation_embedder.normalize.p
+      type: choice
+      values: [-1., 2.]
+      is_numerical: False
+      is_ordered: False
+    - name: lookup_embedder.regularize_args.weight
+      type: range
+      bounds: [0.0, 0.001]
+
+search.num_workers: 4
+train.num_workers: 4
+eval.num_workers: 4
diff --git a/examples/toy-transe-ax-tripleclass.yaml b/examples/toy-transe-ax-tripleclass.yaml
new file mode 100644
index 000000000..00485d590
--- /dev/null
+++ b/examples/toy-transe-ax-tripleclass.yaml
@@ -0,0 +1,91 @@
+job:
+  device: cuda
+  type: search
+
+model: transe
+
+dataset:
+  name: fb15k
+
+train:
+  batch_size: 256
+  loss: margin_ranking
+  loss_arg: 4.0
+  max_epochs: 80
+  optimizer: Adagrad
+  optimizer_args:
+    lr: 0.1
+  type: negative_sampling
+
+negative_sampling:
+  num_negatives_o: 3
+  num_negatives_p: 0
+  num_negatives_s: 3
+  sampling_type: uniform
+
+valid:
+  early_stopping.patience: 5
+  every: 5
+  filter_with_test: True
+  metric: Accuracy
+
+eval:
+  batch_size: 512
+  type: triple_classification
+
+transe:
+  class_name: TransE
+  entity_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  relation_embedder:
+    dim: 100
+    initialize: uniform_
+    initialize_args:
+      uniform_ :
+        a: -1.0
+    sparse: false
+    type: lookup_embedder
+    regularize: l2
+    regularize_args:
+      weight: 1.e-05
+  l_norm: 1.
+
+ax_search:
+  num_trials: 30
+  num_sobol_trials: 20
+  parameters:
+    - name: train.optimizer
+      type: fixed
+      value: Adagrad
+    - name: train.optimizer_args.lr
+      type: range
+      bounds: [0.001, 1.0]
+    - name: train.loss_arg
+      type: range
+      bounds: [0.0001, 10.0]
+    - name: transe.entity_embedder.normalize.p
+      type: choice
+      values: [-1., 2.]
+      is_numerical: False
+      is_ordered: False
+    - name: transe.relation_embedder.normalize.p
+      type: choice
+      values: [-1., 2.]
+      is_numerical: False
+      is_ordered: False
+    - name: lookup_embedder.regularize_args.weight
+      type: range
+      bounds: [0.0, 0.001]
+
+search.num_workers: 4
+train.num_workers: 4
+eval.num_workers: 4
diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
index f0b89704f..d63de83d2 100644
--- a/examples/toy-transe-train-tripleclass.yaml
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -5,7 +5,7 @@ job:
 model: transe
 
 dataset:
-  name: toy
+  name: fb15k
 
 train:
   batch_size: 256
@@ -24,7 +24,7 @@ negative_sampling:
 
 valid:
   early_stopping.patience: 5
-  every: 5
+  every: 1
   metric: Accuracy
 
 eval:
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 568e0d08b..7d384e90b 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -17,13 +17,13 @@ class TripleClassificationJob(EvaluationJob):
     per relation can be returned as well.
     """
 
-    # Todo: Check all datatypes and make them consistent where possible
+    # Todo: Check all tensor datatypes and make them consistent where possible
     # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function
 
 
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
-        self.is_prepared = False
+        self.valid_data_is_prepared = False
 
     def _prepare(self):
         """Prepare the corrupted validation and test data.
@@ -32,7 +32,7 @@ def _prepare(self):
         that every epoch is evaluated on the same data.
         """
 
-        if self.is_prepared:
+        if self.valid_data_is_prepared:
             return
 
         self.config.log("Generate data with corrupted and true triples...")
@@ -46,7 +46,7 @@ def _prepare(self):
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
-        self.is_prepared = True
+        self.valid_data_is_prepared = True
 
     def run(self):
         """Runs the triple classification job."""
@@ -59,12 +59,13 @@ def run(self):
 
         epoch_time = -time.time()
 
-        # Get scores for the corrupted valid and test data
+        # Get scores for the corrupted valid  data
         self.config.log("Compute scores for datasets used...")
         s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
         rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
 
+        # Get scores for the corrupted test  data
         s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2]
         test_scores = self.model.score_spo(s_test, p_test, o_test)
         rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}

From fc660a1a7a143cbae72ff1ba81d388261bad784d Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Mon, 28 Oct 2019 11:27:42 +0100
Subject: [PATCH 09/19] Moved sampling function to sampler.py, updated code
 documentation

---
 examples/fb15k-transe-grid-tripleclass.yaml  |  83 ---------
 examples/toy-complex-train-tripleclass.yaml  |   2 +-
 examples/toy-transe-ax-tripleclass-real.yaml |  91 ----------
 examples/toy-transe-train-tripleclass.yaml   |   4 +-
 kge/config-default.yaml                      |   4 +
 kge/job/triple_classification.py             | 172 +++++++------------
 kge/util/sampler.py                          |  53 ++++++
 7 files changed, 121 insertions(+), 288 deletions(-)
 delete mode 100644 examples/fb15k-transe-grid-tripleclass.yaml
 delete mode 100644 examples/toy-transe-ax-tripleclass-real.yaml

diff --git a/examples/fb15k-transe-grid-tripleclass.yaml b/examples/fb15k-transe-grid-tripleclass.yaml
deleted file mode 100644
index 98d3d0284..000000000
--- a/examples/fb15k-transe-grid-tripleclass.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-job:
-  device: cuda
-  type: search
-
-model: transe
-
-dataset:
-  name: fb15k
-
-train:
-  batch_size: 256
-  loss: margin_ranking
-  loss_arg: 4.0
-  max_epochs: 80
-  optimizer: Adagrad
-  optimizer_args:
-    lr: 0.1
-  type: negative_sampling
-
-negative_sampling:
-  num_negatives_o: 3
-  num_negatives_p: 0
-  num_negatives_s: 3
-  sampling_type: uniform
-
-valid:
-  early_stopping.patience: 5
-  every: 5
-  filter_with_test: True
-  metric: Accuracy
-
-eval:
-  batch_size: 512
-  type: triple_classification
-
-transe:
-  class_name: TransE
-  entity_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  relation_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  l_norm: 1.
-
-search.type: grid
-grid_search.parameters:
-
-    train.optimizer_args.lr: [ 0.001, 0,005, 0.01, 0.1 ]
-    
-    transe.entity_embedder.dim: [20, 50, 100]
-
-    train.batch_size: [30, 120, 480, 1920]
-
-    train.loss_arg: [1, 2, 4]
-
-    lookup_embedder.regularize_args.weight: [0.0, 0.001]
-
-
-    lookup_embedder.regularize_args.weight: 
-search.num_workers: 4
-train.num_workers: 4
-eval.num_workers: 4
-
-
-
-
diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index fc9e03854..5512196a3 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -10,7 +10,7 @@ lookup_embedder.dim: 100
 #lookup_embedder.initialize: normal_
 lookup_embedder.initialize: xavier_uniform_
 eval.type: triple_classification
-valid.metric: Accuracy
+valid.metric: accuracy
 eval.metrics_per.relation: False
 valid.every: 1
 
diff --git a/examples/toy-transe-ax-tripleclass-real.yaml b/examples/toy-transe-ax-tripleclass-real.yaml
deleted file mode 100644
index 25f25656c..000000000
--- a/examples/toy-transe-ax-tripleclass-real.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-job:
-  device: cuda
-  type: search
-
-model: transe
-
-dataset:
-  name: toy
-
-train:
-  batch_size: 256
-  loss: margin_ranking
-  loss_arg: 4.0
-  max_epochs: 10
-  optimizer: Adagrad
-  optimizer_args:
-    lr: 0.1
-  type: negative_sampling
-
-negative_sampling:
-  num_negatives_o: 3
-  num_negatives_p: 0
-  num_negatives_s: 3
-  sampling_type: uniform
-
-valid:
-  early_stopping.patience: 5
-  every: 5
-  filter_with_test: True
-  metric: Accuracy
-
-eval:
-  batch_size: 512
-  type: triple_classification
-
-transe:
-  class_name: TransE
-  entity_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  relation_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  l_norm: 1.
-
-ax_search:
-  num_trials: 10
-  num_sobol_trials: 20
-  parameters:
-    - name: train.optimizer
-      type: fixed
-      value: Adagrad
-    - name: train.optimizer_args.lr
-      type: range
-      bounds: [0.001, 1.0]
-    - name: train.loss_arg
-      type: range
-      bounds: [0.0001, 10.0]
-    - name: transe.entity_embedder.normalize.p
-      type: choice
-      values: [-1., 2.]
-      is_numerical: False
-      is_ordered: False
-    - name: transe.relation_embedder.normalize.p
-      type: choice
-      values: [-1., 2.]
-      is_numerical: False
-      is_ordered: False
-    - name: lookup_embedder.regularize_args.weight
-      type: range
-      bounds: [0.0, 0.001]
-
-search.num_workers: 4
-train.num_workers: 4
-eval.num_workers: 4
diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
index d63de83d2..e120d9762 100644
--- a/examples/toy-transe-train-tripleclass.yaml
+++ b/examples/toy-transe-train-tripleclass.yaml
@@ -5,7 +5,7 @@ job:
 model: transe
 
 dataset:
-  name: fb15k
+  name: toy
 
 train:
   batch_size: 256
@@ -25,7 +25,7 @@ negative_sampling:
 valid:
   early_stopping.patience: 5
   every: 1
-  metric: Accuracy
+  metric: accuracy
 
 eval:
   type: triple_classification
diff --git a/kge/config-default.yaml b/kge/config-default.yaml
index d83fb1518..c1a57ae76 100644
--- a/kge/config-default.yaml
+++ b/kge/config-default.yaml
@@ -393,11 +393,15 @@ valid:
 
   # Name of the trace entry that holds the validation metric (higher value is
   # better)
+<<<<<<< HEAD
 <<<<<<< HEAD
   metric: mean_reciprocal_rank_filtered_with_test
 =======
   metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification
 >>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train)
+=======
+  metric: mean_reciprocal_rank_filtered # accuracy for triple_classification
+>>>>>>> Moved sampling function to sampler.py, updated code documentation
 
   # If the above metric is not present in trace (e.g., because a custom metric
   # should be used), a Python expression to compute the metric. Can refer to
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 7d384e90b..95baf2457 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -1,9 +1,9 @@
 import time
-import random
 
 import torch
 from sklearn.metrics import accuracy_score, precision_score
 from kge.job import EvaluationJob
+from kge.util.sampler import TripleClassificationSampler
 
 class TripleClassificationJob(EvaluationJob):
     """Triple classification evaluation protocol.
@@ -13,14 +13,10 @@ class TripleClassificationJob(EvaluationJob):
     triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation.
     The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen
     triples from the train data will then be predicted as True if the score is higher than the threshold of the
-    respective relation. The metrics include accuracy and precision on test data. IF necessary the accuracy/precision
+    respective relation. The metrics include accuracy and precision on test data. If necessary the accuracy/precision
     per relation can be returned as well.
     """
 
-    # Todo: Check all tensor datatypes and make them consistent where possible
-    # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function
-
-
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
         self.valid_data_is_prepared = False
@@ -29,7 +25,9 @@ def _prepare(self):
         """Prepare the corrupted validation and test data.
 
         The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure
-        that every epoch is evaluated on the same data.
+        that every epoch is evaluated on the same data. For model selection, the thresholds are found for validation
+        data and the accuracy on validation data is used. For testing the thresholds are found for validation data and
+        evaluated on test data.
         """
 
         if self.valid_data_is_prepared:
@@ -38,11 +36,11 @@ def _prepare(self):
         self.config.log("Generate data with corrupted and true triples...")
 
         if self.eval_data == "test":
-            self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid)
-            self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test)
+            self.triples_valid, self.valid_labels, self.rel_valid_labels = TripleClassificationSampler.sample(self, self.dataset.valid)
+            self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.test)
         else:
-            self.triples_valid, self.valid_labels, self.rel_valid_label = self._generate_negatives(self.dataset.valid)
-            self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.valid)
+            self.triples_valid, self.valid_labels, self.rel_valid_label = TripleClassificationSampler.sample(self, self.dataset.valid)
+            self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.valid)
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
@@ -59,13 +57,13 @@ def run(self):
 
         epoch_time = -time.time()
 
-        # Get scores for the corrupted valid  data
-        self.config.log("Compute scores for datasets used...")
+        # Get scores and scores per relation for the corrupted valid data
+        self.config.log("Compute scores for validation and test datasets...")
         s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2]
         valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
         rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
 
-        # Get scores for the corrupted test  data
+        # Get scores and scores per relation for the corrupted test data
         s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2]
         test_scores = self.model.score_spo(s_test, p_test, o_test)
         rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
@@ -74,11 +72,11 @@ def run(self):
         self.config.log("Learning thresholds on validation data.")
         rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid)
 
-        # Make prediction on the specified evaluation data
+        # Make prediction for the specified evaluation data
         self.config.log("Evaluating on {} data.".format(self.eval_data))
         rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test)
 
-        # ComputeReport Metrics
+        # Compute Metrics
         self.config.log("Classification results:")
         metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval)
 
@@ -118,103 +116,53 @@ def run(self):
 
         return trace_entry
 
-    def _generate_negatives(self, dataset):
-        """Generates dataset with negative triples.
-
-        Takes each triple of the specified dataset and randomly replaces either the subject or the object with another
-        subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same
-        position in the dataset. The term corrupted dataset is used throughout the document for a dataset containing
-        both corrupted and original triples.
-
-        Returns:
-            corrupted: A new dataset with the original and corrupted triples.
-
-            labels: A vector with labels for the corresponding triples in the dataset.
-
-            rel_labels: A dictionary mapping relations to labels. The values contain as many 1,0-pairs as we have triples
-            for the relation in the specified dataset Example for 2 original triples: {9: [1, 0, 1, 0]}
-        """
-
-        # Create objects for the corrupted dataset and the corresponding labels
-        corrupted = dataset.repeat(1, 2).view(-1, 3)
-        labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device)
-
-        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
-        sample = torch.randint(0,2,(1,len(dataset))).to(self.device)
-
-        # Sample subjects from subjects which appeared in the dataset
-        corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
-            torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device)
-
-        # Sample objects from objects which appeared in the dataset
-        corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \
-            torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device)
-
-        # TODO: Create a function in util.sampler for that task. Optionally include: Allow to choose from which entities
-        #  to sample (e.g. from test, train and valid entities instead of only valid;
-        #  Add condition that corrupted triple!=original triple
-
-        # Save the labels per relation, since this will be needed frequently later on
-        p = corrupted[:, 1]
-        rel_labels = {int(r): labels[p == r] for r in p.unique()}
-
-
-        return corrupted, labels, rel_labels
-
     def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
         """Find the best thresholds per relation by maximizing accuracy on validation data.
 
-        Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the
-        NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the
-        threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of
-        valid data is any value between two specific score values. Thus I assume, that we can just use one of these
-        score values as the threshold, since we can't know better anyway. Is this thought correct?
-        The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The
-        reason most likely is, that it is really a better threshold, since it is more based on the data than just the
-        lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two
-        methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and
-        the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only
-        important for us to make comparisons inside our framework possible, then I would prefer Method 1.
+        The thresholds are found for every relation by maximizing the accuracy on the validation data. For a given
+        relation, if the scores of all triple in the relation are sorted, the perfect threshold is always a cut between
+        two of the scores. This means, that multiple possible values can be defined as thresholds and give the highest
+        accuracy. To evaluate only as many possible thresholds as really necessary, the scores themselves are considered
+        as possible thresholds. This allows for a fast implementation.
 
         Args:
             p: 1-D tensor containing the relations of the corrupted validation dataset.
-            valid_scores: 1D tensor containing the scores of all corrupted validation triples.
+            valid_scores: 2-D tensor containing the scores of all corrupted validation triples.
             rel_scores: Dictionary containing the scores of the triples in a relation.
-            valid_labels: 1D tensor containing the labels of all corrupted validation triples.
-            valid_data: Dataset used. Should be corrupted validation dataset.
+            valid_labels: 1-D tensor containing the labels of all corrupted validation triples.
+            valid_data: Dataset used. Should be the corrupted validation dataset.
 
         Returns:
-            rel_thresholds: Dictionary with thresholds per relation {relation: threshold}.
+            rel_thresholds: Dictionary with thresholds per relation {relation: thresholds}.
+                            E.g.: {1: tensor(-2.0843, grad_fn=<SelectBackward>)}
         """
 
-        #Method 1: Threshold is always one of the scores
-        #Initialize accuracies, thresholds (and predictions)
+        # Initialize accuracies and thresholds
         rel_accuracies = {int(r): -1 for r in p.unique()}
         rel_thresholds = {int(r): 0 for r in p.unique()}
 
-        # Change the scores from a 2D to a 1D tensor
+        # Change the valid scores from a 2D to a 1D tensor
         valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
 
-
         for r in p.unique():
-            #Predict
-            current_rel = (valid_data[:, 1] == r)
+            current_rel = (valid_data[:, 1] == r) # 0-1 vector for indexing triples of the current relation
             true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-            # true_labels = valid_labels[current_rel]
-            preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int)
-            accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
-            # accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
+
+            # valid_scores[current_rel.nonzero()] and rel_scores[int(r)] both contain the scores of the current
+            # relation. In the comparison, every score is evaluated as possible threshold against all scores.
+            predictions = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int)
+
+            accuracy = [int(((true_labels==predictions[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
             rel_accuracies[int(r)] = max(accuracy)
-            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay as consistent
-            # as possible with original implementation
+
+            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent.
             rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0]
 
-#     #Method 2: Search for best threshold in an interval
-#     #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py
-      # or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py
-#     # Initialize accuracies, thresholds (and predictions)
+#     # Alternative implementation: Search for best threshold in an interval
+#     # Following https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or
+#     # https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py (reimplemented Socher et al. 2013)
+#
+#     # Initialize accuracies, thresholds and interval
 #     min_score = valid_scores.min()
 #     max_score = valid_scores.max()
 #
@@ -223,10 +171,10 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
 #
 #     score = min_score
 #
-#     # Original implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should
-#     # depend on the range of the score values of the model
-#     # Suggestion: float((max_score-min_score)/len(valid_scores))
-#     interval = 0.01#float((max_score-min_score)/len(valid_scores))
+#     # Original implementation uses an interval of 0.01, implemented for NTN model. In general the interval should
+#     # depend on the range of the score values of the model and be at least as large as teh smallest distance between
+#     # two of the sorted scores
+#     interval = 0.01
 #     valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
 #
 #     while(score<=max_score):
@@ -234,8 +182,8 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
 #             #Predict
 #             current_rel = (valid_data[:, 1] == r)
 #             true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-#             preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
-#             accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels)
+#             predictions = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
+#             accuracy = int(((true_labels==predictions).sum(dim=0)))/len(true_labels)
 #
 #             if accuracy > rel_accuracies[int(r)]:
 #                 rel_accuracies[int(r)] = accuracy
@@ -249,13 +197,13 @@ def predict(self, rel_thresholds, rel_scores, p_valid, p_test):
         """Makes predictions on evaluation/test data.
 
         Parameters:
-            rel_thresholds: dictionary with relation thresholds, e.g. {1: 1.5}.
-            rel_scores: dictionary with scores of triples in each relation:
+            rel_thresholds: Dictionary with relation thresholds.
+            rel_scores: Dictionary with scores of triples in each relation:
                         E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}.
 
         Returns:
-            rel_predictions: dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}.
-            not_in_eval: list with relations that are in the test data, but not in the validation data.
+            rel_predictions: Dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}.
+            not_in_eval: List with relations that are in the test data, but not in the validation data.
         """
 
         rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()}
@@ -274,8 +222,8 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test
         """Computes accuracy and precision metrics of predictions.
 
         Returns:
-            metrics: dictionary with the specified metrics accuracy and precision as keys. If spedified, metric per
-            relation are safed as dictionaries in the dictionary.
+            metrics: dictionary with the specified metrics accuracy and precision as keys. If specified, metrics per
+            relation are stored as dictionaries in the dictionary.
             E.g.: {accuracy: 0.9
                    accuracy_per_relation:
                         {relation 1: 0.8}
@@ -289,21 +237,23 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test
                      for r in p_test.unique()
                      for i in rel_predictions[int(r)]], dtype=torch.int64)
 
-        metrics["Accuracy"] = float(accuracy_score(test_labels, pred_list))
-        metrics["Precision"] = float(precision_score(test_labels, pred_list))
+        metrics["accuracy"] = float(accuracy_score(test_labels, pred_list))
+        metrics["precision"] = float(precision_score(test_labels, pred_list))
 
         if self.config.get("eval.metrics_per.relation"):
             precision_per_r = {}
             accuracy_per_r = {}
             for r in p_test.unique():
-                    precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
-                    accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+                    precision_per_r[str(self.dataset.relations[int(r)])] = \
+                        float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+                    accuracy_per_r[str(self.dataset.relations[int(r)])] = \
+                        float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
 
-            metrics["Accuracy_per_Relation"] = accuracy_per_r
+            metrics["accuracy_per_relation"] = accuracy_per_r
 
-            metrics["Precision_Per_Relation"] = precision_per_r
+            metrics["precision_per_relation"] = precision_per_r
 
 
-        metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval)
+        metrics["untested relations due to missing in evaluation data"] = len(not_in_eval)
 
         return metrics
\ No newline at end of file
diff --git a/kge/util/sampler.py b/kge/util/sampler.py
index 5f1ab31b8..6cb5b3c6d 100644
--- a/kge/util/sampler.py
+++ b/kge/util/sampler.py
@@ -3,9 +3,13 @@
 
 import random
 import torch
+<<<<<<< HEAD
 from typing import Optional
 import numpy as np
 import numba
+=======
+import random
+>>>>>>> Moved sampling function to sampler.py, updated code documentation
 
 SLOTS = [0, 1, 2]
 SLOT_STR = ["s", "p", "o"]
@@ -353,3 +357,52 @@ def _sample(self, positive_triples: torch.Tensor, slot: int, num_samples: int):
                 positive_triples.size(0) * num_samples,
             ).view(positive_triples.size(0), num_samples)
         return result
+
+
+class TripleClassificationSampler(KgeNegativeSampler):
+    def __init__(self, config, configuration_key, dataset):
+        super().__init__(config, configuration_key, dataset)
+
+    def sample(self, dataset):
+        """Generates dataset with positive and negative triples.
+
+        Takes each triple of the specified dataset and randomly replaces either the subject or the object with another
+        subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same
+        position in the dataset.
+
+        Returns:
+            corrupted: A new dataset with the original and corrupted triples.
+
+            labels: A vector with labels for the corresponding triples in the dataset.
+
+            rel_labels: A dictionary mapping relations to labels.
+                        Example if we had two triples of relation 1 in the original dataset: {1: [1, 0, 1, 0]}
+        """
+
+        # Create objects for the corrupted dataset and the corresponding labels
+        corrupted = dataset.repeat(1, 2).view(-1, 3)
+        labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device)
+
+        # The sampling influences the results in the end. To compare models or parameters, the seeds should be fixed
+        if self.config.get("eval.triple_classification_random_seed"):
+            torch.manual_seed(5465456876546785)
+            random.seed(5465456876546785)
+
+        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
+        sample = torch.randint(0, 2, (1, len(dataset))).to(self.device)
+
+        # Sample subjects from subjects which appeared in the dataset
+        corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
+            torch.as_tensor(random.choice(
+                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device)
+
+        # Sample objects from objects which appeared in the dataset
+        corrupted[1::2][:, 2][(sample == 0).nonzero()[:, 1]] = \
+            torch.as_tensor(random.choice(
+                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device)
+
+        # Save the labels per relation, since this will be needed frequently later on
+        p = corrupted[:, 1]
+        rel_labels = {int(r): labels[p == r] for r in p.unique()}
+
+        return corrupted, labels, rel_labels

From 51c8ab2b7c6e7e7a66b094a7c007da5a06a60a4e Mon Sep 17 00:00:00 2001
From: Andrej Tschalzev <andrej.tschalzev@web.de>
Date: Mon, 28 Oct 2019 11:40:29 +0100
Subject: [PATCH 10/19] final updates

---
 examples/toy-complex-train-tripleclass.yaml |  8 +-
 examples/toy-rescal-train-tripleclass.yaml  | 33 --------
 examples/toy-transe-ax-tripleclass.yaml     | 91 ---------------------
 examples/toy-transe-train-tripleclass.yaml  | 60 --------------
 4 files changed, 5 insertions(+), 187 deletions(-)
 delete mode 100644 examples/toy-rescal-train-tripleclass.yaml
 delete mode 100644 examples/toy-transe-ax-tripleclass.yaml
 delete mode 100644 examples/toy-transe-train-tripleclass.yaml

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index 5512196a3..582f8b72f 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -9,8 +9,10 @@ train:
 lookup_embedder.dim: 100
 #lookup_embedder.initialize: normal_
 lookup_embedder.initialize: xavier_uniform_
-eval.type: triple_classification
+eval:
+  type: triple_classification
+  metrics_per.relation: False
+  triple_classification_random_seed: False
 valid.metric: accuracy
-eval.metrics_per.relation: False
-valid.every: 1
+
 
diff --git a/examples/toy-rescal-train-tripleclass.yaml b/examples/toy-rescal-train-tripleclass.yaml
deleted file mode 100644
index 293c62bbd..000000000
--- a/examples/toy-rescal-train-tripleclass.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-job.type: train
-dataset.name: toy
-model: sd_rescal
-
-sd_rescal:
-  class_name: SparseDiagonalRescal
-  blocks: -1
-  block_size: 1
-  entity_embedder:
-    type: lookup_embedder
-    dim: 128  # determine automatically
-    dropout: 0.2
-  relation_embedder:
-    type: lookup_embedder
-    dim: -1  # determine automatically
-    dropout: 0.2
-
-valid:
-  early_stopping:
-    patience: 5
-  every: 1
-  filter_with_test: True
-  metric: Accuracy
-train:
-  optimizer: Adagrad
-  optimizer_args:
-    lr: 0.1
-  batch_size: 128
-  max_epochs: 200
-
-eval.type: triple_classification
-eval.metrics_per.relation: False
-
diff --git a/examples/toy-transe-ax-tripleclass.yaml b/examples/toy-transe-ax-tripleclass.yaml
deleted file mode 100644
index 00485d590..000000000
--- a/examples/toy-transe-ax-tripleclass.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-job:
-  device: cuda
-  type: search
-
-model: transe
-
-dataset:
-  name: fb15k
-
-train:
-  batch_size: 256
-  loss: margin_ranking
-  loss_arg: 4.0
-  max_epochs: 80
-  optimizer: Adagrad
-  optimizer_args:
-    lr: 0.1
-  type: negative_sampling
-
-negative_sampling:
-  num_negatives_o: 3
-  num_negatives_p: 0
-  num_negatives_s: 3
-  sampling_type: uniform
-
-valid:
-  early_stopping.patience: 5
-  every: 5
-  filter_with_test: True
-  metric: Accuracy
-
-eval:
-  batch_size: 512
-  type: triple_classification
-
-transe:
-  class_name: TransE
-  entity_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  relation_embedder:
-    dim: 100
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-  l_norm: 1.
-
-ax_search:
-  num_trials: 30
-  num_sobol_trials: 20
-  parameters:
-    - name: train.optimizer
-      type: fixed
-      value: Adagrad
-    - name: train.optimizer_args.lr
-      type: range
-      bounds: [0.001, 1.0]
-    - name: train.loss_arg
-      type: range
-      bounds: [0.0001, 10.0]
-    - name: transe.entity_embedder.normalize.p
-      type: choice
-      values: [-1., 2.]
-      is_numerical: False
-      is_ordered: False
-    - name: transe.relation_embedder.normalize.p
-      type: choice
-      values: [-1., 2.]
-      is_numerical: False
-      is_ordered: False
-    - name: lookup_embedder.regularize_args.weight
-      type: range
-      bounds: [0.0, 0.001]
-
-search.num_workers: 4
-train.num_workers: 4
-eval.num_workers: 4
diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml
deleted file mode 100644
index e120d9762..000000000
--- a/examples/toy-transe-train-tripleclass.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-job:
-  device: cuda
-  type: train
-
-model: transe
-
-dataset:
-  name: toy
-
-train:
-  batch_size: 256
-  loss: margin_ranking
-  loss_arg: 0.2
-  max_epochs: 200
-  optimizer: Adagrad
-  optimizer_args:
-    lr: 0.01
-  type: negative_sampling
-
-negative_sampling:
-  num_negatives_o: 3
-  num_negatives_s: 3
-  sampling_type: uniform
-
-valid:
-  early_stopping.patience: 5
-  every: 1
-  metric: accuracy
-
-eval:
-  type: triple_classification
-  metrics_per.relation: False
-
-transe:
-  class_name: TransE
-  entity_embedder:
-    dim: 128
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-      weighted: False
-  relation_embedder:
-    dim: 128
-    initialize: uniform_
-    initialize_args:
-      uniform_ :
-        a: -1.0
-    sparse: false
-    type: lookup_embedder
-    regularize: l2
-    regularize_args:
-      weight: 1.e-05
-      weighted: False
-  l_norm: 1.

From 1e609a781f0e8a1f7a967ea2a226b7d736498048 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Sun, 24 May 2020 01:32:02 +0200
Subject: [PATCH 11/19] Imporve and update code

---
 kge/job/triple_classification.py | 272 ++++++++++++++++++-------------
 kge/util/sampler.py              |  53 ------
 2 files changed, 157 insertions(+), 168 deletions(-)

diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 95baf2457..8611b8586 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -5,29 +5,35 @@
 from kge.job import EvaluationJob
 from kge.util.sampler import TripleClassificationSampler
 
+
 class TripleClassificationJob(EvaluationJob):
     """Triple classification evaluation protocol.
 
-    Testing model's ability to discriminate between true and false triples based on scores. First, negative (corrupted)
-    triples are generated by randomly corrupting each triple in the validation and test data. Then the scores for each
-    triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation.
-    The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen
-    triples from the train data will then be predicted as True if the score is higher than the threshold of the
-    respective relation. The metrics include accuracy and precision on test data. If necessary the accuracy/precision
-    per relation can be returned as well.
+    Testing a model's ability to classify true and false triples based on
+    thresholding scores. First, negative (corrupted) triples are generated by
+    randomly corrupting each triple in the validation and test data. Then the
+    scores for each triple, produced by the model to evaluate, is retrieved.
+    Afterwards a threshold is determined for each relation. The best threshold
+    for every relation is determined by maximizing the accuracy on validation
+    data. The unseen triples from the train data will then be predicted as True
+    if the score is higher than the threshold of the respective relation. The
+    metrics include accuracy and precision on test data. If necessary the
+    accuracy/precision per relation can be returned as well.
     """
 
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
         self.valid_data_is_prepared = False
+        self.triple_classification_sampler = TripleClassificationSampler(config, "config_key", dataset)
 
     def _prepare(self):
         """Prepare the corrupted validation and test data.
 
-        The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure
-        that every epoch is evaluated on the same data. For model selection, the thresholds are found for validation
-        data and the accuracy on validation data is used. For testing the thresholds are found for validation data and
-        evaluated on test data.
+        The triples are corrupted only for the first evaluated epoch. Afterwards
+        is_prepared is set to true to make sure that every epoch is evaluated on
+        the same data. For model selection, the thresholds are found for validation
+        data and the accuracy on validation data is used. For testing the
+        thresholds are found for validation data and evaluated on test data.
         """
 
         if self.valid_data_is_prepared:
@@ -35,12 +41,28 @@ def _prepare(self):
 
         self.config.log("Generate data with corrupted and true triples...")
 
-        if self.eval_data == "test":
-            self.triples_valid, self.valid_labels, self.rel_valid_labels = TripleClassificationSampler.sample(self, self.dataset.valid)
-            self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.test)
+        if self.eval_split == "test":
+            (
+                self.tune_data,
+                self.tune_labels,
+                self.rel_tune_labels,
+            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
+            (
+                self.eval_data,
+                self.eval_labels,
+                self.rel_eval_labels,
+            ) = self.triple_classification_sampler.sample(self.dataset.split('test'))
         else:
-            self.triples_valid, self.valid_labels, self.rel_valid_label = TripleClassificationSampler.sample(self, self.dataset.valid)
-            self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.valid)
+            (
+                self.tune_data,
+                self.tune_labels,
+                self.rel_tune_label,
+            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
+            (
+                self.eval_data,
+                self.eval_labels,
+                self.rel_eval_labels,
+            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
@@ -58,27 +80,45 @@ def run(self):
         epoch_time = -time.time()
 
         # Get scores and scores per relation for the corrupted valid data
-        self.config.log("Compute scores for validation and test datasets...")
-        s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2]
-        valid_scores = self.model.score_spo(s_valid, p_valid, o_valid)
-        rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()}
+        self.config.log("Compute scores for tune and eval datasets...")
+        s_tune, p_tune, o_tune = (
+            self.tune_data[:, 0],
+            self.tune_data[:, 1],
+            self.tune_data[:, 2],
+        )
+        p_tune_unique = p_tune.unique()
+        tune_scores = self.model.score_spo(s_tune, p_tune, o_tune)
+        rel_tune_scores = {
+            r: tune_scores[(p_tune == r)] for r in p_tune_unique
+        }
 
         # Get scores and scores per relation for the corrupted test data
-        s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2]
-        test_scores = self.model.score_spo(s_test, p_test, o_test)
-        rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()}
+        s_eval, p_eval, o_eval = (
+            self.eval_data[:, 0],
+            self.eval_data[:, 1],
+            self.eval_data[:, 2],
+        )
+        p_eval_unique = p_eval.unique()
+        eval_scores = self.model.score_spo(s_eval, p_eval, o_eval)
 
         # Find the best thresholds for every relation on validation data
-        self.config.log("Learning thresholds on validation data.")
-        rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid)
+        self.config.log("Tuning thresholds.")
+        rel_thresholds = self.findThresholds(
+            p_tune_unique,
+            tune_scores,
+        )
 
         # Make prediction for the specified evaluation data
-        self.config.log("Evaluating on {} data.".format(self.eval_data))
-        rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test)
+        self.config.log("Evaluating on {} data.".format(self.eval_split))
+        rel_predictions, not_in_eval = self.predict(
+            eval_scores, rel_thresholds, p_tune_unique, p_eval_unique
+        )
 
         # Compute Metrics
         self.config.log("Classification results:")
-        metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval)
+        metrics = self._compute_metrics(
+            self.eval_labels, self.rel_eval_labels, rel_predictions, p_eval, not_in_eval
+        )
 
         epoch_time += time.time()
         # compute trace
@@ -86,11 +126,11 @@ def run(self):
             type="triple_classification",
             scope="epoch",
             data_thresholds="Valid",
-            size_threshold_data = len(self.triples_valid),
-            data_evaluate=self.eval_data,
-            size_data_evaluate=len(self.triples_test),
+            size_threshold_data=len(self.tune_data),
+            data_evaluate=self.eval_split,
+            size_data_evaluate=len(self.eval_data),
             epoch=self.epoch,
-            size=2*len(self.dataset.valid),
+            size=2 * len(self.dataset.valid),
             epoch_time=epoch_time,
             **metrics,
         )
@@ -116,109 +156,107 @@ def run(self):
 
         return trace_entry
 
-    def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data):
-        """Find the best thresholds per relation by maximizing accuracy on validation data.
+    def findThresholds(
+        self, p_tune_unique, tune_scores
+    ):
+        """Find the best thresholds per relation by maximizing accuracy on
+        validation data.
 
-        The thresholds are found for every relation by maximizing the accuracy on the validation data. For a given
-        relation, if the scores of all triple in the relation are sorted, the perfect threshold is always a cut between
-        two of the scores. This means, that multiple possible values can be defined as thresholds and give the highest
-        accuracy. To evaluate only as many possible thresholds as really necessary, the scores themselves are considered
-        as possible thresholds. This allows for a fast implementation.
+        The thresholds are found for every relation by maximizing the accuracy on
+        the validation data. For a given relation, if the scores of all triple in
+        the relation are sorted, the perfect threshold is always a cut between two
+        of the scores. This means, that multiple possible values can be defined as
+        thresholds and give the highest accuracy. To evaluate only as many possible
+        thresholds as really necessary, the scores themselves are considered as
+        possible thresholds. This allows for a fast implementation.
 
         Args:
-            p: 1-D tensor containing the relations of the corrupted validation dataset.
-            valid_scores: 2-D tensor containing the scores of all corrupted validation triples.
-            rel_scores: Dictionary containing the scores of the triples in a relation.
-            valid_labels: 1-D tensor containing the labels of all corrupted validation triples.
-            valid_data: Dataset used. Should be the corrupted validation dataset.
+
+            p_tune: 1-D tensor containing the relations of the corrupted validation
+            dataset.
+
+            tune_scores: 2-D tensor containing the scores of all corrupted
+            validation triples.
+
+            rel_tune_scores: Dictionary containing the scores of the triples in a
+            relation.
+
+            tune_thresh_labels: 1-D tensor containing the labels of all corrupted
+            tuning triples.
+
+            tune_data: Dataset used. Should be the corrupted validation dataset.
 
         Returns:
-            rel_thresholds: Dictionary with thresholds per relation {relation: thresholds}.
+            rel_thresholds: Dictionary with thresholds per relation
+                            {relation: thresholds}.
                             E.g.: {1: tensor(-2.0843, grad_fn=<SelectBackward>)}
         """
 
         # Initialize accuracies and thresholds
-        rel_accuracies = {int(r): -1 for r in p.unique()}
-        rel_thresholds = {int(r): 0 for r in p.unique()}
+        rel_accuracies = {r: -1 for r in p_tune_unique}
+        rel_thresholds = {r: 0 for r in p_tune_unique}
 
         # Change the valid scores from a 2D to a 1D tensor
-        valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
-
-        for r in p.unique():
-            current_rel = (valid_data[:, 1] == r) # 0-1 vector for indexing triples of the current relation
-            true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-
-            # valid_scores[current_rel.nonzero()] and rel_scores[int(r)] both contain the scores of the current
-            # relation. In the comparison, every score is evaluated as possible threshold against all scores.
-            predictions = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int)
-
-            accuracy = [int(((true_labels==predictions[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))]
-            rel_accuracies[int(r)] = max(accuracy)
-
-            # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent.
-            rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0]
-
-#     # Alternative implementation: Search for best threshold in an interval
-#     # Following https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or
-#     # https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py (reimplemented Socher et al. 2013)
-#
-#     # Initialize accuracies, thresholds and interval
-#     min_score = valid_scores.min()
-#     max_score = valid_scores.max()
-#
-#     rel_accuracies = {int(r): -1 for r in p.unique()}
-#     rel_thresholds = {int(r): min_score for r in p.unique()}
-#
-#     score = min_score
-#
-#     # Original implementation uses an interval of 0.01, implemented for NTN model. In general the interval should
-#     # depend on the range of the score values of the model and be at least as large as teh smallest distance between
-#     # two of the sorted scores
-#     interval = 0.01
-#     valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device)
-#
-#     while(score<=max_score):
-#         for r in p.unique():
-#             #Predict
-#             current_rel = (valid_data[:, 1] == r)
-#             true_labels = valid_labels[current_rel.nonzero()].type(torch.int)
-#             predictions = (valid_scores[current_rel.nonzero()] >= score).type(torch.int)
-#             accuracy = int(((true_labels==predictions).sum(dim=0)))/len(true_labels)
-#
-#             if accuracy > rel_accuracies[int(r)]:
-#                 rel_accuracies[int(r)] = accuracy
-#                 rel_thresholds[int(r)] = score.clone()
-#
-#         score += interval
+        # tune_scores = torch.as_tensor(
+        #     [float(tune_scores[i]) for i in range(len(tune_scores))]
+        # ).to(self.device)
+
+        for r in p_tune_unique:
+            # 0-1 vector for indexing triples of the current relation
+            current_rel = (
+                self.tune_data[:, 1] == r
+            )
+            true_labels = self.tune_labels[current_rel]
+
+            # tune_scores[current_rel] and rel_tune_scores[r] both
+            # contain the scores of the current relation. In the comparison, every
+            # score is evaluated as possible threshold against all scores.
+            predictions = (
+                tune_scores[current_rel].view(-1, 1) >= tune_scores[current_rel].view(1, -1)
+            )
+
+            accuracies = (predictions & true_labels).float().sum(dim=1) / true_labels.size(0)
+            rel_accuracies[r] = accuracies.max()
+
+            # Choose the smallest score of the ones which give the maximum
+            # accuracy as threshold to stay consistent.
+            rel_thresholds[r] = (tune_scores[current_rel][rel_accuracies[r] >= tune_scores[current_rel]]).min()
 
         return rel_thresholds
 
-    def predict(self, rel_thresholds, rel_scores, p_valid, p_test):
+    def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique):
         """Makes predictions on evaluation/test data.
 
         Parameters:
             rel_thresholds: Dictionary with relation thresholds.
-            rel_scores: Dictionary with scores of triples in each relation:
-                        E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}.
 
         Returns:
             rel_predictions: Dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}.
             not_in_eval: List with relations that are in the test data, but not in the validation data.
         """
 
-        rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()}
-
+        rel_predictions = dict()
         # Set variable for relations which are not in valid data, but in test data
         not_in_eval = []
-        for r in p_test.unique():
-            if r in p_valid.unique(): # Check if relation which is in valid data also is in test data
+        for r in p_eval_unique:
+            if (
+                r in p_tune_unique
+            ):  # Check if relation which is in valid data also is in test data
                 # Predict
-                rel_predictions[int(r)] = rel_scores[int(r)][:, 0, 0] >= rel_thresholds[int(r)]
-            else: not_in_eval.append(r)
+                current_rel = (
+                        self.eval_data[:, 1] == r
+                )
+                rel_predictions[r] = (
+                    eval_scores[current_rel] >= rel_thresholds[r]
+                )
+            else:
+                not_in_eval.append(r)
 
         return rel_predictions, not_in_eval
 
-    def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval):
+    def _compute_metrics(
+        self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval
+    ):
         """Computes accuracy and precision metrics of predictions.
 
         Returns:
@@ -233,9 +271,10 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test
         metrics = {}
 
         # Create a list for all predicted labels, matching the shape of test_labels
-        pred_list = torch.tensor([i
-                     for r in p_test.unique()
-                     for i in rel_predictions[int(r)]], dtype=torch.int64)
+        pred_list = torch.tensor(
+            [i for r in p_test.unique() for i in rel_predictions[int(r)]],
+            dtype=torch.int64,
+        )
 
         metrics["accuracy"] = float(accuracy_score(test_labels, pred_list))
         metrics["precision"] = float(precision_score(test_labels, pred_list))
@@ -244,16 +283,19 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test
             precision_per_r = {}
             accuracy_per_r = {}
             for r in p_test.unique():
-                    precision_per_r[str(self.dataset.relations[int(r)])] = \
-                        float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
-                    accuracy_per_r[str(self.dataset.relations[int(r)])] = \
-                        float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]))
+                precision_per_r[str(self.dataset.relations[int(r)])] = float(
+                    precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])
+                )
+                accuracy_per_r[str(self.dataset.relations[int(r)])] = float(
+                    accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])
+                )
 
             metrics["accuracy_per_relation"] = accuracy_per_r
 
             metrics["precision_per_relation"] = precision_per_r
 
+        metrics["untested relations due to missing in evaluation data"] = len(
+            not_in_eval
+        )
 
-        metrics["untested relations due to missing in evaluation data"] = len(not_in_eval)
-
-        return metrics
\ No newline at end of file
+        return metrics
diff --git a/kge/util/sampler.py b/kge/util/sampler.py
index 6cb5b3c6d..5f1ab31b8 100644
--- a/kge/util/sampler.py
+++ b/kge/util/sampler.py
@@ -3,13 +3,9 @@
 
 import random
 import torch
-<<<<<<< HEAD
 from typing import Optional
 import numpy as np
 import numba
-=======
-import random
->>>>>>> Moved sampling function to sampler.py, updated code documentation
 
 SLOTS = [0, 1, 2]
 SLOT_STR = ["s", "p", "o"]
@@ -357,52 +353,3 @@ def _sample(self, positive_triples: torch.Tensor, slot: int, num_samples: int):
                 positive_triples.size(0) * num_samples,
             ).view(positive_triples.size(0), num_samples)
         return result
-
-
-class TripleClassificationSampler(KgeNegativeSampler):
-    def __init__(self, config, configuration_key, dataset):
-        super().__init__(config, configuration_key, dataset)
-
-    def sample(self, dataset):
-        """Generates dataset with positive and negative triples.
-
-        Takes each triple of the specified dataset and randomly replaces either the subject or the object with another
-        subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same
-        position in the dataset.
-
-        Returns:
-            corrupted: A new dataset with the original and corrupted triples.
-
-            labels: A vector with labels for the corresponding triples in the dataset.
-
-            rel_labels: A dictionary mapping relations to labels.
-                        Example if we had two triples of relation 1 in the original dataset: {1: [1, 0, 1, 0]}
-        """
-
-        # Create objects for the corrupted dataset and the corresponding labels
-        corrupted = dataset.repeat(1, 2).view(-1, 3)
-        labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device)
-
-        # The sampling influences the results in the end. To compare models or parameters, the seeds should be fixed
-        if self.config.get("eval.triple_classification_random_seed"):
-            torch.manual_seed(5465456876546785)
-            random.seed(5465456876546785)
-
-        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
-        sample = torch.randint(0, 2, (1, len(dataset))).to(self.device)
-
-        # Sample subjects from subjects which appeared in the dataset
-        corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \
-            torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device)
-
-        # Sample objects from objects which appeared in the dataset
-        corrupted[1::2][:, 2][(sample == 0).nonzero()[:, 1]] = \
-            torch.as_tensor(random.choice(
-                list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device)
-
-        # Save the labels per relation, since this will be needed frequently later on
-        p = corrupted[:, 1]
-        rel_labels = {int(r): labels[p == r] for r in p.unique()}
-
-        return corrupted, labels, rel_labels

From b0b7791414d034f80b6d5baae5afef1731b71ac3 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Sun, 24 May 2020 01:32:36 +0200
Subject: [PATCH 12/19] config

---
 examples/toy-complex-train.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml
index bfa3fba27..c7dcd3670 100644
--- a/examples/toy-complex-train.yaml
+++ b/examples/toy-complex-train.yaml
@@ -10,6 +10,10 @@ train:
     mode: max
     patience: 4
 
+eval.type: triple_classification
+
+valid.every: 1
+
 model: complex
 lookup_embedder:
   dim: 100

From 162ff38de079af1c7cb2d87a0473c91417c45bbb Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 25 May 2020 01:02:47 +0200
Subject: [PATCH 13/19] TC works now for datasets without neg samples

---
 examples/toy-complex-train.yaml  |  12 +-
 kge/job/triple_classification.py | 218 +++++++++++++++++--------------
 2 files changed, 128 insertions(+), 102 deletions(-)

diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml
index c7dcd3670..be8a0de4b 100644
--- a/examples/toy-complex-train.yaml
+++ b/examples/toy-complex-train.yaml
@@ -1,5 +1,6 @@
 job.type: train
-dataset.name: toy
+dataset.name: fb15k-237
+#dataset.name: fb15k
 
 train:
   optimizer: Adagrad
@@ -9,17 +10,14 @@ train:
   lr_scheduler_args:
     mode: max
     patience: 4
+  batch_size: 1024
 
 eval.type: triple_classification
 
 valid.every: 1
+valid.metric: accuracy
 
 model: complex
 lookup_embedder:
   dim: 100
-  regularize_weight: 0.8e-7
-  initialize: normal_
-  initialize_args:
-    normal_:
-      mean: 0.0
-      std: 0.1
+  regularize_weight: 0.0
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 8611b8586..9e1bf70a9 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -1,9 +1,92 @@
 import time
 
 import torch
-from sklearn.metrics import accuracy_score, precision_score
+from kge import Dataset, Config, Configurable
+from kge.util.sampler import KgeUniformSampler
 from kge.job import EvaluationJob
-from kge.util.sampler import TripleClassificationSampler
+
+SLOTS = [0, 1, 2]
+SLOT_STR = ["s", "p", "o"]
+S, P, O = SLOTS
+
+
+class TripleClassificationSampler(Configurable):
+    def __init__(self, config: Config, configuration_key: str, dataset: Dataset):
+        super().__init__(config, configuration_key)
+        self.dataset = dataset
+        self._is_prepared = False
+        self.train_data = None
+        self.s_entities = None
+        self.o_entities = None
+        uni_sampler_config = config.clone()
+        # uni_sampler_config.set("negative_sampling.num_samples.s", self.get_option("num_samples.s"))
+        uni_sampler_config.set("negative_sampling.num_samples.s", 1)
+        uni_sampler_config.set("negative_sampling.filtering.s", True)
+        # uni_sampler_config.set("negative_sampling.num_samples.o", self.get_option("num_samples.o"))
+        uni_sampler_config.set("negative_sampling.num_samples.o", 1)
+        uni_sampler_config.set("negative_sampling.filtering.o", True)
+        self.uniform_sampler = KgeUniformSampler(
+            uni_sampler_config, "negative_sampling", dataset
+        )
+
+    def _prepare(self,):
+        train_data = self.dataset.split("train")
+        self.s_entities = train_data[:, S].unique().tolist()
+        self.o_entities = train_data[:, O].unique().tolist()
+        self._is_prepared = True
+
+    def sample(self, positive_triples: torch.Tensor):
+        """Generates dataset with positive and negative triples.
+
+        Takes each triple of the specified dataset and randomly replaces either the
+        subject or the object with another subject/object. Only allows a subject/object
+        to be sampled if it appeared as a subject/object at the same position in the dataset.
+
+        Returns:
+            corrupted: A new dataset with the original and corrupted triples.
+
+            labels: A vector with labels for the corresponding triples in the dataset.
+
+            rel_labels: A dictionary mapping relations to labels.
+                        Example if we had two triples of relation 1 in the original
+                        dataset: {1: [1, 0, 1, 0]}
+        """
+
+        if not self._is_prepared:
+            self._prepare()
+
+        # Create objects for the corrupted dataset and the corresponding labels
+        corrupted = positive_triples.repeat(1, 2).view(-1, 3)
+        labels = (
+            torch.as_tensor([1, 0] * len(positive_triples))
+            .type(torch.bool)
+            .to(self.config.get("job.device"))
+        )
+
+        # Random decision if sample subject(sample=nonzero) or object(sample=zero)
+        sample_subject = torch.randint(2, (len(positive_triples),)).type(torch.bool)
+
+        # Sample subjects from subjects which appeared in the dataset
+        # corrupted[1::2][:, S][sample_subject] = torch.as_tensor(
+        #     random.choice(self.s_entities)
+        # )
+        corrupted[1::2, S][sample_subject] = self.uniform_sampler.sample(
+            corrupted[1::2][sample_subject], S, 1
+        ).view(-1)
+
+        # Sample objects from objects which appeared in the dataset
+        # corrupted[1::2][:, O][(sample_subject == False)] = torch.as_tensor(
+        #     random.choice(self.o_entities)
+        # )
+        corrupted[1::2, O][sample_subject == False] = self.uniform_sampler.sample(
+            corrupted[1::2][sample_subject == False], O, 1
+        ).view(-1)
+
+        # Save the labels per relation, since this will be needed frequently later on
+        p = corrupted[:, 1]
+        rel_labels = {int(r): labels[p == r] for r in p.unique()}
+
+        return corrupted, labels, rel_labels
 
 
 class TripleClassificationJob(EvaluationJob):
@@ -24,7 +107,9 @@ class TripleClassificationJob(EvaluationJob):
     def __init__(self, config, dataset, parent_job, model):
         super().__init__(config, dataset, parent_job, model)
         self.valid_data_is_prepared = False
-        self.triple_classification_sampler = TripleClassificationSampler(config, "config_key", dataset)
+        self.triple_classification_sampler = TripleClassificationSampler(
+            config, "triple_classification", dataset
+        )
 
     def _prepare(self):
         """Prepare the corrupted validation and test data.
@@ -46,23 +131,31 @@ def _prepare(self):
                 self.tune_data,
                 self.tune_labels,
                 self.rel_tune_labels,
-            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
+            ) = self.triple_classification_sampler.sample(
+                self.dataset.split("valid").to(self.config.get("job.device"))
+            )
             (
                 self.eval_data,
                 self.eval_labels,
                 self.rel_eval_labels,
-            ) = self.triple_classification_sampler.sample(self.dataset.split('test'))
+            ) = self.triple_classification_sampler.sample(
+                self.dataset.split("test").to(self.config.get("job.device"))
+            )
         else:
             (
                 self.tune_data,
                 self.tune_labels,
                 self.rel_tune_label,
-            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
+            ) = self.triple_classification_sampler.sample(
+                self.dataset.split("valid").to(self.config.get("job.device"))
+            )
             (
                 self.eval_data,
                 self.eval_labels,
                 self.rel_eval_labels,
-            ) = self.triple_classification_sampler.sample(self.dataset.split('valid'))
+            ) = self.triple_classification_sampler.sample(
+                self.dataset.split("valid").to(self.config.get("job.device"))
+            )
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
@@ -71,7 +164,6 @@ def _prepare(self):
     def run(self):
         """Runs the triple classification job."""
 
-        self.config.log("Starting triple classification...")
         self._prepare()
 
         was_training = self.model.training
@@ -80,7 +172,6 @@ def run(self):
         epoch_time = -time.time()
 
         # Get scores and scores per relation for the corrupted valid data
-        self.config.log("Compute scores for tune and eval datasets...")
         s_tune, p_tune, o_tune = (
             self.tune_data[:, 0],
             self.tune_data[:, 1],
@@ -88,9 +179,7 @@ def run(self):
         )
         p_tune_unique = p_tune.unique()
         tune_scores = self.model.score_spo(s_tune, p_tune, o_tune)
-        rel_tune_scores = {
-            r: tune_scores[(p_tune == r)] for r in p_tune_unique
-        }
+        rel_tune_scores = {r: tune_scores[(p_tune == r)] for r in p_tune_unique}
 
         # Get scores and scores per relation for the corrupted test data
         s_eval, p_eval, o_eval = (
@@ -102,35 +191,22 @@ def run(self):
         eval_scores = self.model.score_spo(s_eval, p_eval, o_eval)
 
         # Find the best thresholds for every relation on validation data
-        self.config.log("Tuning thresholds.")
-        rel_thresholds = self.findThresholds(
-            p_tune_unique,
-            tune_scores,
-        )
+        rel_thresholds = self.findThresholds(p_tune_unique, tune_scores,)
 
         # Make prediction for the specified evaluation data
         self.config.log("Evaluating on {} data.".format(self.eval_split))
-        rel_predictions, not_in_eval = self.predict(
+        metrics, not_in_eval = self.predict(
             eval_scores, rel_thresholds, p_tune_unique, p_eval_unique
         )
 
-        # Compute Metrics
-        self.config.log("Classification results:")
-        metrics = self._compute_metrics(
-            self.eval_labels, self.rel_eval_labels, rel_predictions, p_eval, not_in_eval
-        )
-
         epoch_time += time.time()
         # compute trace
         trace_entry = dict(
             type="triple_classification",
             scope="epoch",
             data_thresholds="Valid",
-            size_threshold_data=len(self.tune_data),
             data_evaluate=self.eval_split,
-            size_data_evaluate=len(self.eval_data),
             epoch=self.epoch,
-            size=2 * len(self.dataset.valid),
             epoch_time=epoch_time,
             **metrics,
         )
@@ -152,13 +228,11 @@ def run(self):
         # reset model and return metrics
         if was_training:
             self.model.train()
-        self.config.log("Finished evaluating on " + self.eval_data + " data.")
+        self.config.log("Finished evaluating on " + self.eval_split + " data.")
 
         return trace_entry
 
-    def findThresholds(
-        self, p_tune_unique, tune_scores
-    ):
+    def findThresholds(self, p_tune_unique, tune_scores):
         """Find the best thresholds per relation by maximizing accuracy on
         validation data.
 
@@ -193,8 +267,7 @@ def findThresholds(
         """
 
         # Initialize accuracies and thresholds
-        rel_accuracies = {r: -1 for r in p_tune_unique}
-        rel_thresholds = {r: 0 for r in p_tune_unique}
+        rel_thresholds = {r: -float("inf") for r in range(self.dataset.num_relations())}
 
         # Change the valid scores from a 2D to a 1D tensor
         # tune_scores = torch.as_tensor(
@@ -203,24 +276,25 @@ def findThresholds(
 
         for r in p_tune_unique:
             # 0-1 vector for indexing triples of the current relation
-            current_rel = (
-                self.tune_data[:, 1] == r
-            )
-            true_labels = self.tune_labels[current_rel]
+            current_rel = self.tune_data[:, 1] == r
+            true_labels = self.tune_labels[current_rel].view(-1)
 
             # tune_scores[current_rel] and rel_tune_scores[r] both
             # contain the scores of the current relation. In the comparison, every
             # score is evaluated as possible threshold against all scores.
             predictions = (
-                tune_scores[current_rel].view(-1, 1) >= tune_scores[current_rel].view(1, -1)
-            )
+                tune_scores[current_rel].view(-1, 1)
+                >= tune_scores[current_rel].view(1, -1)
+            ).t()
 
-            accuracies = (predictions & true_labels).float().sum(dim=1) / true_labels.size(0)
-            rel_accuracies[r] = accuracies.max()
+            accuracies = (predictions == true_labels).float().sum(dim=1)
+            accuracies_max = accuracies.max()
 
             # Choose the smallest score of the ones which give the maximum
             # accuracy as threshold to stay consistent.
-            rel_thresholds[r] = (tune_scores[current_rel][rel_accuracies[r] >= tune_scores[current_rel]]).min()
+            rel_thresholds[r.item()] = tune_scores[current_rel][
+                accuracies_max == accuracies
+            ].min()
 
         return rel_thresholds
 
@@ -235,7 +309,7 @@ def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique):
             not_in_eval: List with relations that are in the test data, but not in the validation data.
         """
 
-        rel_predictions = dict()
+        tptn = 0
         # Set variable for relations which are not in valid data, but in test data
         not_in_eval = []
         for r in p_eval_unique:
@@ -243,59 +317,13 @@ def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique):
                 r in p_tune_unique
             ):  # Check if relation which is in valid data also is in test data
                 # Predict
-                current_rel = (
-                        self.eval_data[:, 1] == r
-                )
-                rel_predictions[r] = (
-                    eval_scores[current_rel] >= rel_thresholds[r]
-                )
+                current_rel = self.eval_data[:, 1] == r
+                true_labels = self.eval_labels[current_rel]
+                predictions = eval_scores[current_rel] >= rel_thresholds[r.item()]
+                tptn += (predictions == true_labels).float().sum().item()
             else:
                 not_in_eval.append(r)
 
-        return rel_predictions, not_in_eval
-
-    def _compute_metrics(
-        self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval
-    ):
-        """Computes accuracy and precision metrics of predictions.
-
-        Returns:
-            metrics: dictionary with the specified metrics accuracy and precision as keys. If specified, metrics per
-            relation are stored as dictionaries in the dictionary.
-            E.g.: {accuracy: 0.9
-                   accuracy_per_relation:
-                        {relation 1: 0.8}
-                        {relation 2: 0.9}
-                    }
-        """
-        metrics = {}
-
-        # Create a list for all predicted labels, matching the shape of test_labels
-        pred_list = torch.tensor(
-            [i for r in p_test.unique() for i in rel_predictions[int(r)]],
-            dtype=torch.int64,
-        )
-
-        metrics["accuracy"] = float(accuracy_score(test_labels, pred_list))
-        metrics["precision"] = float(precision_score(test_labels, pred_list))
-
-        if self.config.get("eval.metrics_per.relation"):
-            precision_per_r = {}
-            accuracy_per_r = {}
-            for r in p_test.unique():
-                precision_per_r[str(self.dataset.relations[int(r)])] = float(
-                    precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])
-                )
-                accuracy_per_r[str(self.dataset.relations[int(r)])] = float(
-                    accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])
-                )
-
-            metrics["accuracy_per_relation"] = accuracy_per_r
-
-            metrics["precision_per_relation"] = precision_per_r
-
-        metrics["untested relations due to missing in evaluation data"] = len(
-            not_in_eval
-        )
+        metrics = dict(accuracy=tptn / self.eval_data.size(0))
 
-        return metrics
+        return metrics, not_in_eval

From 6344697f090e31222cd78857d08671575bcf64b5 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 25 May 2020 01:54:40 +0200
Subject: [PATCH 14/19] Fix neg sampling with filtering for unseen sp, po in
 train

---
 kge/util/sampler.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kge/util/sampler.py b/kge/util/sampler.py
index 5f1ab31b8..03f9f1ac5 100644
--- a/kge/util/sampler.py
+++ b/kge/util/sampler.py
@@ -279,7 +279,11 @@ def _filter_and_resample_fast(
         positives_index = numba.typed.Dict()
         for i in range(batch_size):
             pair = (pairs[i][0], pairs[i][1])
-            positives_index[pair] = index.get(pair).numpy()
+            positives_index[pair] = (
+                index.get(pair).numpy()
+                if pair in index
+                else torch.IntTensor([]).numpy()
+            )
         negative_samples = negative_samples.numpy()
         KgeUniformSampler._filter_and_resample_numba(
             negative_samples, pairs, positives_index, batch_size, int(voc_size),

From 4df0c3be5ca20db696468b567462d328f7072cb7 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 25 May 2020 10:23:14 +0200
Subject: [PATCH 15/19] Remove uneeded stuff

---
 kge/job/triple_classification.py | 36 ++++++++------------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 9e1bf70a9..2d5cce804 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -57,11 +57,7 @@ def sample(self, positive_triples: torch.Tensor):
 
         # Create objects for the corrupted dataset and the corresponding labels
         corrupted = positive_triples.repeat(1, 2).view(-1, 3)
-        labels = (
-            torch.as_tensor([1, 0] * len(positive_triples))
-            .type(torch.bool)
-            .to(self.config.get("job.device"))
-        )
+        labels = torch.as_tensor([1, 0] * len(positive_triples)).type(torch.bool)
 
         # Random decision if sample subject(sample=nonzero) or object(sample=zero)
         sample_subject = torch.randint(2, (len(positive_triples),)).type(torch.bool)
@@ -82,11 +78,10 @@ def sample(self, positive_triples: torch.Tensor):
             corrupted[1::2][sample_subject == False], O, 1
         ).view(-1)
 
-        # Save the labels per relation, since this will be needed frequently later on
-        p = corrupted[:, 1]
-        rel_labels = {int(r): labels[p == r] for r in p.unique()}
-
-        return corrupted, labels, rel_labels
+        return (
+            corrupted.to(self.config.get("job.device")),
+            labels.to(self.config.get("job.device")),
+        )
 
 
 class TripleClassificationJob(EvaluationJob):
@@ -130,32 +125,20 @@ def _prepare(self):
             (
                 self.tune_data,
                 self.tune_labels,
-                self.rel_tune_labels,
-            ) = self.triple_classification_sampler.sample(
-                self.dataset.split("valid").to(self.config.get("job.device"))
-            )
+            ) = self.triple_classification_sampler.sample(self.dataset.split("valid"))
             (
                 self.eval_data,
                 self.eval_labels,
-                self.rel_eval_labels,
-            ) = self.triple_classification_sampler.sample(
-                self.dataset.split("test").to(self.config.get("job.device"))
-            )
+            ) = self.triple_classification_sampler.sample(self.dataset.split("test"))
         else:
             (
                 self.tune_data,
                 self.tune_labels,
-                self.rel_tune_label,
-            ) = self.triple_classification_sampler.sample(
-                self.dataset.split("valid").to(self.config.get("job.device"))
-            )
+            ) = self.triple_classification_sampler.sample(self.dataset.split("valid"))
             (
                 self.eval_data,
                 self.eval_labels,
-                self.rel_eval_labels,
-            ) = self.triple_classification_sampler.sample(
-                self.dataset.split("valid").to(self.config.get("job.device"))
-            )
+            ) = self.triple_classification_sampler.sample(self.dataset.split("valid"))
 
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
@@ -179,7 +162,6 @@ def run(self):
         )
         p_tune_unique = p_tune.unique()
         tune_scores = self.model.score_spo(s_tune, p_tune, o_tune)
-        rel_tune_scores = {r: tune_scores[(p_tune == r)] for r in p_tune_unique}
 
         # Get scores and scores per relation for the corrupted test data
         s_eval, p_eval, o_eval = (

From bb5a575be26321d236736debf21c1d5d3a455ee7 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 25 May 2020 10:23:37 +0200
Subject: [PATCH 16/19] Init supporting TC datasets

---
 data/download_all.sh | 19 ++++++++++
 data/preprocess.py   | 83 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/data/download_all.sh b/data/download_all.sh
index 6424f80bb..adba8b274 100755
--- a/data/download_all.sh
+++ b/data/download_all.sh
@@ -201,3 +201,22 @@ else
     echo wikidata5m already prepared
 fi
 
+
+# wn11
+if [ ! -d "$BASEDIR/wn11" ]; then
+  echo Downloading wikidata5m
+  cd $BASEDIR
+  curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip
+  unzip wn11.zip
+  mv wordnet11/wordnet11 wn11
+  rm -r wordnet11/
+  mv wn11/dev.txt wn11/valid.txt
+else
+    echo wikidata5m already present
+fi
+if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then
+  python preprocess.py wikidata5m
+else
+    echo wikidata5m already prepared
+fi
+
diff --git a/data/preprocess.py b/data/preprocess.py
index fa1c735db..43da59d02 100755
--- a/data/preprocess.py
+++ b/data/preprocess.py
@@ -19,6 +19,7 @@
 import numpy as np
 from collections import OrderedDict
 
+
 def store_map(symbol_map, filename):
     with open(filename, "w") as f:
         for symbol, index in symbol_map.items():
@@ -29,14 +30,26 @@ def store_map(symbol_map, filename):
     parser = argparse.ArgumentParser()
     parser.add_argument("folder", type=str)
     parser.add_argument("--order_sop", action="store_true")
+    parser.add_argument("--triple_class", action="store_true")
     args = parser.parse_args()
 
     print(f"Preprocessing {args.folder}...")
     raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"}
     split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"}
-    string_files = {"entity_strings": "entity_strings.del", "relation_strings": "relation_strings.del"}
-    split_files_without_unseen = {"train_sample": "train_sample.del", "valid_without_unseen": "valid_without_unseen.del", 
-            "test_without_unseen": "test_without_unseen.del"}
+    split_files_label = {
+        "train_label": "train_label.del",
+        "valid_label": "valid_label.del",
+        "test_label": "test_label.del",
+    }
+    string_files = {
+        "entity_strings": "entity_strings.del",
+        "relation_strings": "relation_strings.del",
+    }
+    split_files_without_unseen = {
+        "train_sample": "train_sample.del",
+        "valid_without_unseen": "valid_without_unseen.del",
+        "test_without_unseen": "test_without_unseen.del",
+    }
     split_sizes = {}
 
     if args.order_sop:
@@ -73,7 +86,7 @@ def store_map(symbol_map, filename):
             if "train" in split:
                 entities_in_train = entities.copy()
                 relations_in_train = relations.copy()
-    
+
     print(f"{len(relations)} distinct relations")
     print(f"{len(entities)} distinct entities")
     print("Writing relation and entity map...")
@@ -87,13 +100,23 @@ def store_map(symbol_map, filename):
     for split, filename in split_files.items():
         if split in ["valid", "test"]:
             split_without_unseen = split + "_without_unseen"
-            f_wo_unseen = open(os.path.join(args.folder, 
-                                split_files_without_unseen[split_without_unseen]), "w")
+            f_wo_unseen = open(
+                os.path.join(
+                    args.folder, split_files_without_unseen[split_without_unseen]
+                ),
+                "w",
+            )
         else:
             split_without_unseen = split + "_sample"
-            f_tr_sample = open(os.path.join(args.folder, 
-                                split_files_without_unseen[split_without_unseen]), "w")
-            train_sample = np.random.choice(split_sizes["train"], split_sizes["valid"], False)
+            f_tr_sample = open(
+                os.path.join(
+                    args.folder, split_files_without_unseen[split_without_unseen]
+                ),
+                "w",
+            )
+            train_sample = np.random.choice(
+                split_sizes["train"], split_sizes["valid"], False
+            )
         with open(os.path.join(args.folder, filename), "w") as f:
             size_unseen = 0
             for n, t in enumerate(raw[split]):
@@ -115,8 +138,12 @@ def store_map(symbol_map, filename):
                         + "\n"
                     )
                     size_unseen += 1
-                elif split in ["valid", "test"] and t[S] in entities_in_train and \
-                    t[O] in entities_in_train and t[P] in relations_in_train:
+                elif (
+                    split in ["valid", "test"]
+                    and t[S] in entities_in_train
+                    and t[O] in entities_in_train
+                    and t[P] in relations_in_train
+                ):
                     f_wo_unseen.write(
                         str(entities[t[S]])
                         + "\t"
@@ -127,15 +154,32 @@ def store_map(symbol_map, filename):
                     )
                     size_unseen += 1
             without_unseen_sizes[split_without_unseen] = size_unseen
+    if args.triple_class:
+        for split, filename in split_files_label.items():
+            if split in ["valid", "test"]:
+                split_without_unseen = split + "_without_unseen"
+                f_wo_unseen = open(
+                    os.path.join(
+                        args.folder, split_files_without_unseen[split_without_unseen]
+                    ),
+                    "w",
+                )
+                with open(os.path.join(args.folder, filename), "w") as f:
+                    for n, t in enumerate(raw[split]):
+                        f.write(t[4] + "\n")
+                        if (
+                            t[S] in entities_in_train
+                            and t[O] in entities_in_train
+                            and t[P] in relations_in_train
+                        ):
+                            f_wo_unseen.write(t[4] + "\n")
 
     # write config
     print("Writing dataset.yaml...")
     dataset_config = dict(
-        name=args.folder,
-        num_entities=len(entities),
-        num_relations=len(relations),
+        name=args.folder, num_entities=len(entities), num_relations=len(relations),
     )
-    for obj in [ "entity", "relation" ]:
+    for obj in ["entity", "relation"]:
         dataset_config[f"files.{obj}_ids.filename"] = f"{obj}_ids.del"
         dataset_config[f"files.{obj}_ids.type"] = "map"
     for split in split_files.keys():
@@ -143,9 +187,16 @@ def store_map(symbol_map, filename):
         dataset_config[f"files.{split}.type"] = "triples"
         dataset_config[f"files.{split}.size"] = split_sizes.get(split)
     for split in split_files_without_unseen.keys():
-        dataset_config[f"files.{split}.filename"] = split_files_without_unseen.get(split)
+        dataset_config[f"files.{split}.filename"] = split_files_without_unseen.get(
+            split
+        )
         dataset_config[f"files.{split}.type"] = "triples"
         dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split)
+    if args.triple_class:
+        for split in split_files_label.keys():
+            dataset_config[f"files.{split}.filename"] = split_files_label.get(split)
+            dataset_config[f"files.{split}.type"] = "label"
+            dataset_config[f"files.{split}.size"] = split_sizes.get(split)
     for string in string_files.keys():
         if os.path.exists(os.path.join(args.folder, string_files[string])):
             dataset_config[f"files.{string}.filename"] = string_files.get(string)

From 2b9878024eda0e1bb8c86011ab5c97111dc1a5c1 Mon Sep 17 00:00:00 2001
From: samuelbroscheit <samuel.broscheit@gmail.com>
Date: Mon, 25 May 2020 10:23:57 +0200
Subject: [PATCH 17/19] config

---
 examples/toy-complex-train.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml
index be8a0de4b..59d8bc78b 100644
--- a/examples/toy-complex-train.yaml
+++ b/examples/toy-complex-train.yaml
@@ -1,4 +1,5 @@
 job.type: train
+#dataset.name: toy
 dataset.name: fb15k-237
 #dataset.name: fb15k
 

From 5b1a5b408b087d207884c01931714199b80e44c1 Mon Sep 17 00:00:00 2001
From: nzteb <patrickmbetz@gmail.com>
Date: Thu, 4 Jun 2020 15:50:55 +0200
Subject: [PATCH 18/19] Add preprocess functionality for wn11

---
 data/download_all.sh    |  20 +++++---
 data/preprocess.py      | 111 +++++++++++++++++++++++++++-------------
 kge/config-default.yaml |   8 ---
 3 files changed, 89 insertions(+), 50 deletions(-)

diff --git a/data/download_all.sh b/data/download_all.sh
index adba8b274..ea63e0d9d 100755
--- a/data/download_all.sh
+++ b/data/download_all.sh
@@ -204,19 +204,25 @@ fi
 
 # wn11
 if [ ! -d "$BASEDIR/wn11" ]; then
-  echo Downloading wikidata5m
+  echo Downloading wn11
   cd $BASEDIR
+  # TODO this also creates a __MACOSX folder on non-mac in the zip
+  # TODO download file from mannheim server
   curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip
-  unzip wn11.zip
-  mv wordnet11/wordnet11 wn11
-  rm -r wordnet11/
+  unzip wordnet11.zip
+  if [ -d "__MACOSX" ]; then
+    rm -r __MACOSX
+  fi
+  mv wordnet11 wn11
   mv wn11/dev.txt wn11/valid.txt
 else
-    echo wikidata5m already present
+    echo wn11 already present
 fi
+
 if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then
-  python preprocess.py wikidata5m
+  python preprocess.py wn11 --triple_class
 else
-    echo wikidata5m already prepared
+    echo wn11 already prepared
 fi
 
+
diff --git a/data/preprocess.py b/data/preprocess.py
index 43da59d02..3d2627528 100755
--- a/data/preprocess.py
+++ b/data/preprocess.py
@@ -7,7 +7,7 @@
 
 During preprocessing, each distinct entity name and each distinct distinct relation name
 is assigned an index (dense). The index-to-object mapping is stored in files
-"entity_map.del" and "relation_map.del", resp. The triples (as indexes) are stored in
+"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in
 files "train.del", "valid.del", and "test.del". Metadata information is stored in a file
 "dataset.yaml".
 
@@ -36,11 +36,7 @@ def store_map(symbol_map, filename):
     print(f"Preprocessing {args.folder}...")
     raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"}
     split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"}
-    split_files_label = {
-        "train_label": "train_label.del",
-        "valid_label": "valid_label.del",
-        "test_label": "test_label.del",
-    }
+
     string_files = {
         "entity_strings": "entity_strings.del",
         "relation_strings": "relation_strings.del",
@@ -50,6 +46,15 @@ def store_map(symbol_map, filename):
         "valid_without_unseen": "valid_without_unseen.del",
         "test_without_unseen": "test_without_unseen.del",
     }
+
+    if args.triple_class:
+        split_files_negatives = {
+            "valid_negatives": "valid_negatives.del",
+            "test_negatives": "test_negatives.del"}
+        split_files_negatives_without_unseen = {
+            "valid_negatives_without_unseen": "valid_negatives_without_unseen.del",
+            "test_negatives_without_unseen": "test_negatives_without_unseen.del"}
+
     split_sizes = {}
 
     if args.order_sop:
@@ -106,6 +111,15 @@ def store_map(symbol_map, filename):
                 ),
                 "w",
             )
+            if args.triple_class:
+                split_negatives_wo_unseen = f"{split}_negatives_without_unseen"
+                f_negatives_wo_unseen = open(
+                    os.path.join(
+                        args.folder,
+                        split_files_negatives_without_unseen[split_negatives_wo_unseen]
+                    ),
+                    "w"
+                )
         else:
             split_without_unseen = split + "_sample"
             f_tr_sample = open(
@@ -118,9 +132,34 @@ def store_map(symbol_map, filename):
                 split_sizes["train"], split_sizes["valid"], False
             )
         with open(os.path.join(args.folder, filename), "w") as f:
-            size_unseen = 0
+            if args.triple_class and split in ["valid", "test"]:
+                split_negatives = f"{split}_negatives"
+                f_negatives = open(
+                    os.path.join(
+                        args.folder,
+                        split_files_negatives[split_negatives],
+                    ),
+                    "w",
+                )
+
+            if args.triple_class:
+                size_negatives = 0
+                size_negatives_unseen = 0
+                # positives; valid and test sizes have to be recalculated
+                size_positives = 0
+                size_positives_unseen = 0
+            else:
+                size_positives_unseen = 0
             for n, t in enumerate(raw[split]):
-                f.write(
+                if args.triple_class and split in ["valid", "test"] and int(t[3]) == -1:
+                    file_wrapper = f_negatives
+                    size_negatives += 1
+                elif args.triple_class and split in ["valid", "test"]:
+                    size_positives += 1
+                    file_wrapper = f
+                else:
+                    file_wrapper = f
+                file_wrapper.write(
                     str(entities[t[S]])
                     + "\t"
                     + str(relations[t[P]])
@@ -137,14 +176,22 @@ def store_map(symbol_map, filename):
                         + str(entities[t[O]])
                         + "\n"
                     )
-                    size_unseen += 1
+                    size_positives_unseen += 1
                 elif (
                     split in ["valid", "test"]
                     and t[S] in entities_in_train
                     and t[O] in entities_in_train
                     and t[P] in relations_in_train
                 ):
-                    f_wo_unseen.write(
+
+                    if args.triple_class and int(t[3]) == -1:
+                        file_wrapper = f_negatives_wo_unseen
+                        size_negatives_unseen += 1
+                    else:
+                        file_wrapper = f_wo_unseen
+                        size_positives_unseen += 1
+
+                    file_wrapper.write(
                         str(entities[t[S]])
                         + "\t"
                         + str(relations[t[P]])
@@ -152,27 +199,11 @@ def store_map(symbol_map, filename):
                         + str(entities[t[O]])
                         + "\n"
                     )
-                    size_unseen += 1
-            without_unseen_sizes[split_without_unseen] = size_unseen
-    if args.triple_class:
-        for split, filename in split_files_label.items():
-            if split in ["valid", "test"]:
-                split_without_unseen = split + "_without_unseen"
-                f_wo_unseen = open(
-                    os.path.join(
-                        args.folder, split_files_without_unseen[split_without_unseen]
-                    ),
-                    "w",
-                )
-                with open(os.path.join(args.folder, filename), "w") as f:
-                    for n, t in enumerate(raw[split]):
-                        f.write(t[4] + "\n")
-                        if (
-                            t[S] in entities_in_train
-                            and t[O] in entities_in_train
-                            and t[P] in relations_in_train
-                        ):
-                            f_wo_unseen.write(t[4] + "\n")
+            if args.triple_class and split in ["valid", "test"]:
+                without_unseen_sizes[split_negatives_wo_unseen] = size_negatives_unseen
+                split_sizes[split] = size_positives
+                split_sizes[split_negatives] = size_negatives
+            without_unseen_sizes[split_without_unseen] = size_positives_unseen
 
     # write config
     print("Writing dataset.yaml...")
@@ -193,10 +224,20 @@ def store_map(symbol_map, filename):
         dataset_config[f"files.{split}.type"] = "triples"
         dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split)
     if args.triple_class:
-        for split in split_files_label.keys():
-            dataset_config[f"files.{split}.filename"] = split_files_label.get(split)
-            dataset_config[f"files.{split}.type"] = "label"
-            dataset_config[f"files.{split}.size"] = split_sizes.get(split)
+        for split in split_files_negatives.keys():
+            dataset_config[f"files.{split}.filename"] = split_files_negatives.get(split)
+            dataset_config[f"files.{split}.type"] = "triples"
+            dataset_config[f"files.{split}.size"] = split_sizes[split]
+
+        for split in split_files_negatives_without_unseen.keys():
+            dataset_config[f"files.{split}.filename"] = split_files_negatives_without_unseen.get(
+                split)
+            dataset_config[f"files.{split}.type"] = "triples"
+            dataset_config[f"files.{split}.size"] = without_unseen_sizes[
+               split]
+
+
+
     for string in string_files.keys():
         if os.path.exists(os.path.join(args.folder, string_files[string])):
             dataset_config[f"files.{string}.filename"] = string_files.get(string)
diff --git a/kge/config-default.yaml b/kge/config-default.yaml
index c1a57ae76..08ca70681 100644
--- a/kge/config-default.yaml
+++ b/kge/config-default.yaml
@@ -393,15 +393,7 @@ valid:
 
   # Name of the trace entry that holds the validation metric (higher value is
   # better)
-<<<<<<< HEAD
-<<<<<<< HEAD
   metric: mean_reciprocal_rank_filtered_with_test
-=======
-  metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification
->>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train)
-=======
-  metric: mean_reciprocal_rank_filtered # accuracy for triple_classification
->>>>>>> Moved sampling function to sampler.py, updated code documentation
 
   # If the above metric is not present in trace (e.g., because a custom metric
   # should be used), a Python expression to compute the metric. Can refer to

From 8a4416f68dbf3348e10cc79cdf016a2495d5874f Mon Sep 17 00:00:00 2001
From: nzteb <patrickmbetz@gmail.com>
Date: Thu, 4 Jun 2020 18:10:23 +0200
Subject: [PATCH 19/19] Allow to use labels for triple classification from data

---
 examples/toy-complex-train-tripleclass.yaml | 11 +++--
 kge/config-default.yaml                     | 10 +++-
 kge/job/triple_classification.py            | 54 ++++++++++++++++++++-
 3 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml
index 582f8b72f..d75cdd811 100644
--- a/examples/toy-complex-train-tripleclass.yaml
+++ b/examples/toy-complex-train-tripleclass.yaml
@@ -1,6 +1,6 @@
 job.type: train
-dataset.name: toy
-model: distmult
+dataset.name: wn11
+model: complex
 train:
   optimizer: Adagrad
   optimizer_args:
@@ -11,8 +11,11 @@ lookup_embedder.dim: 100
 lookup_embedder.initialize: xavier_uniform_
 eval:
   type: triple_classification
-  metrics_per.relation: False
-  triple_classification_random_seed: False
+triple_classification.random_seed: False
+triple_classification.negatives_from: data
+
+
 valid.metric: accuracy
+valid.every: 1
 
 
diff --git a/kge/config-default.yaml b/kge/config-default.yaml
index 08ca70681..f06a8eea6 100644
--- a/kge/config-default.yaml
+++ b/kge/config-default.yaml
@@ -330,7 +330,7 @@ eval:
   # mean_reciprocal_rank_filtered_with_test.
   filter_with_test: True
 
-  # Type of evaluation (entity_ranking only at the moment)
+  # Type of evaluation (entity_ranking, triple_classification)
   type: entity_ranking
 
   # How to handle cases with ties between the correct answer and other answers, e.g.,
@@ -423,6 +423,14 @@ valid:
 
 ## EVALUATION ##################################################################
 
+triple_classification:
+  random_seed: False
+  # How to obtain negative triple labels. Possible values are:
+  # - corruption: Create negatives by randomly corrupting existing triples (positives)
+  # - data      : Obtain negative labels from the dataset. This assumes the data set
+  #               contains the splits 'valid_negatives' and 'test_negatives'
+  negatives_from: corruption
+
 
 ## HYPERPARAMETER SEARCH #######################################################
 
diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py
index 2d5cce804..918b19520 100644
--- a/kge/job/triple_classification.py
+++ b/kge/job/triple_classification.py
@@ -20,7 +20,12 @@ def __init__(self, config: Config, configuration_key: str, dataset: Dataset):
         self.o_entities = None
         uni_sampler_config = config.clone()
         # uni_sampler_config.set("negative_sampling.num_samples.s", self.get_option("num_samples.s"))
+        # TODO this is redundant as uniform.sample() is called with "num_samples" here in self.sample()
         uni_sampler_config.set("negative_sampling.num_samples.s", 1)
+        # TODO maybe changing the API of KGEsampler.sample() to also accept a param "filter"
+        #  as it is the case already with "num_samples"
+        #  then we would not rely here on configuration options which actually
+        #  belong to a training job
         uni_sampler_config.set("negative_sampling.filtering.s", True)
         # uni_sampler_config.set("negative_sampling.num_samples.o", self.get_option("num_samples.o"))
         uni_sampler_config.set("negative_sampling.num_samples.o", 1)
@@ -31,6 +36,7 @@ def __init__(self, config: Config, configuration_key: str, dataset: Dataset):
 
     def _prepare(self,):
         train_data = self.dataset.split("train")
+        #TODO probably outdated as it refers to out-commented code
         self.s_entities = train_data[:, S].unique().tolist()
         self.o_entities = train_data[:, O].unique().tolist()
         self._is_prepared = True
@@ -105,6 +111,20 @@ def __init__(self, config, dataset, parent_job, model):
         self.triple_classification_sampler = TripleClassificationSampler(
             config, "triple_classification", dataset
         )
+        self.config.check(
+            "triple_classification.negatives_from", ["corruption", "data"]
+        )
+        self.negatives_from = self.config.get("triple_classification.negatives_from")
+        if self.negatives_from == "data":
+            try:
+                self.config.get("dataset.files.valid_negatives.type")
+                self.config.get("dataset.files.test_negatives.type")
+            except:
+                raise KeyError(
+                    "No splits test/valid_negatives found for the dataset. "
+                    "Provide a dataset with splits valid_negatives and test_negatives "
+                    "or run triple classification with negatives_from=corruption"
+                )
 
     def _prepare(self):
         """Prepare the corrupted validation and test data.
@@ -121,7 +141,8 @@ def _prepare(self):
 
         self.config.log("Generate data with corrupted and true triples...")
 
-        if self.eval_split == "test":
+        # TODO maybe should be generalized to allow for other splits as valid_wo_unseen
+        if self.eval_split == "test" and self.negatives_from == "corruption":
             (
                 self.tune_data,
                 self.tune_labels,
@@ -130,7 +151,25 @@ def _prepare(self):
                 self.eval_data,
                 self.eval_labels,
             ) = self.triple_classification_sampler.sample(self.dataset.split("test"))
-        else:
+
+        elif self.eval_split == "test" and self.negatives_from == "data":
+            positives_valid = self.dataset.split("valid")
+            negatives_valid = self.dataset.split("valid_negatives")
+            self.tune_data = torch.cat((positives_valid, negatives_valid)).to(self.device)
+            self.tune_labels = torch.cat(
+                (torch.ones(positives_valid.size(0), torch.zeros(negatives_valid.size(0))))
+            ).to(self.device)
+
+            positives_test = self.dataset.split("test")
+            negatives_test = self.dataset.split("test_negatives")
+            self.tune_data = torch.cat((positives_test, negatives_test)).to(
+                self.device)
+            self.tune_labels = torch.cat(
+                (torch.ones(positives_test.size(0),
+                            torch.zeros(negatives_test.size(0))))
+            ).to(self.device)
+
+        elif self.eval_split == "valid" and self.negatives_from == "corruption":
             (
                 self.tune_data,
                 self.tune_labels,
@@ -140,6 +179,17 @@ def _prepare(self):
                 self.eval_labels,
             ) = self.triple_classification_sampler.sample(self.dataset.split("valid"))
 
+        elif self.eval_split == "valid" and self.negatives_from == "data":
+            positives = self.dataset.split("valid")
+            negatives = self.dataset.split("valid_negatives")
+            self.tune_data = torch.cat((positives, negatives)).to(self.device)
+            self.tune_labels = torch.cat(
+                (torch.ones(positives.size(0)), torch.zeros(negatives.size(0)))
+            ).to(self.device)
+
+            self.eval_data = self.tune_data
+            self.eval_labels = self.tune_labels
+
         # let the model add some hooks, if it wants to do so
         self.model.prepare_job(self)
         self.valid_data_is_prepared = True