From 49ed5ef2ae00ed240bc573527a9922da05d26b21 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Mon, 29 Jul 2019 14:32:34 +0200 Subject: [PATCH 01/19] Implemented triple classification --- examples/toy-complex-train-tripleclass.yaml | 16 ++ kge/job/__init__.py | 1 + kge/job/eval.py | 6 +- kge/job/triple_classification.py | 246 ++++++++++++++++++++ 4 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 examples/toy-complex-train-tripleclass.yaml create mode 100644 kge/job/triple_classification.py diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml new file mode 100644 index 000000000..864f2280f --- /dev/null +++ b/examples/toy-complex-train-tripleclass.yaml @@ -0,0 +1,16 @@ +job.type: train +dataset.name: toy +model: distmult +train: + optimizer: Adagrad + optimizer_args: + lr: 0.2 + weight_decay: 0.4e-7 +lookup_embedder.dim: 100 +#lookup_embedder.initialize: normal_ +lookup_embedder.initialize: xavier_uniform_ +eval.type: triple_classification +valid.metric: Accuracy +eval.thresholds: valid +eval.test: test + diff --git a/kge/job/__init__.py b/kge/job/__init__.py index c3bee8a37..de00c257b 100644 --- a/kge/job/__init__.py +++ b/kge/job/__init__.py @@ -9,3 +9,4 @@ from kge.job.ax_search import AxSearchJob from kge.job.entity_ranking import EntityRankingJob from kge.job.entity_pair_ranking import EntityPairRankingJob +from kge.job.triple_classification import TripleClassificationJob diff --git a/kge/job/eval.py b/kge/job/eval.py index 138726728..d97e838a9 100644 --- a/kge/job/eval.py +++ b/kge/job/eval.py @@ -72,7 +72,7 @@ def __init__(self, config, dataset, parent_job, model): @staticmethod def create(config, dataset, parent_job=None, model=None): """Factory method to create an evaluation job """ - from kge.job import EntityRankingJob, EntityPairRankingJob + from kge.job import EntityRankingJob, EntityPairRankingJob, TripleClassificationJob # create the job if config.get("eval.type") == "entity_ranking": @@ -81,6 +81,10 @@ def create(config, dataset, parent_job=None, model=None): return EntityPairRankingJob( config, dataset, parent_job=parent_job, model=model ) + elif config.get("eval.type") == "triple_classification": + return TripleClassificationJob( + config, dataset, parent_job=parent_job, model=model + ) else: raise ValueError("eval.type") diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py new file mode 100644 index 000000000..457f26836 --- /dev/null +++ b/kge/job/triple_classification.py @@ -0,0 +1,246 @@ +import time +import random + +import torch +from sklearn.metrics import accuracy_score, precision_score +from kge.job import EvaluationJob + + +class TripleClassificationJob(EvaluationJob): + """Triple classification evaluation protocol: + Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for + each relation. Unseen triples will be predicted as True if the score is higher than the treshold. + Todo: Get rid of as many for loops as possible to make the evaluation faster!! + """ + def __init__(self, config, dataset, parent_job, model): + super().__init__(config, dataset, parent_job, model) + self.threshold_data = self.config.get("eval.thresholds") + self.eval_data = self.config.get("eval.test") #Todo: Use eval.data and delete eval.test in configuration (didnt work for some reason) + self.is_prepared = False + + def _prepare(self): + """Load specified data.""" + + if self.is_prepared: + return + + # Set test dataset + if self.eval_data == "test": + self.eval = self.dataset.test + else: + self.eval = self.dataset.valid + + # Set dataset for which thresholds are found + if self.threshold_data == "valid": + self.threshold = self.dataset.valid + else: self.threshold = self.dataset.train + + # let the model add some hooks, if it wants to do so + self.model.prepare_job(self) + self.is_prepared = True + + def run(self): + """1. Generation of (corrupted) negative triples: + Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples. + Allow only entities which appeared at the given position in the dataset + 2. Get scores for the corrupted datasets + 3. Find the best threshold for every relation by maximizing accuracy on validation data + 4. Classify triples in test data + 5. Compute Metrics for test data + 6. Trace & Log + """ + self._prepare() + + was_training = self.model.training #Todo-Question: Copied that from entity ranking but don't know if it is needed + self.model.eval() + + self.config.log("Starting triple classification...") + epoch_time = -time.time() + + # 1. Generate corrupted data. Output: triples, labels, labels per relation + self.config.log("Generate corrupted datasets...") + valid_corrupted, valid_labels, rel_valid_labels = self._generate_negatives(self.threshold) + test_corrupted, test_labels, rel_test_labels = self._generate_negatives(self.eval) + + # 2. Get scores for the new data. Relevant Output: Scores and scores per relation + self.config.log("Get scores for datasets...") + s_valid, p_valid, o_valid = valid_corrupted[:, 0], valid_corrupted[:, 1], valid_corrupted[:, 2] + valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) + rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} + + s_test, p_test, o_test = test_corrupted[:, 0], test_corrupted[:, 1], test_corrupted[:, 2] + test_scores = self.model.score_spo(s_test, p_test, o_test) + rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} + + # 3. Find the best thresholds for every relation and their accuracies on the valid data + self.config.log("Learning thresholds on " + self.threshold_data + " data.") + rel_thresholds, accuracies_valid = self.findThresholds(p_valid, rel_valid_scores, rel_valid_labels) + + # 4. Classification on test data. Output: predictions per relation and number of relations in test which are + # not included in valid + self.config.log("Evaluating on " + self.eval_data + " data.") + self.config.log("Predict...") + rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test) + + # 5. Report Metrics on test data + self.config.log("Classification results:") + metrics = self._compute_metrics(rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval) + + # 6. Trace & Log + + epoch_time += time.time() + # compute trace + trace_entry = dict( + type="triple_classification", + scope="epoch", + data_learn_thresholds=self.threshold_data, + data_evaluate=self.eval_data, + epoch=self.epoch, + size=2*len(self.eval), + epoch_time=epoch_time, + **metrics, + ) + for f in self.post_epoch_trace_hooks: + f(self, trace_entry) + + # if validation metric is not present, try to compute it + metric_name = self.config.get("valid.metric") + if metric_name not in trace_entry: + trace_entry[metric_name] = eval( + self.config.get("valid.metric_expr"), + None, + {"config": self.config, **trace_entry}, + ) + + # write out trace + trace_entry = self.trace(**trace_entry, echo=True, echo_prefix=" ", log=True) + + # reset model and return metrics + if was_training: + self.model.train() + self.config.log("Finished evaluating on " + self.eval_data + " data.") + + return trace_entry + # Todo-Question: Not sure if what is included in the trace is correct or enough. Feedback needed. + + def _generate_negatives(self, dataset): + # 1. Corrupt triples + labels = [] + corrupted = [] + for triple in dataset: + corrupted.append(triple) + labels.append(1) + # Random decision if sample subject(False) or object(True) + if bool(random.getrandbits(1))==True: + s = corrupted[-1][0] + p = corrupted[-1][1] + o = random.sample(list(dataset[:,2]), 1)[0] + # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset + while int(s)==int(o) \ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train\ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid\ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test: + o = random.sample(list(dataset[:,2]), 1)[0] + else: + s = random.sample(list(dataset[:,0]), 1)[0] + p = corrupted[-1][1] + o = corrupted[-1][2] + # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset + while int(s) == int(o) \ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train \ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid \ + and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test: + o = random.sample(list(dataset[:,0]), 1)[0] + + corrupted.append(torch.tensor([s, p, o], dtype=torch.int32)) + labels.append(0) + corrupted = torch.stack(corrupted) + + # TODO-Question: Would it make sense to use and modify util.sampler for that task? + # TODO-Question: Right now we allow only samples at the position where they appeared and only from the same dataset as specified. + # Would it make sense to allow to sample from all three available datasets? + + # Save the labels per relation, since this will be needed frequently later + p = corrupted[:, 1] + rel_labels = {int(r): [labels[int((p == r).nonzero()[i])] + for i in range(len((p == r).nonzero()))] for r in p.unique()} + + return corrupted, labels, rel_labels + + def findThresholds(self, p, rel_scores, rel_labels): + # Initialize accuracies, thresholds (and predictions) + rel_accuracies = {int(r): -1 for r in p.unique()} + rel_thresholds = {int(r): 0 for r in p.unique()} +# rel_predictions = {int(r): 0 for r in p.unique()} + + # Find best thresholds + for r in p.unique(): + for t in rel_scores[int(r)]: + preds = torch.zeros(len((p == r).nonzero())) + for i in range(len(rel_scores[int(r)])): + if rel_scores[int(r)][i] >= t: + preds[i] = 1 + accuracy = accuracy_score(rel_labels[int(r)], preds) + if accuracy > rel_accuracies[int(r)]: + rel_accuracies[int(r)] = accuracy + rel_thresholds[int(r)] = float(t) + #rel_predictions[int(r)] = preds + + return rel_thresholds, rel_accuracies + + def predict(self, rel_thresholds, rel_scores, p_valid, p_test): + + rel_predictions = {int(r):[0]*len(rel_scores[int(r)]) for r in p_test.unique()} + + # Set counter for triples for which the relation is not in valid data + not_in_eval = [] + for r in p_test.unique(): + # Check if relation which is in valid data also is in test data + if r in p_valid.unique(): + # Predict + for i in range(len(rel_scores[int(r)])): + if float(rel_scores[int(r)][i]) >= rel_thresholds[int(r)]: + rel_predictions[int(r)][i] = 1 + else: not_in_eval.append(r) + + return rel_predictions, not_in_eval + + def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval): + metrics = {} + + labels_in_test_list = [i + for r in p_test.unique() + for i in rel_test_labels[int(r)]] + + pred_list = [i + for r in p_test.unique() + for i in rel_predictions[int(r)]] + + + metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list)) + metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list)) + + precision_per_r = {} + accuracy_per_r = {} + for r in p_test.unique(): + precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + # Todo: Find out what the warning "UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. + # 'precision', 'predicted', average, warn_for)" is. + metrics["Accuracy_per_Relation"] = accuracy_per_r + + metrics["Precision_Per_Relation"] = precision_per_r + + # Since we evaluate on test data, only the relations in the test data which cannot be evaluated are counted here. + # In general we miss more than teh half of the existing relations for toy data, because they are not in test/valid. + metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval) + + return metrics + + # TODO-Question: We optimized the tresholds only for one randomly corrupted sample of the data. + # Another sample would give (a little) different results due to a different threshold. + # I would probably optimize the thresholds for different samples and in the end take something like the mean of all + # thresholds as final threshold, but in the literature, it seems like they really corrupt the data only once. + # Anyway for comparison of models, we have to pay attention to use the same data samples.Thus it might be better to + # create and save a dataset with negative labels and use always the same for all models. + # Any feedback on this? \ No newline at end of file From a6aec4fb637dbf99c06d4d4ba41aead75b3383a2 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Tue, 20 Aug 2019 12:26:30 +0200 Subject: [PATCH 02/19] got rid of unnecessary codelines, Improved classification time to ~15sec on fb15k, implemented an alternative way to find the thresholds, different slight changes --- examples/toy-transe-train-tripleclass.yaml | 58 ++++++ kge/job/triple_classification.py | 195 ++++++++++----------- 2 files changed, 155 insertions(+), 98 deletions(-) create mode 100644 examples/toy-transe-train-tripleclass.yaml diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml new file mode 100644 index 000000000..2119bc1ab --- /dev/null +++ b/examples/toy-transe-train-tripleclass.yaml @@ -0,0 +1,58 @@ +job: + device: cuda + type: train + +model: transe + +dataset: + name: fb15k + +train: + batch_size: 256 + loss: margin_ranking + loss_arg: 0.2 + max_epochs: 200 + optimizer: Adagrad + optimizer_args: + lr: 0.01 + type: negative_sampling + +negative_sampling: + num_negatives_o: 3 + num_negatives_p: 0 + num_negatives_s: 3 + sampling_type: uniform + +valid: + early_stopping.patience: 5 + every: 5 + filter_with_test: True + metric: Accuracy + +eval: + batch_size: 512 + type: triple_classification + +transe: + class_name: TransE + entity_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_weight: 1.e-05 + relation_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_weight: 1.e-05 + l_norm: 1. diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 457f26836..643dacb9d 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -1,5 +1,7 @@ import time import random +import itertools +from copy import deepcopy import torch from sklearn.metrics import accuracy_score, precision_score @@ -10,34 +12,17 @@ class TripleClassificationJob(EvaluationJob): """Triple classification evaluation protocol: Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for each relation. Unseen triples will be predicted as True if the score is higher than the treshold. - Todo: Get rid of as many for loops as possible to make the evaluation faster!! + """ def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) - self.threshold_data = self.config.get("eval.thresholds") - self.eval_data = self.config.get("eval.test") #Todo: Use eval.data and delete eval.test in configuration (didnt work for some reason) - self.is_prepared = False - - def _prepare(self): - """Load specified data.""" - - if self.is_prepared: - return - - # Set test dataset - if self.eval_data == "test": - self.eval = self.dataset.test - else: - self.eval = self.dataset.valid - # Set dataset for which thresholds are found - if self.threshold_data == "valid": - self.threshold = self.dataset.valid - else: self.threshold = self.dataset.train + # 1. Generate corrupted data. Output: triples, labels, labels per relation + self.config.log("Generate corrupted datasets...") + # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data + self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) + self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) - # let the model add some hooks, if it wants to do so - self.model.prepare_job(self) - self.is_prepared = True def run(self): """1. Generation of (corrupted) negative triples: @@ -49,42 +34,38 @@ def run(self): 5. Compute Metrics for test data 6. Trace & Log """ - self._prepare() - was_training = self.model.training #Todo-Question: Copied that from entity ranking but don't know if it is needed + was_training = self.model.training self.model.eval() self.config.log("Starting triple classification...") epoch_time = -time.time() - # 1. Generate corrupted data. Output: triples, labels, labels per relation - self.config.log("Generate corrupted datasets...") - valid_corrupted, valid_labels, rel_valid_labels = self._generate_negatives(self.threshold) - test_corrupted, test_labels, rel_test_labels = self._generate_negatives(self.eval) + # 1. Generate corrupted data - already done # 2. Get scores for the new data. Relevant Output: Scores and scores per relation self.config.log("Get scores for datasets...") - s_valid, p_valid, o_valid = valid_corrupted[:, 0], valid_corrupted[:, 1], valid_corrupted[:, 2] + s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} - s_test, p_test, o_test = test_corrupted[:, 0], test_corrupted[:, 1], test_corrupted[:, 2] + s_test, p_test, o_test = self.test_corrupted[:, 0], self.test_corrupted[:, 1], self.test_corrupted[:, 2] test_scores = self.model.score_spo(s_test, p_test, o_test) rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} # 3. Find the best thresholds for every relation and their accuracies on the valid data - self.config.log("Learning thresholds on " + self.threshold_data + " data.") - rel_thresholds, accuracies_valid = self.findThresholds(p_valid, rel_valid_scores, rel_valid_labels) + self.config.log("Learning thresholds on validation data.") + rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted) # 4. Classification on test data. Output: predictions per relation and number of relations in test which are # not included in valid - self.config.log("Evaluating on " + self.eval_data + " data.") + self.config.log("Evaluating on test data.") self.config.log("Predict...") - rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test) + rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test) # 5. Report Metrics on test data self.config.log("Classification results:") - metrics = self._compute_metrics(rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval) + metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval) # 6. Trace & Log @@ -93,10 +74,10 @@ def run(self): trace_entry = dict( type="triple_classification", scope="epoch", - data_learn_thresholds=self.threshold_data, - data_evaluate=self.eval_data, + data_learn_thresholds="Valid", + data_evaluate="Test", epoch=self.epoch, - size=2*len(self.eval), + size=2*len(self.dataset.valid), epoch_time=epoch_time, **metrics, ) @@ -125,40 +106,33 @@ def run(self): def _generate_negatives(self, dataset): # 1. Corrupt triples - labels = [] - corrupted = [] - for triple in dataset: - corrupted.append(triple) - labels.append(1) - # Random decision if sample subject(False) or object(True) - if bool(random.getrandbits(1))==True: - s = corrupted[-1][0] - p = corrupted[-1][1] - o = random.sample(list(dataset[:,2]), 1)[0] - # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset - while int(s)==int(o) \ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train\ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid\ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test: - o = random.sample(list(dataset[:,2]), 1)[0] - else: - s = random.sample(list(dataset[:,0]), 1)[0] - p = corrupted[-1][1] - o = corrupted[-1][2] - # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset - while int(s) == int(o) \ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.train \ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.valid \ - and torch.tensor([s, p, o], dtype=torch.int32) in self.dataset.test: - o = random.sample(list(dataset[:,0]), 1)[0] - - corrupted.append(torch.tensor([s, p, o], dtype=torch.int32)) - labels.append(0) - corrupted = torch.stack(corrupted) - - # TODO-Question: Would it make sense to use and modify util.sampler for that task? - # TODO-Question: Right now we allow only samples at the position where they appeared and only from the same dataset as specified. - # Would it make sense to allow to sample from all three available datasets? + corrupted = dataset.repeat(1, 2).view(-1, 3) + labels = torch.as_tensor([1, 0] * len(dataset)) + + sample = torch.randint(0,2,(1,len(dataset))) + + # Random decision if sample subject(sample=nonzero) or object(sample=zero) + corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ + torch.as_tensor(random.sample( + list(map(int, dataset[:, 0])), len(corrupted[1::2][:, 0][sample.nonzero()[:, 1]])), dtype=torch.int32) + + corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \ + torch.as_tensor(random.sample( + list(map(int, dataset[:, 2])), len(corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]])), dtype=torch.int32) + + # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset + for i in range(len(corrupted[1::2])): + while int(corrupted[1::2][i][0]) == int(corrupted[1::2][i][2]) \ + and corrupted[1::2][i] in self.dataset.train \ + and corrupted[1::2][i] in self.dataset.valid \ + and corrupted[1::2][i] in self.dataset.test: + if bool(random.getrandbits(1)) == True: + corrupted[1::2][i][2] = random.sample(list(dataset[:, 2]), 1)[0] + else: + corrupted[1::2][i][0] = random.sample(list(dataset[:, 0]), 1)[0] + + # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample + # (e.g. from test, train and valid entities instead of only valid. # Save the labels per relation, since this will be needed frequently later p = corrupted[:, 1] @@ -167,30 +141,62 @@ def _generate_negatives(self, dataset): return corrupted, labels, rel_labels - def findThresholds(self, p, rel_scores, rel_labels): + def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): + """Method 1: Threshold is always one of the scores""" # Initialize accuracies, thresholds (and predictions) rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} -# rel_predictions = {int(r): 0 for r in p.unique()} - # Find best thresholds + valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) + + for r in p.unique(): - for t in rel_scores[int(r)]: - preds = torch.zeros(len((p == r).nonzero())) - for i in range(len(rel_scores[int(r)])): - if rel_scores[int(r)][i] >= t: - preds[i] = 1 - accuracy = accuracy_score(rel_labels[int(r)], preds) + #Predict + current_rel = (valid_data[:, 1] == r) + true_labels = valid_labels[current_rel.nonzero()].type(torch.int) + preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int) + accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] + + rel_accuracies[int(r)] = max(accuracy) + # Todo: Sometimes different scores can be the largest. Add condition, that always the largest/smalles/something else score that gives the maximum accuracy is chosen + rel_thresholds[int(r)] = rel_scores[int(r)][accuracy.index(max(accuracy))] + + + """Method 2: Search for best threshold in an interval + https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py + # Initialize accuracies, thresholds (and predictions) + min_score = valid_scores.min() + max_score = valid_scores.max() + + rel_accuracies = {int(r): -1 for r in p.unique()} + rel_thresholds = {int(r): min_score for r in p.unique()} + + score = min_score + + # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model + # Suggestion: float((max_score-min_score)/len(valid_scores)) + interval = float((max_score-min_score)/len(valid_scores)) + valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) + + while(score<=max_score): + for r in p.unique(): + #Predict + current_rel = (valid_data[:, 1] == r) + true_labels = valid_labels[current_rel.nonzero()].type(torch.int) + preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) + accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) + if accuracy > rel_accuracies[int(r)]: rel_accuracies[int(r)] = accuracy - rel_thresholds[int(r)] = float(t) - #rel_predictions[int(r)] = preds + rel_thresholds[int(r)] = score.clone() + score += interval + """ return rel_thresholds, rel_accuracies - def predict(self, rel_thresholds, rel_scores, p_valid, p_test): + def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test): - rel_predictions = {int(r):[0]*len(rel_scores[int(r)]) for r in p_test.unique()} + rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])) for r in p_test.unique()} # Set counter for triples for which the relation is not in valid data not_in_eval = [] @@ -216,7 +222,7 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no for r in p_test.unique() for i in rel_predictions[int(r)]] - + # Todo: Calculate accuracy and precision instead of using sklearn function metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list)) metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list)) @@ -225,22 +231,15 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no for r in p_test.unique(): precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - # Todo: Find out what the warning "UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. - # 'precision', 'predicted', average, warn_for)" is. + metrics["Accuracy_per_Relation"] = accuracy_per_r metrics["Precision_Per_Relation"] = precision_per_r - # Since we evaluate on test data, only the relations in the test data which cannot be evaluated are counted here. - # In general we miss more than teh half of the existing relations for toy data, because they are not in test/valid. + metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval) return metrics - # TODO-Question: We optimized the tresholds only for one randomly corrupted sample of the data. - # Another sample would give (a little) different results due to a different threshold. - # I would probably optimize the thresholds for different samples and in the end take something like the mean of all - # thresholds as final threshold, but in the literature, it seems like they really corrupt the data only once. - # Anyway for comparison of models, we have to pay attention to use the same data samples.Thus it might be better to - # create and save a dataset with negative labels and use always the same for all models. - # Any feedback on this? \ No newline at end of file + # TODO-Question: We optimized the thresholds only for one randomly corrupted sample of the data. + # Another sample would give (a little) different results. How can we ǵet reproduceable results? From af64a5476a9d6d0d89eccd47aead1012edd881f9 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Thu, 22 Aug 2019 17:03:49 +0200 Subject: [PATCH 03/19] Integrate _prepare function, delete conditions in generate_negatives which were not used in other implementations, include unique condition while sampling negatives from list to ensure same probability, Change find_thresholds so that the smallest score which gives the highest accuracy is used as threshold --- examples/toy-transe-train-tripleclass.yaml | 17 +-- kge/job/triple_classification.py | 155 +++++++++++---------- 2 files changed, 92 insertions(+), 80 deletions(-) diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml index 2119bc1ab..5cf81a2b7 100644 --- a/examples/toy-transe-train-tripleclass.yaml +++ b/examples/toy-transe-train-tripleclass.yaml @@ -5,7 +5,7 @@ job: model: transe dataset: - name: fb15k + name: toy train: batch_size: 256 @@ -19,24 +19,21 @@ train: negative_sampling: num_negatives_o: 3 - num_negatives_p: 0 num_negatives_s: 3 sampling_type: uniform valid: early_stopping.patience: 5 every: 5 - filter_with_test: True metric: Accuracy eval: - batch_size: 512 type: triple_classification transe: class_name: TransE entity_embedder: - dim: 100 + dim: 128 initialize: uniform_ initialize_args: uniform_ : @@ -44,9 +41,11 @@ transe: sparse: false type: lookup_embedder regularize: l2 - regularize_weight: 1.e-05 + regularize_args: + weight: 1.e-05 + weighted: False relation_embedder: - dim: 100 + dim: 128 initialize: uniform_ initialize_args: uniform_ : @@ -54,5 +53,7 @@ transe: sparse: false type: lookup_embedder regularize: l2 - regularize_weight: 1.e-05 + regularize_args: + weight: 1.e-05 + weighted: False l_norm: 1. diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 643dacb9d..54a1185f6 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -1,7 +1,5 @@ import time import random -import itertools -from copy import deepcopy import torch from sklearn.metrics import accuracy_score, precision_score @@ -9,31 +7,49 @@ class TripleClassificationJob(EvaluationJob): - """Triple classification evaluation protocol: - Testing model's ability to discriminate between true and false triples based on scores. Introduces a treshold for - each relation. Unseen triples will be predicted as True if the score is higher than the treshold. - + """Triple classification evaluation protocol. + + Testing model's ability to discriminate between true and false triples based on scores. Introduces a threshold for + each relation. Unseen triples will be predicted as True if the score is higher than the threshold. Procedure: + + 1. Generation of (corrupted) negative triples: + Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples. + Allow only entities which appeared at the given position in the dataset + 2. Get scores for the corrupted datasets + 3. Find the best threshold for every relation by maximizing accuracy on validation data + 4. Classify triples in test data + 5. Compute Metrics for test data + 6. Report metrics in Trace + # Todo: Check where it is necessary to add .to(self.device) to created tensors + # Todo: Change comments to fit the standard guidelines + # Todo: Find out if it makes sense to use a dataloader with the relations as batches with a _collate function + # Todo: Check all datatypes and make them consistent where possible """ + def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) + self.is_prepared = False + + def _prepare(self): + """Construct the datasets needed.""" - # 1. Generate corrupted data. Output: triples, labels, labels per relation + if self.is_prepared: + return + + # 1. Generate corrupted data self.config.log("Generate corrupted datasets...") # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) + # let the model add some hooks, if it wants to do so + self.model.prepare_job(self) + self.is_prepared = True def run(self): - """1. Generation of (corrupted) negative triples: - Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples. - Allow only entities which appeared at the given position in the dataset - 2. Get scores for the corrupted datasets - 3. Find the best threshold for every relation by maximizing accuracy on validation data - 4. Classify triples in test data - 5. Compute Metrics for test data - 6. Trace & Log - """ + """Runs the triple classification job.""" + + self._prepare() was_training = self.model.training self.model.eval() @@ -41,8 +57,6 @@ def run(self): self.config.log("Starting triple classification...") epoch_time = -time.time() - # 1. Generate corrupted data - already done - # 2. Get scores for the new data. Relevant Output: Scores and scores per relation self.config.log("Get scores for datasets...") s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2] @@ -109,27 +123,20 @@ def _generate_negatives(self, dataset): corrupted = dataset.repeat(1, 2).view(-1, 3) labels = torch.as_tensor([1, 0] * len(dataset)) + # Random decision if sample subject(sample=nonzero) or object(sample=zero) sample = torch.randint(0,2,(1,len(dataset))) - # Random decision if sample subject(sample=nonzero) or object(sample=zero) + # Sample subjects from subjects which appeared in the dataset corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ - torch.as_tensor(random.sample( - list(map(int, dataset[:, 0])), len(corrupted[1::2][:, 0][sample.nonzero()[:, 1]])), dtype=torch.int32) + torch.as_tensor(random.choice( + list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32) + # Sample objects from objects which appeared in the dataset corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \ - torch.as_tensor(random.sample( - list(map(int, dataset[:, 2])), len(corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]])), dtype=torch.int32) - - # Guarantee that s!=o and that the sampled triple is not a true triple of any other dataset - for i in range(len(corrupted[1::2])): - while int(corrupted[1::2][i][0]) == int(corrupted[1::2][i][2]) \ - and corrupted[1::2][i] in self.dataset.train \ - and corrupted[1::2][i] in self.dataset.valid \ - and corrupted[1::2][i] in self.dataset.test: - if bool(random.getrandbits(1)) == True: - corrupted[1::2][i][2] = random.sample(list(dataset[:, 2]), 1)[0] - else: - corrupted[1::2][i][0] = random.sample(list(dataset[:, 0]), 1)[0] + torch.as_tensor(random.choice( + list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32) + + # Todo: Add condition that corrupted triple!=original triple # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample # (e.g. from test, train and valid entities instead of only valid. @@ -142,8 +149,16 @@ def _generate_negatives(self, dataset): return corrupted, labels, rel_labels def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): + # Todo: Check if methods are equivalent + #Todo-Question: Method 1 is what seems reasonable for me, Method 2 is the reimplementation of the NTNH Paper of Socher et al. 2013. + # Method 1 is much faster and delivers equally good results. Since the threshold entirely is determined by the valid_scores + # and is a cut between them, the best threshold in terms of valid data is any value between two specific score values. + # Thus I assume, that we can just use one of these score values as the threshold, since we can't know better anyway. + # Is this thought correct? + # If not and Method 2 has to be used, how can it be fastened up? + """Method 1: Threshold is always one of the scores""" - # Initialize accuracies, thresholds (and predictions) + #Initialize accuracies, thresholds (and predictions) rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} @@ -158,40 +173,39 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] rel_accuracies[int(r)] = max(accuracy) - # Todo: Sometimes different scores can be the largest. Add condition, that always the largest/smalles/something else score that gives the maximum accuracy is chosen - rel_thresholds[int(r)] = rel_scores[int(r)][accuracy.index(max(accuracy))] + # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation + rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))]) + + # #Method 2: Search for best threshold in an interval + # #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py + # # Initialize accuracies, thresholds (and predictions) + # min_score = valid_scores.min() + # max_score = valid_scores.max() + # + # rel_accuracies = {int(r): -1 for r in p.unique()} + # rel_thresholds = {int(r): min_score for r in p.unique()} + # + # score = min_score + # + # # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model + # # Suggestion: float((max_score-min_score)/len(valid_scores)) + # interval = float((max_score-min_score)/len(valid_scores)) + # valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) + # + # while(score<=max_score): + # for r in p.unique(): + # #Predict + # current_rel = (valid_data[:, 1] == r) + # true_labels = valid_labels[current_rel.nonzero()].type(torch.int) + # preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) + # accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) + # + # if accuracy > rel_accuracies[int(r)]: + # rel_accuracies[int(r)] = accuracy + # rel_thresholds[int(r)] = score.clone() + # + # score += interval - - """Method 2: Search for best threshold in an interval - https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py - # Initialize accuracies, thresholds (and predictions) - min_score = valid_scores.min() - max_score = valid_scores.max() - - rel_accuracies = {int(r): -1 for r in p.unique()} - rel_thresholds = {int(r): min_score for r in p.unique()} - - score = min_score - - # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model - # Suggestion: float((max_score-min_score)/len(valid_scores)) - interval = float((max_score-min_score)/len(valid_scores)) - valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) - - while(score<=max_score): - for r in p.unique(): - #Predict - current_rel = (valid_data[:, 1] == r) - true_labels = valid_labels[current_rel.nonzero()].type(torch.int) - preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) - accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) - - if accuracy > rel_accuracies[int(r)]: - rel_accuracies[int(r)] = accuracy - rel_thresholds[int(r)] = score.clone() - - score += interval - """ return rel_thresholds, rel_accuracies def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test): @@ -239,7 +253,4 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval) - return metrics - - # TODO-Question: We optimized the thresholds only for one randomly corrupted sample of the data. - # Another sample would give (a little) different results. How can we ǵet reproduceable results? + return metrics \ No newline at end of file From c18e5f8ae6beb7829eb74dd9da5e6f67d6e82bf9 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Thu, 17 Oct 2019 15:36:39 +0200 Subject: [PATCH 04/19] Fixed some minor Todos, improved documentation --- kge/job/triple_classification.py | 111 ++++++++++++++++--------------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 54a1185f6..de2bb7c45 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -20,10 +20,10 @@ class TripleClassificationJob(EvaluationJob): 4. Classify triples in test data 5. Compute Metrics for test data 6. Report metrics in Trace - # Todo: Check where it is necessary to add .to(self.device) to created tensors # Todo: Change comments to fit the standard guidelines - # Todo: Find out if it makes sense to use a dataloader with the relations as batches with a _collate function # Todo: Check all datatypes and make them consistent where possible + # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function + # Todo: Make printing out predictions per relation optionally with recent additions in config_default """ def __init__(self, config, dataset, parent_job, model): @@ -48,16 +48,16 @@ def _prepare(self): def run(self): """Runs the triple classification job.""" - + self.config.log("Starting triple classification...") self._prepare() + # Todo Question: What is the purpose of was_training? It was in entity ranking and it already was_training = self.model.training self.model.eval() - self.config.log("Starting triple classification...") epoch_time = -time.time() - # 2. Get scores for the new data. Relevant Output: Scores and scores per relation + # 2. Get scores for the corrupted valid and test data self.config.log("Get scores for datasets...") s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) @@ -70,7 +70,7 @@ def run(self): # 3. Find the best thresholds for every relation and their accuracies on the valid data self.config.log("Learning thresholds on validation data.") rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted) - + print(rel_thresholds) # 4. Classification on test data. Output: predictions per relation and number of relations in test which are # not included in valid self.config.log("Evaluating on test data.") @@ -116,30 +116,28 @@ def run(self): self.config.log("Finished evaluating on " + self.eval_data + " data.") return trace_entry - # Todo-Question: Not sure if what is included in the trace is correct or enough. Feedback needed. def _generate_negatives(self, dataset): # 1. Corrupt triples corrupted = dataset.repeat(1, 2).view(-1, 3) - labels = torch.as_tensor([1, 0] * len(dataset)) + labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device) # Random decision if sample subject(sample=nonzero) or object(sample=zero) - sample = torch.randint(0,2,(1,len(dataset))) + sample = torch.randint(0,2,(1,len(dataset))).to(self.device) # Sample subjects from subjects which appeared in the dataset corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32) + list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device) # Sample objects from objects which appeared in the dataset corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \ torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32) - - # Todo: Add condition that corrupted triple!=original triple + list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device) - # TODO: Create a function in util.sampler for that task. Then: Allow to choose from which entities to sample - # (e.g. from test, train and valid entities instead of only valid. + # TODO: Create a function in util.sampler for that task. Optionally include: Allow to choose from which entities + # to sample (e.g. from test, train and valid entities instead of only valid; + # Add condition that corrupted triple!=original triple # Save the labels per relation, since this will be needed frequently later p = corrupted[:, 1] @@ -149,20 +147,25 @@ def _generate_negatives(self, dataset): return corrupted, labels, rel_labels def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): - # Todo: Check if methods are equivalent - #Todo-Question: Method 1 is what seems reasonable for me, Method 2 is the reimplementation of the NTNH Paper of Socher et al. 2013. - # Method 1 is much faster and delivers equally good results. Since the threshold entirely is determined by the valid_scores - # and is a cut between them, the best threshold in terms of valid data is any value between two specific score values. - # Thus I assume, that we can just use one of these score values as the threshold, since we can't know better anyway. - # Is this thought correct? - # If not and Method 2 has to be used, how can it be fastened up? + #Todo-Question: Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the + # NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the + # threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of + # valid data is any value between two specific score values. Thus I assume, that we can just use one of these + # score values as the threshold, since we can't know better anyway. Is this thought correct? + # The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The + # reason most likely is, that it is really a better threshold, since it is more based on the data than just the + # lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two + # methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and + # the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only + # important for us to make comparisons inside our framework possible, then I would prefer Method 1. """Method 1: Threshold is always one of the scores""" #Initialize accuracies, thresholds (and predictions) rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} - valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) + # Change the scores to be entries instead of separated lists the tensor + valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) for r in p.unique(): @@ -176,41 +179,42 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))]) - # #Method 2: Search for best threshold in an interval - # #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py - # # Initialize accuracies, thresholds (and predictions) - # min_score = valid_scores.min() - # max_score = valid_scores.max() - # - # rel_accuracies = {int(r): -1 for r in p.unique()} - # rel_thresholds = {int(r): min_score for r in p.unique()} - # - # score = min_score - # - # # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should depend on the range of the score values of the model - # # Suggestion: float((max_score-min_score)/len(valid_scores)) - # interval = float((max_score-min_score)/len(valid_scores)) - # valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]) - # - # while(score<=max_score): - # for r in p.unique(): - # #Predict - # current_rel = (valid_data[:, 1] == r) - # true_labels = valid_labels[current_rel.nonzero()].type(torch.int) - # preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) - # accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) - # - # if accuracy > rel_accuracies[int(r)]: - # rel_accuracies[int(r)] = accuracy - # rel_thresholds[int(r)] = score.clone() - # - # score += interval +# #Method 2: Search for best threshold in an interval +# #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py +# # Initialize accuracies, thresholds (and predictions) +# min_score = valid_scores.min() +# max_score = valid_scores.max() +# +# rel_accuracies = {int(r): -1 for r in p.unique()} +# rel_thresholds = {int(r): min_score for r in p.unique()} +# +# score = min_score +# +# # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should +# # depend on the range of the score values of the model +# # Suggestion: float((max_score-min_score)/len(valid_scores)) +# interval = 0.01#float((max_score-min_score)/len(valid_scores)) +# valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) +# +# while(score<=max_score): +# for r in p.unique(): +# #Predict +# current_rel = (valid_data[:, 1] == r) +# true_labels = valid_labels[current_rel.nonzero()].type(torch.int) +# preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) +# accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) +# +# if accuracy > rel_accuracies[int(r)]: +# rel_accuracies[int(r)] = accuracy +# rel_thresholds[int(r)] = score.clone() +# +# score += interval return rel_thresholds, rel_accuracies def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test): - rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])) for r in p_test.unique()} + rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()} # Set counter for triples for which the relation is not in valid data not_in_eval = [] @@ -236,7 +240,6 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no for r in p_test.unique() for i in rel_predictions[int(r)]] - # Todo: Calculate accuracy and precision instead of using sklearn function metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list)) metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list)) From b90c9d5ac6d61f37d728175a74f7c5fae584e13a Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Thu, 17 Oct 2019 16:23:06 +0200 Subject: [PATCH 05/19] Make printing out predictions per relation optionally, delete unnecessary specifications in config files --- examples/toy-complex-train-tripleclass.yaml | 3 +-- examples/toy-transe-train-tripleclass.yaml | 1 + kge/job/triple_classification.py | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index 864f2280f..db72f7bae 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -11,6 +11,5 @@ lookup_embedder.dim: 100 lookup_embedder.initialize: xavier_uniform_ eval.type: triple_classification valid.metric: Accuracy -eval.thresholds: valid -eval.test: test +eval.metrics_per.relation_type: False diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml index 5cf81a2b7..c4b25dceb 100644 --- a/examples/toy-transe-train-tripleclass.yaml +++ b/examples/toy-transe-train-tripleclass.yaml @@ -29,6 +29,7 @@ valid: eval: type: triple_classification + metrics_per.relation_type: True transe: class_name: TransE diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index de2bb7c45..81ab663a8 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -70,7 +70,7 @@ def run(self): # 3. Find the best thresholds for every relation and their accuracies on the valid data self.config.log("Learning thresholds on validation data.") rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted) - print(rel_thresholds) + # 4. Classification on test data. Output: predictions per relation and number of relations in test which are # not included in valid self.config.log("Evaluating on test data.") @@ -243,15 +243,16 @@ def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, no metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list)) metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list)) - precision_per_r = {} - accuracy_per_r = {} - for r in p_test.unique(): - precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + if self.config.get("eval.metrics_per.relation_type"): + precision_per_r = {} + accuracy_per_r = {} + for r in p_test.unique(): + precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - metrics["Accuracy_per_Relation"] = accuracy_per_r + metrics["Accuracy_per_Relation"] = accuracy_per_r - metrics["Precision_Per_Relation"] = precision_per_r + metrics["Precision_Per_Relation"] = precision_per_r metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval) From f9d8feb0de5295ee486deb6090fb0c5c28b8c103 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Fri, 18 Oct 2019 16:15:05 +0200 Subject: [PATCH 06/19] Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train) --- examples/toy-complex-train-tripleclass.yaml | 2 +- examples/toy-transe-train-tripleclass.yaml | 2 +- kge/config-default.yaml | 4 + kge/job/triple_classification.py | 130 +++++++++++++------- 4 files changed, 89 insertions(+), 49 deletions(-) diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index db72f7bae..148fce5af 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -11,5 +11,5 @@ lookup_embedder.dim: 100 lookup_embedder.initialize: xavier_uniform_ eval.type: triple_classification valid.metric: Accuracy -eval.metrics_per.relation_type: False +eval.metrics_per.relation: False diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml index c4b25dceb..f0b89704f 100644 --- a/examples/toy-transe-train-tripleclass.yaml +++ b/examples/toy-transe-train-tripleclass.yaml @@ -29,7 +29,7 @@ valid: eval: type: triple_classification - metrics_per.relation_type: True + metrics_per.relation: False transe: class_name: TransE diff --git a/kge/config-default.yaml b/kge/config-default.yaml index 08ca70681..d83fb1518 100644 --- a/kge/config-default.yaml +++ b/kge/config-default.yaml @@ -393,7 +393,11 @@ valid: # Name of the trace entry that holds the validation metric (higher value is # better) +<<<<<<< HEAD metric: mean_reciprocal_rank_filtered_with_test +======= + metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification +>>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train) # If the above metric is not present in trace (e.g., because a custom metric # should be used), a Python expression to compute the metric. Can refer to diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 81ab663a8..f604576f6 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -5,91 +5,106 @@ from sklearn.metrics import accuracy_score, precision_score from kge.job import EvaluationJob +"""Daniel feedback: +- Gather other results as baseline +""" + + +""" +Since last commit: Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train), +""" + class TripleClassificationJob(EvaluationJob): """Triple classification evaluation protocol. - Testing model's ability to discriminate between true and false triples based on scores. Introduces a threshold for - each relation. Unseen triples will be predicted as True if the score is higher than the threshold. Procedure: - - 1. Generation of (corrupted) negative triples: - Corrupt each triple in valid and test data once to get equally amount of wrong and correct triples. - Allow only entities which appeared at the given position in the dataset - 2. Get scores for the corrupted datasets - 3. Find the best threshold for every relation by maximizing accuracy on validation data - 4. Classify triples in test data - 5. Compute Metrics for test data - 6. Report metrics in Trace + Testing model's ability to discriminate between true and false triples based on scores. First, negative (corrupted) + triples are generated by randomly corrupting each triple in the validation and test data. Then the scores for each + triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation. + The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen + triples from the train data will then be predicted as True if the score is higher than the threshold of the + respective relation. The metrics include accuracy and precision on test data. IF necessary the accuracy/precision + per relation can be returned as well. + """ + # Todo: Change comments to fit the standard guidelines # Todo: Check all datatypes and make them consistent where possible # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function - # Todo: Make printing out predictions per relation optionally with recent additions in config_default - """ + def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) self.is_prepared = False def _prepare(self): - """Construct the datasets needed.""" + """Prepare the corrupted validation and test datasets. + + The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure + that every epoch is evaluated on the same data. + """ if self.is_prepared: return - # 1. Generate corrupted data - self.config.log("Generate corrupted datasets...") - # Create the corrupted triples while creating the evaluation Job to make sure that every epoch is evaluated on the same data - self.valid_corrupted, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) - self.test_corrupted, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) + self.config.log("Generate datasets with corrupted and true triples...") + # Generate corrupted data + if self.eval_data == "test": + self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) + self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) + else: + self.triples_valid, self.valid_labels, self.rel_valid_label = self._generate_negatives(self.dataset.valid) + self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.valid) # let the model add some hooks, if it wants to do so self.model.prepare_job(self) self.is_prepared = True def run(self): - """Runs the triple classification job.""" + """Runs the triple classification job and returns the trace.""" + self.config.log("Starting triple classification...") self._prepare() - # Todo Question: What is the purpose of was_training? It was in entity ranking and it already was_training = self.model.training self.model.eval() epoch_time = -time.time() - # 2. Get scores for the corrupted valid and test data + # Get scores for the corrupted valid and test data self.config.log("Get scores for datasets...") - s_valid, p_valid, o_valid = self.valid_corrupted[:, 0], self.valid_corrupted[:, 1], self.valid_corrupted[:, 2] + s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} - s_test, p_test, o_test = self.test_corrupted[:, 0], self.test_corrupted[:, 1], self.test_corrupted[:, 2] + s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2] test_scores = self.model.score_spo(s_test, p_test, o_test) rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} - # 3. Find the best thresholds for every relation and their accuracies on the valid data + # Find the best thresholds for every relation and their accuracies on the valid data self.config.log("Learning thresholds on validation data.") - rel_thresholds, accuracies_valid = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.valid_corrupted) + rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid) - # 4. Classification on test data. Output: predictions per relation and number of relations in test which are + # Classification on test data. Output: predictions per relation and number of relations in test which are # not included in valid self.config.log("Evaluating on test data.") self.config.log("Predict...") rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test) - # 5. Report Metrics on test data + # Report Metrics on test data self.config.log("Classification results:") metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval) - # 6. Trace & Log + # Trace & Log epoch_time += time.time() # compute trace trace_entry = dict( type="triple_classification", scope="epoch", - data_learn_thresholds="Valid", - data_evaluate="Test", + data_thresholds="Valid", + size_threshold_data = len(self.triples_valid), + data_evaluate=self.eval_data, + size_data_evaluate=len(self.triples_test), epoch=self.epoch, size=2*len(self.dataset.valid), epoch_time=epoch_time, @@ -118,7 +133,22 @@ def run(self): return trace_entry def _generate_negatives(self, dataset): - # 1. Corrupt triples + """Generates dataset with negative triples. + + Takes each triple of the specified dataset and randomly replaces either the subject or the object with another + subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same + position in the dataset. + + Returns: + corrupted: A new dataset with the original and corrupted triples. + + labels: A vector with labels for the corresponding triples in the dataset. + + rel_labels: A dictionary mapping relations to labels. The values contain as many 1,0-pairs as we have triples + for the relation in the specified dataset Example for 2 original triples: {9: [1, 0, 1, 0]} + """ + + # Create objects for the corrupted dataset and the corresponding labels corrupted = dataset.repeat(1, 2).view(-1, 3) labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device) @@ -139,7 +169,7 @@ def _generate_negatives(self, dataset): # to sample (e.g. from test, train and valid entities instead of only valid; # Add condition that corrupted triple!=original triple - # Save the labels per relation, since this will be needed frequently later + # Save the labels per relation, since this will be needed frequently later on p = corrupted[:, 1] rel_labels = {int(r): [labels[int((p == r).nonzero()[i])] for i in range(len((p == r).nonzero()))] for r in p.unique()} @@ -147,19 +177,25 @@ def _generate_negatives(self, dataset): return corrupted, labels, rel_labels def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): - #Todo-Question: Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the - # NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the - # threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of - # valid data is any value between two specific score values. Thus I assume, that we can just use one of these - # score values as the threshold, since we can't know better anyway. Is this thought correct? - # The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The - # reason most likely is, that it is really a better threshold, since it is more based on the data than just the - # lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two - # methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and - # the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only - # important for us to make comparisons inside our framework possible, then I would prefer Method 1. - - """Method 1: Threshold is always one of the scores""" + """Find the best thresholds per relation by maximizing accuracy on validation data. + + Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the + NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the + threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of + valid data is any value between two specific score values. Thus I assume, that we can just use one of these + score values as the threshold, since we can't know better anyway. Is this thought correct? + The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The + reason most likely is, that it is really a better threshold, since it is more based on the data than just the + lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two + methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and + the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only + important for us to make comparisons inside our framework possible, then I would prefer Method 1. + + Returns: + rel_thresholds: Dictionary with thresholds per relation {relation: threshold} + """ + + #Method 1: Threshold is always one of the scores #Initialize accuracies, thresholds (and predictions) rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} @@ -210,7 +246,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): # # score += interval - return rel_thresholds, rel_accuracies + return rel_thresholds def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test): From ef7a7f8184c279f13f7d6222db8849876e52bf4f Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Wed, 23 Oct 2019 17:54:18 +0200 Subject: [PATCH 07/19] vectorized prediction function, gt rid of unnecessary part in _compute_metrics, updated comment documentation, easier way to retrieve labels per relations in generate function, minor simplifications and error fixings --- examples/toy-complex-train-tripleclass.yaml | 1 + kge/job/triple_classification.py | 118 +++++++++++--------- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index 148fce5af..fc9e03854 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -12,4 +12,5 @@ lookup_embedder.initialize: xavier_uniform_ eval.type: triple_classification valid.metric: Accuracy eval.metrics_per.relation: False +valid.every: 1 diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index f604576f6..568e0d08b 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -5,16 +5,6 @@ from sklearn.metrics import accuracy_score, precision_score from kge.job import EvaluationJob -"""Daniel feedback: -- Gather other results as baseline -""" - - -""" -Since last commit: Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train), -""" - - class TripleClassificationJob(EvaluationJob): """Triple classification evaluation protocol. @@ -27,7 +17,6 @@ class TripleClassificationJob(EvaluationJob): per relation can be returned as well. """ - # Todo: Change comments to fit the standard guidelines # Todo: Check all datatypes and make them consistent where possible # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function @@ -37,7 +26,7 @@ def __init__(self, config, dataset, parent_job, model): self.is_prepared = False def _prepare(self): - """Prepare the corrupted validation and test datasets. + """Prepare the corrupted validation and test data. The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure that every epoch is evaluated on the same data. @@ -46,8 +35,8 @@ def _prepare(self): if self.is_prepared: return - self.config.log("Generate datasets with corrupted and true triples...") - # Generate corrupted data + self.config.log("Generate data with corrupted and true triples...") + if self.eval_data == "test": self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) @@ -60,7 +49,7 @@ def _prepare(self): self.is_prepared = True def run(self): - """Runs the triple classification job and returns the trace.""" + """Runs the triple classification job.""" self.config.log("Starting triple classification...") self._prepare() @@ -71,7 +60,7 @@ def run(self): epoch_time = -time.time() # Get scores for the corrupted valid and test data - self.config.log("Get scores for datasets...") + self.config.log("Compute scores for datasets used...") s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} @@ -80,21 +69,17 @@ def run(self): test_scores = self.model.score_spo(s_test, p_test, o_test) rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} - # Find the best thresholds for every relation and their accuracies on the valid data + # Find the best thresholds for every relation on validation data self.config.log("Learning thresholds on validation data.") rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid) - # Classification on test data. Output: predictions per relation and number of relations in test which are - # not included in valid - self.config.log("Evaluating on test data.") - self.config.log("Predict...") - rel_predictions, not_in_eval = self.predict(rel_thresholds, test_scores, rel_test_scores, p_valid, p_test) + # Make prediction on the specified evaluation data + self.config.log("Evaluating on {} data.".format(self.eval_data)) + rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test) - # Report Metrics on test data + # ComputeReport Metrics self.config.log("Classification results:") - metrics = self._compute_metrics(self.rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval) - - # Trace & Log + metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval) epoch_time += time.time() # compute trace @@ -137,7 +122,8 @@ def _generate_negatives(self, dataset): Takes each triple of the specified dataset and randomly replaces either the subject or the object with another subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same - position in the dataset. + position in the dataset. The term corrupted dataset is used throughout the document for a dataset containing + both corrupted and original triples. Returns: corrupted: A new dataset with the original and corrupted triples. @@ -171,8 +157,8 @@ def _generate_negatives(self, dataset): # Save the labels per relation, since this will be needed frequently later on p = corrupted[:, 1] - rel_labels = {int(r): [labels[int((p == r).nonzero()[i])] - for i in range(len((p == r).nonzero()))] for r in p.unique()} + rel_labels = {int(r): labels[p == r] for r in p.unique()} + return corrupted, labels, rel_labels @@ -191,8 +177,15 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only important for us to make comparisons inside our framework possible, then I would prefer Method 1. + Args: + p: 1-D tensor containing the relations of the corrupted validation dataset. + valid_scores: 1D tensor containing the scores of all corrupted validation triples. + rel_scores: Dictionary containing the scores of the triples in a relation. + valid_labels: 1D tensor containing the labels of all corrupted validation triples. + valid_data: Dataset used. Should be corrupted validation dataset. + Returns: - rel_thresholds: Dictionary with thresholds per relation {relation: threshold} + rel_thresholds: Dictionary with thresholds per relation {relation: threshold}. """ #Method 1: Threshold is always one of the scores @@ -200,7 +193,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} - # Change the scores to be entries instead of separated lists the tensor + # Change the scores from a 2D to a 1D tensor valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) @@ -208,15 +201,18 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): #Predict current_rel = (valid_data[:, 1] == r) true_labels = valid_labels[current_rel.nonzero()].type(torch.int) + # true_labels = valid_labels[current_rel] preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int) accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] - + # accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] rel_accuracies[int(r)] = max(accuracy) - # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent with original implementation - rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))]) + # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay as consistent + # as possible with original implementation + rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0] # #Method 2: Search for best threshold in an interval -# #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py +# #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py + # or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py # # Initialize accuracies, thresholds (and predictions) # min_score = valid_scores.min() # max_score = valid_scores.max() @@ -226,7 +222,7 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): # # score = min_score # -# # ORiginal implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should +# # Original implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should # # depend on the range of the score values of the model # # Suggestion: float((max_score-min_score)/len(valid_scores)) # interval = 0.01#float((max_score-min_score)/len(valid_scores)) @@ -248,38 +244,54 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): return rel_thresholds - def predict(self, rel_thresholds, test_scores, rel_scores, p_valid, p_test): + def predict(self, rel_thresholds, rel_scores, p_valid, p_test): + """Makes predictions on evaluation/test data. + + Parameters: + rel_thresholds: dictionary with relation thresholds, e.g. {1: 1.5}. + rel_scores: dictionary with scores of triples in each relation: + E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}. + + Returns: + rel_predictions: dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}. + not_in_eval: list with relations that are in the test data, but not in the validation data. + """ rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()} - # Set counter for triples for which the relation is not in valid data + # Set variable for relations which are not in valid data, but in test data not_in_eval = [] for r in p_test.unique(): - # Check if relation which is in valid data also is in test data - if r in p_valid.unique(): + if r in p_valid.unique(): # Check if relation which is in valid data also is in test data # Predict - for i in range(len(rel_scores[int(r)])): - if float(rel_scores[int(r)][i]) >= rel_thresholds[int(r)]: - rel_predictions[int(r)][i] = 1 + rel_predictions[int(r)] = rel_scores[int(r)][:, 0, 0] >= rel_thresholds[int(r)] else: not_in_eval.append(r) return rel_predictions, not_in_eval - def _compute_metrics(self, rel_test_labels, rel_predictions, p_valid, p_test, not_in_eval): - metrics = {} + def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval): + """Computes accuracy and precision metrics of predictions. - labels_in_test_list = [i - for r in p_test.unique() - for i in rel_test_labels[int(r)]] + Returns: + metrics: dictionary with the specified metrics accuracy and precision as keys. If spedified, metric per + relation are safed as dictionaries in the dictionary. + E.g.: {accuracy: 0.9 + accuracy_per_relation: + {relation 1: 0.8} + {relation 2: 0.9} + } + """ + metrics = {} - pred_list = [i + # Create a list for all predicted labels, matching the shape of test_labels + pred_list = torch.tensor([i for r in p_test.unique() - for i in rel_predictions[int(r)]] + for i in rel_predictions[int(r)]], dtype=torch.int64) - metrics["Accuracy"] = float(accuracy_score(labels_in_test_list, pred_list)) - metrics["Precision"] = float(precision_score(labels_in_test_list, pred_list)) + metrics["Accuracy"] = float(accuracy_score(test_labels, pred_list)) + metrics["Precision"] = float(precision_score(test_labels, pred_list)) - if self.config.get("eval.metrics_per.relation_type"): + if self.config.get("eval.metrics_per.relation"): precision_per_r = {} accuracy_per_r = {} for r in p_test.unique(): From 1d929744fc9cc33d7217b38bf672201e135e4384 Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Fri, 25 Oct 2019 16:19:16 +0200 Subject: [PATCH 08/19] Update --- examples/fb15k-transe-grid-tripleclass.yaml | 83 ++++++++++++++++++ examples/toy-rescal-train-tripleclass.yaml | 33 +++++++ examples/toy-transe-ax-tripleclass-real.yaml | 91 ++++++++++++++++++++ examples/toy-transe-ax-tripleclass.yaml | 91 ++++++++++++++++++++ examples/toy-transe-train-tripleclass.yaml | 4 +- kge/job/triple_classification.py | 11 +-- 6 files changed, 306 insertions(+), 7 deletions(-) create mode 100644 examples/fb15k-transe-grid-tripleclass.yaml create mode 100644 examples/toy-rescal-train-tripleclass.yaml create mode 100644 examples/toy-transe-ax-tripleclass-real.yaml create mode 100644 examples/toy-transe-ax-tripleclass.yaml diff --git a/examples/fb15k-transe-grid-tripleclass.yaml b/examples/fb15k-transe-grid-tripleclass.yaml new file mode 100644 index 000000000..98d3d0284 --- /dev/null +++ b/examples/fb15k-transe-grid-tripleclass.yaml @@ -0,0 +1,83 @@ +job: + device: cuda + type: search + +model: transe + +dataset: + name: fb15k + +train: + batch_size: 256 + loss: margin_ranking + loss_arg: 4.0 + max_epochs: 80 + optimizer: Adagrad + optimizer_args: + lr: 0.1 + type: negative_sampling + +negative_sampling: + num_negatives_o: 3 + num_negatives_p: 0 + num_negatives_s: 3 + sampling_type: uniform + +valid: + early_stopping.patience: 5 + every: 5 + filter_with_test: True + metric: Accuracy + +eval: + batch_size: 512 + type: triple_classification + +transe: + class_name: TransE + entity_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + relation_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + l_norm: 1. + +search.type: grid +grid_search.parameters: + + train.optimizer_args.lr: [ 0.001, 0,005, 0.01, 0.1 ] + + transe.entity_embedder.dim: [20, 50, 100] + + train.batch_size: [30, 120, 480, 1920] + + train.loss_arg: [1, 2, 4] + + lookup_embedder.regularize_args.weight: [0.0, 0.001] + + + lookup_embedder.regularize_args.weight: +search.num_workers: 4 +train.num_workers: 4 +eval.num_workers: 4 + + + + diff --git a/examples/toy-rescal-train-tripleclass.yaml b/examples/toy-rescal-train-tripleclass.yaml new file mode 100644 index 000000000..293c62bbd --- /dev/null +++ b/examples/toy-rescal-train-tripleclass.yaml @@ -0,0 +1,33 @@ +job.type: train +dataset.name: toy +model: sd_rescal + +sd_rescal: + class_name: SparseDiagonalRescal + blocks: -1 + block_size: 1 + entity_embedder: + type: lookup_embedder + dim: 128 # determine automatically + dropout: 0.2 + relation_embedder: + type: lookup_embedder + dim: -1 # determine automatically + dropout: 0.2 + +valid: + early_stopping: + patience: 5 + every: 1 + filter_with_test: True + metric: Accuracy +train: + optimizer: Adagrad + optimizer_args: + lr: 0.1 + batch_size: 128 + max_epochs: 200 + +eval.type: triple_classification +eval.metrics_per.relation: False + diff --git a/examples/toy-transe-ax-tripleclass-real.yaml b/examples/toy-transe-ax-tripleclass-real.yaml new file mode 100644 index 000000000..25f25656c --- /dev/null +++ b/examples/toy-transe-ax-tripleclass-real.yaml @@ -0,0 +1,91 @@ +job: + device: cuda + type: search + +model: transe + +dataset: + name: toy + +train: + batch_size: 256 + loss: margin_ranking + loss_arg: 4.0 + max_epochs: 10 + optimizer: Adagrad + optimizer_args: + lr: 0.1 + type: negative_sampling + +negative_sampling: + num_negatives_o: 3 + num_negatives_p: 0 + num_negatives_s: 3 + sampling_type: uniform + +valid: + early_stopping.patience: 5 + every: 5 + filter_with_test: True + metric: Accuracy + +eval: + batch_size: 512 + type: triple_classification + +transe: + class_name: TransE + entity_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + relation_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + l_norm: 1. + +ax_search: + num_trials: 10 + num_sobol_trials: 20 + parameters: + - name: train.optimizer + type: fixed + value: Adagrad + - name: train.optimizer_args.lr + type: range + bounds: [0.001, 1.0] + - name: train.loss_arg + type: range + bounds: [0.0001, 10.0] + - name: transe.entity_embedder.normalize.p + type: choice + values: [-1., 2.] + is_numerical: False + is_ordered: False + - name: transe.relation_embedder.normalize.p + type: choice + values: [-1., 2.] + is_numerical: False + is_ordered: False + - name: lookup_embedder.regularize_args.weight + type: range + bounds: [0.0, 0.001] + +search.num_workers: 4 +train.num_workers: 4 +eval.num_workers: 4 diff --git a/examples/toy-transe-ax-tripleclass.yaml b/examples/toy-transe-ax-tripleclass.yaml new file mode 100644 index 000000000..00485d590 --- /dev/null +++ b/examples/toy-transe-ax-tripleclass.yaml @@ -0,0 +1,91 @@ +job: + device: cuda + type: search + +model: transe + +dataset: + name: fb15k + +train: + batch_size: 256 + loss: margin_ranking + loss_arg: 4.0 + max_epochs: 80 + optimizer: Adagrad + optimizer_args: + lr: 0.1 + type: negative_sampling + +negative_sampling: + num_negatives_o: 3 + num_negatives_p: 0 + num_negatives_s: 3 + sampling_type: uniform + +valid: + early_stopping.patience: 5 + every: 5 + filter_with_test: True + metric: Accuracy + +eval: + batch_size: 512 + type: triple_classification + +transe: + class_name: TransE + entity_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + relation_embedder: + dim: 100 + initialize: uniform_ + initialize_args: + uniform_ : + a: -1.0 + sparse: false + type: lookup_embedder + regularize: l2 + regularize_args: + weight: 1.e-05 + l_norm: 1. + +ax_search: + num_trials: 30 + num_sobol_trials: 20 + parameters: + - name: train.optimizer + type: fixed + value: Adagrad + - name: train.optimizer_args.lr + type: range + bounds: [0.001, 1.0] + - name: train.loss_arg + type: range + bounds: [0.0001, 10.0] + - name: transe.entity_embedder.normalize.p + type: choice + values: [-1., 2.] + is_numerical: False + is_ordered: False + - name: transe.relation_embedder.normalize.p + type: choice + values: [-1., 2.] + is_numerical: False + is_ordered: False + - name: lookup_embedder.regularize_args.weight + type: range + bounds: [0.0, 0.001] + +search.num_workers: 4 +train.num_workers: 4 +eval.num_workers: 4 diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml index f0b89704f..d63de83d2 100644 --- a/examples/toy-transe-train-tripleclass.yaml +++ b/examples/toy-transe-train-tripleclass.yaml @@ -5,7 +5,7 @@ job: model: transe dataset: - name: toy + name: fb15k train: batch_size: 256 @@ -24,7 +24,7 @@ negative_sampling: valid: early_stopping.patience: 5 - every: 5 + every: 1 metric: Accuracy eval: diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 568e0d08b..7d384e90b 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -17,13 +17,13 @@ class TripleClassificationJob(EvaluationJob): per relation can be returned as well. """ - # Todo: Check all datatypes and make them consistent where possible + # Todo: Check all tensor datatypes and make them consistent where possible # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) - self.is_prepared = False + self.valid_data_is_prepared = False def _prepare(self): """Prepare the corrupted validation and test data. @@ -32,7 +32,7 @@ def _prepare(self): that every epoch is evaluated on the same data. """ - if self.is_prepared: + if self.valid_data_is_prepared: return self.config.log("Generate data with corrupted and true triples...") @@ -46,7 +46,7 @@ def _prepare(self): # let the model add some hooks, if it wants to do so self.model.prepare_job(self) - self.is_prepared = True + self.valid_data_is_prepared = True def run(self): """Runs the triple classification job.""" @@ -59,12 +59,13 @@ def run(self): epoch_time = -time.time() - # Get scores for the corrupted valid and test data + # Get scores for the corrupted valid data self.config.log("Compute scores for datasets used...") s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} + # Get scores for the corrupted test data s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2] test_scores = self.model.score_spo(s_test, p_test, o_test) rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} From fc660a1a7a143cbae72ff1ba81d388261bad784d Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Mon, 28 Oct 2019 11:27:42 +0100 Subject: [PATCH 09/19] Moved sampling function to sampler.py, updated code documentation --- examples/fb15k-transe-grid-tripleclass.yaml | 83 --------- examples/toy-complex-train-tripleclass.yaml | 2 +- examples/toy-transe-ax-tripleclass-real.yaml | 91 ---------- examples/toy-transe-train-tripleclass.yaml | 4 +- kge/config-default.yaml | 4 + kge/job/triple_classification.py | 172 +++++++------------ kge/util/sampler.py | 53 ++++++ 7 files changed, 121 insertions(+), 288 deletions(-) delete mode 100644 examples/fb15k-transe-grid-tripleclass.yaml delete mode 100644 examples/toy-transe-ax-tripleclass-real.yaml diff --git a/examples/fb15k-transe-grid-tripleclass.yaml b/examples/fb15k-transe-grid-tripleclass.yaml deleted file mode 100644 index 98d3d0284..000000000 --- a/examples/fb15k-transe-grid-tripleclass.yaml +++ /dev/null @@ -1,83 +0,0 @@ -job: - device: cuda - type: search - -model: transe - -dataset: - name: fb15k - -train: - batch_size: 256 - loss: margin_ranking - loss_arg: 4.0 - max_epochs: 80 - optimizer: Adagrad - optimizer_args: - lr: 0.1 - type: negative_sampling - -negative_sampling: - num_negatives_o: 3 - num_negatives_p: 0 - num_negatives_s: 3 - sampling_type: uniform - -valid: - early_stopping.patience: 5 - every: 5 - filter_with_test: True - metric: Accuracy - -eval: - batch_size: 512 - type: triple_classification - -transe: - class_name: TransE - entity_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - relation_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - l_norm: 1. - -search.type: grid -grid_search.parameters: - - train.optimizer_args.lr: [ 0.001, 0,005, 0.01, 0.1 ] - - transe.entity_embedder.dim: [20, 50, 100] - - train.batch_size: [30, 120, 480, 1920] - - train.loss_arg: [1, 2, 4] - - lookup_embedder.regularize_args.weight: [0.0, 0.001] - - - lookup_embedder.regularize_args.weight: -search.num_workers: 4 -train.num_workers: 4 -eval.num_workers: 4 - - - - diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index fc9e03854..5512196a3 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -10,7 +10,7 @@ lookup_embedder.dim: 100 #lookup_embedder.initialize: normal_ lookup_embedder.initialize: xavier_uniform_ eval.type: triple_classification -valid.metric: Accuracy +valid.metric: accuracy eval.metrics_per.relation: False valid.every: 1 diff --git a/examples/toy-transe-ax-tripleclass-real.yaml b/examples/toy-transe-ax-tripleclass-real.yaml deleted file mode 100644 index 25f25656c..000000000 --- a/examples/toy-transe-ax-tripleclass-real.yaml +++ /dev/null @@ -1,91 +0,0 @@ -job: - device: cuda - type: search - -model: transe - -dataset: - name: toy - -train: - batch_size: 256 - loss: margin_ranking - loss_arg: 4.0 - max_epochs: 10 - optimizer: Adagrad - optimizer_args: - lr: 0.1 - type: negative_sampling - -negative_sampling: - num_negatives_o: 3 - num_negatives_p: 0 - num_negatives_s: 3 - sampling_type: uniform - -valid: - early_stopping.patience: 5 - every: 5 - filter_with_test: True - metric: Accuracy - -eval: - batch_size: 512 - type: triple_classification - -transe: - class_name: TransE - entity_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - relation_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - l_norm: 1. - -ax_search: - num_trials: 10 - num_sobol_trials: 20 - parameters: - - name: train.optimizer - type: fixed - value: Adagrad - - name: train.optimizer_args.lr - type: range - bounds: [0.001, 1.0] - - name: train.loss_arg - type: range - bounds: [0.0001, 10.0] - - name: transe.entity_embedder.normalize.p - type: choice - values: [-1., 2.] - is_numerical: False - is_ordered: False - - name: transe.relation_embedder.normalize.p - type: choice - values: [-1., 2.] - is_numerical: False - is_ordered: False - - name: lookup_embedder.regularize_args.weight - type: range - bounds: [0.0, 0.001] - -search.num_workers: 4 -train.num_workers: 4 -eval.num_workers: 4 diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml index d63de83d2..e120d9762 100644 --- a/examples/toy-transe-train-tripleclass.yaml +++ b/examples/toy-transe-train-tripleclass.yaml @@ -5,7 +5,7 @@ job: model: transe dataset: - name: fb15k + name: toy train: batch_size: 256 @@ -25,7 +25,7 @@ negative_sampling: valid: early_stopping.patience: 5 every: 1 - metric: Accuracy + metric: accuracy eval: type: triple_classification diff --git a/kge/config-default.yaml b/kge/config-default.yaml index d83fb1518..c1a57ae76 100644 --- a/kge/config-default.yaml +++ b/kge/config-default.yaml @@ -393,11 +393,15 @@ valid: # Name of the trace entry that holds the validation metric (higher value is # better) +<<<<<<< HEAD <<<<<<< HEAD metric: mean_reciprocal_rank_filtered_with_test ======= metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification >>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train) +======= + metric: mean_reciprocal_rank_filtered # accuracy for triple_classification +>>>>>>> Moved sampling function to sampler.py, updated code documentation # If the above metric is not present in trace (e.g., because a custom metric # should be used), a Python expression to compute the metric. Can refer to diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 7d384e90b..95baf2457 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -1,9 +1,9 @@ import time -import random import torch from sklearn.metrics import accuracy_score, precision_score from kge.job import EvaluationJob +from kge.util.sampler import TripleClassificationSampler class TripleClassificationJob(EvaluationJob): """Triple classification evaluation protocol. @@ -13,14 +13,10 @@ class TripleClassificationJob(EvaluationJob): triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation. The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen triples from the train data will then be predicted as True if the score is higher than the threshold of the - respective relation. The metrics include accuracy and precision on test data. IF necessary the accuracy/precision + respective relation. The metrics include accuracy and precision on test data. If necessary the accuracy/precision per relation can be returned as well. """ - # Todo: Check all tensor datatypes and make them consistent where possible - # Todo: Stick to torch functions: Calculate accuracy and precision instead of using sklearn function - - def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) self.valid_data_is_prepared = False @@ -29,7 +25,9 @@ def _prepare(self): """Prepare the corrupted validation and test data. The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure - that every epoch is evaluated on the same data. + that every epoch is evaluated on the same data. For model selection, the thresholds are found for validation + data and the accuracy on validation data is used. For testing the thresholds are found for validation data and + evaluated on test data. """ if self.valid_data_is_prepared: @@ -38,11 +36,11 @@ def _prepare(self): self.config.log("Generate data with corrupted and true triples...") if self.eval_data == "test": - self.triples_valid, self.valid_labels, self.rel_valid_labels = self._generate_negatives(self.dataset.valid) - self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.test) + self.triples_valid, self.valid_labels, self.rel_valid_labels = TripleClassificationSampler.sample(self, self.dataset.valid) + self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.test) else: - self.triples_valid, self.valid_labels, self.rel_valid_label = self._generate_negatives(self.dataset.valid) - self.triples_test, self.test_labels, self.rel_test_labels = self._generate_negatives(self.dataset.valid) + self.triples_valid, self.valid_labels, self.rel_valid_label = TripleClassificationSampler.sample(self, self.dataset.valid) + self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.valid) # let the model add some hooks, if it wants to do so self.model.prepare_job(self) @@ -59,13 +57,13 @@ def run(self): epoch_time = -time.time() - # Get scores for the corrupted valid data - self.config.log("Compute scores for datasets used...") + # Get scores and scores per relation for the corrupted valid data + self.config.log("Compute scores for validation and test datasets...") s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2] valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} - # Get scores for the corrupted test data + # Get scores and scores per relation for the corrupted test data s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2] test_scores = self.model.score_spo(s_test, p_test, o_test) rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} @@ -74,11 +72,11 @@ def run(self): self.config.log("Learning thresholds on validation data.") rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid) - # Make prediction on the specified evaluation data + # Make prediction for the specified evaluation data self.config.log("Evaluating on {} data.".format(self.eval_data)) rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test) - # ComputeReport Metrics + # Compute Metrics self.config.log("Classification results:") metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval) @@ -118,103 +116,53 @@ def run(self): return trace_entry - def _generate_negatives(self, dataset): - """Generates dataset with negative triples. - - Takes each triple of the specified dataset and randomly replaces either the subject or the object with another - subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same - position in the dataset. The term corrupted dataset is used throughout the document for a dataset containing - both corrupted and original triples. - - Returns: - corrupted: A new dataset with the original and corrupted triples. - - labels: A vector with labels for the corresponding triples in the dataset. - - rel_labels: A dictionary mapping relations to labels. The values contain as many 1,0-pairs as we have triples - for the relation in the specified dataset Example for 2 original triples: {9: [1, 0, 1, 0]} - """ - - # Create objects for the corrupted dataset and the corresponding labels - corrupted = dataset.repeat(1, 2).view(-1, 3) - labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device) - - # Random decision if sample subject(sample=nonzero) or object(sample=zero) - sample = torch.randint(0,2,(1,len(dataset))).to(self.device) - - # Sample subjects from subjects which appeared in the dataset - corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ - torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device) - - # Sample objects from objects which appeared in the dataset - corrupted[1::2][:, 2][(sample==0).nonzero()[:, 1]] = \ - torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device) - - # TODO: Create a function in util.sampler for that task. Optionally include: Allow to choose from which entities - # to sample (e.g. from test, train and valid entities instead of only valid; - # Add condition that corrupted triple!=original triple - - # Save the labels per relation, since this will be needed frequently later on - p = corrupted[:, 1] - rel_labels = {int(r): labels[p == r] for r in p.unique()} - - - return corrupted, labels, rel_labels - def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): """Find the best thresholds per relation by maximizing accuracy on validation data. - Method 1 is what seems the most reasonable for me, Method 2 is the reimplementation of the - NTN Paper of Socher et al. 2013. Method 1 is much faster and delivers equally good results. Since the - threshold entirely is determined by the valid_scores and is a cut between them, the best threshold in terms of - valid data is any value between two specific score values. Thus I assume, that we can just use one of these - score values as the threshold, since we can't know better anyway. Is this thought correct? - The two methods are not equivalent. Method 1 leads to slightly (~0.01) better result in terms of accuracy. The - reason most likely is, that it is really a better threshold, since it is more based on the data than just the - lowest arbitrary threshold that produces the best accuracy. If we would have infinite valid triples, the two - methods would be equivalent. Nevertheless, since other Triple Classification papers probably used Method 2 and - the goal is evaluation, we maybe should stick to Method 2 to make comparisons to others possible. If it is only - important for us to make comparisons inside our framework possible, then I would prefer Method 1. + The thresholds are found for every relation by maximizing the accuracy on the validation data. For a given + relation, if the scores of all triple in the relation are sorted, the perfect threshold is always a cut between + two of the scores. This means, that multiple possible values can be defined as thresholds and give the highest + accuracy. To evaluate only as many possible thresholds as really necessary, the scores themselves are considered + as possible thresholds. This allows for a fast implementation. Args: p: 1-D tensor containing the relations of the corrupted validation dataset. - valid_scores: 1D tensor containing the scores of all corrupted validation triples. + valid_scores: 2-D tensor containing the scores of all corrupted validation triples. rel_scores: Dictionary containing the scores of the triples in a relation. - valid_labels: 1D tensor containing the labels of all corrupted validation triples. - valid_data: Dataset used. Should be corrupted validation dataset. + valid_labels: 1-D tensor containing the labels of all corrupted validation triples. + valid_data: Dataset used. Should be the corrupted validation dataset. Returns: - rel_thresholds: Dictionary with thresholds per relation {relation: threshold}. + rel_thresholds: Dictionary with thresholds per relation {relation: thresholds}. + E.g.: {1: tensor(-2.0843, grad_fn=)} """ - #Method 1: Threshold is always one of the scores - #Initialize accuracies, thresholds (and predictions) + # Initialize accuracies and thresholds rel_accuracies = {int(r): -1 for r in p.unique()} rel_thresholds = {int(r): 0 for r in p.unique()} - # Change the scores from a 2D to a 1D tensor + # Change the valid scores from a 2D to a 1D tensor valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) - for r in p.unique(): - #Predict - current_rel = (valid_data[:, 1] == r) + current_rel = (valid_data[:, 1] == r) # 0-1 vector for indexing triples of the current relation true_labels = valid_labels[current_rel.nonzero()].type(torch.int) - # true_labels = valid_labels[current_rel] - preds = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int) - accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] - # accuracy = [int(((true_labels==preds[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] + + # valid_scores[current_rel.nonzero()] and rel_scores[int(r)] both contain the scores of the current + # relation. In the comparison, every score is evaluated as possible threshold against all scores. + predictions = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int) + + accuracy = [int(((true_labels==predictions[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] rel_accuracies[int(r)] = max(accuracy) - # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay as consistent - # as possible with original implementation + + # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent. rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0] -# #Method 2: Search for best threshold in an interval -# #https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py - # or https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py -# # Initialize accuracies, thresholds (and predictions) +# # Alternative implementation: Search for best threshold in an interval +# # Following https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or +# # https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py (reimplemented Socher et al. 2013) +# +# # Initialize accuracies, thresholds and interval # min_score = valid_scores.min() # max_score = valid_scores.max() # @@ -223,10 +171,10 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): # # score = min_score # -# # Original implementation uses an interval 0.01, implemented for NTN model. In general the interval imo should -# # depend on the range of the score values of the model -# # Suggestion: float((max_score-min_score)/len(valid_scores)) -# interval = 0.01#float((max_score-min_score)/len(valid_scores)) +# # Original implementation uses an interval of 0.01, implemented for NTN model. In general the interval should +# # depend on the range of the score values of the model and be at least as large as teh smallest distance between +# # two of the sorted scores +# interval = 0.01 # valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) # # while(score<=max_score): @@ -234,8 +182,8 @@ def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): # #Predict # current_rel = (valid_data[:, 1] == r) # true_labels = valid_labels[current_rel.nonzero()].type(torch.int) -# preds = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) -# accuracy = int(((true_labels==preds).sum(dim=0)))/len(true_labels) +# predictions = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) +# accuracy = int(((true_labels==predictions).sum(dim=0)))/len(true_labels) # # if accuracy > rel_accuracies[int(r)]: # rel_accuracies[int(r)] = accuracy @@ -249,13 +197,13 @@ def predict(self, rel_thresholds, rel_scores, p_valid, p_test): """Makes predictions on evaluation/test data. Parameters: - rel_thresholds: dictionary with relation thresholds, e.g. {1: 1.5}. - rel_scores: dictionary with scores of triples in each relation: + rel_thresholds: Dictionary with relation thresholds. + rel_scores: Dictionary with scores of triples in each relation: E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}. Returns: - rel_predictions: dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}. - not_in_eval: list with relations that are in the test data, but not in the validation data. + rel_predictions: Dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}. + not_in_eval: List with relations that are in the test data, but not in the validation data. """ rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()} @@ -274,8 +222,8 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test """Computes accuracy and precision metrics of predictions. Returns: - metrics: dictionary with the specified metrics accuracy and precision as keys. If spedified, metric per - relation are safed as dictionaries in the dictionary. + metrics: dictionary with the specified metrics accuracy and precision as keys. If specified, metrics per + relation are stored as dictionaries in the dictionary. E.g.: {accuracy: 0.9 accuracy_per_relation: {relation 1: 0.8} @@ -289,21 +237,23 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test for r in p_test.unique() for i in rel_predictions[int(r)]], dtype=torch.int64) - metrics["Accuracy"] = float(accuracy_score(test_labels, pred_list)) - metrics["Precision"] = float(precision_score(test_labels, pred_list)) + metrics["accuracy"] = float(accuracy_score(test_labels, pred_list)) + metrics["precision"] = float(precision_score(test_labels, pred_list)) if self.config.get("eval.metrics_per.relation"): precision_per_r = {} accuracy_per_r = {} for r in p_test.unique(): - precision_per_r[str(self.dataset.relations[int(r)])] = float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - accuracy_per_r[str(self.dataset.relations[int(r)])] = float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + precision_per_r[str(self.dataset.relations[int(r)])] = \ + float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + accuracy_per_r[str(self.dataset.relations[int(r)])] = \ + float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - metrics["Accuracy_per_Relation"] = accuracy_per_r + metrics["accuracy_per_relation"] = accuracy_per_r - metrics["Precision_Per_Relation"] = precision_per_r + metrics["precision_per_relation"] = precision_per_r - metrics["Untested relations due to missing in evaluation data"] = len(not_in_eval) + metrics["untested relations due to missing in evaluation data"] = len(not_in_eval) return metrics \ No newline at end of file diff --git a/kge/util/sampler.py b/kge/util/sampler.py index 5f1ab31b8..6cb5b3c6d 100644 --- a/kge/util/sampler.py +++ b/kge/util/sampler.py @@ -3,9 +3,13 @@ import random import torch +<<<<<<< HEAD from typing import Optional import numpy as np import numba +======= +import random +>>>>>>> Moved sampling function to sampler.py, updated code documentation SLOTS = [0, 1, 2] SLOT_STR = ["s", "p", "o"] @@ -353,3 +357,52 @@ def _sample(self, positive_triples: torch.Tensor, slot: int, num_samples: int): positive_triples.size(0) * num_samples, ).view(positive_triples.size(0), num_samples) return result + + +class TripleClassificationSampler(KgeNegativeSampler): + def __init__(self, config, configuration_key, dataset): + super().__init__(config, configuration_key, dataset) + + def sample(self, dataset): + """Generates dataset with positive and negative triples. + + Takes each triple of the specified dataset and randomly replaces either the subject or the object with another + subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same + position in the dataset. + + Returns: + corrupted: A new dataset with the original and corrupted triples. + + labels: A vector with labels for the corresponding triples in the dataset. + + rel_labels: A dictionary mapping relations to labels. + Example if we had two triples of relation 1 in the original dataset: {1: [1, 0, 1, 0]} + """ + + # Create objects for the corrupted dataset and the corresponding labels + corrupted = dataset.repeat(1, 2).view(-1, 3) + labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device) + + # The sampling influences the results in the end. To compare models or parameters, the seeds should be fixed + if self.config.get("eval.triple_classification_random_seed"): + torch.manual_seed(5465456876546785) + random.seed(5465456876546785) + + # Random decision if sample subject(sample=nonzero) or object(sample=zero) + sample = torch.randint(0, 2, (1, len(dataset))).to(self.device) + + # Sample subjects from subjects which appeared in the dataset + corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ + torch.as_tensor(random.choice( + list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device) + + # Sample objects from objects which appeared in the dataset + corrupted[1::2][:, 2][(sample == 0).nonzero()[:, 1]] = \ + torch.as_tensor(random.choice( + list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device) + + # Save the labels per relation, since this will be needed frequently later on + p = corrupted[:, 1] + rel_labels = {int(r): labels[p == r] for r in p.unique()} + + return corrupted, labels, rel_labels From 51c8ab2b7c6e7e7a66b094a7c007da5a06a60a4e Mon Sep 17 00:00:00 2001 From: Andrej Tschalzev Date: Mon, 28 Oct 2019 11:40:29 +0100 Subject: [PATCH 10/19] final updates --- examples/toy-complex-train-tripleclass.yaml | 8 +- examples/toy-rescal-train-tripleclass.yaml | 33 -------- examples/toy-transe-ax-tripleclass.yaml | 91 --------------------- examples/toy-transe-train-tripleclass.yaml | 60 -------------- 4 files changed, 5 insertions(+), 187 deletions(-) delete mode 100644 examples/toy-rescal-train-tripleclass.yaml delete mode 100644 examples/toy-transe-ax-tripleclass.yaml delete mode 100644 examples/toy-transe-train-tripleclass.yaml diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index 5512196a3..582f8b72f 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -9,8 +9,10 @@ train: lookup_embedder.dim: 100 #lookup_embedder.initialize: normal_ lookup_embedder.initialize: xavier_uniform_ -eval.type: triple_classification +eval: + type: triple_classification + metrics_per.relation: False + triple_classification_random_seed: False valid.metric: accuracy -eval.metrics_per.relation: False -valid.every: 1 + diff --git a/examples/toy-rescal-train-tripleclass.yaml b/examples/toy-rescal-train-tripleclass.yaml deleted file mode 100644 index 293c62bbd..000000000 --- a/examples/toy-rescal-train-tripleclass.yaml +++ /dev/null @@ -1,33 +0,0 @@ -job.type: train -dataset.name: toy -model: sd_rescal - -sd_rescal: - class_name: SparseDiagonalRescal - blocks: -1 - block_size: 1 - entity_embedder: - type: lookup_embedder - dim: 128 # determine automatically - dropout: 0.2 - relation_embedder: - type: lookup_embedder - dim: -1 # determine automatically - dropout: 0.2 - -valid: - early_stopping: - patience: 5 - every: 1 - filter_with_test: True - metric: Accuracy -train: - optimizer: Adagrad - optimizer_args: - lr: 0.1 - batch_size: 128 - max_epochs: 200 - -eval.type: triple_classification -eval.metrics_per.relation: False - diff --git a/examples/toy-transe-ax-tripleclass.yaml b/examples/toy-transe-ax-tripleclass.yaml deleted file mode 100644 index 00485d590..000000000 --- a/examples/toy-transe-ax-tripleclass.yaml +++ /dev/null @@ -1,91 +0,0 @@ -job: - device: cuda - type: search - -model: transe - -dataset: - name: fb15k - -train: - batch_size: 256 - loss: margin_ranking - loss_arg: 4.0 - max_epochs: 80 - optimizer: Adagrad - optimizer_args: - lr: 0.1 - type: negative_sampling - -negative_sampling: - num_negatives_o: 3 - num_negatives_p: 0 - num_negatives_s: 3 - sampling_type: uniform - -valid: - early_stopping.patience: 5 - every: 5 - filter_with_test: True - metric: Accuracy - -eval: - batch_size: 512 - type: triple_classification - -transe: - class_name: TransE - entity_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - relation_embedder: - dim: 100 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - l_norm: 1. - -ax_search: - num_trials: 30 - num_sobol_trials: 20 - parameters: - - name: train.optimizer - type: fixed - value: Adagrad - - name: train.optimizer_args.lr - type: range - bounds: [0.001, 1.0] - - name: train.loss_arg - type: range - bounds: [0.0001, 10.0] - - name: transe.entity_embedder.normalize.p - type: choice - values: [-1., 2.] - is_numerical: False - is_ordered: False - - name: transe.relation_embedder.normalize.p - type: choice - values: [-1., 2.] - is_numerical: False - is_ordered: False - - name: lookup_embedder.regularize_args.weight - type: range - bounds: [0.0, 0.001] - -search.num_workers: 4 -train.num_workers: 4 -eval.num_workers: 4 diff --git a/examples/toy-transe-train-tripleclass.yaml b/examples/toy-transe-train-tripleclass.yaml deleted file mode 100644 index e120d9762..000000000 --- a/examples/toy-transe-train-tripleclass.yaml +++ /dev/null @@ -1,60 +0,0 @@ -job: - device: cuda - type: train - -model: transe - -dataset: - name: toy - -train: - batch_size: 256 - loss: margin_ranking - loss_arg: 0.2 - max_epochs: 200 - optimizer: Adagrad - optimizer_args: - lr: 0.01 - type: negative_sampling - -negative_sampling: - num_negatives_o: 3 - num_negatives_s: 3 - sampling_type: uniform - -valid: - early_stopping.patience: 5 - every: 1 - metric: accuracy - -eval: - type: triple_classification - metrics_per.relation: False - -transe: - class_name: TransE - entity_embedder: - dim: 128 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - weighted: False - relation_embedder: - dim: 128 - initialize: uniform_ - initialize_args: - uniform_ : - a: -1.0 - sparse: false - type: lookup_embedder - regularize: l2 - regularize_args: - weight: 1.e-05 - weighted: False - l_norm: 1. From 1e609a781f0e8a1f7a967ea2a226b7d736498048 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Sun, 24 May 2020 01:32:02 +0200 Subject: [PATCH 11/19] Imporve and update code --- kge/job/triple_classification.py | 272 ++++++++++++++++++------------- kge/util/sampler.py | 53 ------ 2 files changed, 157 insertions(+), 168 deletions(-) diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 95baf2457..8611b8586 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -5,29 +5,35 @@ from kge.job import EvaluationJob from kge.util.sampler import TripleClassificationSampler + class TripleClassificationJob(EvaluationJob): """Triple classification evaluation protocol. - Testing model's ability to discriminate between true and false triples based on scores. First, negative (corrupted) - triples are generated by randomly corrupting each triple in the validation and test data. Then the scores for each - triple, produced by the model to evaluate, is retrieved. Afterwards a threshold is determined for each relation. - The best threshold for every relation is determined by maximizing the accuracy on validation data. The unseen - triples from the train data will then be predicted as True if the score is higher than the threshold of the - respective relation. The metrics include accuracy and precision on test data. If necessary the accuracy/precision - per relation can be returned as well. + Testing a model's ability to classify true and false triples based on + thresholding scores. First, negative (corrupted) triples are generated by + randomly corrupting each triple in the validation and test data. Then the + scores for each triple, produced by the model to evaluate, is retrieved. + Afterwards a threshold is determined for each relation. The best threshold + for every relation is determined by maximizing the accuracy on validation + data. The unseen triples from the train data will then be predicted as True + if the score is higher than the threshold of the respective relation. The + metrics include accuracy and precision on test data. If necessary the + accuracy/precision per relation can be returned as well. """ def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) self.valid_data_is_prepared = False + self.triple_classification_sampler = TripleClassificationSampler(config, "config_key", dataset) def _prepare(self): """Prepare the corrupted validation and test data. - The triples are corrupted only for the first evaluated epoch. Afterwards is_prepared is set to true to make sure - that every epoch is evaluated on the same data. For model selection, the thresholds are found for validation - data and the accuracy on validation data is used. For testing the thresholds are found for validation data and - evaluated on test data. + The triples are corrupted only for the first evaluated epoch. Afterwards + is_prepared is set to true to make sure that every epoch is evaluated on + the same data. For model selection, the thresholds are found for validation + data and the accuracy on validation data is used. For testing the + thresholds are found for validation data and evaluated on test data. """ if self.valid_data_is_prepared: @@ -35,12 +41,28 @@ def _prepare(self): self.config.log("Generate data with corrupted and true triples...") - if self.eval_data == "test": - self.triples_valid, self.valid_labels, self.rel_valid_labels = TripleClassificationSampler.sample(self, self.dataset.valid) - self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.test) + if self.eval_split == "test": + ( + self.tune_data, + self.tune_labels, + self.rel_tune_labels, + ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) + ( + self.eval_data, + self.eval_labels, + self.rel_eval_labels, + ) = self.triple_classification_sampler.sample(self.dataset.split('test')) else: - self.triples_valid, self.valid_labels, self.rel_valid_label = TripleClassificationSampler.sample(self, self.dataset.valid) - self.triples_test, self.test_labels, self.rel_test_labels = TripleClassificationSampler.sample(self, self.dataset.valid) + ( + self.tune_data, + self.tune_labels, + self.rel_tune_label, + ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) + ( + self.eval_data, + self.eval_labels, + self.rel_eval_labels, + ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) # let the model add some hooks, if it wants to do so self.model.prepare_job(self) @@ -58,27 +80,45 @@ def run(self): epoch_time = -time.time() # Get scores and scores per relation for the corrupted valid data - self.config.log("Compute scores for validation and test datasets...") - s_valid, p_valid, o_valid = self.triples_valid[:, 0], self.triples_valid[:, 1], self.triples_valid[:, 2] - valid_scores = self.model.score_spo(s_valid, p_valid, o_valid) - rel_valid_scores = {int(r): valid_scores[(p_valid == r).nonzero(),:] for r in p_valid.unique()} + self.config.log("Compute scores for tune and eval datasets...") + s_tune, p_tune, o_tune = ( + self.tune_data[:, 0], + self.tune_data[:, 1], + self.tune_data[:, 2], + ) + p_tune_unique = p_tune.unique() + tune_scores = self.model.score_spo(s_tune, p_tune, o_tune) + rel_tune_scores = { + r: tune_scores[(p_tune == r)] for r in p_tune_unique + } # Get scores and scores per relation for the corrupted test data - s_test, p_test, o_test = self.triples_test[:, 0], self.triples_test[:, 1], self.triples_test[:, 2] - test_scores = self.model.score_spo(s_test, p_test, o_test) - rel_test_scores = {int(r): test_scores[(p_test == r).nonzero(),:] for r in p_test.unique()} + s_eval, p_eval, o_eval = ( + self.eval_data[:, 0], + self.eval_data[:, 1], + self.eval_data[:, 2], + ) + p_eval_unique = p_eval.unique() + eval_scores = self.model.score_spo(s_eval, p_eval, o_eval) # Find the best thresholds for every relation on validation data - self.config.log("Learning thresholds on validation data.") - rel_thresholds = self.findThresholds(p_valid, valid_scores, rel_valid_scores, self.valid_labels, self.triples_valid) + self.config.log("Tuning thresholds.") + rel_thresholds = self.findThresholds( + p_tune_unique, + tune_scores, + ) # Make prediction for the specified evaluation data - self.config.log("Evaluating on {} data.".format(self.eval_data)) - rel_predictions, not_in_eval = self.predict(rel_thresholds, rel_test_scores, p_valid, p_test) + self.config.log("Evaluating on {} data.".format(self.eval_split)) + rel_predictions, not_in_eval = self.predict( + eval_scores, rel_thresholds, p_tune_unique, p_eval_unique + ) # Compute Metrics self.config.log("Classification results:") - metrics = self._compute_metrics(self.test_labels, self.rel_test_labels, rel_predictions, p_test, not_in_eval) + metrics = self._compute_metrics( + self.eval_labels, self.rel_eval_labels, rel_predictions, p_eval, not_in_eval + ) epoch_time += time.time() # compute trace @@ -86,11 +126,11 @@ def run(self): type="triple_classification", scope="epoch", data_thresholds="Valid", - size_threshold_data = len(self.triples_valid), - data_evaluate=self.eval_data, - size_data_evaluate=len(self.triples_test), + size_threshold_data=len(self.tune_data), + data_evaluate=self.eval_split, + size_data_evaluate=len(self.eval_data), epoch=self.epoch, - size=2*len(self.dataset.valid), + size=2 * len(self.dataset.valid), epoch_time=epoch_time, **metrics, ) @@ -116,109 +156,107 @@ def run(self): return trace_entry - def findThresholds(self, p, valid_scores, rel_scores, valid_labels, valid_data): - """Find the best thresholds per relation by maximizing accuracy on validation data. + def findThresholds( + self, p_tune_unique, tune_scores + ): + """Find the best thresholds per relation by maximizing accuracy on + validation data. - The thresholds are found for every relation by maximizing the accuracy on the validation data. For a given - relation, if the scores of all triple in the relation are sorted, the perfect threshold is always a cut between - two of the scores. This means, that multiple possible values can be defined as thresholds and give the highest - accuracy. To evaluate only as many possible thresholds as really necessary, the scores themselves are considered - as possible thresholds. This allows for a fast implementation. + The thresholds are found for every relation by maximizing the accuracy on + the validation data. For a given relation, if the scores of all triple in + the relation are sorted, the perfect threshold is always a cut between two + of the scores. This means, that multiple possible values can be defined as + thresholds and give the highest accuracy. To evaluate only as many possible + thresholds as really necessary, the scores themselves are considered as + possible thresholds. This allows for a fast implementation. Args: - p: 1-D tensor containing the relations of the corrupted validation dataset. - valid_scores: 2-D tensor containing the scores of all corrupted validation triples. - rel_scores: Dictionary containing the scores of the triples in a relation. - valid_labels: 1-D tensor containing the labels of all corrupted validation triples. - valid_data: Dataset used. Should be the corrupted validation dataset. + + p_tune: 1-D tensor containing the relations of the corrupted validation + dataset. + + tune_scores: 2-D tensor containing the scores of all corrupted + validation triples. + + rel_tune_scores: Dictionary containing the scores of the triples in a + relation. + + tune_thresh_labels: 1-D tensor containing the labels of all corrupted + tuning triples. + + tune_data: Dataset used. Should be the corrupted validation dataset. Returns: - rel_thresholds: Dictionary with thresholds per relation {relation: thresholds}. + rel_thresholds: Dictionary with thresholds per relation + {relation: thresholds}. E.g.: {1: tensor(-2.0843, grad_fn=)} """ # Initialize accuracies and thresholds - rel_accuracies = {int(r): -1 for r in p.unique()} - rel_thresholds = {int(r): 0 for r in p.unique()} + rel_accuracies = {r: -1 for r in p_tune_unique} + rel_thresholds = {r: 0 for r in p_tune_unique} # Change the valid scores from a 2D to a 1D tensor - valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) - - for r in p.unique(): - current_rel = (valid_data[:, 1] == r) # 0-1 vector for indexing triples of the current relation - true_labels = valid_labels[current_rel.nonzero()].type(torch.int) - - # valid_scores[current_rel.nonzero()] and rel_scores[int(r)] both contain the scores of the current - # relation. In the comparison, every score is evaluated as possible threshold against all scores. - predictions = (valid_scores[current_rel.nonzero()] >= rel_scores[int(r)]).type(torch.int) - - accuracy = [int(((true_labels==predictions[i]).sum(dim=0)))/len(true_labels) for i in range(len(rel_scores[int(r)]))] - rel_accuracies[int(r)] = max(accuracy) - - # Choose the smallest score of the ones which give the maximum accuracy as threshold to stay consistent. - rel_thresholds[int(r)] = min(rel_scores[int(r)][list(filter(lambda x: accuracy[x] == max(accuracy), range(len(accuracy))))])[0,0] - -# # Alternative implementation: Search for best threshold in an interval -# # Following https://github.com/siddharth-agrawal/Neural-Tensor-Network/blob/master/neuralTensorNetwork.py or -# # https://github.com/dddoss/tensorflow-socher-ntn/blob/master/code/ntn_eval.py (reimplemented Socher et al. 2013) -# -# # Initialize accuracies, thresholds and interval -# min_score = valid_scores.min() -# max_score = valid_scores.max() -# -# rel_accuracies = {int(r): -1 for r in p.unique()} -# rel_thresholds = {int(r): min_score for r in p.unique()} -# -# score = min_score -# -# # Original implementation uses an interval of 0.01, implemented for NTN model. In general the interval should -# # depend on the range of the score values of the model and be at least as large as teh smallest distance between -# # two of the sorted scores -# interval = 0.01 -# valid_scores = torch.as_tensor([float(valid_scores[i]) for i in range(len(valid_scores))]).to(self.device) -# -# while(score<=max_score): -# for r in p.unique(): -# #Predict -# current_rel = (valid_data[:, 1] == r) -# true_labels = valid_labels[current_rel.nonzero()].type(torch.int) -# predictions = (valid_scores[current_rel.nonzero()] >= score).type(torch.int) -# accuracy = int(((true_labels==predictions).sum(dim=0)))/len(true_labels) -# -# if accuracy > rel_accuracies[int(r)]: -# rel_accuracies[int(r)] = accuracy -# rel_thresholds[int(r)] = score.clone() -# -# score += interval + # tune_scores = torch.as_tensor( + # [float(tune_scores[i]) for i in range(len(tune_scores))] + # ).to(self.device) + + for r in p_tune_unique: + # 0-1 vector for indexing triples of the current relation + current_rel = ( + self.tune_data[:, 1] == r + ) + true_labels = self.tune_labels[current_rel] + + # tune_scores[current_rel] and rel_tune_scores[r] both + # contain the scores of the current relation. In the comparison, every + # score is evaluated as possible threshold against all scores. + predictions = ( + tune_scores[current_rel].view(-1, 1) >= tune_scores[current_rel].view(1, -1) + ) + + accuracies = (predictions & true_labels).float().sum(dim=1) / true_labels.size(0) + rel_accuracies[r] = accuracies.max() + + # Choose the smallest score of the ones which give the maximum + # accuracy as threshold to stay consistent. + rel_thresholds[r] = (tune_scores[current_rel][rel_accuracies[r] >= tune_scores[current_rel]]).min() return rel_thresholds - def predict(self, rel_thresholds, rel_scores, p_valid, p_test): + def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique): """Makes predictions on evaluation/test data. Parameters: rel_thresholds: Dictionary with relation thresholds. - rel_scores: Dictionary with scores of triples in each relation: - E.g. relation with four triples in it:, e.g. {1: [-2, 1, 2, 4]}. Returns: rel_predictions: Dictionary with predictions for the triples in a relation, e.g. {1: [0, 0, 1, 1]}. not_in_eval: List with relations that are in the test data, but not in the validation data. """ - rel_predictions = {int(r): torch.as_tensor([0]*len(rel_scores[int(r)])).to(self.device) for r in p_test.unique()} - + rel_predictions = dict() # Set variable for relations which are not in valid data, but in test data not_in_eval = [] - for r in p_test.unique(): - if r in p_valid.unique(): # Check if relation which is in valid data also is in test data + for r in p_eval_unique: + if ( + r in p_tune_unique + ): # Check if relation which is in valid data also is in test data # Predict - rel_predictions[int(r)] = rel_scores[int(r)][:, 0, 0] >= rel_thresholds[int(r)] - else: not_in_eval.append(r) + current_rel = ( + self.eval_data[:, 1] == r + ) + rel_predictions[r] = ( + eval_scores[current_rel] >= rel_thresholds[r] + ) + else: + not_in_eval.append(r) return rel_predictions, not_in_eval - def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval): + def _compute_metrics( + self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval + ): """Computes accuracy and precision metrics of predictions. Returns: @@ -233,9 +271,10 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test metrics = {} # Create a list for all predicted labels, matching the shape of test_labels - pred_list = torch.tensor([i - for r in p_test.unique() - for i in rel_predictions[int(r)]], dtype=torch.int64) + pred_list = torch.tensor( + [i for r in p_test.unique() for i in rel_predictions[int(r)]], + dtype=torch.int64, + ) metrics["accuracy"] = float(accuracy_score(test_labels, pred_list)) metrics["precision"] = float(precision_score(test_labels, pred_list)) @@ -244,16 +283,19 @@ def _compute_metrics(self, test_labels, rel_test_labels, rel_predictions, p_test precision_per_r = {} accuracy_per_r = {} for r in p_test.unique(): - precision_per_r[str(self.dataset.relations[int(r)])] = \ - float(precision_score(rel_test_labels[int(r)], rel_predictions[int(r)])) - accuracy_per_r[str(self.dataset.relations[int(r)])] = \ - float(accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)])) + precision_per_r[str(self.dataset.relations[int(r)])] = float( + precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]) + ) + accuracy_per_r[str(self.dataset.relations[int(r)])] = float( + accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]) + ) metrics["accuracy_per_relation"] = accuracy_per_r metrics["precision_per_relation"] = precision_per_r + metrics["untested relations due to missing in evaluation data"] = len( + not_in_eval + ) - metrics["untested relations due to missing in evaluation data"] = len(not_in_eval) - - return metrics \ No newline at end of file + return metrics diff --git a/kge/util/sampler.py b/kge/util/sampler.py index 6cb5b3c6d..5f1ab31b8 100644 --- a/kge/util/sampler.py +++ b/kge/util/sampler.py @@ -3,13 +3,9 @@ import random import torch -<<<<<<< HEAD from typing import Optional import numpy as np import numba -======= -import random ->>>>>>> Moved sampling function to sampler.py, updated code documentation SLOTS = [0, 1, 2] SLOT_STR = ["s", "p", "o"] @@ -357,52 +353,3 @@ def _sample(self, positive_triples: torch.Tensor, slot: int, num_samples: int): positive_triples.size(0) * num_samples, ).view(positive_triples.size(0), num_samples) return result - - -class TripleClassificationSampler(KgeNegativeSampler): - def __init__(self, config, configuration_key, dataset): - super().__init__(config, configuration_key, dataset) - - def sample(self, dataset): - """Generates dataset with positive and negative triples. - - Takes each triple of the specified dataset and randomly replaces either the subject or the object with another - subject/object. Only allows a subject/object to be sampled if it appeared as a subject/object at the same - position in the dataset. - - Returns: - corrupted: A new dataset with the original and corrupted triples. - - labels: A vector with labels for the corresponding triples in the dataset. - - rel_labels: A dictionary mapping relations to labels. - Example if we had two triples of relation 1 in the original dataset: {1: [1, 0, 1, 0]} - """ - - # Create objects for the corrupted dataset and the corresponding labels - corrupted = dataset.repeat(1, 2).view(-1, 3) - labels = torch.as_tensor([1, 0] * len(dataset)).to(self.device) - - # The sampling influences the results in the end. To compare models or parameters, the seeds should be fixed - if self.config.get("eval.triple_classification_random_seed"): - torch.manual_seed(5465456876546785) - random.seed(5465456876546785) - - # Random decision if sample subject(sample=nonzero) or object(sample=zero) - sample = torch.randint(0, 2, (1, len(dataset))).to(self.device) - - # Sample subjects from subjects which appeared in the dataset - corrupted[1::2][:, 0][sample.nonzero()[:, 1]] = \ - torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 0].unique()))))), dtype=torch.int32).to(self.device) - - # Sample objects from objects which appeared in the dataset - corrupted[1::2][:, 2][(sample == 0).nonzero()[:, 1]] = \ - torch.as_tensor(random.choice( - list(map(int, list(map(int, dataset[:, 2].unique()))))), dtype=torch.int32).to(self.device) - - # Save the labels per relation, since this will be needed frequently later on - p = corrupted[:, 1] - rel_labels = {int(r): labels[p == r] for r in p.unique()} - - return corrupted, labels, rel_labels From b0b7791414d034f80b6d5baae5afef1731b71ac3 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Sun, 24 May 2020 01:32:36 +0200 Subject: [PATCH 12/19] config --- examples/toy-complex-train.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml index bfa3fba27..c7dcd3670 100644 --- a/examples/toy-complex-train.yaml +++ b/examples/toy-complex-train.yaml @@ -10,6 +10,10 @@ train: mode: max patience: 4 +eval.type: triple_classification + +valid.every: 1 + model: complex lookup_embedder: dim: 100 From 162ff38de079af1c7cb2d87a0473c91417c45bbb Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Mon, 25 May 2020 01:02:47 +0200 Subject: [PATCH 13/19] TC works now for datasets without neg samples --- examples/toy-complex-train.yaml | 12 +- kge/job/triple_classification.py | 218 +++++++++++++++++-------------- 2 files changed, 128 insertions(+), 102 deletions(-) diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml index c7dcd3670..be8a0de4b 100644 --- a/examples/toy-complex-train.yaml +++ b/examples/toy-complex-train.yaml @@ -1,5 +1,6 @@ job.type: train -dataset.name: toy +dataset.name: fb15k-237 +#dataset.name: fb15k train: optimizer: Adagrad @@ -9,17 +10,14 @@ train: lr_scheduler_args: mode: max patience: 4 + batch_size: 1024 eval.type: triple_classification valid.every: 1 +valid.metric: accuracy model: complex lookup_embedder: dim: 100 - regularize_weight: 0.8e-7 - initialize: normal_ - initialize_args: - normal_: - mean: 0.0 - std: 0.1 + regularize_weight: 0.0 diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 8611b8586..9e1bf70a9 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -1,9 +1,92 @@ import time import torch -from sklearn.metrics import accuracy_score, precision_score +from kge import Dataset, Config, Configurable +from kge.util.sampler import KgeUniformSampler from kge.job import EvaluationJob -from kge.util.sampler import TripleClassificationSampler + +SLOTS = [0, 1, 2] +SLOT_STR = ["s", "p", "o"] +S, P, O = SLOTS + + +class TripleClassificationSampler(Configurable): + def __init__(self, config: Config, configuration_key: str, dataset: Dataset): + super().__init__(config, configuration_key) + self.dataset = dataset + self._is_prepared = False + self.train_data = None + self.s_entities = None + self.o_entities = None + uni_sampler_config = config.clone() + # uni_sampler_config.set("negative_sampling.num_samples.s", self.get_option("num_samples.s")) + uni_sampler_config.set("negative_sampling.num_samples.s", 1) + uni_sampler_config.set("negative_sampling.filtering.s", True) + # uni_sampler_config.set("negative_sampling.num_samples.o", self.get_option("num_samples.o")) + uni_sampler_config.set("negative_sampling.num_samples.o", 1) + uni_sampler_config.set("negative_sampling.filtering.o", True) + self.uniform_sampler = KgeUniformSampler( + uni_sampler_config, "negative_sampling", dataset + ) + + def _prepare(self,): + train_data = self.dataset.split("train") + self.s_entities = train_data[:, S].unique().tolist() + self.o_entities = train_data[:, O].unique().tolist() + self._is_prepared = True + + def sample(self, positive_triples: torch.Tensor): + """Generates dataset with positive and negative triples. + + Takes each triple of the specified dataset and randomly replaces either the + subject or the object with another subject/object. Only allows a subject/object + to be sampled if it appeared as a subject/object at the same position in the dataset. + + Returns: + corrupted: A new dataset with the original and corrupted triples. + + labels: A vector with labels for the corresponding triples in the dataset. + + rel_labels: A dictionary mapping relations to labels. + Example if we had two triples of relation 1 in the original + dataset: {1: [1, 0, 1, 0]} + """ + + if not self._is_prepared: + self._prepare() + + # Create objects for the corrupted dataset and the corresponding labels + corrupted = positive_triples.repeat(1, 2).view(-1, 3) + labels = ( + torch.as_tensor([1, 0] * len(positive_triples)) + .type(torch.bool) + .to(self.config.get("job.device")) + ) + + # Random decision if sample subject(sample=nonzero) or object(sample=zero) + sample_subject = torch.randint(2, (len(positive_triples),)).type(torch.bool) + + # Sample subjects from subjects which appeared in the dataset + # corrupted[1::2][:, S][sample_subject] = torch.as_tensor( + # random.choice(self.s_entities) + # ) + corrupted[1::2, S][sample_subject] = self.uniform_sampler.sample( + corrupted[1::2][sample_subject], S, 1 + ).view(-1) + + # Sample objects from objects which appeared in the dataset + # corrupted[1::2][:, O][(sample_subject == False)] = torch.as_tensor( + # random.choice(self.o_entities) + # ) + corrupted[1::2, O][sample_subject == False] = self.uniform_sampler.sample( + corrupted[1::2][sample_subject == False], O, 1 + ).view(-1) + + # Save the labels per relation, since this will be needed frequently later on + p = corrupted[:, 1] + rel_labels = {int(r): labels[p == r] for r in p.unique()} + + return corrupted, labels, rel_labels class TripleClassificationJob(EvaluationJob): @@ -24,7 +107,9 @@ class TripleClassificationJob(EvaluationJob): def __init__(self, config, dataset, parent_job, model): super().__init__(config, dataset, parent_job, model) self.valid_data_is_prepared = False - self.triple_classification_sampler = TripleClassificationSampler(config, "config_key", dataset) + self.triple_classification_sampler = TripleClassificationSampler( + config, "triple_classification", dataset + ) def _prepare(self): """Prepare the corrupted validation and test data. @@ -46,23 +131,31 @@ def _prepare(self): self.tune_data, self.tune_labels, self.rel_tune_labels, - ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) + ) = self.triple_classification_sampler.sample( + self.dataset.split("valid").to(self.config.get("job.device")) + ) ( self.eval_data, self.eval_labels, self.rel_eval_labels, - ) = self.triple_classification_sampler.sample(self.dataset.split('test')) + ) = self.triple_classification_sampler.sample( + self.dataset.split("test").to(self.config.get("job.device")) + ) else: ( self.tune_data, self.tune_labels, self.rel_tune_label, - ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) + ) = self.triple_classification_sampler.sample( + self.dataset.split("valid").to(self.config.get("job.device")) + ) ( self.eval_data, self.eval_labels, self.rel_eval_labels, - ) = self.triple_classification_sampler.sample(self.dataset.split('valid')) + ) = self.triple_classification_sampler.sample( + self.dataset.split("valid").to(self.config.get("job.device")) + ) # let the model add some hooks, if it wants to do so self.model.prepare_job(self) @@ -71,7 +164,6 @@ def _prepare(self): def run(self): """Runs the triple classification job.""" - self.config.log("Starting triple classification...") self._prepare() was_training = self.model.training @@ -80,7 +172,6 @@ def run(self): epoch_time = -time.time() # Get scores and scores per relation for the corrupted valid data - self.config.log("Compute scores for tune and eval datasets...") s_tune, p_tune, o_tune = ( self.tune_data[:, 0], self.tune_data[:, 1], @@ -88,9 +179,7 @@ def run(self): ) p_tune_unique = p_tune.unique() tune_scores = self.model.score_spo(s_tune, p_tune, o_tune) - rel_tune_scores = { - r: tune_scores[(p_tune == r)] for r in p_tune_unique - } + rel_tune_scores = {r: tune_scores[(p_tune == r)] for r in p_tune_unique} # Get scores and scores per relation for the corrupted test data s_eval, p_eval, o_eval = ( @@ -102,35 +191,22 @@ def run(self): eval_scores = self.model.score_spo(s_eval, p_eval, o_eval) # Find the best thresholds for every relation on validation data - self.config.log("Tuning thresholds.") - rel_thresholds = self.findThresholds( - p_tune_unique, - tune_scores, - ) + rel_thresholds = self.findThresholds(p_tune_unique, tune_scores,) # Make prediction for the specified evaluation data self.config.log("Evaluating on {} data.".format(self.eval_split)) - rel_predictions, not_in_eval = self.predict( + metrics, not_in_eval = self.predict( eval_scores, rel_thresholds, p_tune_unique, p_eval_unique ) - # Compute Metrics - self.config.log("Classification results:") - metrics = self._compute_metrics( - self.eval_labels, self.rel_eval_labels, rel_predictions, p_eval, not_in_eval - ) - epoch_time += time.time() # compute trace trace_entry = dict( type="triple_classification", scope="epoch", data_thresholds="Valid", - size_threshold_data=len(self.tune_data), data_evaluate=self.eval_split, - size_data_evaluate=len(self.eval_data), epoch=self.epoch, - size=2 * len(self.dataset.valid), epoch_time=epoch_time, **metrics, ) @@ -152,13 +228,11 @@ def run(self): # reset model and return metrics if was_training: self.model.train() - self.config.log("Finished evaluating on " + self.eval_data + " data.") + self.config.log("Finished evaluating on " + self.eval_split + " data.") return trace_entry - def findThresholds( - self, p_tune_unique, tune_scores - ): + def findThresholds(self, p_tune_unique, tune_scores): """Find the best thresholds per relation by maximizing accuracy on validation data. @@ -193,8 +267,7 @@ def findThresholds( """ # Initialize accuracies and thresholds - rel_accuracies = {r: -1 for r in p_tune_unique} - rel_thresholds = {r: 0 for r in p_tune_unique} + rel_thresholds = {r: -float("inf") for r in range(self.dataset.num_relations())} # Change the valid scores from a 2D to a 1D tensor # tune_scores = torch.as_tensor( @@ -203,24 +276,25 @@ def findThresholds( for r in p_tune_unique: # 0-1 vector for indexing triples of the current relation - current_rel = ( - self.tune_data[:, 1] == r - ) - true_labels = self.tune_labels[current_rel] + current_rel = self.tune_data[:, 1] == r + true_labels = self.tune_labels[current_rel].view(-1) # tune_scores[current_rel] and rel_tune_scores[r] both # contain the scores of the current relation. In the comparison, every # score is evaluated as possible threshold against all scores. predictions = ( - tune_scores[current_rel].view(-1, 1) >= tune_scores[current_rel].view(1, -1) - ) + tune_scores[current_rel].view(-1, 1) + >= tune_scores[current_rel].view(1, -1) + ).t() - accuracies = (predictions & true_labels).float().sum(dim=1) / true_labels.size(0) - rel_accuracies[r] = accuracies.max() + accuracies = (predictions == true_labels).float().sum(dim=1) + accuracies_max = accuracies.max() # Choose the smallest score of the ones which give the maximum # accuracy as threshold to stay consistent. - rel_thresholds[r] = (tune_scores[current_rel][rel_accuracies[r] >= tune_scores[current_rel]]).min() + rel_thresholds[r.item()] = tune_scores[current_rel][ + accuracies_max == accuracies + ].min() return rel_thresholds @@ -235,7 +309,7 @@ def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique): not_in_eval: List with relations that are in the test data, but not in the validation data. """ - rel_predictions = dict() + tptn = 0 # Set variable for relations which are not in valid data, but in test data not_in_eval = [] for r in p_eval_unique: @@ -243,59 +317,13 @@ def predict(self, eval_scores, rel_thresholds, p_tune_unique, p_eval_unique): r in p_tune_unique ): # Check if relation which is in valid data also is in test data # Predict - current_rel = ( - self.eval_data[:, 1] == r - ) - rel_predictions[r] = ( - eval_scores[current_rel] >= rel_thresholds[r] - ) + current_rel = self.eval_data[:, 1] == r + true_labels = self.eval_labels[current_rel] + predictions = eval_scores[current_rel] >= rel_thresholds[r.item()] + tptn += (predictions == true_labels).float().sum().item() else: not_in_eval.append(r) - return rel_predictions, not_in_eval - - def _compute_metrics( - self, test_labels, rel_test_labels, rel_predictions, p_test, not_in_eval - ): - """Computes accuracy and precision metrics of predictions. - - Returns: - metrics: dictionary with the specified metrics accuracy and precision as keys. If specified, metrics per - relation are stored as dictionaries in the dictionary. - E.g.: {accuracy: 0.9 - accuracy_per_relation: - {relation 1: 0.8} - {relation 2: 0.9} - } - """ - metrics = {} - - # Create a list for all predicted labels, matching the shape of test_labels - pred_list = torch.tensor( - [i for r in p_test.unique() for i in rel_predictions[int(r)]], - dtype=torch.int64, - ) - - metrics["accuracy"] = float(accuracy_score(test_labels, pred_list)) - metrics["precision"] = float(precision_score(test_labels, pred_list)) - - if self.config.get("eval.metrics_per.relation"): - precision_per_r = {} - accuracy_per_r = {} - for r in p_test.unique(): - precision_per_r[str(self.dataset.relations[int(r)])] = float( - precision_score(rel_test_labels[int(r)], rel_predictions[int(r)]) - ) - accuracy_per_r[str(self.dataset.relations[int(r)])] = float( - accuracy_score(rel_test_labels[int(r)], rel_predictions[int(r)]) - ) - - metrics["accuracy_per_relation"] = accuracy_per_r - - metrics["precision_per_relation"] = precision_per_r - - metrics["untested relations due to missing in evaluation data"] = len( - not_in_eval - ) + metrics = dict(accuracy=tptn / self.eval_data.size(0)) - return metrics + return metrics, not_in_eval From 6344697f090e31222cd78857d08671575bcf64b5 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Mon, 25 May 2020 01:54:40 +0200 Subject: [PATCH 14/19] Fix neg sampling with filtering for unseen sp, po in train --- kge/util/sampler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kge/util/sampler.py b/kge/util/sampler.py index 5f1ab31b8..03f9f1ac5 100644 --- a/kge/util/sampler.py +++ b/kge/util/sampler.py @@ -279,7 +279,11 @@ def _filter_and_resample_fast( positives_index = numba.typed.Dict() for i in range(batch_size): pair = (pairs[i][0], pairs[i][1]) - positives_index[pair] = index.get(pair).numpy() + positives_index[pair] = ( + index.get(pair).numpy() + if pair in index + else torch.IntTensor([]).numpy() + ) negative_samples = negative_samples.numpy() KgeUniformSampler._filter_and_resample_numba( negative_samples, pairs, positives_index, batch_size, int(voc_size), From 4df0c3be5ca20db696468b567462d328f7072cb7 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Mon, 25 May 2020 10:23:14 +0200 Subject: [PATCH 15/19] Remove uneeded stuff --- kge/job/triple_classification.py | 36 ++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 9e1bf70a9..2d5cce804 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -57,11 +57,7 @@ def sample(self, positive_triples: torch.Tensor): # Create objects for the corrupted dataset and the corresponding labels corrupted = positive_triples.repeat(1, 2).view(-1, 3) - labels = ( - torch.as_tensor([1, 0] * len(positive_triples)) - .type(torch.bool) - .to(self.config.get("job.device")) - ) + labels = torch.as_tensor([1, 0] * len(positive_triples)).type(torch.bool) # Random decision if sample subject(sample=nonzero) or object(sample=zero) sample_subject = torch.randint(2, (len(positive_triples),)).type(torch.bool) @@ -82,11 +78,10 @@ def sample(self, positive_triples: torch.Tensor): corrupted[1::2][sample_subject == False], O, 1 ).view(-1) - # Save the labels per relation, since this will be needed frequently later on - p = corrupted[:, 1] - rel_labels = {int(r): labels[p == r] for r in p.unique()} - - return corrupted, labels, rel_labels + return ( + corrupted.to(self.config.get("job.device")), + labels.to(self.config.get("job.device")), + ) class TripleClassificationJob(EvaluationJob): @@ -130,32 +125,20 @@ def _prepare(self): ( self.tune_data, self.tune_labels, - self.rel_tune_labels, - ) = self.triple_classification_sampler.sample( - self.dataset.split("valid").to(self.config.get("job.device")) - ) + ) = self.triple_classification_sampler.sample(self.dataset.split("valid")) ( self.eval_data, self.eval_labels, - self.rel_eval_labels, - ) = self.triple_classification_sampler.sample( - self.dataset.split("test").to(self.config.get("job.device")) - ) + ) = self.triple_classification_sampler.sample(self.dataset.split("test")) else: ( self.tune_data, self.tune_labels, - self.rel_tune_label, - ) = self.triple_classification_sampler.sample( - self.dataset.split("valid").to(self.config.get("job.device")) - ) + ) = self.triple_classification_sampler.sample(self.dataset.split("valid")) ( self.eval_data, self.eval_labels, - self.rel_eval_labels, - ) = self.triple_classification_sampler.sample( - self.dataset.split("valid").to(self.config.get("job.device")) - ) + ) = self.triple_classification_sampler.sample(self.dataset.split("valid")) # let the model add some hooks, if it wants to do so self.model.prepare_job(self) @@ -179,7 +162,6 @@ def run(self): ) p_tune_unique = p_tune.unique() tune_scores = self.model.score_spo(s_tune, p_tune, o_tune) - rel_tune_scores = {r: tune_scores[(p_tune == r)] for r in p_tune_unique} # Get scores and scores per relation for the corrupted test data s_eval, p_eval, o_eval = ( From bb5a575be26321d236736debf21c1d5d3a455ee7 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Mon, 25 May 2020 10:23:37 +0200 Subject: [PATCH 16/19] Init supporting TC datasets --- data/download_all.sh | 19 ++++++++++ data/preprocess.py | 83 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/data/download_all.sh b/data/download_all.sh index 6424f80bb..adba8b274 100755 --- a/data/download_all.sh +++ b/data/download_all.sh @@ -201,3 +201,22 @@ else echo wikidata5m already prepared fi + +# wn11 +if [ ! -d "$BASEDIR/wn11" ]; then + echo Downloading wikidata5m + cd $BASEDIR + curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip + unzip wn11.zip + mv wordnet11/wordnet11 wn11 + rm -r wordnet11/ + mv wn11/dev.txt wn11/valid.txt +else + echo wikidata5m already present +fi +if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then + python preprocess.py wikidata5m +else + echo wikidata5m already prepared +fi + diff --git a/data/preprocess.py b/data/preprocess.py index fa1c735db..43da59d02 100755 --- a/data/preprocess.py +++ b/data/preprocess.py @@ -19,6 +19,7 @@ import numpy as np from collections import OrderedDict + def store_map(symbol_map, filename): with open(filename, "w") as f: for symbol, index in symbol_map.items(): @@ -29,14 +30,26 @@ def store_map(symbol_map, filename): parser = argparse.ArgumentParser() parser.add_argument("folder", type=str) parser.add_argument("--order_sop", action="store_true") + parser.add_argument("--triple_class", action="store_true") args = parser.parse_args() print(f"Preprocessing {args.folder}...") raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"} split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"} - string_files = {"entity_strings": "entity_strings.del", "relation_strings": "relation_strings.del"} - split_files_without_unseen = {"train_sample": "train_sample.del", "valid_without_unseen": "valid_without_unseen.del", - "test_without_unseen": "test_without_unseen.del"} + split_files_label = { + "train_label": "train_label.del", + "valid_label": "valid_label.del", + "test_label": "test_label.del", + } + string_files = { + "entity_strings": "entity_strings.del", + "relation_strings": "relation_strings.del", + } + split_files_without_unseen = { + "train_sample": "train_sample.del", + "valid_without_unseen": "valid_without_unseen.del", + "test_without_unseen": "test_without_unseen.del", + } split_sizes = {} if args.order_sop: @@ -73,7 +86,7 @@ def store_map(symbol_map, filename): if "train" in split: entities_in_train = entities.copy() relations_in_train = relations.copy() - + print(f"{len(relations)} distinct relations") print(f"{len(entities)} distinct entities") print("Writing relation and entity map...") @@ -87,13 +100,23 @@ def store_map(symbol_map, filename): for split, filename in split_files.items(): if split in ["valid", "test"]: split_without_unseen = split + "_without_unseen" - f_wo_unseen = open(os.path.join(args.folder, - split_files_without_unseen[split_without_unseen]), "w") + f_wo_unseen = open( + os.path.join( + args.folder, split_files_without_unseen[split_without_unseen] + ), + "w", + ) else: split_without_unseen = split + "_sample" - f_tr_sample = open(os.path.join(args.folder, - split_files_without_unseen[split_without_unseen]), "w") - train_sample = np.random.choice(split_sizes["train"], split_sizes["valid"], False) + f_tr_sample = open( + os.path.join( + args.folder, split_files_without_unseen[split_without_unseen] + ), + "w", + ) + train_sample = np.random.choice( + split_sizes["train"], split_sizes["valid"], False + ) with open(os.path.join(args.folder, filename), "w") as f: size_unseen = 0 for n, t in enumerate(raw[split]): @@ -115,8 +138,12 @@ def store_map(symbol_map, filename): + "\n" ) size_unseen += 1 - elif split in ["valid", "test"] and t[S] in entities_in_train and \ - t[O] in entities_in_train and t[P] in relations_in_train: + elif ( + split in ["valid", "test"] + and t[S] in entities_in_train + and t[O] in entities_in_train + and t[P] in relations_in_train + ): f_wo_unseen.write( str(entities[t[S]]) + "\t" @@ -127,15 +154,32 @@ def store_map(symbol_map, filename): ) size_unseen += 1 without_unseen_sizes[split_without_unseen] = size_unseen + if args.triple_class: + for split, filename in split_files_label.items(): + if split in ["valid", "test"]: + split_without_unseen = split + "_without_unseen" + f_wo_unseen = open( + os.path.join( + args.folder, split_files_without_unseen[split_without_unseen] + ), + "w", + ) + with open(os.path.join(args.folder, filename), "w") as f: + for n, t in enumerate(raw[split]): + f.write(t[4] + "\n") + if ( + t[S] in entities_in_train + and t[O] in entities_in_train + and t[P] in relations_in_train + ): + f_wo_unseen.write(t[4] + "\n") # write config print("Writing dataset.yaml...") dataset_config = dict( - name=args.folder, - num_entities=len(entities), - num_relations=len(relations), + name=args.folder, num_entities=len(entities), num_relations=len(relations), ) - for obj in [ "entity", "relation" ]: + for obj in ["entity", "relation"]: dataset_config[f"files.{obj}_ids.filename"] = f"{obj}_ids.del" dataset_config[f"files.{obj}_ids.type"] = "map" for split in split_files.keys(): @@ -143,9 +187,16 @@ def store_map(symbol_map, filename): dataset_config[f"files.{split}.type"] = "triples" dataset_config[f"files.{split}.size"] = split_sizes.get(split) for split in split_files_without_unseen.keys(): - dataset_config[f"files.{split}.filename"] = split_files_without_unseen.get(split) + dataset_config[f"files.{split}.filename"] = split_files_without_unseen.get( + split + ) dataset_config[f"files.{split}.type"] = "triples" dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split) + if args.triple_class: + for split in split_files_label.keys(): + dataset_config[f"files.{split}.filename"] = split_files_label.get(split) + dataset_config[f"files.{split}.type"] = "label" + dataset_config[f"files.{split}.size"] = split_sizes.get(split) for string in string_files.keys(): if os.path.exists(os.path.join(args.folder, string_files[string])): dataset_config[f"files.{string}.filename"] = string_files.get(string) From 2b9878024eda0e1bb8c86011ab5c97111dc1a5c1 Mon Sep 17 00:00:00 2001 From: samuelbroscheit Date: Mon, 25 May 2020 10:23:57 +0200 Subject: [PATCH 17/19] config --- examples/toy-complex-train.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/toy-complex-train.yaml b/examples/toy-complex-train.yaml index be8a0de4b..59d8bc78b 100644 --- a/examples/toy-complex-train.yaml +++ b/examples/toy-complex-train.yaml @@ -1,4 +1,5 @@ job.type: train +#dataset.name: toy dataset.name: fb15k-237 #dataset.name: fb15k From 5b1a5b408b087d207884c01931714199b80e44c1 Mon Sep 17 00:00:00 2001 From: nzteb Date: Thu, 4 Jun 2020 15:50:55 +0200 Subject: [PATCH 18/19] Add preprocess functionality for wn11 --- data/download_all.sh | 20 +++++--- data/preprocess.py | 111 +++++++++++++++++++++++++++------------- kge/config-default.yaml | 8 --- 3 files changed, 89 insertions(+), 50 deletions(-) diff --git a/data/download_all.sh b/data/download_all.sh index adba8b274..ea63e0d9d 100755 --- a/data/download_all.sh +++ b/data/download_all.sh @@ -204,19 +204,25 @@ fi # wn11 if [ ! -d "$BASEDIR/wn11" ]; then - echo Downloading wikidata5m + echo Downloading wn11 cd $BASEDIR + # TODO this also creates a __MACOSX folder on non-mac in the zip + # TODO download file from mannheim server curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip - unzip wn11.zip - mv wordnet11/wordnet11 wn11 - rm -r wordnet11/ + unzip wordnet11.zip + if [ -d "__MACOSX" ]; then + rm -r __MACOSX + fi + mv wordnet11 wn11 mv wn11/dev.txt wn11/valid.txt else - echo wikidata5m already present + echo wn11 already present fi + if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then - python preprocess.py wikidata5m + python preprocess.py wn11 --triple_class else - echo wikidata5m already prepared + echo wn11 already prepared fi + diff --git a/data/preprocess.py b/data/preprocess.py index 43da59d02..3d2627528 100755 --- a/data/preprocess.py +++ b/data/preprocess.py @@ -7,7 +7,7 @@ During preprocessing, each distinct entity name and each distinct distinct relation name is assigned an index (dense). The index-to-object mapping is stored in files -"entity_map.del" and "relation_map.del", resp. The triples (as indexes) are stored in +"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in files "train.del", "valid.del", and "test.del". Metadata information is stored in a file "dataset.yaml". @@ -36,11 +36,7 @@ def store_map(symbol_map, filename): print(f"Preprocessing {args.folder}...") raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"} split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"} - split_files_label = { - "train_label": "train_label.del", - "valid_label": "valid_label.del", - "test_label": "test_label.del", - } + string_files = { "entity_strings": "entity_strings.del", "relation_strings": "relation_strings.del", @@ -50,6 +46,15 @@ def store_map(symbol_map, filename): "valid_without_unseen": "valid_without_unseen.del", "test_without_unseen": "test_without_unseen.del", } + + if args.triple_class: + split_files_negatives = { + "valid_negatives": "valid_negatives.del", + "test_negatives": "test_negatives.del"} + split_files_negatives_without_unseen = { + "valid_negatives_without_unseen": "valid_negatives_without_unseen.del", + "test_negatives_without_unseen": "test_negatives_without_unseen.del"} + split_sizes = {} if args.order_sop: @@ -106,6 +111,15 @@ def store_map(symbol_map, filename): ), "w", ) + if args.triple_class: + split_negatives_wo_unseen = f"{split}_negatives_without_unseen" + f_negatives_wo_unseen = open( + os.path.join( + args.folder, + split_files_negatives_without_unseen[split_negatives_wo_unseen] + ), + "w" + ) else: split_without_unseen = split + "_sample" f_tr_sample = open( @@ -118,9 +132,34 @@ def store_map(symbol_map, filename): split_sizes["train"], split_sizes["valid"], False ) with open(os.path.join(args.folder, filename), "w") as f: - size_unseen = 0 + if args.triple_class and split in ["valid", "test"]: + split_negatives = f"{split}_negatives" + f_negatives = open( + os.path.join( + args.folder, + split_files_negatives[split_negatives], + ), + "w", + ) + + if args.triple_class: + size_negatives = 0 + size_negatives_unseen = 0 + # positives; valid and test sizes have to be recalculated + size_positives = 0 + size_positives_unseen = 0 + else: + size_positives_unseen = 0 for n, t in enumerate(raw[split]): - f.write( + if args.triple_class and split in ["valid", "test"] and int(t[3]) == -1: + file_wrapper = f_negatives + size_negatives += 1 + elif args.triple_class and split in ["valid", "test"]: + size_positives += 1 + file_wrapper = f + else: + file_wrapper = f + file_wrapper.write( str(entities[t[S]]) + "\t" + str(relations[t[P]]) @@ -137,14 +176,22 @@ def store_map(symbol_map, filename): + str(entities[t[O]]) + "\n" ) - size_unseen += 1 + size_positives_unseen += 1 elif ( split in ["valid", "test"] and t[S] in entities_in_train and t[O] in entities_in_train and t[P] in relations_in_train ): - f_wo_unseen.write( + + if args.triple_class and int(t[3]) == -1: + file_wrapper = f_negatives_wo_unseen + size_negatives_unseen += 1 + else: + file_wrapper = f_wo_unseen + size_positives_unseen += 1 + + file_wrapper.write( str(entities[t[S]]) + "\t" + str(relations[t[P]]) @@ -152,27 +199,11 @@ def store_map(symbol_map, filename): + str(entities[t[O]]) + "\n" ) - size_unseen += 1 - without_unseen_sizes[split_without_unseen] = size_unseen - if args.triple_class: - for split, filename in split_files_label.items(): - if split in ["valid", "test"]: - split_without_unseen = split + "_without_unseen" - f_wo_unseen = open( - os.path.join( - args.folder, split_files_without_unseen[split_without_unseen] - ), - "w", - ) - with open(os.path.join(args.folder, filename), "w") as f: - for n, t in enumerate(raw[split]): - f.write(t[4] + "\n") - if ( - t[S] in entities_in_train - and t[O] in entities_in_train - and t[P] in relations_in_train - ): - f_wo_unseen.write(t[4] + "\n") + if args.triple_class and split in ["valid", "test"]: + without_unseen_sizes[split_negatives_wo_unseen] = size_negatives_unseen + split_sizes[split] = size_positives + split_sizes[split_negatives] = size_negatives + without_unseen_sizes[split_without_unseen] = size_positives_unseen # write config print("Writing dataset.yaml...") @@ -193,10 +224,20 @@ def store_map(symbol_map, filename): dataset_config[f"files.{split}.type"] = "triples" dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split) if args.triple_class: - for split in split_files_label.keys(): - dataset_config[f"files.{split}.filename"] = split_files_label.get(split) - dataset_config[f"files.{split}.type"] = "label" - dataset_config[f"files.{split}.size"] = split_sizes.get(split) + for split in split_files_negatives.keys(): + dataset_config[f"files.{split}.filename"] = split_files_negatives.get(split) + dataset_config[f"files.{split}.type"] = "triples" + dataset_config[f"files.{split}.size"] = split_sizes[split] + + for split in split_files_negatives_without_unseen.keys(): + dataset_config[f"files.{split}.filename"] = split_files_negatives_without_unseen.get( + split) + dataset_config[f"files.{split}.type"] = "triples" + dataset_config[f"files.{split}.size"] = without_unseen_sizes[ + split] + + + for string in string_files.keys(): if os.path.exists(os.path.join(args.folder, string_files[string])): dataset_config[f"files.{string}.filename"] = string_files.get(string) diff --git a/kge/config-default.yaml b/kge/config-default.yaml index c1a57ae76..08ca70681 100644 --- a/kge/config-default.yaml +++ b/kge/config-default.yaml @@ -393,15 +393,7 @@ valid: # Name of the trace entry that holds the validation metric (higher value is # better) -<<<<<<< HEAD -<<<<<<< HEAD metric: mean_reciprocal_rank_filtered_with_test -======= - metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification ->>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train) -======= - metric: mean_reciprocal_rank_filtered # accuracy for triple_classification ->>>>>>> Moved sampling function to sampler.py, updated code documentation # If the above metric is not present in trace (e.g., because a custom metric # should be used), a Python expression to compute the metric. Can refer to From 8a4416f68dbf3348e10cc79cdf016a2495d5874f Mon Sep 17 00:00:00 2001 From: nzteb Date: Thu, 4 Jun 2020 18:10:23 +0200 Subject: [PATCH 19/19] Allow to use labels for triple classification from data --- examples/toy-complex-train-tripleclass.yaml | 11 +++-- kge/config-default.yaml | 10 +++- kge/job/triple_classification.py | 54 ++++++++++++++++++++- 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/examples/toy-complex-train-tripleclass.yaml b/examples/toy-complex-train-tripleclass.yaml index 582f8b72f..d75cdd811 100644 --- a/examples/toy-complex-train-tripleclass.yaml +++ b/examples/toy-complex-train-tripleclass.yaml @@ -1,6 +1,6 @@ job.type: train -dataset.name: toy -model: distmult +dataset.name: wn11 +model: complex train: optimizer: Adagrad optimizer_args: @@ -11,8 +11,11 @@ lookup_embedder.dim: 100 lookup_embedder.initialize: xavier_uniform_ eval: type: triple_classification - metrics_per.relation: False - triple_classification_random_seed: False +triple_classification.random_seed: False +triple_classification.negatives_from: data + + valid.metric: accuracy +valid.every: 1 diff --git a/kge/config-default.yaml b/kge/config-default.yaml index 08ca70681..f06a8eea6 100644 --- a/kge/config-default.yaml +++ b/kge/config-default.yaml @@ -330,7 +330,7 @@ eval: # mean_reciprocal_rank_filtered_with_test. filter_with_test: True - # Type of evaluation (entity_ranking only at the moment) + # Type of evaluation (entity_ranking, triple_classification) type: entity_ranking # How to handle cases with ties between the correct answer and other answers, e.g., @@ -423,6 +423,14 @@ valid: ## EVALUATION ################################################################## +triple_classification: + random_seed: False + # How to obtain negative triple labels. Possible values are: + # - corruption: Create negatives by randomly corrupting existing triples (positives) + # - data : Obtain negative labels from the dataset. This assumes the data set + # contains the splits 'valid_negatives' and 'test_negatives' + negatives_from: corruption + ## HYPERPARAMETER SEARCH ####################################################### diff --git a/kge/job/triple_classification.py b/kge/job/triple_classification.py index 2d5cce804..918b19520 100644 --- a/kge/job/triple_classification.py +++ b/kge/job/triple_classification.py @@ -20,7 +20,12 @@ def __init__(self, config: Config, configuration_key: str, dataset: Dataset): self.o_entities = None uni_sampler_config = config.clone() # uni_sampler_config.set("negative_sampling.num_samples.s", self.get_option("num_samples.s")) + # TODO this is redundant as uniform.sample() is called with "num_samples" here in self.sample() uni_sampler_config.set("negative_sampling.num_samples.s", 1) + # TODO maybe changing the API of KGEsampler.sample() to also accept a param "filter" + # as it is the case already with "num_samples" + # then we would not rely here on configuration options which actually + # belong to a training job uni_sampler_config.set("negative_sampling.filtering.s", True) # uni_sampler_config.set("negative_sampling.num_samples.o", self.get_option("num_samples.o")) uni_sampler_config.set("negative_sampling.num_samples.o", 1) @@ -31,6 +36,7 @@ def __init__(self, config: Config, configuration_key: str, dataset: Dataset): def _prepare(self,): train_data = self.dataset.split("train") + #TODO probably outdated as it refers to out-commented code self.s_entities = train_data[:, S].unique().tolist() self.o_entities = train_data[:, O].unique().tolist() self._is_prepared = True @@ -105,6 +111,20 @@ def __init__(self, config, dataset, parent_job, model): self.triple_classification_sampler = TripleClassificationSampler( config, "triple_classification", dataset ) + self.config.check( + "triple_classification.negatives_from", ["corruption", "data"] + ) + self.negatives_from = self.config.get("triple_classification.negatives_from") + if self.negatives_from == "data": + try: + self.config.get("dataset.files.valid_negatives.type") + self.config.get("dataset.files.test_negatives.type") + except: + raise KeyError( + "No splits test/valid_negatives found for the dataset. " + "Provide a dataset with splits valid_negatives and test_negatives " + "or run triple classification with negatives_from=corruption" + ) def _prepare(self): """Prepare the corrupted validation and test data. @@ -121,7 +141,8 @@ def _prepare(self): self.config.log("Generate data with corrupted and true triples...") - if self.eval_split == "test": + # TODO maybe should be generalized to allow for other splits as valid_wo_unseen + if self.eval_split == "test" and self.negatives_from == "corruption": ( self.tune_data, self.tune_labels, @@ -130,7 +151,25 @@ def _prepare(self): self.eval_data, self.eval_labels, ) = self.triple_classification_sampler.sample(self.dataset.split("test")) - else: + + elif self.eval_split == "test" and self.negatives_from == "data": + positives_valid = self.dataset.split("valid") + negatives_valid = self.dataset.split("valid_negatives") + self.tune_data = torch.cat((positives_valid, negatives_valid)).to(self.device) + self.tune_labels = torch.cat( + (torch.ones(positives_valid.size(0), torch.zeros(negatives_valid.size(0)))) + ).to(self.device) + + positives_test = self.dataset.split("test") + negatives_test = self.dataset.split("test_negatives") + self.tune_data = torch.cat((positives_test, negatives_test)).to( + self.device) + self.tune_labels = torch.cat( + (torch.ones(positives_test.size(0), + torch.zeros(negatives_test.size(0)))) + ).to(self.device) + + elif self.eval_split == "valid" and self.negatives_from == "corruption": ( self.tune_data, self.tune_labels, @@ -140,6 +179,17 @@ def _prepare(self): self.eval_labels, ) = self.triple_classification_sampler.sample(self.dataset.split("valid")) + elif self.eval_split == "valid" and self.negatives_from == "data": + positives = self.dataset.split("valid") + negatives = self.dataset.split("valid_negatives") + self.tune_data = torch.cat((positives, negatives)).to(self.device) + self.tune_labels = torch.cat( + (torch.ones(positives.size(0)), torch.zeros(negatives.size(0))) + ).to(self.device) + + self.eval_data = self.tune_data + self.eval_labels = self.tune_labels + # let the model add some hooks, if it wants to do so self.model.prepare_job(self) self.valid_data_is_prepared = True