From 5b1a5b408b087d207884c01931714199b80e44c1 Mon Sep 17 00:00:00 2001 From: nzteb Date: Thu, 4 Jun 2020 15:50:55 +0200 Subject: [PATCH] Add preprocess functionality for wn11 --- data/download_all.sh | 20 +++++--- data/preprocess.py | 111 +++++++++++++++++++++++++++------------- kge/config-default.yaml | 8 --- 3 files changed, 89 insertions(+), 50 deletions(-) diff --git a/data/download_all.sh b/data/download_all.sh index adba8b274..ea63e0d9d 100755 --- a/data/download_all.sh +++ b/data/download_all.sh @@ -204,19 +204,25 @@ fi # wn11 if [ ! -d "$BASEDIR/wn11" ]; then - echo Downloading wikidata5m + echo Downloading wn11 cd $BASEDIR + # TODO this also creates a __MACOSX folder on non-mac in the zip + # TODO download file from mannheim server curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip - unzip wn11.zip - mv wordnet11/wordnet11 wn11 - rm -r wordnet11/ + unzip wordnet11.zip + if [ -d "__MACOSX" ]; then + rm -r __MACOSX + fi + mv wordnet11 wn11 mv wn11/dev.txt wn11/valid.txt else - echo wikidata5m already present + echo wn11 already present fi + if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then - python preprocess.py wikidata5m + python preprocess.py wn11 --triple_class else - echo wikidata5m already prepared + echo wn11 already prepared fi + diff --git a/data/preprocess.py b/data/preprocess.py index 43da59d02..3d2627528 100755 --- a/data/preprocess.py +++ b/data/preprocess.py @@ -7,7 +7,7 @@ During preprocessing, each distinct entity name and each distinct distinct relation name is assigned an index (dense). The index-to-object mapping is stored in files -"entity_map.del" and "relation_map.del", resp. The triples (as indexes) are stored in +"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in files "train.del", "valid.del", and "test.del". Metadata information is stored in a file "dataset.yaml". @@ -36,11 +36,7 @@ def store_map(symbol_map, filename): print(f"Preprocessing {args.folder}...") raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"} split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"} - split_files_label = { - "train_label": "train_label.del", - "valid_label": "valid_label.del", - "test_label": "test_label.del", - } + string_files = { "entity_strings": "entity_strings.del", "relation_strings": "relation_strings.del", @@ -50,6 +46,15 @@ def store_map(symbol_map, filename): "valid_without_unseen": "valid_without_unseen.del", "test_without_unseen": "test_without_unseen.del", } + + if args.triple_class: + split_files_negatives = { + "valid_negatives": "valid_negatives.del", + "test_negatives": "test_negatives.del"} + split_files_negatives_without_unseen = { + "valid_negatives_without_unseen": "valid_negatives_without_unseen.del", + "test_negatives_without_unseen": "test_negatives_without_unseen.del"} + split_sizes = {} if args.order_sop: @@ -106,6 +111,15 @@ def store_map(symbol_map, filename): ), "w", ) + if args.triple_class: + split_negatives_wo_unseen = f"{split}_negatives_without_unseen" + f_negatives_wo_unseen = open( + os.path.join( + args.folder, + split_files_negatives_without_unseen[split_negatives_wo_unseen] + ), + "w" + ) else: split_without_unseen = split + "_sample" f_tr_sample = open( @@ -118,9 +132,34 @@ def store_map(symbol_map, filename): split_sizes["train"], split_sizes["valid"], False ) with open(os.path.join(args.folder, filename), "w") as f: - size_unseen = 0 + if args.triple_class and split in ["valid", "test"]: + split_negatives = f"{split}_negatives" + f_negatives = open( + os.path.join( + args.folder, + split_files_negatives[split_negatives], + ), + "w", + ) + + if args.triple_class: + size_negatives = 0 + size_negatives_unseen = 0 + # positives; valid and test sizes have to be recalculated + size_positives = 0 + size_positives_unseen = 0 + else: + size_positives_unseen = 0 for n, t in enumerate(raw[split]): - f.write( + if args.triple_class and split in ["valid", "test"] and int(t[3]) == -1: + file_wrapper = f_negatives + size_negatives += 1 + elif args.triple_class and split in ["valid", "test"]: + size_positives += 1 + file_wrapper = f + else: + file_wrapper = f + file_wrapper.write( str(entities[t[S]]) + "\t" + str(relations[t[P]]) @@ -137,14 +176,22 @@ def store_map(symbol_map, filename): + str(entities[t[O]]) + "\n" ) - size_unseen += 1 + size_positives_unseen += 1 elif ( split in ["valid", "test"] and t[S] in entities_in_train and t[O] in entities_in_train and t[P] in relations_in_train ): - f_wo_unseen.write( + + if args.triple_class and int(t[3]) == -1: + file_wrapper = f_negatives_wo_unseen + size_negatives_unseen += 1 + else: + file_wrapper = f_wo_unseen + size_positives_unseen += 1 + + file_wrapper.write( str(entities[t[S]]) + "\t" + str(relations[t[P]]) @@ -152,27 +199,11 @@ def store_map(symbol_map, filename): + str(entities[t[O]]) + "\n" ) - size_unseen += 1 - without_unseen_sizes[split_without_unseen] = size_unseen - if args.triple_class: - for split, filename in split_files_label.items(): - if split in ["valid", "test"]: - split_without_unseen = split + "_without_unseen" - f_wo_unseen = open( - os.path.join( - args.folder, split_files_without_unseen[split_without_unseen] - ), - "w", - ) - with open(os.path.join(args.folder, filename), "w") as f: - for n, t in enumerate(raw[split]): - f.write(t[4] + "\n") - if ( - t[S] in entities_in_train - and t[O] in entities_in_train - and t[P] in relations_in_train - ): - f_wo_unseen.write(t[4] + "\n") + if args.triple_class and split in ["valid", "test"]: + without_unseen_sizes[split_negatives_wo_unseen] = size_negatives_unseen + split_sizes[split] = size_positives + split_sizes[split_negatives] = size_negatives + without_unseen_sizes[split_without_unseen] = size_positives_unseen # write config print("Writing dataset.yaml...") @@ -193,10 +224,20 @@ def store_map(symbol_map, filename): dataset_config[f"files.{split}.type"] = "triples" dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split) if args.triple_class: - for split in split_files_label.keys(): - dataset_config[f"files.{split}.filename"] = split_files_label.get(split) - dataset_config[f"files.{split}.type"] = "label" - dataset_config[f"files.{split}.size"] = split_sizes.get(split) + for split in split_files_negatives.keys(): + dataset_config[f"files.{split}.filename"] = split_files_negatives.get(split) + dataset_config[f"files.{split}.type"] = "triples" + dataset_config[f"files.{split}.size"] = split_sizes[split] + + for split in split_files_negatives_without_unseen.keys(): + dataset_config[f"files.{split}.filename"] = split_files_negatives_without_unseen.get( + split) + dataset_config[f"files.{split}.type"] = "triples" + dataset_config[f"files.{split}.size"] = without_unseen_sizes[ + split] + + + for string in string_files.keys(): if os.path.exists(os.path.join(args.folder, string_files[string])): dataset_config[f"files.{string}.filename"] = string_files.get(string) diff --git a/kge/config-default.yaml b/kge/config-default.yaml index c1a57ae76..08ca70681 100644 --- a/kge/config-default.yaml +++ b/kge/config-default.yaml @@ -393,15 +393,7 @@ valid: # Name of the trace entry that holds the validation metric (higher value is # better) -<<<<<<< HEAD -<<<<<<< HEAD metric: mean_reciprocal_rank_filtered_with_test -======= - metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification ->>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train) -======= - metric: mean_reciprocal_rank_filtered # accuracy for triple_classification ->>>>>>> Moved sampling function to sampler.py, updated code documentation # If the above metric is not present in trace (e.g., because a custom metric # should be used), a Python expression to compute the metric. Can refer to