Skip to content

Commit

Permalink
Add preprocess functionality for wn11
Browse files Browse the repository at this point in the history
  • Loading branch information
Nzteb committed Jun 4, 2020
1 parent 2b98780 commit 5b1a5b4
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 50 deletions.
20 changes: 13 additions & 7 deletions data/download_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,19 +204,25 @@ fi

# wn11
if [ ! -d "$BASEDIR/wn11" ]; then
echo Downloading wikidata5m
echo Downloading wn11
cd $BASEDIR
# TODO this also creates a __MACOSX folder on non-mac in the zip
# TODO download file from mannheim server
curl -O https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wordnet11.zip
unzip wn11.zip
mv wordnet11/wordnet11 wn11
rm -r wordnet11/
unzip wordnet11.zip
if [ -d "__MACOSX" ]; then
rm -r __MACOSX
fi
mv wordnet11 wn11
mv wn11/dev.txt wn11/valid.txt
else
echo wikidata5m already present
echo wn11 already present
fi

if [ ! -f "$BASEDIR/wn11/dataset.yaml" ]; then
python preprocess.py wikidata5m
python preprocess.py wn11 --triple_class
else
echo wikidata5m already prepared
echo wn11 already prepared
fi


111 changes: 76 additions & 35 deletions data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
During preprocessing, each distinct entity name and each distinct distinct relation name
is assigned an index (dense). The index-to-object mapping is stored in files
"entity_map.del" and "relation_map.del", resp. The triples (as indexes) are stored in
"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in
files "train.del", "valid.del", and "test.del". Metadata information is stored in a file
"dataset.yaml".
Expand Down Expand Up @@ -36,11 +36,7 @@ def store_map(symbol_map, filename):
print(f"Preprocessing {args.folder}...")
raw_split_files = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"}
split_files = {"train": "train.del", "valid": "valid.del", "test": "test.del"}
split_files_label = {
"train_label": "train_label.del",
"valid_label": "valid_label.del",
"test_label": "test_label.del",
}

string_files = {
"entity_strings": "entity_strings.del",
"relation_strings": "relation_strings.del",
Expand All @@ -50,6 +46,15 @@ def store_map(symbol_map, filename):
"valid_without_unseen": "valid_without_unseen.del",
"test_without_unseen": "test_without_unseen.del",
}

if args.triple_class:
split_files_negatives = {
"valid_negatives": "valid_negatives.del",
"test_negatives": "test_negatives.del"}
split_files_negatives_without_unseen = {
"valid_negatives_without_unseen": "valid_negatives_without_unseen.del",
"test_negatives_without_unseen": "test_negatives_without_unseen.del"}

split_sizes = {}

if args.order_sop:
Expand Down Expand Up @@ -106,6 +111,15 @@ def store_map(symbol_map, filename):
),
"w",
)
if args.triple_class:
split_negatives_wo_unseen = f"{split}_negatives_without_unseen"
f_negatives_wo_unseen = open(
os.path.join(
args.folder,
split_files_negatives_without_unseen[split_negatives_wo_unseen]
),
"w"
)
else:
split_without_unseen = split + "_sample"
f_tr_sample = open(
Expand All @@ -118,9 +132,34 @@ def store_map(symbol_map, filename):
split_sizes["train"], split_sizes["valid"], False
)
with open(os.path.join(args.folder, filename), "w") as f:
size_unseen = 0
if args.triple_class and split in ["valid", "test"]:
split_negatives = f"{split}_negatives"
f_negatives = open(
os.path.join(
args.folder,
split_files_negatives[split_negatives],
),
"w",
)

if args.triple_class:
size_negatives = 0
size_negatives_unseen = 0
# positives; valid and test sizes have to be recalculated
size_positives = 0
size_positives_unseen = 0
else:
size_positives_unseen = 0
for n, t in enumerate(raw[split]):
f.write(
if args.triple_class and split in ["valid", "test"] and int(t[3]) == -1:
file_wrapper = f_negatives
size_negatives += 1
elif args.triple_class and split in ["valid", "test"]:
size_positives += 1
file_wrapper = f
else:
file_wrapper = f
file_wrapper.write(
str(entities[t[S]])
+ "\t"
+ str(relations[t[P]])
Expand All @@ -137,42 +176,34 @@ def store_map(symbol_map, filename):
+ str(entities[t[O]])
+ "\n"
)
size_unseen += 1
size_positives_unseen += 1
elif (
split in ["valid", "test"]
and t[S] in entities_in_train
and t[O] in entities_in_train
and t[P] in relations_in_train
):
f_wo_unseen.write(

if args.triple_class and int(t[3]) == -1:
file_wrapper = f_negatives_wo_unseen
size_negatives_unseen += 1
else:
file_wrapper = f_wo_unseen
size_positives_unseen += 1

file_wrapper.write(
str(entities[t[S]])
+ "\t"
+ str(relations[t[P]])
+ "\t"
+ str(entities[t[O]])
+ "\n"
)
size_unseen += 1
without_unseen_sizes[split_without_unseen] = size_unseen
if args.triple_class:
for split, filename in split_files_label.items():
if split in ["valid", "test"]:
split_without_unseen = split + "_without_unseen"
f_wo_unseen = open(
os.path.join(
args.folder, split_files_without_unseen[split_without_unseen]
),
"w",
)
with open(os.path.join(args.folder, filename), "w") as f:
for n, t in enumerate(raw[split]):
f.write(t[4] + "\n")
if (
t[S] in entities_in_train
and t[O] in entities_in_train
and t[P] in relations_in_train
):
f_wo_unseen.write(t[4] + "\n")
if args.triple_class and split in ["valid", "test"]:
without_unseen_sizes[split_negatives_wo_unseen] = size_negatives_unseen
split_sizes[split] = size_positives
split_sizes[split_negatives] = size_negatives
without_unseen_sizes[split_without_unseen] = size_positives_unseen

# write config
print("Writing dataset.yaml...")
Expand All @@ -193,10 +224,20 @@ def store_map(symbol_map, filename):
dataset_config[f"files.{split}.type"] = "triples"
dataset_config[f"files.{split}.size"] = without_unseen_sizes.get(split)
if args.triple_class:
for split in split_files_label.keys():
dataset_config[f"files.{split}.filename"] = split_files_label.get(split)
dataset_config[f"files.{split}.type"] = "label"
dataset_config[f"files.{split}.size"] = split_sizes.get(split)
for split in split_files_negatives.keys():
dataset_config[f"files.{split}.filename"] = split_files_negatives.get(split)
dataset_config[f"files.{split}.type"] = "triples"
dataset_config[f"files.{split}.size"] = split_sizes[split]

for split in split_files_negatives_without_unseen.keys():
dataset_config[f"files.{split}.filename"] = split_files_negatives_without_unseen.get(
split)
dataset_config[f"files.{split}.type"] = "triples"
dataset_config[f"files.{split}.size"] = without_unseen_sizes[
split]



for string in string_files.keys():
if os.path.exists(os.path.join(args.folder, string_files[string])):
dataset_config[f"files.{string}.filename"] = string_files.get(string)
Expand Down
8 changes: 0 additions & 8 deletions kge/config-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -393,15 +393,7 @@ valid:

# Name of the trace entry that holds the validation metric (higher value is
# better)
<<<<<<< HEAD
<<<<<<< HEAD
metric: mean_reciprocal_rank_filtered_with_test
=======
metric: mean_reciprocal_rank_filtered # Accuracy for triple_classification
>>>>>>> Improved in-code documentation, removed accuracy output from get_thresholds, added comments for triple classification specification in default file, Included specification of evaluating on either test or valid data depending on the task (Test or validation during train)
=======
metric: mean_reciprocal_rank_filtered # accuracy for triple_classification
>>>>>>> Moved sampling function to sampler.py, updated code documentation

# If the above metric is not present in trace (e.g., because a custom metric
# should be used), a Python expression to compute the metric. Can refer to
Expand Down

0 comments on commit 5b1a5b4

Please sign in to comment.