-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcompile-evaluation-vocabulary.py
66 lines (58 loc) · 2.42 KB
/
compile-evaluation-vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
import gzip
import argparse
import random
import io
import os
import sys
from read_write import read_word_vectors
from read_write import gzopen
import subprocess
import eval_wordsim
import eval_translate
import eval_qvec
import eval_parse
import eval_classify
import os
def get_vocab(eval_dirs):
vocab = set()
for dir_path in eval_dirs:
if not os.path.isdir(dir_path):
print 'the following data directory in the tasks_datasets file does not exist: {}'.format(dir_path)
assert False
if dir_path.startswith('eval-data/wordsim'):
vocab |= eval_wordsim.get_relevant_word_types(dir_path + '/annotated_word_pairs')
elif dir_path.startswith('eval-data/word_translation'):
vocab |= eval_translate.get_relevant_word_types(dir_path + '/dictionary')[0]
elif dir_path.startswith('eval-data/qvec'):
vocab |= eval_qvec.get_relevant_word_types(dir_path + '/semantic_classes')
elif dir_path.startswith('eval-data/parsing'):
vocab |= eval_parse.get_relevant_word_types(dir_path + '/train_treebank', dir_path + '/test_treebank')
elif dir_path.startswith('eval-data/classification'):
vocab |= eval_classify.get_relevant_word_types(dir_path + '/document-representations/data/idfs/idf.all')
else:
assert(False)
return vocab
def get_eval_dirs(tasks_datasets_filename):
eval_dirs = []
for line in open(tasks_datasets_filename):
if len(line.strip()) == 0 or line.strip().startswith("#"): continue
splits = line.strip().strip().split()
assert len(splits) == 4
eval_dirs.append(splits[1])
return eval_dirs
def main(argv):
# parse/validate arguments
argparser = argparse.ArgumentParser()
argparser.add_argument("-tasks_datasets", help="(input) a file that describes the available evaluation tasks and their data, e.g., https://github.com/wammar/multilingual-embeddings-eval-portal/blob/master/tasks_datasets")
argparser.add_argument("-vocab", help="(output) write the evaluation vocabulary to this file.")
args = argparser.parse_args()
# read the tasks_datasets file
eval_dirs = get_eval_dirs(args.tasks_datasets)
# collect vocabulary from evaluation scripts
vocab = get_vocab(eval_dirs)
# write vocabulary to file
with io.open(args.vocab, encoding='utf8', mode='w') as vocab_file: vocab_file.write('\n'.join(vocab))
print 'evaluation vocabulary size is {}, written to {}'.format(len(vocab), args.vocab)
if __name__ == '__main__':
main(sys.argv)