-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocess.py
executable file
·117 lines (99 loc) · 3.98 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
import argparse
import os
from collections import Counter
import unicodedata
import numpy as np
def is_number(s):
s = s.replace(',', '') # 10,000 -> 10000
s = s.replace(':', '') # 5:30 -> 530
s = s.replace('-', '') # 17-08 -> 1708
s = s.replace('/', '') # 17/08/1992 -> 17081992
try:
float(s)
return True
except ValueError:
pass
try:
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
def process_conll(in_path, out_path, lower=True, clean=True, p=0.1):
word_counts = Counter()
tag_counts = Counter()
label_counts = Counter()
with open(in_path, 'r') as f:
for line in f:
fields = line.split()
if fields:
word = fields[1].lower() if lower else fields[1]
tag = fields[3]
label = fields[7]
word_counts.update([word])
tag_counts.update([tag])
label_counts.update([label])
with open(out_path + '.words.txt', 'w') as f:
for word, count in word_counts.most_common():
processed = word
if count == 1:
if is_number(word) and clean:
processed = '<num>'
elif np.random.random() < p:
processed = '<unk>'
print('{} {} {}'.format(word, processed, count), file=f)
with open(out_path + '.tags.txt', 'w') as f:
for tag, count in tag_counts.most_common():
print('{} {}'.format(tag, count), file=f)
with open(out_path + '.labels.txt', 'w') as f:
for label, count in label_counts.most_common():
print('{} {}'.format(label, count), file=f)
def compare_vocabulary(train_path, dev_path, test_path):
train_vocab = dict()
dev_vocab = dict()
test_vocab = dict()
def read_dict(path, dict):
with open(path, 'r') as f:
for line in f:
word, _, count = line.split()
dict[word] = int(count)
read_dict(train_path, train_vocab)
read_dict(dev_path, dev_vocab)
read_dict(test_path, test_vocab)
nwords_train = len(train_vocab)
ntokens_train = sum(train_vocab.values())
nwords_dev = len(dev_vocab)
ntokens_dev = sum(dev_vocab.values())
nwords_test = len(test_vocab)
ntokens_test = sum(test_vocab.values())
unseen_words = list(set(dev_vocab.keys()) - (set(train_vocab.keys()) & set(dev_vocab.keys())))
num_unseen_tokens = sum([dev_vocab[w] for w in unseen_words])
with open('vocab/data-statistics.csv', 'w') as g:
print('dataset,nwords,ntokens', file=g)
print('train,{},{}'.format(nwords_train, ntokens_train), file=g)
print('dev,{},{}'.format(nwords_dev, ntokens_dev), file=g)
print('test,{},{}'.format(nwords_test, ntokens_test), file=g)
print('unseen,{},{}'.format(len(unseen_words), num_unseen_tokens), file=g)
with open('vocab/unseen.txt', 'w') as f:
for word in unseen_words:
print('{} {}'.format(word, dev_vocab[word]), file=f)
def main(args):
data = os.path.expanduser(args.data)
train_conll_path = os.path.join(data, 'train.conll')
dev_conll_path = os.path.join(data, 'dev.conll')
test_conll_path = os.path.join(data, 'test.conll')
train_vocab_path = os.path.join(args.out, 'train')
dev_vocab_path = os.path.join(args.out, 'dev')
test_vocab_path = os.path.join(args.out, 'test')
process_conll(train_conll_path, train_vocab_path, p=0.5, clean=False)
process_conll(dev_conll_path, dev_vocab_path, p=0.0)
process_conll(test_conll_path, test_vocab_path, p=0.0)
compare_vocabulary(
train_vocab_path + '.words.txt', dev_vocab_path + '.words.txt', test_vocab_path + '.words.txt')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data', default='~/data/ptb-stanford')
parser.add_argument('--out', default='vocab')
args = parser.parse_args()
main(args)