-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathantonym-joint_retrofit.py
121 lines (106 loc) · 4.29 KB
/
antonym-joint_retrofit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import gzip
import math
import numpy
import re
import sys
from collections import defaultdict
from copy import deepcopy
import numpy as np
isNumber = re.compile(r'\d+.*')
def norm_word(word):
if isNumber.search(word.lower()):
return '---num---'
elif re.sub(r'\W+', '', word) == '':
return '---punc---'
else:
return word.lower()
''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
wordVectors = {}
if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
else: fileObject = open(filename, 'r')
for line in fileObject:
line = line.strip().lower()
word = line.split()[0]
wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
for index, vecVal in enumerate(line.split()[1:]):
wordVectors[word][index] = float(vecVal)
''' normalize weight vector '''
wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
sys.stderr.write("Vectors read from: "+filename+" \n")
return wordVectors
''' Write word vectors to file '''
def print_word_vecs(wordVectors, outFileName):
sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
outFile = open(outFileName, 'w')
for word, values in wordVectors.items():
outFile.write(word+' ')
for val in wordVectors[word]:
outFile.write('%.4f' %(val)+' ')
outFile.write('\n')
outFile.close()
''' Read the PPDB word relations as a dictionary '''
def read_lexicon(filename, wordVecs):
lexicon = {}
for line in open(filename, 'r'):
words = line.lower().strip().split()
lexicon[words[0]] = [word for word in words[1:]]
return lexicon
''' Retrofit word vectors to a lexicon '''
def retrofit(wordVecs, lexicon, numIters, w1, w2, w3):
newWordVecs = deepcopy(wordVecs)
wvVocab = set(newWordVecs.keys())
loopVocab = set()
for w in lexicon.keys():
if w.split('%')[0] in wvVocab:
loopVocab.add(w)
for it in range(numIters):
# loop through every node also in ontology (else just use data estimate)
wvVocab = set(newWordVecs.keys())
for word in loopVocab:
wordNeighbours = []
weightNeighbours = []
numNeighbours = 0.0
for w in lexicon[word]:
weight = float(w.split('#')[1])
if w.split('#')[0] in wvVocab: #w = 'renew%0#0.6'
numNeighbours += weight
weightNeighbours.append(weight*w2)
wordNeighbours.append(w.split('#')[0])
elif w.split('%')[0] in wvVocab: #
numNeighbours += weight
weightNeighbours.append(weight*w3)
wordNeighbours.append(w.split('%')[0])
#no neighbours, pass - use data estimate
if numNeighbours == 0:
continue
# the weight of the data estimate if the number of neighbours
newVec = w1 * numNeighbours * wordVecs[word.split('%')[0]]
numNeighbours *= w1
for (ppWord, weight) in zip(wordNeighbours, weightNeighbours):
newVec += newWordVecs[ppWord] * weight *-1
numNeighbours += weight
#newWordVecs[word.split('#')[0]] = newVec/(numNeighbours)
tmp_vec = newVec/(numNeighbours)
if word.split('#')[0] not in newWordVecs:
newWordVecs[word.split('#')[0]] = tmp_vec
elif np.linalg.norm(tmp_vec-newWordVecs[word.split('#')[0]]) > 0.1:
newWordVecs[word.split('#')[0]] = tmp_vec
return newWordVecs
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, default=None, help="Input word vecs")
parser.add_argument("-a", "--lexicon", type=str, default=None, help="Lexicon file name")
parser.add_argument("-o", "--output", type=str, help="Output word vecs")
parser.add_argument("-n", "--numiter", type=int, default=5, help="Num iterations")
parser.add_argument("-w1", "--w1", type=float, default=1.0, help="Alpha Wight")
parser.add_argument("-w2", "--w2", type=float, default=1.0, help="Beta Weight")
parser.add_argument("-w3", "--w3", type=float, default=1.0, help="Gamma Weight")
args = parser.parse_args()
wordVecs = read_word_vecs(args.input)
lexicon = read_lexicon(args.lexicon, wordVecs)
numIter = int(args.numiter)
outFileName = args.output
''' Enrich the word vectors using ppdb and print the enriched vectors '''
print_word_vecs(retrofit(wordVecs, lexicon, numIter, args.w1, args.w2, args.w3), outFileName)