-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathserver.py
171 lines (144 loc) · 5.39 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
A simple word game powered by word embeddings.
"""
import re
import random
import numpy as np
import h5py
import wordfreq
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import PorterStemmer
from sanic import Sanic
from sanic.response import json
def standardized(term):
"""
Breaks into underscore-separated words and replaces numbers with '#' signs.
"""
tokens = wordfreq.tokenize(term.replace('_', ' '), 'xx')
if tokens[0] == 'to':
tokens = tokens[1:]
return replace_numbers('_'.join(tokens))
def replace_numbers(s):
"""
Replace digits with # in any term where a sequence of two digits appears.
This operation is applied to text that passes through word2vec, so we
should match it.
"""
DOUBLE_DIGIT_RE = re.compile(r'[0-9][0-9]')
DIGIT_RE = re.compile(r'[0-9]')
if DOUBLE_DIGIT_RE.search(s):
return DIGIT_RE.sub('#', s)
else:
return s
# Load word embeddings. Run create_data_file.py to create the
# embeddings.h5 file.
with h5py.File("words/embeddings.h5", "r") as f:
mat = f['mat']['vecs'][:]
words = [word.decode('utf-8') for word in f['mat']['words'][:]]
index = {word: i for i, word in enumerate(words)}
# Word stemming
ls = LancasterStemmer()
ps = PorterStemmer()
stemmers = [
ls.stem, ps.stem, lambda x: ps.stem(ls.stem(x)),
lambda x: ls.stem(ps.stem(x))
]
# Acceptable first words
with open('words/acceptable_first_words.txt') as f:
first_words = set(map(str.strip, f.readlines()))
# Unacceptable words to say
with open('words/bad_words.txt') as f:
bad_words = set(map(str.strip, f.readlines()))
def lexicallyRelated(word1, word2):
"""
Determine whether two words might be lexically related to one another.
"""
return any(map(lambda stem: stem(word1) == stem(word2), stemmers)
) or word1.startswith(word2) or word2.startswith(word1)
def canUse(candidate, past):
"""
Check whether a candidate is OK to use.
"""
candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large")
candidateRootFrequency = max(
candidateFrequency,
wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))
# Reject words that are too infrequent or too frequent (like "a" or "the")
if candidateFrequency < 2.3 or candidateFrequency > 6:
return False
# Mostly, this rejects '#'-containing words
if not candidate.isalpha():
return False
# Is it a bad word?
if candidate in bad_words:
return False
# Now, we check if we've used a related word before.
if any(map(lambda w: lexicallyRelated(candidate, w), past)):
return False
# If we have a relatively infrequent word that is too related to
# our past three rounds of words, we should reject it.
if sum(similarityScore(x, candidate) for x in past[-6:]) > 2 and candidateRootFrequency < 3.2:
return False
# otherwise, we're ok!
return True
# Precompute frequencies of all words in list
frequencies = np.array(
[wordfreq.zipf_frequency(x, 'en', wordlist="large") for x in words])
def nextWord(prevWord1, prevWord2, past):
"""
the further apart the two words are to each other, the less
"frequency" should matter; it should not drown out subtle connections
to both words. TODO: Improve this and use the fact that zipf is
logarithmic.
"""
currentCloseness = similarityScore(prevWord1, prevWord2)
if currentCloseness < .09:
freq_modifier = 1000.0
elif currentCloseness < .25:
freq_modifier = 200.0
else:
freq_modifier = 80.0
i1, i2 = index[prevWord1], index[prevWord2]
wordScores = (np.dot(mat, mat[i1, :]) * np.dot(mat, mat[i2, :])) + (frequencies / freq_modifier)
bestWords = map(lambda i: words[i], reversed(np.argsort(wordScores)))
bestWord = next(word for word in bestWords if canUse(word, past))
print(prevWord1, prevWord2, bestWord)
return bestWord
def similarityScore(w1, w2):
score = np.dot(mat[index[w1], :], mat[index[w2], :])
return score
app = Sanic(__name__)
app.static("/", "./frontend/dist/index.html")
app.static("/static", "./frontend/dist/static/")
@app.route("/first_word")
async def first_word(request):
w = random.choice(list(index))
while w not in first_words or wordfreq.zipf_frequency(w, 'en', wordlist="large") < 3.5:
w = random.choice(list(index))
return json({"word": w})
@app.route("/next_word", methods=['POST'])
async def next_word(request):
"""
1. Do I know the word? If not, send back "unknownWord: true"
2. Are the words the same? If so, send back "unknownWord: false, victory: true"
3. Otherwise, send back unknownWord: false, victory: false, nextWord: word, simScore: simScore
"""
w1 = standardized(request.json['word1'])
w2 = standardized(request.json['word2'])
past = map(standardized, request.json['past'])
past = list(past) + [w1, w2]
if w1 not in words:
return json({"unknownWord": True})
if len(past) > 2:
with open('wordlog.csv', 'a+') as f:
f.write(','.join(past[-4:]) + "\n")
if ps.stem(w1) == ps.stem(w2):
return json({"unknownWord": False, "victory": True})
else:
return json({
"unknownWord": False,
"victory": False,
"nextWord": nextWord(w1, w2, past),
"simScore": float(similarityScore(w1, w2))
})
app.run(host="0.0.0.0", port=8000)