Skip to content

Commit

Permalink
corpus prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
pasqLisena committed Apr 8, 2020
1 parent a5189ad commit 1bb0a4b
Show file tree
Hide file tree
Showing 21 changed files with 35,370 additions and 944 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,4 @@ data/*
tests/

!data/data.txt
!data/20ng*.txt
26 changes: 22 additions & 4 deletions app/builtin/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def __init__(self):
def load(self):
"""
Load the model and eventual dependencies.
Can also not be implemented.
Implementation not mandatory.
"""
pass

Expand All @@ -21,7 +21,16 @@ def predict(self, doc, topn=5):
"""
raise NotImplementedError

def train(self, datapath='/app/data/data.txt'):
def predict_corpus(self, datapath='/data/data.txt'):
if self.model is None:
self.load()

with open(datapath, "r") as datafile:
text = [line.rstrip() for line in datafile if line]

return [self.predict(t) for t in text]

def train(self, datapath='/data/data.txt'):
"""
datapath: path to training data text file
"""
Expand All @@ -38,6 +47,14 @@ def get_raw_topics(self):
def topics(self):
raise NotImplementedError

def get_corpus_predictions(self):
"""
Returns the predictions computed on the training corpus.
This is not re-computing predictions, but reading training results.
"""
raise NotImplementedError


def coherence(self, datapath='/app/data/data.txt', coherence='c_v'):
""" Get coherence of model topics """
if self.model is None:
Expand All @@ -52,14 +69,15 @@ def coherence(self, datapath='/app/data/data.txt', coherence='c_v'):
while True:
try:
coherence_model = gensim.models.coherencemodel.CoherenceModel(topics=topic_words, texts=text,
dictionary=dictionary, coherence=coherence)
dictionary=dictionary,
coherence=coherence)
coherence_per_topic = coherence_model.get_coherence_per_topic()

for i in range(len(topic_words)):
json_topics[str(i)][coherence] = coherence_per_topic[i]

json_topics[coherence] = np.nanmean(coherence_per_topic)
json_topics[coherence+'_std'] = np.nanstd(coherence_per_topic)
json_topics[coherence + '_std'] = np.nanstd(coherence_per_topic)

break

Expand Down
24 changes: 23 additions & 1 deletion app/builtin/gsdmm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,29 @@ def train(self,
def predict(self, doc, topn=5):
if self.model is None:
self.load()

# gsdmm works for short text
# given the preprocessing, here there is no punctuation nor stopwords
# we keep the first 10 words
doc = ''.join(doc.split()[0:7])

results = [(topic, score) for topic, score in enumerate(self.model.score(doc))]
print(results)
results = [{topic: weight} for topic, weight in sorted(results, key=lambda kv: kv[1], reverse=True)[:topn]]
return results

def get_corpus_predictions(self):
if self.model is None:
self.load()

# gsdmm is not saving the training corpus predictions
# however, it is very fast to process a 11k documents corpus

with open('../data/20ng.txt', "r") as datafile:
docs = [line.rstrip() for line in datafile if line]

scores = [self.model.score(''.join(doc.split()[0:7])) for doc in docs]

topics = [[(topic, score) for topic, score in enumerate(doc)] for doc in scores]
topics = [sorted(doc, key=lambda t:-t[1]) for doc in topics]

return topics
11 changes: 8 additions & 3 deletions app/builtin/lda_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,11 @@ def predict(self, doc, topn=5):
results = [{topic: weight} for topic, weight in
sorted(doc_topic_dist, key=lambda kv: kv[1], reverse=True)[:topn]]

print(results)
return results

# Train the model
def train(self,
datapath=ROOT+'/data/data.txt',
datapath=ROOT + '/data/data.txt',
num_topics=35,
alpha=50,
random_seed=5,
Expand Down Expand Up @@ -119,7 +118,7 @@ def topics(self):
return json_topics

# Get weighted similarity of topic words and tags
def evaluate(self, datapath=ROOT+'/data/data.txt', tagspath=ROOT+'/data/tags.txt', topn=5):
def evaluate(self, datapath=ROOT + '/data/data.txt', tagspath=ROOT + '/data/tags.txt', topn=5):
# Load a KeyedVector model using a pre-trained word2vec
word2vecmodel = gensim.models.KeyedVectors.load(W2V_PATH, mmap='r')
# Load vocabulary
Expand Down Expand Up @@ -191,3 +190,9 @@ def evaluate(self, datapath=ROOT+'/data/data.txt', tagspath=ROOT+'/data/tags.txt

# Return score
return score

def get_corpus_predictions(self):
if self.model is None:
self.load()

return list(self.model.load_document_topics())
58 changes: 20 additions & 38 deletions app/builtin/lftm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
MODEL_ROOT = ROOT + '/models/lftm'
TOP_WORDS = MODEL_ROOT + '/TEDLFLDA.topWords'
PARAS_PATH = MODEL_ROOT + '/TEDLFLDA.paras'
THETA_PATH_MODEL = MODEL_ROOT + '/TEDLFLDA.theta'
DATA_GLOVE = MODEL_ROOT + '/data_glove.txt'

DATA_ROOT = ROOT + '/data/lftm' # these files are regenerated at each prediction
DOC_PATH = DATA_ROOT + '/doc.txt'
DATA_GLOVE = DATA_ROOT + '/data_glove.txt'
THETA_PATH = DATA_ROOT + '/TEDLFLDAinf.theta'

os.makedirs('/data/lftm', exist_ok=True)
os.makedirs(ROOT + '/data/lftm', exist_ok=True)

W2V_BIN = DATA_ROOT + '/word2vec.bin'

Expand All @@ -38,12 +39,7 @@ def remove_tokens(x, tok2remove):
class LftmModel(AbstractModel):

# Perform Inference
def predict(self,
doc,
initer=500,
niter=0,
topn=10,
name='TEDLFLDAinf'):
def predict(self, doc, initer=500, niter=0, topn=10, name='TEDLFLDAinf'):
"""
doc: the document on which to make the inference
initer: initial sampling iterations to separate the counts for the latent feature component and the Dirichlet multinomial component
Expand All @@ -60,18 +56,9 @@ def predict(self,
f.write(doc)

# Perform Inference
completedProc = subprocess.run(
'java -jar {} -model {} -paras {} -corpus {} -initers {} -niters {} -twords '
'{} -name {} -sstep {}'.format(
LFTM_JAR,
'LFLDAinf',
PARAS_PATH,
DOC_PATH,
str(initer),
str(niter),
str(topn),
name,
'0'), shell=True)
proc = 'java -jar {} -model {} -paras {} -corpus {} -initers {} -niters {} -twords {} -name {} -sstep {}' \
.format(LFTM_JAR, 'LFLDAinf', PARAS_PATH, DOC_PATH, str(initer), str(niter), str(topn), name, '0')
completedProc = subprocess.run(proc, shell=True)

# os.system('mv /app/data/TEDLFLDAinf.* /app/models/lftm/')

Expand Down Expand Up @@ -131,23 +118,10 @@ def train(self,
completedProc = subprocess.run(
'java -jar {} -model {} -corpus {} -vectors {} -ntopics {} -alpha {} -beta {}'
' -lambda {} -initers {} -niters {} -twords {} -name {} -sstep {}'.format(
LFTM_JAR,
'LFLDA',
DATA_GLOVE,
GLOVE_TXT,
str(ntopics),
str(alpha),
str(beta),
str(_lambda),
str(initer),
str(niter),
str(topn),
'TEDLFLDA',
'0'), shell=True)

print(completedProc.returncode)
LFTM_JAR, 'LFLDA', DATA_GLOVE, GLOVE_TXT, str(ntopics), str(alpha),
str(beta), str(_lambda), str(initer), str(niter), str(topn), 'TEDLFLDA', '0'), shell=True)

return 'success'
return 'success' if completedProc.returncode == 0 else ('error %d' % completedProc.returncode)

def get_raw_topics(self):
json_topics = {}
Expand Down Expand Up @@ -200,7 +174,6 @@ def evaluate(self, tagspath=DATA_ROOT + '/tags.txt', topn=5):
if not tags:
continue

print('doc', num_doc)
doc_score = 0
topic_weights = 0
# Iterate over the top topics
Expand All @@ -224,11 +197,20 @@ def evaluate(self, tagspath=DATA_ROOT + '/tags.txt', topn=5):
doc_score += topic_weight * topic_score
topic_weights += topic_weight
doc_score /= topic_weights
print('doc score', doc_score)

score += doc_score

score /= (num_doc + 1)

print(score)
return score

def get_corpus_predictions(self):
with open(THETA_PATH_MODEL, "r") as file:
doc_topic_dist = [line.strip().split() for line in file.readlines()]

topics = [[(i, float(score)) for i, score in enumerate(doc)]
for doc in doc_topic_dist]

topics = [sorted(doc, key=lambda t:-t[1]) for doc in topics]
return topics
8 changes: 7 additions & 1 deletion app/builtin/ntm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,15 @@ def topics(self):

return json_topics

def get_corpus_predictions(self):
if self.model is None:
self.load()

return [self.model.get_document_topics(i) for i in range(0, len(self.model.get_docvecs()))]

# Train the model
def train(self,
datapath='/app/data/data.txt',
datapath='/data/data.txt',
n_topics=35,
batch_size=1024 * 6,
n_epochs=20,
Expand Down
12 changes: 12 additions & 0 deletions app/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ def predict(model_type):
# Return results and score
return jsonify({'time': dur, 'results': results}), 200

@app.route('/api/<string:model_type>/predict_corpus', methods=['POST'])
def predict_corpus(model_type):
start = time.time()

# Load the model
model = models[model_type]()
# Perform Inference
results = model.predict_corpus(request.json['datapath'])
dur = time.time() - start
# Return results and score
return jsonify({'time': dur, 'results': results}), 200


#################################################################
# TAGS #
Expand Down
2 changes: 1 addition & 1 deletion app/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ SCIPY
SCIKIT-LEARN
GENSIM
KERAS==2.3.1
TENSORFLOW==1.14
TENSORFLOW==1.15.2
swagger-ui-py
flask-cors
Loading

0 comments on commit 1bb0a4b

Please sign in to comment.