corpus prediction

D2KLab · Apr 8, 2020 · 1bb0a4b · 1bb0a4b
1 parent a5189ad
commit 1bb0a4b
Show file tree

Hide file tree

Showing 21 changed files with 35,370 additions and 944 deletions.
diff --git a/.gitignore b/.gitignore
@@ -136,3 +136,4 @@ data/*
 tests/
 
 !data/data.txt
+!data/20ng*.txt
diff --git a/app/builtin/abstract_model.py b/app/builtin/abstract_model.py
@@ -9,7 +9,7 @@ def __init__(self):
     def load(self):
         """
             Load the model and eventual dependencies.
-            Can also not be implemented.
+            Implementation not mandatory.
         """
         pass
 
@@ -21,7 +21,16 @@ def predict(self, doc, topn=5):
         """
         raise NotImplementedError
 
-    def train(self, datapath='/app/data/data.txt'):
+    def predict_corpus(self, datapath='/data/data.txt'):
+        if self.model is None:
+            self.load()
+
+        with open(datapath, "r") as datafile:
+            text = [line.rstrip() for line in datafile if line]
+
+        return [self.predict(t) for t in text]
+
+    def train(self, datapath='/data/data.txt'):
         """
             datapath: path to training data text file
         """
@@ -38,6 +47,14 @@ def get_raw_topics(self):
     def topics(self):
         raise NotImplementedError
 
+    def get_corpus_predictions(self):
+        """
+        Returns the predictions computed on the training corpus.
+        This is not re-computing predictions, but reading training results.
+        """
+        raise NotImplementedError
+
+
     def coherence(self, datapath='/app/data/data.txt', coherence='c_v'):
         """ Get coherence of model topics """
         if self.model is None:
@@ -52,14 +69,15 @@ def coherence(self, datapath='/app/data/data.txt', coherence='c_v'):
         while True:
             try:
                 coherence_model = gensim.models.coherencemodel.CoherenceModel(topics=topic_words, texts=text,
-                                                                              dictionary=dictionary, coherence=coherence)
+                                                                              dictionary=dictionary,
+                                                                              coherence=coherence)
                 coherence_per_topic = coherence_model.get_coherence_per_topic()
 
                 for i in range(len(topic_words)):
                     json_topics[str(i)][coherence] = coherence_per_topic[i]
 
                 json_topics[coherence] = np.nanmean(coherence_per_topic)
-                json_topics[coherence+'_std'] = np.nanstd(coherence_per_topic)
+                json_topics[coherence + '_std'] = np.nanstd(coherence_per_topic)
 
                 break
 

diff --git a/app/builtin/gsdmm_model.py b/app/builtin/gsdmm_model.py
@@ -82,7 +82,29 @@ def train(self,
     def predict(self, doc, topn=5):
         if self.model is None:
             self.load()
+
+        # gsdmm works for short text
+        # given the preprocessing, here there is no punctuation nor stopwords
+        # we keep the first 10 words
+        doc = ''.join(doc.split()[0:7])
+
         results = [(topic, score) for topic, score in enumerate(self.model.score(doc))]
-        print(results)
         results = [{topic: weight} for topic, weight in sorted(results, key=lambda kv: kv[1], reverse=True)[:topn]]
         return results
+
+    def get_corpus_predictions(self):
+        if self.model is None:
+            self.load()
+
+        # gsdmm is not saving the training corpus predictions
+        # however, it is very fast to process a 11k documents corpus
+
+        with open('../data/20ng.txt', "r") as datafile:
+            docs = [line.rstrip() for line in datafile if line]
+
+        scores = [self.model.score(''.join(doc.split()[0:7])) for doc in docs]
+
+        topics = [[(topic, score) for topic, score in enumerate(doc)] for doc in scores]
+        topics = [sorted(doc, key=lambda t:-t[1]) for doc in topics]
+
+        return topics
diff --git a/app/builtin/lda_model.py b/app/builtin/lda_model.py
@@ -37,12 +37,11 @@ def predict(self, doc, topn=5):
         results = [{topic: weight} for topic, weight in
                    sorted(doc_topic_dist, key=lambda kv: kv[1], reverse=True)[:topn]]
 
-        print(results)
         return results
 
     # Train the model
     def train(self,
-              datapath=ROOT+'/data/data.txt',
+              datapath=ROOT + '/data/data.txt',
               num_topics=35,
               alpha=50,
               random_seed=5,
@@ -119,7 +118,7 @@ def topics(self):
         return json_topics
 
     # Get weighted similarity of topic words and tags
-    def evaluate(self, datapath=ROOT+'/data/data.txt', tagspath=ROOT+'/data/tags.txt', topn=5):
+    def evaluate(self, datapath=ROOT + '/data/data.txt', tagspath=ROOT + '/data/tags.txt', topn=5):
         # Load a KeyedVector model using a pre-trained word2vec
         word2vecmodel = gensim.models.KeyedVectors.load(W2V_PATH, mmap='r')
         # Load vocabulary
@@ -191,3 +190,9 @@ def evaluate(self, datapath=ROOT+'/data/data.txt', tagspath=ROOT+'/data/tags.txt
 
         # Return score
         return score
+
+    def get_corpus_predictions(self):
+        if self.model is None:
+            self.load()
+
+        return list(self.model.load_document_topics())
diff --git a/app/builtin/lftm_model.py b/app/builtin/lftm_model.py
@@ -16,13 +16,14 @@
 MODEL_ROOT = ROOT + '/models/lftm'
 TOP_WORDS = MODEL_ROOT + '/TEDLFLDA.topWords'
 PARAS_PATH = MODEL_ROOT + '/TEDLFLDA.paras'
+THETA_PATH_MODEL = MODEL_ROOT + '/TEDLFLDA.theta'
+DATA_GLOVE = MODEL_ROOT + '/data_glove.txt'
 
 DATA_ROOT = ROOT + '/data/lftm'  # these files are regenerated at each prediction
 DOC_PATH = DATA_ROOT + '/doc.txt'
-DATA_GLOVE = DATA_ROOT + '/data_glove.txt'
 THETA_PATH = DATA_ROOT + '/TEDLFLDAinf.theta'
 
-os.makedirs('/data/lftm', exist_ok=True)
+os.makedirs(ROOT + '/data/lftm', exist_ok=True)
 
 W2V_BIN = DATA_ROOT + '/word2vec.bin'
 
@@ -38,12 +39,7 @@ def remove_tokens(x, tok2remove):
 class LftmModel(AbstractModel):
 
     # Perform Inference
-    def predict(self,
-                doc,
-                initer=500,
-                niter=0,
-                topn=10,
-                name='TEDLFLDAinf'):
+    def predict(self, doc, initer=500, niter=0, topn=10, name='TEDLFLDAinf'):
         """
             doc: the document on which to make the inference
             initer: initial sampling iterations to separate the counts for the latent feature component and the Dirichlet multinomial component
@@ -60,18 +56,9 @@ def predict(self,
             f.write(doc)
 
         # Perform Inference
-        completedProc = subprocess.run(
-            'java -jar {} -model {} -paras {} -corpus {} -initers {} -niters {} -twords '
-            '{} -name {} -sstep {}'.format(
-                LFTM_JAR,
-                'LFLDAinf',
-                PARAS_PATH,
-                DOC_PATH,
-                str(initer),
-                str(niter),
-                str(topn),
-                name,
-                '0'), shell=True)
+        proc = 'java -jar {} -model {} -paras {} -corpus {} -initers {} -niters {} -twords {} -name {} -sstep {}' \
+            .format(LFTM_JAR, 'LFLDAinf', PARAS_PATH, DOC_PATH, str(initer), str(niter), str(topn), name, '0')
+        completedProc = subprocess.run(proc, shell=True)
 
         # os.system('mv /app/data/TEDLFLDAinf.* /app/models/lftm/')
 
@@ -131,23 +118,10 @@ def train(self,
         completedProc = subprocess.run(
             'java -jar {} -model {} -corpus {} -vectors {} -ntopics {} -alpha {} -beta {}'
             ' -lambda {} -initers {} -niters {} -twords {} -name {} -sstep {}'.format(
-                LFTM_JAR,
-                'LFLDA',
-                DATA_GLOVE,
-                GLOVE_TXT,
-                str(ntopics),
-                str(alpha),
-                str(beta),
-                str(_lambda),
-                str(initer),
-                str(niter),
-                str(topn),
-                'TEDLFLDA',
-                '0'), shell=True)
-
-        print(completedProc.returncode)
+                LFTM_JAR, 'LFLDA', DATA_GLOVE, GLOVE_TXT, str(ntopics), str(alpha),
+                str(beta), str(_lambda), str(initer), str(niter), str(topn), 'TEDLFLDA', '0'), shell=True)
 
-        return 'success'
+        return 'success' if completedProc.returncode == 0 else ('error %d' % completedProc.returncode)
 
     def get_raw_topics(self):
         json_topics = {}
@@ -200,7 +174,6 @@ def evaluate(self, tagspath=DATA_ROOT + '/tags.txt', topn=5):
                 if not tags:
                     continue
 
-                print('doc', num_doc)
                 doc_score = 0
                 topic_weights = 0
                 # Iterate over the top topics
@@ -224,11 +197,20 @@ def evaluate(self, tagspath=DATA_ROOT + '/tags.txt', topn=5):
                     doc_score += topic_weight * topic_score
                     topic_weights += topic_weight
                 doc_score /= topic_weights
-                print('doc score', doc_score)
 
                 score += doc_score
 
         score /= (num_doc + 1)
 
         print(score)
         return score
+
+    def get_corpus_predictions(self):
+        with open(THETA_PATH_MODEL, "r") as file:
+            doc_topic_dist = [line.strip().split() for line in file.readlines()]
+
+        topics = [[(i, float(score)) for i, score in enumerate(doc)]
+                  for doc in doc_topic_dist]
+
+        topics = [sorted(doc, key=lambda t:-t[1]) for doc in topics]
+        return topics
diff --git a/app/builtin/ntm_model.py b/app/builtin/ntm_model.py
@@ -46,9 +46,15 @@ def topics(self):
 
         return json_topics
 
+    def get_corpus_predictions(self):
+        if self.model is None:
+            self.load()
+
+        return [self.model.get_document_topics(i) for i in range(0, len(self.model.get_docvecs()))]
+
     # Train the model
     def train(self,
-              datapath='/app/data/data.txt',
+              datapath='/data/data.txt',
               n_topics=35,
               batch_size=1024 * 6,
               n_epochs=20,

diff --git a/app/project.py b/app/project.py
@@ -45,6 +45,18 @@ def predict(model_type):
     # Return results and score
     return jsonify({'time': dur, 'results': results}), 200
 
+@app.route('/api/<string:model_type>/predict_corpus', methods=['POST'])
+def predict_corpus(model_type):
+    start = time.time()
+
+    # Load the model
+    model = models[model_type]()
+    # Perform Inference
+    results = model.predict_corpus(request.json['datapath'])
+    dur = time.time() - start
+    # Return results and score
+    return jsonify({'time': dur, 'results': results}), 200
+
 
 #################################################################
 #							TAGS								#

diff --git a/app/requirements.txt b/app/requirements.txt
@@ -7,6 +7,6 @@ SCIPY
 SCIKIT-LEARN
 GENSIM
 KERAS==2.3.1
-TENSORFLOW==1.14
+TENSORFLOW==1.15.2
 swagger-ui-py
 flask-cors
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,3 +136,4 @@ data/*
		tests/

		!data/data.txt
		!data/20ng*.txt