-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgetSimProjects.py
39 lines (32 loc) · 1.55 KB
/
getSimProjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
#Creates the dictionary from the readme file
#This texts pre-processing only involves removing some common words in languages which carries little information
#May need to add some other tweaks to improve the data quality
def getDict(f):
# f is the pickle file from which we are going to create the dictionary
dictionary = corpora.Dictionary(line.lower().split() for line in f)
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids) # remove stop words
dictionary.compactify()
return dictionary
# Transform the corpus into bow format
def getBOW(dictionary, f):
# f is the pickle file from which we are going to create the bow representation of the corpus
corpus=[];
corpus=[dictionary.doc2bow(line.lower().split()) for line in f]
return corpus
dictionary = getDict(f)
corpusBOW = getBOW(dictionary, f)
corpusTfidf = models.TfidfModel(corpusBOW) # Train the tfidf model
index = similarities.SparseMatrixSimilarity(tfidf[corpusBOW], num_features=12)
testQuery="hello world"
testQueryBow = dictionary.doc2bow(testQuery.lower().split())
testQueryTfidf = corpusTfidf[testQueryBow]
sims = index[testQueryTfidf]
print(list(enumerate(sims)))