-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearch.py
54 lines (44 loc) · 1.85 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
from utils import preprocess_text
import numpy as np
import gensim
from sklearn.metrics.pairwise import cosine_similarity
data = pd.read_csv('./ML_Module/models/Preprocessed_data.csv')
all_title_embeddings = pd.read_csv('./ML_Module/models/title_embeddings.csv').values
w2v_model = gensim.models.word2vec.Word2Vec.load('./ML_Module/models/SO_word2vec_embeddings.bin')
def question_to_vec(question, embeddings, dim=300):
question_embedding = np.zeros(dim)
valid_words = 0
for word in question.split(' '):
if word in embeddings:
valid_words += 1
question_embedding += embeddings[word]
if valid_words > 0:
return question_embedding/valid_words
else:
return question_embedding
def searchresults(search_string, num_results):
search_string = preprocess_text(search_string)
search_vect = np.array([question_to_vec(search_string, w2v_model)])
search_results = []
cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0])
#cosine_similarities = cosine_similarities*(0.4*data.overall_scores + 0.1*(data.sentiment_polarity))
for i,j in cosine_similarities.nlargest(int(num_results)).iteritems():
output = ''
for t in data.question_content[i][:200].split():
if t.lower() in search_string:
output += " " + str(t)
else:
output += " "+str(t)
temp = {
'Title': str(data.original_title[i]),
'url': str(data.question_url[i]),
'Id': str(i),
'answer': str(data.answers_content[i]),
'Tags': str(data.tags[i]),
'similarity_score': str(j)[:5],
'votes': str(data.overall_scores[i]),
'Body':str(output)
}
search_results.append(temp)
return search_results