forked from shubham16394/Text-Similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbm25.py
39 lines (33 loc) · 1.55 KB
/
bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import scipy.sparse as sp
import numpy as np
def compute_corpus_term_idfs(corpus_features, norm_corpus):
dfs = np.diff(sp.csc_matrix(corpus_features, copy=True).indptr)
dfs = 1 + dfs # to smoothen idf later
total_docs = 1 + len(norm_corpus)
idfs = 1.0 + np.log(float(total_docs) / dfs)
return idfs
def compute_bm25_similarity(doc_features, corpus_features,
corpus_doc_lengths, avg_doc_length,
term_idfs, k1=1.5, b=0.75):
# get corpus bag of words features
corpus_features = corpus_features.toarray()
# convert query document features to binary features
# this is to keep a note of which terms exist per document
doc_features = doc_features.toarray()[0]
doc_features[doc_features >= 1] = 1
# compute the document idf scores for present terms
doc_idfs = doc_features * term_idfs
# compute numerator expression in BM25 equation
numerator_coeff = corpus_features * (k1 + 1)
numerator = np.multiply(doc_idfs, numerator_coeff)
# compute denominator expression in BM25 equation
denominator_coeff = k1 * (1 - b +
(b * (corpus_doc_lengths /
avg_doc_length)))
denominator_coeff = np.vstack(denominator_coeff)
denominator = corpus_features + denominator_coeff
# compute the BM25 score combining the above equations
bm25_scores = np.sum(np.divide(numerator,
denominator),
axis=1)
return bm25_scores