forked from shubham16394/Text-Similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
22 lines (17 loc) · 1013 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency',
ngram_range=(1, 1), min_df=0.0, max_df=1.0):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,
ngram_range=ngram_range)
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix