-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_extractor.py
486 lines (454 loc) · 19.9 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#!/usr/bin/python3
# -*-coding: utf-8 -*-
"""
Make sure to run data_reader.py before proceed
"""
import json
import pickle
from nltk import ngrams
import os
import numpy
import psycopg2
from sklearn.feature_extraction.text import TfidfVectorizer
import wget
from zope.interface import implementer, Interface
con = psycopg2.connect(
user="postgres", host="localhost", dbname="olx_data", password="postgres")
con.autocommit = True
cur = con.cursor()
class Preprocessor(object):
"""
Basic text preprocessing
"""
def __init__(self):
self.stop_words = \
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
'into', 'through', 'during', 'before', 'after', 'above', 'below',
'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
'under', 'again', 'further', 'then', 'once', 'here', 'there',
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll',
'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn',
'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn',
'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn',
'condition', 'great', 'good', 'selling', 'sale', 'service',
'urgent', 'price', 'viewing', 'quality']
def preprocessed(self, str_input):
"""
Preprocess input string: removes punctuation and other symbols except
letters, lower case, stop words filter
:param str_input: input string
:return: output string
"""
for char in str_input:
if not char.isalpha():
str_input = str_input.replace(char, " ")
words = str_input.strip().lower().split(" ")
filtered_words = [
word for word in words if (word not in self.stop_words
and len(word) > 2)]
return " ".join(filtered_words)
class IFeatureExtractor(Interface):
def get_vector_feature(text_feature, *args):
"""
Calculates vector feature from text feature
"""
class FeatureExtractorBase(Preprocessor):
"""
Base class for feature extraction
"""
def __init__(self):
super(FeatureExtractorBase, self).__init__()
self.base_path = os.path.dirname(os.path.realpath(__file__))
self.n_gram_lens = (1, 2, 3, 4)
self.threshold_max_count = 2
self.threshold_count = 0.5
self.min_feature_len = 3
def find_most_frequently_n_grams(self, data_set):
"""
Finds most frequently n-grams in all dataset
:param data_set: type of data set (train or test)
:return: most frequently n-grams by n and categories
"""
query = "SELECT item_id, listing_title, category_l3_name_en FROM " \
"samples_%s;" % data_set
cur.execute(query)
res = cur.fetchall()
print("Will find most frequently n grams...")
text_data_by_categories = {}
for row in res:
item_id, listing_title, category_l3_name_en = row
if category_l3_name_en not in text_data_by_categories:
text_data_by_categories[category_l3_name_en] = [
self.preprocessed(listing_title)]
else:
text_data_by_categories[category_l3_name_en].extend(
[self.preprocessed(listing_title)])
n_grams_by_count = {}
for n_gram_len in self.n_gram_lens:
n_grams_by_categories = {}
for category, text_data in text_data_by_categories.items():
n_grams_by_frequency = {}
for sentence in text_data:
n_grams = ngrams(sentence.split(), n_gram_len)
for n_gram in n_grams:
if n_gram not in n_grams_by_frequency:
n_grams_by_frequency[n_gram] = 1
else:
n_grams_by_frequency[n_gram] += 1
n_grams_by_categories[category] = n_grams_by_frequency
n_grams_by_count[n_gram_len] = n_grams_by_categories
most_frequently_n_grams_by_count = {}
for n_gram_len, values in n_grams_by_count.items():
most_frequently_n_grams_by_cat = {}
for category, n_grams_by_frequency in values.items():
most_frequently_n_grams_by_cat[category] = []
counts_list = list(
n_grams_by_count[n_gram_len][category].values())
if counts_list:
max_count = max(counts_list)
else:
max_count = 0
for n_gram, count in n_grams_by_frequency.items():
if (max_count > self.threshold_max_count and
count >= self.threshold_count * max_count):
most_frequently_n_grams_by_cat[category].append(
list(set(n_gram)))
most_frequently_n_grams_by_count[n_gram_len] = \
most_frequently_n_grams_by_cat
return most_frequently_n_grams_by_count
def extract_features_per_item(self, res, fr_n_grams):
"""
Extracts text features from item info given most frequently n-grams
:param res: item info
:param fr_n_grams: most frequently n-grams
:return: set of text features
"""
features = set()
listing_title, listing_description, \
listing_price, category_sk, category_l1_name_en, \
category_l2_name_en, category_l3_name_en, listing_latitude, \
listing_longitude = res
preprocessed_title = self.preprocessed(listing_title).split(" ")
for n_gram_len in self.n_gram_lens:
n_grams = ngrams(preprocessed_title, n_gram_len)
for n_gram in n_grams:
n_gram_set = set(n_gram)
n_gams_by_category = fr_n_grams[str(n_gram_len)]
if category_l3_name_en not in n_gams_by_category:
continue
for elem in n_gams_by_category[
category_l3_name_en]:
if set(elem) == n_gram_set:
features = features.union(n_gram_set)
if len(features) < self.min_feature_len:
features = features.union(set(preprocessed_title))
if len(features) < self.min_feature_len:
features = features.union(
set(self.preprocessed(listing_description).split(" ")))
return features
def extract_text_features_per_item_db(self, item_id, fr_n_grams, data_set):
"""
Extracts text features from item id given most frequently n-grams
:param item_id: input item id
:param fr_n_grams: most frequently n-grams
:param data_set: type of data set (train or test)
:return: set of text features
"""
query = "SELECT listing_title, listing_description, listing_price, " \
"category_sk, category_l1_name_en, category_l2_name_en, " \
"category_l3_name_en, listing_latitude, listing_longitude " \
"FROM samples_%s WHERE item_id=%%s;" % data_set
cur.execute(query, (item_id,))
res = cur.fetchone()
if not res:
raise AttributeError(
"No data for item %s in the database" % item_id)
return self.extract_features_per_item(res, fr_n_grams)
def extract_features_per_item_info(self, item_info, fr_n_grams, *args):
"""
Extracts text feature and vector feature from item info given most
frequently n-grams
:param item_info: input item info
:param fr_n_grams: most frequently n-grams
:param args: arguments for get_vector_feature() function which should
be implemented in child class
:return: set of text features, vector feature
"""
item_features = self.extract_features_per_item(item_info, fr_n_grams)
text_feature = " ".join(item_features)
vector_feature = self.get_vector_feature(text_feature, *args)
return text_feature, vector_feature
def extract_features(self, fr_n_grams, data_set, *args):
"""
Exracts text and vectors features for all dataset. Append text and
vectors features per item id in PostgreSQL database
:param fr_n_grams: most frequently n-grams
:param data_set: type of data set (train or test)
:param args: arguments for get_vector_feature() function which should
be implemented in child class
"""
print("Will extract features for %s" % data_set)
query = "SELECT EXISTS (SELECT 1 FROM information_schema.columns " \
"WHERE table_name ='samples_%s' AND " \
"column_name='text_feature');" % data_set
cur.execute(query)
row = cur.fetchone()
if not row[0]:
query = "ALTER TABLE samples_%s ADD COLUMN text_feature character " \
"varying(500)" % data_set
cur.execute(query)
query = "ALTER TABLE samples_%s ADD COLUMN vector_feature double " \
"precision[]" % data_set
cur.execute(query)
query = "SELECT item_id FROM samples_%s;" % data_set
cur.execute(query)
res = cur.fetchall()
item_count = 0
for row in res:
if not row:
continue
item_id, = row
item_count += 1
print("Item id: %s. Item count: %s" % (item_id, item_count))
item_features = self.extract_text_features_per_item_db(
item_id, fr_n_grams, data_set)
text_feature = " ".join(item_features)
vector_feature = self.get_vector_feature(text_feature, *args)
query = "UPDATE samples_%s SET text_feature = %%s, vector_feature " \
"= %%s WHERE item_id = %%s;" % data_set
cur.execute(query, (text_feature, vector_feature, item_id))
@implementer(IFeatureExtractor)
class FeatureExtractorMI(FeatureExtractorBase):
"""
Category-feature mutual information based feature extractor
"""
@staticmethod
def get_categories(data_set):
query = "SELECT category_l1_name_en,category_l2_name_en, " \
"category_l3_name_en FROM samples_%s;" % data_set
cur.execute(query)
res = cur.fetchall()
distinct_categories_l1_name_en = set()
distinct_categories_l2_name_en = set()
distinct_categories_l3_name_en = set()
for row in res:
category_l1_name_en, category_l2_name_en, category_l3_name_en = row
distinct_categories_l1_name_en.add(category_l1_name_en)
distinct_categories_l2_name_en.add(category_l2_name_en)
distinct_categories_l3_name_en.add(category_l3_name_en)
return distinct_categories_l1_name_en, distinct_categories_l2_name_en, \
distinct_categories_l3_name_en
@staticmethod
def get_disctinct_features(
path_to_features, path_to_disctinct_features):
with open(path_to_features, "rb") as fin:
features = pickle.load(fin)
disctinct_features = set()
for seller_id, seller_items in features.items():
for item_id, item_features in seller_items.items():
for feature in item_features:
disctinct_features.add(feature)
disctinct_features = list(disctinct_features)
with open(path_to_disctinct_features, "wb") as fin:
pickle.dump(disctinct_features, fin)
def get_matrix(
self, path_to_features, path_to_disctinct_features, data_set):
categories_l1, categories_l2, categories_l3 = self.get_categories(
data_set)
disctinct_categories = list(
categories_l1 | categories_l2 | categories_l3)
with open(path_to_features, "rb") as fin:
features = pickle.load(fin)
with open(path_to_disctinct_features, "rb") as fin:
disctinct_features = pickle.load(fin)
matrix = numpy.zeros(
[len(disctinct_features), len(disctinct_categories)])
for seller_id, seller_items in features.items():
for item_id, item_features in seller_items.items():
for feature in item_features:
query = "SELECT category_l1_name_en,category_l2_name_en, " \
"category_l3_name_en FROM samples_%s WHERE " \
"item_id='%%s';" % data_set
cur.execute(query, (item_id, ))
res = cur.fetchone()
category_l1_name_en, category_l2_name_en, \
category_l3_name_en = res
for cat in (category_l1_name_en, category_l2_name_en,
category_l3_name_en):
category_index = disctinct_categories.index(cat)
feature_index = disctinct_features.index(feature)
matrix[feature_index][category_index] += 1
import pdb
pdb.set_trace()
@staticmethod
def get_vector_feature(text_feature, *args):
# TODO: implement
pass
@implementer(IFeatureExtractor)
class FeatureExtractorW2V(FeatureExtractorBase):
"""
Word2vec based feature extractor
"""
def __init__(self):
super(FeatureExtractorW2V, self).__init__()
self.vector_length = 300
self.path_to_ngrams = os.path.join(
self.base_path, "most_frequently_n_grams.json")
self.path_to_features_classes = os.path.join(
self.base_path, "features_classes.pickle")
self.path_to_word_tfidf = os.path.join(
self.base_path, "word_tfidf.pickle")
@staticmethod
def get_vector_from_word(nlp_model, word, word2tfidf):
"""
Calcalutes vector for given word with tfidf weights
:param nlp_model: word2vec model
:param word: input word
:param word2tfidf: words with tfidf dictionary
:return: output vector
"""
try:
vec = nlp_model[word]
except:
print(word)
return
if not word2tfidf or word not in word2tfidf:
idf = 0
print(word)
else:
idf = word2tfidf[word]
return vec * idf
@staticmethod
def get_word2tfidf(features_classes):
"""
Calculates words with tfidf dictionary
:param features_classes: all text features
:return: words with tfidf dictionary
"""
tfidf = TfidfVectorizer()
tfidf.fit_transform(features_classes)
return dict(zip(tfidf.get_feature_names(), tfidf.idf_))
@staticmethod
def load_model_from_vec():
"""
Loads word2vec model from .vec format
:return: word2vec model
"""
from gensim.models.keyedvectors import KeyedVectors
basepath = os.path.dirname(os.path.abspath(__file__))
path_to_model = os.path.join(basepath, 'wiki.en.vec')
if not os.access(path_to_model, os.R_OK):
wget.download(
"https://s3-us-west-1.amazonaws.com/fasttext-vectors/"
"wiki.en.vec", path_to_model)
lan_model = KeyedVectors.load_word2vec_format(
path_to_model, binary=False)
return lan_model
def get_vector(self, text, nlp_model, word2tfidf):
"""
Calcalutes mean vector for given text
:param text: input text
:param nlp_model: word2vec model
:param word2tfidf: words with tfidf dictionary
:return: output vector
"""
words = text.split(" ")
mean_vecs = numpy.zeros([len(words), self.vector_length])
index = 0
for word in words:
mean_vec = self.get_vector_from_word(
nlp_model, word, word2tfidf)
if mean_vec is not None:
mean_vecs[index] = mean_vec
index += 1
mean_vec = list(mean_vecs.mean(axis=0))
return mean_vec
def get_all_txt_features(self, fr_n_grams, data_set):
"""
Finds all text features in dataset
:param fr_n_grams: most frequently n-grams
:param data_set: type of data set (train or test)
:return: all text features
"""
query = "SELECT listing_title, listing_description, listing_price, " \
"category_sk, category_l1_name_en, category_l2_name_en, " \
"category_l3_name_en, listing_latitude, listing_longitude " \
"FROM samples_%s;" % data_set
cur.execute(query)
res = cur.fetchall()
features_classes = []
for row in res:
if not row:
continue
item_features = self.extract_features_per_item(
row, fr_n_grams)
item_features_text = " ".join(item_features)
features_classes.append(item_features_text)
return features_classes
def get_vector_feature(self, text_feature, *args):
nlp_model, word2tfidf = args
return self.get_vector(text_feature, nlp_model, word2tfidf)
def load_model_from_pickle(self):
"""
Loads word2vec model from .pickle format (much faster than from .vec)
:return: word2vec model
"""
print("Will load nlp model. May take some time...")
basepath = os.path.dirname(os.path.abspath(__file__))
path_to_model = os.path.join(basepath, "nlp_model.pickle")
if os.access(path_to_model, os.F_OK):
with open(path_to_model, "rb") as fin:
nlp_model = pickle.load(fin)
else:
nlp_model = self.load_model_from_vec()
with open(path_to_model, "wb") as fout:
pickle.dump(nlp_model, fout)
print("Loaded")
return nlp_model
def get_features_for_test(self):
"""
Extracts features for test set
"""
with open(self.path_to_ngrams, "r") as fin:
frequently_n_grams = json.load(fin)
with open(self.path_to_word_tfidf, "rb") as fin:
word_tfidf = pickle.load(fin)
nlp_model_ = self.load_model_from_pickle()
self.extract_features(
frequently_n_grams, "test", nlp_model_,
word_tfidf)
def get_features_for_train(self):
"""
Extracts features for train set
"""
set_of_data = "train"
frequently_n_grams = self.find_most_frequently_n_grams(
set_of_data)
with open(self.path_to_ngrams, "w") as fout:
json.dump(frequently_n_grams, fout)
with open(self.path_to_ngrams, "r") as fin:
frequently_n_grams = json.load(fin)
feat_classes = self.get_all_txt_features(
frequently_n_grams, set_of_data)
word_tfidf = self.get_word2tfidf(feat_classes)
with open(self.path_to_word_tfidf, "wb") as fout:
pickle.dump(word_tfidf, fout)
nlp_model_ = self.load_model_from_pickle()
self.extract_features(
frequently_n_grams, set_of_data, nlp_model_,
word_tfidf)
if __name__ == "__main__":
feature_extractor = FeatureExtractorW2V()
feature_extractor.get_features_for_train()
feature_extractor.get_features_for_test()