-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProductSafetyRiskModel.py
592 lines (408 loc) · 26.6 KB
/
ProductSafetyRiskModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
import nltk
nltk.download("stopwords")
import pandas as pd
import numpy as np
import json
import re
import xlsxwriter
from langdetect import detect
from nltk.stem import *
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_union
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn import metrics
import random
import pickle
import tornado.ioloop
import tornado.httpserver
import tornado.web
# Web application that routes requests to proper handlers
class Application(tornado.web.Application):
def __init__(self):
handlers = [
(r"/", MainHandler)
]
settings = dict(
)
super(Application, self).__init__(handlers, **settings)
# # Language detection, and drop rows with unsupported languages
def detectLanguage(sentence):
try:
return detect(str(sentence)) # change this into the right format for use later one
except Exception:
return 'unsupported'
def drop_column(dataframe, column_name):
return dataframe.drop([column_name], inplace=True, axis=1)
# Drop unsupported language columns before removing special characters in name
def dropRowsWithUnsupportedLanguage(dataframe, columnName):
rowsToBeDropped = []
for index, row in dataframe[columnName].iteritems():
language = detectLanguage(row)
if language == 'unsupported':
rowsToBeDropped.append(index)
print(index)
print(row)
print(rowsToBeDropped)
temp_data_frame = dataframe.drop(dataframe.index[rowsToBeDropped], axis=0) #dataframe.drop(rowsToBeDropped, axis=1)
return temp_data_frame
# # Removing special characters
# Replacing the special characters with space, evaluate with and without. Sometimes having special characters makes sense
# Eg: iPhone-6S
def removeSpecialCharacters(sentence):
return re.sub('[^a-zA-Z0-9 \n\.]', ' ', sentence)
# # Dataframe column data extractor
class DataFrameColumnExtracter(TransformerMixin, BaseEstimator):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column]
class MainHandler(tornado.web.RequestHandler):
def post(self):
# training_file = self.get_argument("training_file")
results_file = self.get_argument("results_file")
test_file = self.get_argument("test_file")
# product_safety_df = pd.read_excel(training_file)
# df_with_target_label = product_safety_df.dropna(subset=['SEVERITY']) # Drop rows without target label SEVERITY value
# list_of_unique_asins = pd.unique(df_with_target_label['ASIN']) # Grouping Risk levels by ASINs and assigning highest seen RISK level
# df_with_target_label.reset_index(drop=True, inplace=True) # TODO use iloc instead of reset_index when optimizing
# for asin in list_of_unique_asins:
# temp_df = df_with_target_label[df_with_target_label['ASIN'] == asin]
# temp_list_of_severity = pd.unique(temp_df['SEVERITY'])
# indexes_to_change = []
# if len(temp_list_of_severity) > 1:
# indexes_to_change = temp_df.index.tolist()
# if 'HIGH' in temp_list_of_severity:
# df_with_target_label.set_value(indexes_to_change, 'SEVERITY', 'HIGH')
# elif 'MEDIUM' in temp_list_of_severity:
# df_with_target_label.set_value(indexes_to_change, 'SEVERITY', 'MEDIUM')
# # Remove unwaunusable columns in the current input dataset
# drop_column(df_with_target_label, "ASIN") # Not of any use as this is a unique variable and is a unique identifier.
# drop_column(df_with_target_label, "Brand") # Brand has so many values as zero and was told might not be right once by the business team.
# # Item name has 137 zeroes, These 137 will be dropped later on as this is one of the most important attribute to be used
# # to determine the risk level.
# drop_column(df_with_target_label, "REVIEW_ID") # Review ID is a unique ID got from a database export.
# drop_column(df_with_target_label, "AUTHOR_ID")
# # Is AUTHOR_ID, related to customer ID of a review or is it something populated only for books?
# # Or the person who reviewed? Dropping for now, as nothing can be related.
# #Cant say surely as we are not performing account level but only on ASIN level
# drop_column(df_with_target_label, "SCORE")# Score 12138 empty columns and rest say 0.0 - removing as of no value.
# drop_column(df_with_target_label, "TT_NUMBER")# TT number - of no use to us, just a unique identifer.
# drop_column(df_with_target_label, "PSDB_ID") # Remove as not useful
# #Item Type - 503 empty , might have to remove them as this would be a very important feature.
# drop_column(df_with_target_label, "Sub_Category") # Sub Category is empty, removing column.
# # drop_column(df_with_target_label, "Country of Origin")
# # Build multiple models with and without removing this. More than 90% of the column is empty, but there is variation in the rest 10%
# drop_column(df_with_target_label, "CCR") # Not a ASIN level attribute but is a seller level attribute. Drop it for now.
# # Future models built not only on product level attributes can use it.
# drop_column(df_with_target_label, "Rating") #Might have to remove this as its a,
# # combination of product level and review level data. Its investigation/review level attribute and not product level attribute.
# drop_column(df_with_target_label, "Seller ID") # Seller ID is not of any use to us as its a seller level attribute.
# # Can be used if we are building seller level models later on.
# drop_column(df_with_target_label, "vulnerable Score") #vulnerable score, It's generated based on other attributes(See original BRD draft version 1)
# # Not product level attribute
# drop_column(df_with_target_label, "Rating Score") #Rating score , It's generated based on other attributes(See original BRD draft version 1)
# # Not product level attribute
# drop_column(df_with_target_label, "Country Score") # Country Score, It's generated based on other attributes(See original BRD draft version 1)
# # Not product level attribute
# drop_column(df_with_target_label, "Severity Score ") #Severity Score, It's generated based on other attributes(See original BRD draft version 1)
# # Not product level attribute
# drop_column(df_with_target_label, "CCR Score") # CCR Score, ignore it. This columns is a seller level attribute and not
# # for the an ASIN as such.
# drop_column(df_with_target_label, "Total Score") # Total score , ignore it. Just a sum of all other scores. The attribute
# # was shared based on the original BRD draft version 1.
# drop_column(df_with_target_label, "PSDB_TABLE") # Not useful, ignore it. PSDB_TABLE, not a product level attribute and is
# # just a unique identifier.
# drop_column(df_with_target_label, "MARKETPLACE_ID") # No variation in the input date. All of them are from NA. ie 1.0 is 12754 or 7.0 is only 485
# # What is market place ID - 0 - actually only 2, so its ok. - remove them before building model. How do you group them? So dropping.
# # Can be an important attribute when we get ASINs from different marketplaces across regions. For now we have only NA(US,CA).
# drop_column(df_with_target_label, "DATE_OF_CONCERN") # Not enough data for time series analysis.
# # Cannot perform was this ASINS recalled today and when will it be recalled next as not enough data.
# drop_column(df_with_target_label, "INJURY") # Dropping it as it is a review/investigation level attribute but not a product level attribute.
# # Also What does 0, 1, N, Y mean for INJURY column - can be used later when building models which want not just ASIN level attributes.
# drop_column(df_with_target_label, "ID") # ID is a unique id and is of no use to us.
# df_with_target_label["Retail or not retail"].fillna(value="Not_Retail", inplace=True)
# # TODO Use iloc instead of reset_index while optimizing
# df_with_target_label.reset_index(drop=True, inplace=True)
# df_with_target_label = dropRowsWithUnsupportedLanguage(df_with_target_label,'Item Name')
# #print('\n\n\n\nDF')
# #print(df_with_target_label)
# # Removing special characters is not helping the model being built, optional.
# for index, row in df_with_target_label.iterrows():
# print(row['Item Name'])
# row['Item Name'] = removeSpecialCharacters(str(row['Item Name']))
# df_with_target_label.set_value(index, 'Item Name', removeSpecialCharacters(str(row['Item Name'])))
# print(df_with_target_label.get_value(index, 'Item Name', takeable=False))
# # # Perform stemming and lemmatization on the text columns to get to the root word.
# #Caveats, if Chinese is in text then stemming wont happen. Snowball stemmer doesnt have python libraries.
# # For this to run you need to install nltk stopwords
# stemmer = SnowballStemmer("english", ignore_stopwords=True)
# # Stemming
# for index, row in df_with_target_label.iterrows():
# print(row['Item Name'])
# df_with_target_label.set_value(index, 'Item Name', ",".join([ stemmer.stem(kw) for kw in df_with_target_label.get_value(index, 'Item Name', takeable=False).split(" ")]))
# print(df_with_target_label.get_value(index, 'Item Name', takeable=False))
# # Do not use str to convert from unicode to encoded text / bytes.
# # As columns are text fill missing values and junk values with 'missing' string.
# df_with_target_label["Merchant Brand Name"].fillna(value="missing", inplace=True)
# df_with_target_label["Item_type"].fillna(value="missing", inplace=True)
# df_with_target_label["Country of Origin"].fillna(value="missing", inplace=True)
# for index, row in df_with_target_label.iterrows():
# if(type(row['Item_type']) == float or type(row['Item_type']) == int or type(row['Item_type']) == bool):
# print(row['Item_type'])
# df_with_target_label.set_value(index, 'Item_type', 'missing')
# print(df_with_target_label.get_value(index, 'Item_type', takeable=False))
# for index, row in df_with_target_label.iterrows():
# if(type(row['Merchant Brand Name']) == float or type(row['Merchant Brand Name']) == int or type(row['Merchant Brand Name']) == bool):
# print(row['Merchant Brand Name'])
# df_with_target_label.set_value(index, 'Merchant Brand Name', 'missing')
# print(df_with_target_label.get_value(index, 'Merchant Brand Name', takeable=False))
# for index, row in df_with_target_label.iterrows():
# if(type(row['Country of Origin']) == float or type(row['Country of Origin']) == int or type(row['Country of Origin']) == bool):
# print(row['Country of Origin'])
# df_with_target_label.set_value(index, 'Country of Origin', 'missing')
# print(df_with_target_label.get_value(index, 'Country of Origin', takeable=False))
# # It has int float and bool values too. Parts of dataset is not clean
# # # Train Test split of dataset
# target = df_with_target_label.pop('SEVERITY')
# X_train, X_test, y_train, y_test = train_test_split(df_with_target_label, target, test_size=0.20, random_state=42)
# # # Build FeatureUnion
# item_name_pipe = Pipeline([
# ('extractor',DataFrameColumnExtracter('Item Name')),
# ('count_vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
# ('tf_idf', TfidfTransformer(use_idf=True))
# ])
# merchant_brand_name_pipe = Pipeline([
# ('extractor',DataFrameColumnExtracter('Merchant Brand Name')),
# ('count_vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
# ('tf_idf', TfidfTransformer(use_idf=True))
# ])
# retail_or_not_pipe = Pipeline([
# ('extractor',DataFrameColumnExtracter('Retail or not retail')),
# ('count_vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
# ('tf_idf', TfidfTransformer(use_idf=True))
# ])
# item_type_pipe = Pipeline([
# ('extractor',DataFrameColumnExtracter('Item_type')),
# ('count_vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
# ('tf_idf', TfidfTransformer(use_idf=True))
# ])
# country_of_origin_pipe = Pipeline([
# ('extractor',DataFrameColumnExtracter('Country of Origin')),
# ('count_vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
# ('tf_idf', TfidfTransformer(use_idf=True))
# ])
# # Add more features as and when more data and attributes are available. Currently all chosen columns are text features
# # and CountVectorizer should suffice. Having length of Item Name column as a feature doesn't make sense in our case.
# feature_union = make_union(item_name_pipe, merchant_brand_name_pipe, retail_or_not_pipe, item_type_pipe, country_of_origin_pipe)
# feature_union.fit_transform(X_train)
# # # Define pipeline for easier runs
# pipeline = Pipeline([
# # Use FeatureUnion to combine the features
# ('union', feature_union),
# # Classifier
# ('clf', RandomForestClassifier(n_estimators=500, max_depth=None,min_samples_split=2, random_state=42)),
# ])
# # # Training
# pipeline.fit(X_train, y_train)
# # Save fitted pipeline for later use, to make predictions on new data directly without entire notebook run
filename = '/Users/lisirui/Desktop/PytonF/builtModels/fitted_model.ser'
# with open(filename, 'wb') as f:
# pickle.dump(pipeline, open(filename, 'wb'))
with open(filename, 'rb') as f:
pipeline = pickle.load(f)
print("Training Data Loaded!")
# Evalauate test set
# predicted = pipeline.predict(X_test)
# predicted_proba = pipeline.predict_proba(X_test)
# score = pipeline.score(X_test,y_test)
# # # Metrics - (run for each algorithm)
# np.mean(predicted == y_test)
# print(metrics.classification_report(y_test, predicted,
# target_names=['LOW','MEDIUM','HIGH']))
'''
indices_to_generate_results = X_test.index.values
export_excel_file = X_test.loc[indices_to_generate_results]
export_excel_file['PREDICTED'] = pd.Series(predicted, index=export_excel_file.index)
export_excel_file['prediction-HIGH'] = predicted_proba[:,0]
export_excel_file['prediction-LOW'] = predicted_proba[:,1]
export_excel_file['prediction-MEDIUM'] = predicted_proba[:,2]
# Save results file
writer = pd.ExcelWriter('results.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
export_excel_file.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
toReturn = {}
i = 0
for row in X_test['Item Name'].iteritems():
toReturn[row] = '{} [{}]'.format(predicted[i], predicted_proba[i])
i += 1
print(toReturn)
'''
# # Evaluating new test data
test_product_safety_df = pd.read_excel(test_file)
test_risky_product_safety_df = pd.read_excel(results_file)
# Verify ASINS order are same both the DF. One shared by business team and one shared by IS_RISKY/NOT team.
test_product_safety_df = pd.concat([test_product_safety_df, test_risky_product_safety_df['IS_RISKY']], axis=1)
asin_column = test_product_safety_df['ASIN']
test_product_safety_df = test_product_safety_df[test_product_safety_df['IS_RISKY'] == 'Y'] # Only use ASINs marked as risky
test_product_safety_df.rename(columns={'title': 'Item Name', 'item_type_keyword': 'Item_type',
'country' : 'Country of Origin',
'Retail or not Retail': 'Retail or not retail'}, inplace=True)
test_product_safety_df = test_product_safety_df[['Merchant Brand Name', 'Item Name', 'Item_type',
'Country of Origin', 'Retail or not retail']]
test_product_safety_df['Retail or not retail'].replace(to_replace=['Not Retail'],value='Not_Retail', inplace=True)
print(test_product_safety_df)
df_with_target_label = test_product_safety_df
df_with_target_label["Retail or not retail"].fillna(value="Not_Retail", inplace=True)
df_with_target_label = dropRowsWithUnsupportedLanguage(df_with_target_label,'Item Name')
# Removing special characters is not helping the model being built, optional.
for index, row in df_with_target_label.iterrows():
print(row['Item Name'])
row['Item Name'] = removeSpecialCharacters(str(row['Item Name']))
df_with_target_label.set_value(index, 'Item Name', removeSpecialCharacters(str(row['Item Name'])))
print(df_with_target_label.get_value(index, 'Item Name', takeable=False))
# # Perform stemming and lemmatization on the text columns to get to the root word.
#Caveats, if Chinese is in text then stemming wont happen. Snowball stemmer doesnt have python libraries.
# For this to run you need to install nltk stopwords
stemmer = SnowballStemmer("english", ignore_stopwords=True)
# Stemming
for index, row in df_with_target_label.iterrows():
print(row['Item Name'])
df_with_target_label.set_value(index, 'Item Name', ",".join([ stemmer.stem(kw) for kw in df_with_target_label.get_value(index, 'Item Name', takeable=False).split(" ")]))
print(df_with_target_label.get_value(index, 'Item Name', takeable=False))
# Do not use str to convert from unicode to encoded text / bytes.
# As columns are text fill missing values and junk values with 'missing' string.
df_with_target_label["Merchant Brand Name"].fillna(value="missing", inplace=True)
df_with_target_label["Item_type"].fillna(value="missing", inplace=True)
df_with_target_label["Country of Origin"].fillna(value="missing", inplace=True)
for index, row in df_with_target_label.iterrows():
if(type(row['Item_type']) == float or type(row['Item_type']) == int or type(row['Item_type']) == bool):
print(row['Item_type'])
df_with_target_label.set_value(index, 'Item_type', 'missing')
print(df_with_target_label.get_value(index, 'Item_type', takeable=False))
for index, row in df_with_target_label.iterrows():
if(type(row['Merchant Brand Name']) == float or type(row['Merchant Brand Name']) == int or type(row['Merchant Brand Name']) == bool):
print(row['Merchant Brand Name'])
df_with_target_label.set_value(index, 'Merchant Brand Name', 'missing')
print(df_with_target_label.get_value(index, 'Merchant Brand Name', takeable=False))
for index, row in df_with_target_label.iterrows():
if(type(row['Country of Origin']) == float or type(row['Country of Origin']) == int or type(row['Country of Origin']) == bool):
print(row['Country of Origin'])
df_with_target_label.set_value(index, 'Country of Origin', 'missing')
print(df_with_target_label.get_value(index, 'Country of Origin', takeable=False))
indices_to_generate_results = test_product_safety_df.index.values
print("ATTTRIBUTES")
print(test_product_safety_df.index.values)
predicted = pipeline.predict(df_with_target_label)
predicted_proba = pipeline.predict_proba(df_with_target_label)
print(predicted_proba)
export_excel_file = test_product_safety_df.loc[indices_to_generate_results]
export_excel_file['PREDICTED'] = pd.Series(predicted, index=export_excel_file.index)
export_excel_file['prediction-HIGH'] = predicted_proba[:,0]
export_excel_file['prediction-LOW'] = predicted_proba[:,1]
export_excel_file['prediction-MEDIUM'] = predicted_proba[:,2]
buildAsin = []
for index in indices_to_generate_results:
buildAsin.append(asin_column[index-1])
export_excel_file['ASIN'] = buildAsin
print(export_excel_file)
# Save results file
writer = pd.ExcelWriter('results.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
export_excel_file.to_excel(writer, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format1 = workbook.add_format({'bold': True})
worksheet.write_string(0, 11, 'RISK_LEVEL', format1)
for row in range(len(predicted)):
worksheet.write_formula(row + 1, 11, 'IF(G{0}="HIGH", IF(H{0} >= 0.7, 5, 4), IF(G{0} = "MEDIUM", IF(J{0} >= 0.7, 3, 2), 1))'.format(row + 1))
# Close the Pandas Excel writer and output the Excel file.
writer.save()
self.write('File was saved')
# Create application, set up server for it listening to port 8000, start IO loop
def main():
application = Application()
http_server = tornado.httpserver.HTTPServer(application, xheaders=True)
http_server.listen(9023)
tornado.ioloop.IOLoop.current().start()
if __name__ == "__main__":
main()
'''
# # GridSearchCV for finding best parameters
rf_filter = my_rf_filter(threshold='mean')
clf = RandomForestClassifier(n_jobs=-1, random_state=42, oob_score=False)
# Grid search is an approach to parameter tuning that will methodically build and evaluate a model for each combination of algorithm parameters specified in a grid.
# Choose lesser number of combination of parameters at a time and experiment. Else it take forever to run.
# Grid search parameters
# rf_n_estimators = [10, 50, 500, 1000]
# rf_max_features = ['auto', 'sqrt', 'log2']
# rf_max_depth = [None, 3, 5, 10, 20]
# rf_min_samples_split = [2, 3, 10]
# rf_min_samples_leaf = [1, 3, 10]
# rf_bootstrap = [True, False]
# rf_criterion = ["gini", "entropy"]
rf_n_estimators = [10, 50, 500, 1000]
rf_max_features = ['auto']
rf_max_depth = [None]
rf_min_samples_split = [2]
rf_min_samples_leaf = [3]
rf_bootstrap = [False]
rf_criterion = ["gini"]
# rff_transform = ["median", "mean"] # Search the threshold parameters
estimator = GridSearchCV(pipeline,
cv = 3,
param_grid = dict(clf__n_estimators = rf_n_estimators,
clf__max_features = rf_max_features,
clf__max_depth = rf_max_depth,
clf__min_samples_split = rf_min_samples_split,
clf__min_samples_leaf = rf_min_samples_leaf,
clf__bootstrap = rf_bootstrap,
clf__criterion = rf_criterion))
estimator.get_params().keys()
estimator.fit(X_train, y_train)
# # Running on all ML Algos
classifier_map = {"Decision Tree " : DecisionTreeClassifier(random_state=0),
"Bagging Classifier " : BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),
"Random Forest Classifier " : RandomForestClassifier(n_estimators=500, max_depth=None,min_samples_split=2, random_state=42),
"AdaBoost" : AdaBoostClassifier(n_estimators=100),
"SVC " : SVC(gamma=2, C=1),
"Logistic Regression " : linear_model.LogisticRegression(C=1e5)}
for clf_key, clf_value in classifier_map.iteritems():
pipeline = Pipeline([
('union', feature_union),
('clf', clf_value),])
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
score = pipeline.score(X_test,y_test)
print("Accuracy after running with %s algorithm is %f" % (clf_key,score))
f = open('report_compliance_risk_model.txt','a')
f.write("Accuracy after running with %s algorithm is %f" % (clf_key,score))
f.write("\n")
print(metrics.classification_report(y_test, predicted,target_names=['LOW','MEDIUM','HIGH']))
f.write(metrics.classification_report(y_test, predicted,target_names=['LOW','MEDIUM','HIGH']))
f.write("\n")
print(pd.crosstab(y_test, predicted, rownames=['True'], colnames=['Predicted'], margins=True))
f.write("\n\n\n")
f.close()
'''