-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblending.py
105 lines (91 loc) · 3.62 KB
/
blending.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import svm
from xgboost import XGBClassifier
from xgboost import plot_importance
#from matplotlib import pyplot
def blender(t,s,e):
traindata = pd.read_csv(t, header=None)
#testdata = pd.read_csv('/Users/nishantuzir/Downloads/ponzi_chat_traffic_classification/ponzi_traffic_test.csv'
X = traindata.iloc[:,s:e]
Y = traindata.iloc[:,e]
#C = testdata.iloc[:,38]
#T = testdata.iloc[:,0:38]
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
#scaler = Normalizer().fit(T)
#testT = scaler.transform(T)
traindata = np.array(trainX)
trainlabel = np.array(Y)
#testdata = np.array(testT)
#testlabel = np.array(C)
seed = 1337
test_size = 0.30
x_train,x_test,y_train,y_test = train_test_split(traindata,trainlabel,test_size = test_size, random_state = seed)
modellr = LogisticRegression()
modelnb = GaussianNB()
modeldt = DecisionTreeClassifier()
modelrf = RandomForestClassifier()
models = [modellr,modelnb,modeldt,modelrf]
predictions = pd.DataFrame(columns=[i for i in range(len(models))])
for j,model in enumerate(models):
predictions[j] = model.fit(x_train, y_train).predict_proba(x_test)[:,1]
predictions = np.array(predictions)
from sklearn.neural_network import MLPClassifier
modelnn = MLPClassifier(hidden_layer_sizes=(100,),max_iter=500)
modelnn.fit(predictions,y_test)
expected = y_test
predicted = modelnn.predict(predictions)
global accuracy,recall,precision,f1,cm
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")
cm = metrics.confusion_matrix(expected, predicted)
print(cm)
#pyplot.matshow(cm)
#pyplot.title('Confusion matrix')
#pyplot.colorbar()
#pyplot.ylabel('True label')
#pyplot.xlabel('Predicted label')
#pyplot.show()
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
return modelnn
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser(description='neural network based wighted blender')
parser.add_argument('-t', '--train', default=None, help='specify path of train data')
parser.add_argument('-e', '--end', type=int,default=10, help='specify the end column of the train dataset; this value will be excluded because python is upper limit exclusive')
parser.add_argument('-s', '--start',type=int ,default=0, help='specify the start column of the train dataset; this value will not be excluded because python is not lower limit exclusive')
args = parser.parse_args()
if args.train:
blender(args.train,args.start,args.end)
else:
parser.print_help()