-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlogistic_regression.py
65 lines (58 loc) · 2.79 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from analytics_vidhya import bow,train, np, test,tfidf,wordvec_df, docvec_df
lreg = LogisticRegression()
# Extracting train and test BoW features
train_bow = bow[:31962,:]
test_bow = bow[31962:,:]
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], random_state=42, test_size=0.3)
print(f'>>>>>>>>>>>>>>LOGISTIC REGRESSION<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
# training the model
lreg.fit(xtrain_bow, ytrain)
prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)
score_f1 = f1_score(yvalid, prediction_int) # calculating f1 score for the validation set
print(f'F1 score for logistic regression on the BOW features{score_f1}')
#Now let’s make predictions for the test dataset and create a submission file.
test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id','label']]
submission.to_csv('Submissions/sub_lreg_bow.csv', index=False) # writing data to a CSV file
# tf-idf featureset
train_tfidf = tfidf[:31962,:]
test_tfidf = tfidf[31962:,:]
xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]
lreg.fit(xtrain_tfidf, ytrain)
prediction = lreg.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
tf_idf_f1_score = f1_score(yvalid, prediction_int) # calculating f1 score for the validation set
print(f'F1 score for logistic regression on the TF-IDF features{tf_idf_f1_score}')
# Word2Vec features
train_w2v = wordvec_df.iloc[:31962,:]
test_w2v = wordvec_df.iloc[31962:,:]
xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]
lreg.fit(xtrain_w2v, ytrain)
prediction = lreg.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
word2vec_f1_score = f1_score(yvalid, prediction_int)
print(f'F1 score for logistic regression on the Word2Vec features{word2vec_f1_score}')
# doc to vec features
train_d2v = docvec_df.iloc[:31962,:]
test_d2v = docvec_df.iloc[31962:,:]
xtrain_d2v = train_d2v.iloc[ytrain.index,:]
xvalid_d2v = train_d2v.iloc[yvalid.index,:]
lreg.fit(xtrain_d2v, ytrain)
prediction = lreg.predict_proba(xvalid_d2v)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
doc2vec_f1_score = f1_score(yvalid, prediction_int)
print(f'F1 score for logistic regression on the Doc2Vec features{doc2vec_f1_score}')