-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSentiment Analysis pre-processing added.py
138 lines (88 loc) · 4.12 KB
/
Sentiment Analysis pre-processing added.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
short_pos = open("C:\\Users\\Raora\\Downloads\\pos-1.txt","r").read()
short_neg = open("C:\\Users\\Raora\\Downloads\\neg-1.txt","r").read()
documents = []
for r in short_pos.split('\n'):
documents.append( (r, "pos") )
for r in short_neg.split('\n'):
documents.append( (r, "neg") )
##print(documents[:5])
print("Length of documents: " + str(len(documents)))
import nltk
##nltk.download('punkt')
# !pip install autocorrect
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load("en_core_web_sm")
all_words = []
def preprocessing(text):
doc = nlp(text)
tokens = [token.text.lower() for token in doc if not (token.is_stop or not token.is_alpha)]
lemmas=[token.lemma_ for token in nlp(' '.join(tokens))]
return lemmas
short_pos_words = preprocessing(short_pos)
short_neg_words = preprocessing(short_neg)
print(len(short_pos_words))
print(len(short_neg_words))
for w in short_pos_words:
all_words.append(w.lower())
for w in short_neg_words:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
print("Length of BOW: " + str(len(all_words)))
lof = 500
word_features = list(all_words.keys())[:lof]
print("Length of features: " + str(len(word_features)))
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
print("Length of feature sets: " + str(len(featuresets)))
import random
import pickle
random.shuffle(featuresets)
##print(featuresets[0])
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
ratio1 = 80
ratio2 = 20
training_set = featuresets[:int((lof * ratio1) / 100)]
testing_set = featuresets[int((lof * ratio2) / 100):]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
##print(classifier.show_most_informative_features(15))
##save_classifier = open("naivebayes.pickle","wb")
##pickle.dump(classifier, save_classifier)
##save_classifier.close()
##classifier_f = open("naivebayes.pickle", "rb")
##classifier = pickle.load(classifier_f)
##classifier_f.close()
##
##
##print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
##classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)