This repository has been archived by the owner on Sep 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainer.py
55 lines (43 loc) · 2.27 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from pickle import dump
from copy import deepcopy
import os
from nltk.classify import apply_features, accuracy
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from statistics import load_pickle
from text_processing.corpus import print_corpus_info, get_training_documents
from feature_extractor.bag_of_words import binary_bag_of_words, counted_bag_of_words
def trainer(train_documents, test_documents, bag_of_words, classifier_object, type, name):
print("Training {0} ...".format(name))
train_set = apply_features(bag_of_words, train_documents)
test_set = apply_features(bag_of_words, test_documents)
classifier = SklearnClassifier(classifier_object, type)
classifier.train(train_set)
with open("Classifiers/" + name + ".pickle", 'wb') as file_handler:
dump(classifier, file_handler)
print("Done")
print("Accurancy {0}".format(accuracy(classifier, test_set)))
print("Done")
def main():
print ("Welcome to trainer ! \n")
print_corpus_info()
if not os.path.isdir(os.getcwd() + "/Classifiers"):
os.mkdir("Classifiers")
#train_documents, test_documents = get_training_documents(cut_off=0.75, save=True)
train_documents = load_pickle("Classifiers/train_feature_set.pickle")
test_documents = load_pickle("Classifiers/test_feature_set.pickle")
algorithms = [DecisionTreeClassifier(), KNeighborsClassifier(), LinearSVC(), LogisticRegression(), BernoulliNB()]
for algorithm in algorithms:
class_name = algorithm.__class__.__name__
trainer(train_documents, test_documents, binary_bag_of_words, deepcopy(algorithm), bool, class_name + "_bool")
trainer(train_documents, test_documents, counted_bag_of_words, deepcopy(algorithm), int, class_name + "_int")
trainer(train_documents, test_documents, counted_bag_of_words, Pipeline([('tfidf', TfidfTransformer()),
('nb', deepcopy(algorithm))]), float, class_name + "_tfidf")
if __name__ == '__main__':
main()