-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRun.py
147 lines (90 loc) · 4.5 KB
/
Run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#This file is part of the PAN contest 2018 submission code.
#It is supposed to work with the respective task data for training and testing.
#Take a look at https://pan.webis.de/clef18/pan18-web/author-identification.html
#to run this software from the command line:
# python Run.py -c folder_with_PAN_files -o folder_with_answers
import os, torch, codecs, time, random, argparse
from torch.autograd import Variable
from sklearn.preprocessing import LabelEncoder
from random import shuffle
from nltk import ngrams
from ModelComp import ArcBinaryClassifier, Encoder, BinaryMeasurer, Parafac
from memory_profiler import profile
from scipy import stats
from collections import Counter, defaultdict
import ModelComp
import json
from sklearn import svm
from sklearn.decomposition import PCA
global use_cuda
use_cuda = False
ModelComp.use_cuda=use_cuda
import matplotlib
matplotlib.use('Agg')
matplotlib.rcParams.update({'font.size': 6})
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from utils import get_corpus_info, get_problem_truth, tagging_problem, create_char_ngrams_stat, \
filter_problem_corpus, create_ngrams_and_splitgrams, stats_for_ngrams_and_skipgrams, \
vectorise_problem_corpus, compute_mean_and_std, make_authors_list, \
define_model, define_optimiser, training, testing, save_tools, load_tools, as_minutes, cluster_test, set_sizes
def main():
time_start = time.time()
parser = argparse.ArgumentParser(description='Lukasz Gagala PAN18')
parser.add_argument('-c', type=str,
help='Path to input data set')
parser.add_argument('-r', type=str,
help='input run')
parser.add_argument('-o', type=str,
help='output dir')
parser.add_argument('-freq1', type=int, default = 1000,
help='first n most frequent items, default: 1000')
parser.add_argument('-freq2', type=int, default = 300,
help='number of most frequent lemmas added to PoS-tags, default: 300')
args = parser.parse_args()
problem_names, problem_languages = get_corpus_info(args.c)
for nb, problem in enumerate(problem_names):
problem_solving(nb, problem, problem_languages, args, time_start)
def problem_solving(nb, problem, problem_languages, args, time_start):
if problem_languages[nb] == "pl":
pass
print(problem)
local_path = get_problem_truth(args.c, problem)
print(local_path)
problem_collection, number_of_texts = tagging_problem(local_path, problem_languages[nb])
print('tagged')
authors = make_authors_list(problem_collection)
print('authors defined')
freq1 = args.freq1
freq2 = args.freq2
training_set_size, test_set_size = set_sizes(problem_collection)
random.seed(time.time())
trunc_words1, trunc_words2 = create_char_ngrams_stat(problem_collection, freq2, problem_languages[nb])
problem_collection = filter_problem_corpus(problem_collection, trunc_words1, trunc_words2, problem_languages[nb])
problem_collection, nb_categories = create_ngrams_and_splitgrams(problem_collection)
words_encoder, words_num = stats_for_ngrams_and_skipgrams(problem_collection, nb_categories, freq1)
freq_feature, words_num = vectorise_problem_corpus(problem_collection, words_encoder, words_num, frequency, number_of_texts)
freq_feature_form_norm, network_sizes = compute_mean_and_std(freq_feature, problem_collection,words_num)
model_test = define_model(network_sizes, len(authors),len(words_encoder))
optimiser_test = define_optimiser(model_test)
bceloss = torch.nn.NLLLoss()
if use_cuda:
bceloss = bceloss.cuda()
mseloss = torch.nn.MSELoss()
if use_cuda:
mseloss = mseloss.cuda()
model = training(model_test, training_set_size,
problem_collection, authors, bceloss, optimiser_test,
freq_feature_form_norm)
print('after training')
result = testing(problem_collection, model, authors, freq_feature_form_norm)
print('after testing')
with open(os.path.join(args.o,'answers-{}.json'.format(problem)), 'w') as outfile:
json.dump(result, outfile)
time_now = time.time()
timing = time_now -time_start
print(as_minutes(timing))
print('sdadkashdksadfksahfksafhksadhf')
return
if __name__ == '__main__':
main()