-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_postf.py
executable file
·223 lines (186 loc) · 8.11 KB
/
classify_postf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita ([email protected])
# Arturo Montejo-Ráez ([email protected])
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import json
import os
from ComplexityLanguage import ComplexityLanguage
from ComplexitySpanish import ComplexitySpanish
from ComplexityEnglish import ComplexityEnglish
from ComplexityFrench import ComplexityFrench
from ComplexityPolish import ComplexityPolish
from ComplexityItalian import ComplexityItalian
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import argparse
## ----------------------------------------------------------------------------
##
## Read command line arguments
##
parser = argparse.ArgumentParser(description='PAN2018 author identificator based on POS vectors')
parser.add_argument('-i', '--input', type=str, help='input directory')
parser.add_argument('-o', '--output', type=str, help='output directory')
parser.add_argument('-n', '--ngramsize', type=int, help='maximum n-gram size', choices=[1,2,3,4], default=2)
parser.add_argument('-f', '--idf', action='store_true', help='apply inverse document frequency', default=False)
parser.add_argument('-x', '--axis', type=int, choices=[0,1], default=1, help='apply L2 normalization by sample (1, default) or by feature (0)')
args = parser.parse_args()
INPUT_DIR, OUTPUT_DIR = args.input, args.output
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print('Loading complexity analyzers for different languages...\n', flush=True)
mlComplexityText = {
'en': ComplexityEnglish(),
'sp': ComplexitySpanish(),
'fr': ComplexityFrench(),
'pl': ComplexityPolish(),
'it': ComplexityItalian()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
postf = pd.DataFrame()
labels = {}
labels_cand = []
#
# Recorremos todos los problemas
#
print('Loading collection-info.json file from', INPUT_DIR, flush=True)
with open(INPUT_DIR+'/collection-info.json', 'r') as f:
collectionInfo = json.load(f)
f.close()
for problem in collectionInfo:
print('\n\nProblem: ', problem['problem-name'], flush=True)
print('Language: ', problem['language'], flush=True)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText = mlComplexityText[problem['language']]
#
# Recorremos todos los candidatos
#
print("Loading problem data...\n", flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
problem_info= json.load(problem_info_fhd)
problem_info_fhd.close()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print("Loading training data")
for candidate in problem_info['candidate-authors']:
print('Candidate: ', candidate['author-name'], flush=True)
files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
probcand = problem['problem-name'] + candidate['author-name']
if not probcand in labels:
labels[probcand] = len(labels)
labels_cand += [probcand]
#
# Procesamos todo los textos de ese candidato
#
for i, nameFile in enumerate(files):
print('Reading text file: ', nameFile, flush=True)
with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as fhnd:
postags = complexityText.getPOS(fhnd.read())
fhnd.close()
postags = " ".join([" ".join(p) for p in postags])
dfi = pd.DataFrame({'Pos': postags}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
dfi['candidate'] = candidate['author-name']
dfi['label'] = labels[probcand]
dfi['filename'] = nameFile
postf = postf.append([dfi])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates = False
if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
print("Reading ground truth...", flush=True)
with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as fhnd:
ground_truth = json.load(fhnd)
fhnd.close()
unknown_candidates = {}
for item in ground_truth['ground_truth']:
unknown_candidates[item['unknown-text']] = item['true-author']
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print("Loading test data", flush=True)
for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
print("Analyzing file", unknown_file, flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as fhnd:
postags = complexityText.getPOS(fhnd.read())
fhnd.close()
postags = " ".join([" ".join(p) for p in postags])
dfi = pd.DataFrame({'Pos': postags}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
if unknown_candidates and unknown_candidates[unknown_file]:
probcand = problem['problem-name'] + unknown_candidates[unknown_file]
dfi['candidate'] = unknown_candidates[unknown_file]
dfi['label'] = labels[probcand]
else:
dfi['candidate'] = None
dfi['label'] = None
dfi['filename'] = unknown_file
postf = postf.append([dfi])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
from sklearn import svm
clf = svm.LinearSVC(C=1)
for problem in set(postf['problem']):
answers = []
print('------- Training and classifying ', problem, flush=True)
#
# Calculamos el modelo de espacio vectorial
#
tfidfVectorizer = TfidfVectorizer(ngram_range=(1, args.ngramsize), use_idf=args.idf)
postf['POStfidf'] = list(tfidfVectorizer.fit_transform(postf['Pos']))
#
# Para el train cogemos los textos conocidos
#
train = postf[postf['filename'].str.contains(r"\bknown", regex=True)]
train = train.loc[train['problem'] == problem]
train = train.dropna(axis=1, how='any')
train_target = train['label']
train_data = np.array(list(train['POStfidf'].apply(lambda x: x.toarray()[0])))
#
# Para el test cogemos los textos desconocidos
#
test = postf[postf['filename'].str.contains(r"\bunknown", regex=True)]
test = test.loc[test['problem'] == problem]
test = test.dropna(axis=1, how='any')
test_data = np.array(list(test['POStfidf'].apply(lambda x: x.toarray()[0])))
#
# Normalizamos
#
data = pd.concat([pd.DataFrame(train_data), pd.DataFrame(test_data)])
data = pd.DataFrame(preprocessing.normalize(data, norm='l2', axis=args.axis))
train_data = data.iloc[:train_data.shape[0],:]
test_data = data.iloc[train_data.shape[0]:,:]
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data)
for index, row in test.iterrows():
probcand = labels_cand[y_pred[index]]
answers.append({
'unknown-text': row['filename'],
'predicted-author': probcand[probcand.find("candidate"):],
})
with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
json.dump(answers, file, indent=4)
print("done!")