-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathComplexityItalian.py
366 lines (288 loc) · 13.8 KB
/
ComplexityItalian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# -*- coding: utf-8 -*-
import sys
sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
import freeling
import os
import re
from functools import reduce
import numpy as np
import scipy.stats
import math
class ComplexityItalian():
def __init__(self, lang = 'it'):
## Modify this line to be your FreeLing installation directory
FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
DATA = FREELINGDIR+"/data/"
self.DATA = DATA
self.lang = lang
freeling.util_init_locale("default")
# create language analyzer
self.la=freeling.lang_ident(DATA+"common/lang_ident/ident.dat")
# create options set for maco analyzer. Default values are Ok, except for data files.
op= freeling.maco_options(lang)
op.set_data_files( "",
self.DATA + "common/punct.dat",
self.DATA + self.lang + "/dicc.src",
self.DATA + self.lang + "/afixos.dat",
"",
self.DATA + self.lang + "/locucions.dat",
self.DATA + self.lang + "/np.dat",
"",
self.DATA + self.lang + "/probabilitats.dat")
# create analyzers
self.tk=freeling.tokenizer(self.DATA + self.lang + "/tokenizer.dat")
self.sp=freeling.splitter(self.DATA + self.lang + "/splitter.dat")
self.mf=freeling.maco(op)
# activate mmorpho modules to be used in next call
self.mf.set_active_options(False, True, True, True, # select which among created
True, True, False, True, # submodules are to be used.
True, True, True, True ) # default: all created submodules are used
# create tagger
self.tg=freeling.hmm_tagger(self.DATA+self.lang+"/tagger.dat",True,2)
self.sen=freeling.senses(DATA+lang+"/senses.dat")
"""
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [
True|False, # PUNCTUATION MARKS
True|False, # SCI
True|False, # ARI
True|False, # MU
True|False, # Flesch-Vaca
True|False, # Gulpease
]
Si config == None se calculan todas las métricas de complejidad soportadas
"""
self.config = [True, True, True, True, True, True]
self.metricsIt = ['AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU', 'FLESCH-VACA', 'GULPEASE']
self.configExtend = [True, True, True, True, True]
self.metricsItExtend = ['MEAN WORDS', 'STD WORDS','COMPLEX SENTENCES', 'MEAN SYLLABLES', 'STD SYLLABLES']
def textProcessing(self, text):
text = text.replace(u'\xa0', u' ').replace('"', '')
# meter todas las funciones en una patron de los tokens válidos
#ls = sen.analyze(ls)
sid=self.sp.open_session()
tokens = self.tk.tokenize(text)
#print("Tokens:", [w.get_form() for w in tokens])
#print("hay Tokens:", len(tokens))
ls = self.sp.split(sid,tokens,True)
#print("After split", len(ls))
ls = self.mf.analyze(ls)
#print("After morpho", len(ls))
ls = self.tg.analyze(ls)
#print("After tagger", len(ls))
#ls = self.parser.analyze(ls)
#print("After parser", len(ls))
#ls = self.dep.analyze(ls)
#print("After dependencies", len(ls))
self.sentences = ls
self.N_sentences = len(ls)
self.sp.close_session(sid)
return self.sentences, self.N_sentences
def punctuationMarks(self):
#Solo nos interesa contar los tokens que sean signo de puntuación.
#Number of words.
punctuation = []
lsentences=[]
for words in self.sentences:
lwords = []
for w in words:
if re.match('F.*', w.get_tag()):
punctuation.append(w.get_form())
else:
lwords.append(w.get_form())
lsentences.append(len(lwords))
#print('list sentences: ',lsentences)
self.N_words = sum(lsentences)
#print('Number of words (N_w): ', self.N_words, '\n' )
self.mean_words = np.mean(lsentences)
self.std_words = np.std(lsentences)
#print('media', np.mean(lsentences))
#print('std', np.std(lsentences))
#print('Las palabras del texto son : ', N_words)
#self.N_words = len(N_words)
#print('Number of words (N_w): ', self.N_words, '\n' )
self.N_punctuation = len(punctuation)
self.punctuation = punctuation
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
if self.N_words == 0:
punctuation_over_words = 0
else:
punctuation_over_words = self.N_punctuation / self.N_words
self.punctuation_over_words = punctuation_over_words
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
return self.punctuation_over_words, self.mean_words, self.std_words,self.N_punctuation, self.punctuation, self.N_words
def sentenceComplexity(self):
#Number of complex sentences
N_cs = 0
for sentence in self.sentences:
previous_is_verb = False
count = 0
for words in sentence:
for w in words:
if re.match('V.*', w.get_tag()):
if (previous_is_verb):
count += 1
previous_is_verb = False
else:
previous_is_verb = True
else:
previous_is_verb = False
if count>0:
N_cs += 1
self.N_cs = N_cs
#print("Number of complex sentences: ", self.N_cs, "\n")
ASL = self.N_words / self.N_sentences
self.ASL = ASL
#print("Average Sentence Length (ASL) = ", self.ASL, '\n')
CS = self.N_cs / self.N_sentences
self.CS = CS
#print("Complex Sentences (CS) = ", self.CS, '\n')
SCI = (ASL + CS)/ 2
self.SCI = SCI
#print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
return self.SCI, self.CS, self.N_cs, self.ASL
def autoReadability(self):
#Number of characters
count = 0
listwords = []
for words in self.sentences:
for w in words:
if re.match('\r\n.*', w.get_tag()):
count +=1
else:
listwords.append(w.get_form())
self.listwords = listwords
N_charac = 0
for characters in self.listwords:
N_charac += len(characters)
self.N_charac = N_charac
#print("Number of characters: ", self.N_charac, "\n")
ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_sentences - 21.43
self.ARI = ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
return self.ARI, self.N_charac, self.listwords
def mureadability(self):
#Number of syllables and Number of words with 3 or more syllables:tagger
N_syllables3 = 0
punctuation = []
lsyllablesentence=[]
for words in self.sentences:
lwords = []
N_syllables = 0
for w in words:
if re.match('F.*', w.get_tag()):
punctuation.append(w.get_form())
else:
lwords.append(w.get_form())
#print('lwords', lwords)
for words in lwords:
count=0
for character in words:
if re.match('a|e|i|o|u|y', character):
N_syllables+=1
count+=1
if count>=3:
N_syllables3+= 1
lsyllablesentence.append(N_syllables)
#print('lsyllablesentence', lsyllablesentence)
self.N_syllables = sum(lsyllablesentence)
self.N_syllables3 = N_syllables3
self.mean_syllables = np.mean(lsyllablesentence)
self.std_syllables = np.std(lsyllablesentence)
#print('media', self.mean_syllables)
#print('std', self.std_syllables)
#Number of letters
listwords = []
for words in self.sentences:
for w in words:
if re.match('F.*', w.get_tag()):
punctuation.append(w.get_form())
else:
listwords.append(w.get_form())
N_letters= 0
letters = []
vecletters =[]
for word in listwords:
if re.match('[a-zA-Z]|á|ó|í|ú|é', word):
letters.append(word)
N_letters+=len(word)
vecletters.append(len(word))
self.letters = letters
self.N_letters = N_letters
self.vecletters = vecletters
x=self.N_letters / self.N_words
varianza=np.var(self.vecletters)
mu = (self.N_words /(self.N_words-1))*(x/varianza)*100
#print("READABILITY MU: ", mu, "\n")
self.mu = mu
return self.mu, self.mean_syllables, self.std_syllables, self.N_syllables, self.N_syllables3, self.letters, self.N_letters, self.vecletters
def readability(self):
fleschvacareadability = 206 - 65 * (self.N_syllables / self.N_words) - (self.N_words / self.N_sentences)
self.fleschvacareadability = fleschvacareadability
gulpeasereadability = 89 - 10 * (self.N_letters / self.N_words) + 300 * (self.N_sentences / self.N_words )
self.gulpeasereadability = gulpeasereadability
return self.fleschvacareadability, self.gulpeasereadability
def calcMetrics(self, text):
"""
Calcula la métricas de complejidad activadas en la configuración
"""
self.textProcessing(text)
metrics = {}
punctuationMarks = None
autoreadability = None
sentencecomplexity = None
readability = None
for i in range(0, len(self.metricsIt)):
if self.config == None or self.config[i] and self.metricsIt[i] == 'AVERAGE PUNCTUATION MARKS':
punctuationmarks = self.punctuationMarks()
metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0]
if self.config == None or self.config[i] and self.metricsIt[i] == 'SCI':
sentencecomplexity= self.sentenceComplexity()
metrics['SCI'] = self.SCI
if self.config == None or self.config[i] and self.metricsIt[i] == 'ARI':
autoreadability = self.autoReadability()
metrics['ARI'] = autoreadability[0]
if self.config == None or self.config[i] and self.metricsIt[i] == 'MU':
mureadability = self. mureadability()
metrics['MU'] = mureadability[0]
if self.config == None or self.config[i] and self.metricsIt[i] == 'FLESCH-VACA':
if not readability: readability = self.readability()
metrics['FLESCH-VACA'] = readability[0]
if self.config == None or self.config[i] and self.metricsIt[i] == 'GULPEASE':
if not readability: readability = self.readability()
metrics['GULPEASE'] = readability[1]
return metrics
def getPOS(self, text):
self.textProcessing(text)
pos_sentences = []
for sentence in self.sentences:
ws = sentence.get_words();
pos_sentences.append([w.get_tag() for w in ws])
self.pos_sentences = pos_sentences
return self.pos_sentences
def calcMetricsExtend(self, text):
"""
Calcula la métricas de complejidad activadas en la configuración
"""
self.textProcessing(text)
metricsExtend = {}
punctuationmarks = None
sentencecomplexity = None
mureadability= None
for i in range(0, len(self.metricsItExtend)):
if self.configExtend == None or self.configExtend[i] and self.metricsItExtend[i] == 'MEAN WORDS':
punctuationmarks = self.punctuationMarks()
metricsExtend['MEAN WORDS'] = punctuationmarks[1]
if self.configExtend == None or self.configExtend[i] and self.metricsItExtend[i] == 'STD WORDS':
punctuationmarks = self.punctuationMarks()
metricsExtend['STD WORDS'] = punctuationmarks[2]
if self.configExtend == None or self.configExtend[i] and self.metricsItExtend[i] == 'COMPLEX SENTENCES':
sentencecomplexity= self.sentenceComplexity()
metricsExtend['COMPLEX SENTENCES'] = sentencecomplexity[1]
if self.configExtend == None or self.configExtend[i] and self.metricsItExtend[i] == 'MEAN SYLLABLES':
mureadability = self. mureadability()
metricsExtend['MEAN SYLLABLES'] = mureadability[1]
if self.configExtend == None or self.configExtend[i] and self.metricsItExtend[i] == 'STD SYLLABLES':
mureadability = self. mureadability()
metricsExtend['STD SYLLABLES'] = mureadability[2]
return metricsExtend