-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevalFunctions.py
174 lines (166 loc) · 5.4 KB
/
evalFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import sys
import os
import nltk, string, json
from nltk.util import ngrams
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
# Removal-Funktion
def removal(x,fv):
stop_words = set(stopwords.words('english'))
# Stopwörter entfernen
res = []
for pair in x:
l = len(pair)
count = 0
for word in pair:
if word in stop_words:
count = count or 0
else:
count = count or 1
if count==1:
for word in pair:
if word in fv or len(fv)==0:
count = count or 1
if count==1:
res.append(pair)
return res
# Rekursive Matrix-Funktion
def createWordMatrix(word,seq,lseq,unigram,i):
if not lseq==i:
createWordMatrix(word,seq,lseq-1,unigram,i)
if not word==seq[lseq]:
unigram.append(word)
unigram.append(seq[lseq])
return unigram
# Erstellen eines Unigrams und ein tokenisierter Text
def unigram(sents):
punct = ['“','”','–','’','‘','—','…']
unigram = []
toktext = []
for sentence in sents:
sentence = ''.join([char for char in sentence if (char not in string.punctuation) and (char not in punct)])
sentence = sentence.lower()
sequence = nltk.word_tokenize(sentence)
for word in sequence:
unigram.append(word)
toktext.append(sequence)
return toktext,unigram
# Erstellung eines Matrix-Unigrams
def unigramMatrix(sents:list):
unigram = []
l_names, adverbs, adjectives, female, male, res = getFilterValues(1,0,0,0,0)
for sentence in sents:
sequence = nltk.word_tokenize(sentence)
lseq = len(sequence)-1
i = 0
for word in sequence:
if word in l_names:
createWordMatrix(word,sequence,lseq,unigram,i)
i += 1
return unigram
def ngramFilter(unigram):
unigramall = []
# POS Tags ermitteln
sequence = nltk.pos_tag(unigram)
names = []
with open(os.path.join('Data','pos','names.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
names.append(key)
for word in sequence:
if word[1] in ['JJ','JJR','JJS','RB','RBR','RBS'] or word[0] in names: # Nomen, Adverben, Adjektive
if len(unigramall)==0 or word[0]!=unigramall[-1]:
unigramall.append(word[0])
return unigramall
def initNgrams(unigram):
# Variable erstellen
bigram = []
# Filterwerte ermitteln
fv = getFilterValues(1,0,0,0,0)
# n-Grame erstellen
bigram.extend(list(ngrams(unigram, 2)))
# Removal-Funktion anwenden
unigram = removal(unigram,fv)
# FreqDist berechnen
freq_uni = nltk.FreqDist(unigram)
freq_bi = nltk.FreqDist(bigram)
len_bi = len(freq_bi)
return freq_uni,freq_bi.most_common(len_bi)
def getWordFrequency(unigram): # TODO
# Variablendeklaration
x = []
y = []
color = []
gram_pos = nltk.pos_tag(unigram)
counter = 0
c_noun = "green"
c_adv = "yellow"
c_adj = "red"
c_none = "blue"
legend = ['Nomen','Adverb','Adjektiv','Andere']
# Ergebnisse
tuple_gram = unigram.most_common(100)
for tuple in tuple_gram:
x.append(tuple[0])
y.append(tuple[1])
if gram_pos[counter][1] in ['NN','NNS','NNP']:
color.append(c_noun)
elif gram_pos[counter][1] in ['JJ','JJR','JJS']:
color.append(c_adv)
elif gram_pos[counter][1] in ['RB','RBR','RBS']:
color.append(c_adj)
else:
color.append(c_none)
counter += 1
fig = plt.figure(figsize = (20,5))
plt.bar(x,y,color = color)
plt.title("Wordfrequencies")
plt.xlabel("Word")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
plt.show()
# Funktion zum Ermitteln von Filter-werten
def getFilterValues(n:bool,adv:bool,adj:bool,female_n:bool,male_n:bool):
res = []
names = []
adverbs = []
adjectives = []
female = []
male = []
if n==1:
with open(os.path.join('Data','pos','names.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
names.append(key.lower())
res += names
if adv==1:
with open(os.path.join('Data','pos','adverbs.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
adverbs.append(key.lower())
res += adverbs
if adj==1:
with open(os.path.join('Data','pos','adjectives.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
adjectives.append(key.lower())
res += adjectives
if female_n==1:
with open(os.path.join('Data','pos','female.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
female.append(key.lower())
res += female
if male_n==1:
with open(os.path.join('Data','pos','male.json'),'r',encoding='utf-8') as f:
content = f.read().replace('\xad','')
n = json.loads(content)
for key in n.keys():
male.append(key.lower())
res += male
return names, adverbs, adjectives, female, male, res