-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_utils.py
69 lines (59 loc) · 1.82 KB
/
index_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# Data preprocessing
def f_clean(i):
s_word = stopwords.words('english')
i = i.replace("\\n","")
i = i.lower()
i = i.translate(str.maketrans("","",string.punctuation))
word = word_tokenize(i)
filt = [w for w in word if not w in s_word]
ps = PorterStemmer()
stemmed = []
for w in filt:
stemmed.append(ps.stem(w))
punctuation = list(string.punctuation)
punctuation.append("''")
without_punt = [w for w in stemmed if not w in punctuation]
return without_punt
# create the vocabulary and set a new column in the dataframe with the len on the plot+intro
def create_voc(row,vocabulary):
film_id = row['film_id']
try:
text = f_clean(row['intro']+row['plot'])
except Exception as e:
print(film_id,e)
return 0
for w in text:
if w not in vocabulary:
vocabulary[w] = {film_id:1}
else:
if film_id not in vocabulary[w]:
vocabulary[w][film_id] = 1
else:
vocabulary[w][film_id] += 1
return len(text)
#we map the keys of the vocabulary in a number
def map_voc(voc):
i = 0
new_voc = {}
for e in voc.keys():
new_voc[e] = i
i +=1
return new_voc
# Now we define a function to calculates the tf idf
def invertx_voc(voc):
new_voc = {}
for k in voc.keys():
repetition = len(voc[k])
IDF = math.log(30000/repetition)
for elem in voc[k].keys():
val = voc[k][elem]
length = list(df_film[df_film['film_id'] == int(elem)]['len_text'])[0]
tf = val/length
if k not in new_voc:
new_voc[k] = {elem : tf*IDF}
else:
new_voc[k][elem] = tf*IDF
return new_voc