forked from SwastikUdupa/TF-IDF-hadoop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreducer3.py
25 lines (25 loc) · 810 Bytes
/
reducer3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/env python
import sys
import math
dict_word = {}
dict_word_doc = {}
D = 12
for line in sys.stdin:
word_in_doc = line.strip().split("=")[0]
freq = line.strip().split("=")[1]
#freq is the frequency of the word in that document
dict_word_doc[word_in_doc] = freq
word = line.strip().split(",")[0]
#used to determine in how many docs we are having this word
if word in dict_word:
dict_word[word] += 1
else:
dict_word[word] = 1
for element in dict_word_doc:
document = element.split(",")[1]
word = element.split(",")[0]
d = dict_word[word]
tf = math.log(float(D)/float(d))
freq = float(dict_word_doc[element])
tfidf = tf*float(freq)
sys.stdout.write("{0}@{1},{2},{3},{4}\n".format(word, document, tf, freq, tfidf))