forked from msr-ds3/coursework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenron_naive_bayes.sh
executable file
·75 lines (63 loc) · 1.78 KB
/
enron_naive_bayes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
#
# file: enron_naive_bayes.sh
#
# description: trains a simple one-word naive bayes spam filter using
# enron email data
#
# usage: ./enron_naive_bayes.sh <word>
#
# requirements:
# curl
#
# author: jake hofman (gmail: jhofman)
#
# how to use the code
if [ $# -eq 1 ]
then
word=$1
else
echo "usage: enron_naive_bayes.sh <word>"
exit
fi
# if the file doesn't exist, download from the web
if ! [ -e enron1.tar.gz ]
then
curl -O -L 'http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/enron1.tar.gz'
fi
# if the directory doesn't exist, uncompress the .tar.gz
if ! [ -d enron1 ]
then
tar zxvf enron1.tar.gz
fi
# change into enron1
cd enron1
# get counts of total spam, ham, and overall msgs
Nspam=`ls -l spam/*.txt | wc -l`
Nham=`ls -l ham/*.txt | wc -l`
Ntot=$Nspam+$Nham
echo $Nspam spam examples
echo $Nham ham examples
# get counts containing word in spam and ham classes
Nword_spam=`grep -il $word spam/*.txt | wc -l`
Nword_ham=`grep -il $word ham/*.txt | wc -l`
echo $Nword_spam "spam examples containing $word"
echo $Nword_ham "ham examples containing $word"
# calculate probabilities using bash calculator "bc"
Pspam=`echo "scale=4; $Nspam / ($Nspam+$Nham)" | bc`
Pham=`echo "scale=4; 1-$Pspam" | bc`
echo
echo "estimated P(spam) =" $Pspam
echo "estimated P(ham) =" $Pham
Pword_spam=`echo "scale=4; $Nword_spam / $Nspam" | bc`
Pword_ham=`echo "scale=4; $Nword_ham / $Nham" | bc`
echo "estimated P($word|spam) =" $Pword_spam
echo "estimated P($word|ham) =" $Pword_ham
Pspam_word=`echo "scale=4; $Pword_spam*$Pspam" | bc`
Pham_word=`echo "scale=4; $Pword_ham*$Pham" | bc`
Pword=`echo "scale=4; $Pspam_word+$Pham_word" | bc`
Pspam_word=`echo "scale=4; $Pspam_word / $Pword" | bc`
echo
echo "P(spam|$word) =" $Pspam_word
# return original directory
cd ..