Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spam attempt #5

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
layout python3
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ docs/_build/
# PyBuilder
target/

.direnv/
385 changes: 385 additions & 0 deletions Spam.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,385 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Bayesian Classifier"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn import linear_model\n",
"from sklearn.cross_validation import train_test_split \n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>0.64</th>\n",
" <th>0.64.1</th>\n",
" <th>0.1</th>\n",
" <th>0.32</th>\n",
" <th>0.2</th>\n",
" <th>0.3</th>\n",
" <th>0.4</th>\n",
" <th>0.5</th>\n",
" <th>0.6</th>\n",
" <th>...</th>\n",
" <th>0.40</th>\n",
" <th>0.41</th>\n",
" <th>0.42</th>\n",
" <th>0.778</th>\n",
" <th>0.43</th>\n",
" <th>0.44</th>\n",
" <th>3.756</th>\n",
" <th>61</th>\n",
" <th>278</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.21</td>\n",
" <td>0.28</td>\n",
" <td>0.50</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.28</td>\n",
" <td>0.21</td>\n",
" <td>0.07</td>\n",
" <td>0.00</td>\n",
" <td>0.94</td>\n",
" <td>...</td>\n",
" <td>0.00</td>\n",
" <td>0.132</td>\n",
" <td>0</td>\n",
" <td>0.372</td>\n",
" <td>0.180</td>\n",
" <td>0.048</td>\n",
" <td>5.114</td>\n",
" <td>101</td>\n",
" <td>1028</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.06</td>\n",
" <td>0.00</td>\n",
" <td>0.71</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.19</td>\n",
" <td>0.19</td>\n",
" <td>0.12</td>\n",
" <td>0.64</td>\n",
" <td>0.25</td>\n",
" <td>...</td>\n",
" <td>0.01</td>\n",
" <td>0.143</td>\n",
" <td>0</td>\n",
" <td>0.276</td>\n",
" <td>0.184</td>\n",
" <td>0.010</td>\n",
" <td>9.821</td>\n",
" <td>485</td>\n",
" <td>2259</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0.31</td>\n",
" <td>0.63</td>\n",
" <td>0.31</td>\n",
" <td>0.63</td>\n",
" <td>...</td>\n",
" <td>0.00</td>\n",
" <td>0.137</td>\n",
" <td>0</td>\n",
" <td>0.137</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" <td>3.537</td>\n",
" <td>40</td>\n",
" <td>191</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0.31</td>\n",
" <td>0.63</td>\n",
" <td>0.31</td>\n",
" <td>0.63</td>\n",
" <td>...</td>\n",
" <td>0.00</td>\n",
" <td>0.135</td>\n",
" <td>0</td>\n",
" <td>0.135</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" <td>3.537</td>\n",
" <td>40</td>\n",
" <td>191</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.85</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>...</td>\n",
" <td>0.00</td>\n",
" <td>0.223</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" <td>3.000</td>\n",
" <td>15</td>\n",
" <td>54</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 58 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n",
"0 0.21 0.28 0.50 0 0 0.28 0.21 0.07 0.00 0.94 ... 0.00 \n",
"1 0.06 0.00 0.71 0 0 0.19 0.19 0.12 0.64 0.25 ... 0.01 \n",
"2 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63 ... 0.00 \n",
"3 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63 ... 0.00 \n",
"4 0.00 0.00 0.00 0 0 0.00 0.00 1.85 0.00 0.00 ... 0.00 \n",
"\n",
" 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n",
"0 0.132 0 0.372 0.180 0.048 5.114 101 1028 1 \n",
"1 0.143 0 0.276 0.184 0.010 9.821 485 2259 1 \n",
"2 0.137 0 0.137 0.000 0.000 3.537 40 191 1 \n",
"3 0.135 0 0.135 0.000 0.000 3.537 40 191 1 \n",
"4 0.223 0 0.000 0.000 0.000 3.000 15 54 1 \n",
"\n",
"[5 rows x 58 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spam_data = pd.read_csv('spambase.data.csv')\n",
"spam_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##Determining the test and training data from the original dataset\n",
"###60% of data is training and 40% is testing"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"spam_train_x, spam_test_x = train_test_split(spam_data, test_size=.40)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##Training data"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"spam_train_y = spam_train_x[spam_train_x.columns[-1]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##Testing data"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"spam_test_y = spam_test_x[spam_test_x.columns[-1]]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"classifier = MultinomialNB()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.fit(spam_train_x, spam_train_y)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.85434782608695647"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.score(spam_test_x, spam_test_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### The spam detector seems to be accurate. It has an r**2 score of .854."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Binary file added spambase (1).zip
Binary file not shown.
Binary file added spambase (1)/.DS_Store
Binary file not shown.
Loading