Skip to content

Commit

Permalink
update code of cross validation with lemmatization preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
sli41 committed Apr 25, 2017
1 parent a7330d4 commit 0fbfdf5
Show file tree
Hide file tree
Showing 8 changed files with 977 additions and 229 deletions.
213 changes: 99 additions & 114 deletions code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb

Large diffs are not rendered by default.

358 changes: 314 additions & 44 deletions code/.ipynb_checkpoints/EM_NB_text_classification_v3-checkpoint.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 0
}
235 changes: 210 additions & 25 deletions code/EM_NB_text_classification_v2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"collapsed": false
},
Expand All @@ -32,7 +32,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"collapsed": true
},
Expand All @@ -45,11 +45,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(11314, 26747) (7532, 26747)\n"
]
}
],
"source": [
"# Convert all text data into tf-idf vectors \n",
"vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)\n",
Expand All @@ -61,11 +69,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5657, 26747) (5657, 26747)\n"
]
}
],
"source": [
"# Divide train data set into labeled and unlabeled data sets\n",
"n_train_data = train_vec.shape[0]\n",
Expand All @@ -76,42 +92,110 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Train Naive Bayes classifier (imported) \n",
"# using labeled data set only\n",
"nb_clf = MultinomialNB(alpha=1)\n",
"nb_clf = MultinomialNB(alpha=1e-2)\n",
"nb_clf.fit(X_l, y_l)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial expected log likelihood = -2954689.899\n",
"\n",
"EM iteration #1\n",
"\tExpected log likelihood = -2798528.882\n",
"EM iteration #2\n",
"\tExpected log likelihood = -2798484.321\n",
"EM iteration #3\n",
"\tExpected log likelihood = -2798484.321\n"
]
},
{
"data": {
"text/plain": [
"<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x117b2ac68>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Train Naive Bayes classifier (imported) \n",
"# using both labeled and unlabeled data set\n",
"em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-8, max_iter=30) # semi supervised EM based Naive Bayes classifier\n",
"# em_nb_clf.fit(X_l, y_l, X_u)\n",
"em_nb_clf.fit_x(X_l, y_l, X_u)\n",
"em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2) # semi supervised EM based Naive Bayes classifier\n",
"em_nb_clf.fit(X_l, y_l, X_u)\n",
"# em_nb_clf.fit_with_clustering(X_l, y_l, X_u)\n",
"# em_nb_clf.partial_fit(X_l, y_l, X_u)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" alt.atheism 0.49 0.41 0.45 319\n",
" comp.graphics 0.61 0.66 0.63 389\n",
" comp.os.ms-windows.misc 0.61 0.54 0.57 394\n",
"comp.sys.ibm.pc.hardware 0.58 0.64 0.61 392\n",
" comp.sys.mac.hardware 0.69 0.62 0.66 385\n",
" comp.windows.x 0.78 0.73 0.75 395\n",
" misc.forsale 0.75 0.70 0.72 390\n",
" rec.autos 0.73 0.68 0.70 396\n",
" rec.motorcycles 0.73 0.68 0.70 398\n",
" rec.sport.baseball 0.88 0.78 0.82 397\n",
" rec.sport.hockey 0.58 0.92 0.71 399\n",
" sci.crypt 0.72 0.73 0.72 396\n",
" sci.electronics 0.62 0.52 0.56 393\n",
" sci.med 0.82 0.74 0.78 396\n",
" sci.space 0.73 0.74 0.74 394\n",
" soc.religion.christian 0.57 0.80 0.67 398\n",
" talk.politics.guns 0.53 0.67 0.59 364\n",
" talk.politics.mideast 0.80 0.79 0.79 376\n",
" talk.politics.misc 0.59 0.41 0.49 310\n",
" talk.religion.misc 0.34 0.22 0.26 251\n",
"\n",
" avg / total 0.67 0.66 0.66 7532\n",
"\n",
"0.661975570898\n"
]
}
],
"source": [
"# Evaluate original NB classifier using test data set\n",
"pred = nb_clf.predict(test_vec)\n",
Expand All @@ -122,11 +206,44 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" alt.atheism 0.55 0.41 0.47 319\n",
" comp.graphics 0.62 0.67 0.64 389\n",
" comp.os.ms-windows.misc 0.65 0.49 0.56 394\n",
"comp.sys.ibm.pc.hardware 0.56 0.66 0.61 392\n",
" comp.sys.mac.hardware 0.68 0.63 0.66 385\n",
" comp.windows.x 0.77 0.74 0.76 395\n",
" misc.forsale 0.75 0.69 0.72 390\n",
" rec.autos 0.72 0.70 0.71 396\n",
" rec.motorcycles 0.75 0.70 0.73 398\n",
" rec.sport.baseball 0.93 0.80 0.86 397\n",
" rec.sport.hockey 0.58 0.94 0.71 399\n",
" sci.crypt 0.73 0.73 0.73 396\n",
" sci.electronics 0.64 0.52 0.58 393\n",
" sci.med 0.85 0.76 0.80 396\n",
" sci.space 0.73 0.77 0.75 394\n",
" soc.religion.christian 0.54 0.87 0.67 398\n",
" talk.politics.guns 0.54 0.72 0.62 364\n",
" talk.politics.mideast 0.80 0.79 0.80 376\n",
" talk.politics.misc 0.59 0.41 0.49 310\n",
" talk.religion.misc 0.42 0.12 0.19 251\n",
"\n",
" avg / total 0.68 0.67 0.66 7532\n",
"\n",
"0.672331386086\n"
]
}
],
"source": [
"# Evaluate semi-supervised EM NB classifier using test data set\n",
"pred = em_nb_clf.predict(test_vec)\n",
Expand All @@ -137,7 +254,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"collapsed": false
},
Expand All @@ -154,33 +271,101 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: bobby say atheism bible just religion don think people god\n",
"comp.graphics: know file 3d program does software files thanks image graphics\n",
"comp.os.ms-windows.misc: using thanks driver use program drivers files dos file windows\n",
"comp.sys.ibm.pc.hardware: dos thanks monitor ide pc card bus controller scsi drive\n",
"comp.sys.mac.hardware: know lc software use problem does thanks drive apple mac\n",
"comp.windows.x: application xterm using display thanks x11r5 widget motif server window\n",
"misc.forsale: drive interested email condition price new offer shipping 00 sale\n",
"rec.autos: ford oil just don new like engine dealer cars car\n",
"rec.motorcycles: riding dog like bmw just bikes motorcycle ride dod bike\n",
"rec.sport.baseball: think pitching braves games hit runs game baseball year team\n",
"rec.sport.hockey: like year games play nhl players season hockey team game\n",
"sci.crypt: people use nsa escrow keys government chip clipper encryption key\n",
"sci.electronics: ve electronics good used amp know does circuit like use\n",
"sci.med: chastity n3jxp skepticism patients pitt banks geb gordon edu msg\n",
"sci.space: lunar shuttle spacecraft earth like launch moon orbit nasa space\n",
"soc.religion.christian: christian people faith believe christ bible church christians jesus god\n",
"talk.politics.guns: government weapon right firearms law don people weapons guns gun\n",
"talk.politics.mideast: arabs turkey arab people turkish armenian armenians jews israeli israel\n",
"talk.politics.misc: think drugs did tax clinton just president don government people\n",
"talk.religion.misc: just know morality koresh don christian objective people jesus god\n"
]
}
],
"source": [
"show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by original NB classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: islam say atheists religion just atheism don people think god\n",
"comp.graphics: 3d program looking file hi know image files thanks graphics\n",
"comp.os.ms-windows.misc: ftp card thanks use driver drivers files dos file windows\n",
"comp.sys.ibm.pc.hardware: monitor thanks disk pc ide controller bus scsi card drive\n",
"comp.sys.mac.hardware: quadra know monitor simms use problem thanks drive apple mac\n",
"comp.windows.x: x11r5 program windows use application thanks widget motif server window\n",
"misc.forsale: asking sell price new email condition offer shipping 00 sale\n",
"rec.autos: don ford new good dealer just like engine cars car\n",
"rec.motorcycles: don helmet like riding motorcycle just ride bikes dod bike\n",
"rec.sport.baseball: players braves pitching hit runs games game baseball team year\n",
"rec.sport.hockey: teams year nhl season players play games hockey team game\n",
"sci.crypt: people escrow use nsa keys government chip clipper encryption key\n",
"sci.electronics: electronics thanks used voltage does know like use circuit power\n",
"sci.med: skepticism cadre banks dsl n3jxp chastity pitt gordon geb msg\n",
"sci.space: just lunar like earth shuttle launch moon orbit nasa space\n",
"soc.religion.christian: faith believe christ christians people christian bible church jesus god\n",
"talk.politics.guns: right just government law fbi don weapons people guns gun\n",
"talk.politics.mideast: war said arab turkish armenians people armenian jews israeli israel\n",
"talk.politics.misc: state know clinton think president just tax don government people\n",
"talk.religion.misc: think know christian don morality kent objective people jesus god\n"
]
}
],
"source": [
"show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-3.16001007 -2.96389519 -2.95367364 -2.95367364 -2.97422231 -2.94691686\n",
" -2.96389519 -2.94691686 -2.94020542 -2.94020542 -2.93686652 -2.94355551\n",
" -2.95367364 -2.94691686 -2.95028954 -2.93686652 -3.0311772 -2.99874192\n",
" -3.18961054 -3.40420703] [-3.21130337 -2.98465718 -3.03853017 -2.9040767 -3.00229433 -2.93022198\n",
" -2.99874192 -2.94860178 -2.94523477 -2.99343687 -2.70840381 -2.94355551\n",
" -3.00585939 -2.97076807 -2.99167476 -2.80876652 -2.98291046 -2.98116678\n",
" -3.22900294 -3.61676847]\n"
]
}
],
"source": [
"print nb_clf.class_log_prior_, em_nb_clf.clf.class_log_prior_"
]
Expand Down
Loading

0 comments on commit 0fbfdf5

Please sign in to comment.