Skip to content

Commit

Permalink
update code of result comparison between original and semi-supervised…
Browse files Browse the repository at this point in the history
… EM NB classier
  • Loading branch information
sli41 committed Apr 23, 2017
1 parent 5fd40f7 commit 0c4ca4f
Show file tree
Hide file tree
Showing 10 changed files with 755 additions and 484 deletions.
204 changes: 112 additions & 92 deletions code/.ipynb_checkpoints/EM_NB_text_classification_v1-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"outputs": [],
"source": [
"# Convert all text data into tf-idf vectors\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)\n",
"train_vec = vectorizer.fit_transform(train_Xy.data)\n",
"test_vec = vectorizer.transform(test_Xy.data)"
]
Expand All @@ -59,14 +59,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(3394, 101631) (7920, 101631)\n"
"(2262, 26747) (9052, 26747)\n"
]
}
],
"source": [
"# Divide train data set into labeled and unlabeled data sets\n",
"n_train_data = train_vec.shape[0]\n",
"split_ratio = 0.3 # labeled vs unlabeled\n",
"split_ratio = 0.2 # labeled vs unlabeled\n",
"X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)\n",
"print X_l.shape, X_u.shape"
]
Expand All @@ -93,7 +93,8 @@
"# Train Naive Bayes classifier (imported) \n",
"# using both labeled and unlabeled data set\n",
"clf = MultinomialNB(alpha=1e-8)\n",
"clf.fit(X_l, y_l)"
"clf.fit(X_l, y_l)\n",
"# clf.fit(train_vec, train_Xy.target)"
]
},
{
Expand All @@ -109,28 +110,28 @@
"text": [
" precision recall f1-score support\n",
"\n",
" alt.atheism 0.42 0.39 0.41 319\n",
" comp.graphics 0.53 0.56 0.54 389\n",
" comp.os.ms-windows.misc 0.30 0.36 0.33 394\n",
"comp.sys.ibm.pc.hardware 0.49 0.62 0.55 392\n",
" comp.sys.mac.hardware 0.77 0.35 0.48 385\n",
" comp.windows.x 0.61 0.78 0.68 395\n",
" misc.forsale 0.68 0.62 0.65 390\n",
" rec.autos 0.80 0.43 0.56 396\n",
" rec.motorcycles 0.69 0.57 0.62 398\n",
" rec.sport.baseball 0.90 0.68 0.77 397\n",
" rec.sport.hockey 0.73 0.86 0.79 399\n",
" sci.crypt 0.55 0.70 0.61 396\n",
" sci.electronics 0.58 0.35 0.43 393\n",
" sci.med 0.66 0.69 0.68 396\n",
" sci.space 0.62 0.65 0.63 394\n",
" soc.religion.christian 0.57 0.70 0.62 398\n",
" talk.politics.guns 0.40 0.64 0.49 364\n",
" talk.politics.mideast 0.57 0.76 0.65 376\n",
" talk.politics.misc 0.46 0.36 0.40 310\n",
" talk.religion.misc 0.32 0.16 0.22 251\n",
" alt.atheism 0.45 0.31 0.37 319\n",
" comp.graphics 0.61 0.29 0.39 389\n",
" comp.os.ms-windows.misc 0.53 0.30 0.39 394\n",
"comp.sys.ibm.pc.hardware 0.54 0.51 0.53 392\n",
" comp.sys.mac.hardware 0.65 0.36 0.46 385\n",
" comp.windows.x 0.43 0.77 0.56 395\n",
" misc.forsale 0.73 0.51 0.60 390\n",
" rec.autos 0.43 0.70 0.53 396\n",
" rec.motorcycles 0.78 0.33 0.46 398\n",
" rec.sport.baseball 0.85 0.62 0.71 397\n",
" rec.sport.hockey 0.85 0.83 0.84 399\n",
" sci.crypt 0.39 0.72 0.50 396\n",
" sci.electronics 0.40 0.43 0.42 393\n",
" sci.med 0.62 0.64 0.63 396\n",
" sci.space 0.58 0.64 0.61 394\n",
" soc.religion.christian 0.59 0.69 0.63 398\n",
" talk.politics.guns 0.51 0.48 0.49 364\n",
" talk.politics.mideast 0.46 0.82 0.59 376\n",
" talk.politics.misc 0.42 0.29 0.34 310\n",
" talk.religion.misc 0.30 0.18 0.22 251\n",
"\n",
" avg / total 0.59 0.57 0.57 7532\n",
" avg / total 0.56 0.53 0.52 7532\n",
"\n"
]
}
Expand All @@ -152,46 +153,46 @@
"name": "stdout",
"output_type": "stream",
"text": [
"array([[126, 0, 9, 2, 2, 4, 1, 0, 5, 1, 6, 8, 1,\n",
" 6, 9, 69, 17, 24, 6, 23],\n",
" [ 5, 218, 31, 14, 7, 51, 4, 1, 0, 1, 3, 18, 3,\n",
" 7, 11, 4, 6, 3, 2, 0],\n",
" [ 4, 43, 141, 74, 1, 62, 10, 2, 5, 0, 2, 20, 0,\n",
" 6, 10, 2, 3, 3, 5, 1],\n",
" [ 0, 24, 32, 244, 10, 25, 21, 1, 2, 0, 1, 14, 9,\n",
" 6, 1, 0, 1, 0, 1, 0],\n",
" [ 0, 32, 38, 69, 133, 16, 20, 5, 0, 2, 3, 12, 16,\n",
" 7, 16, 2, 8, 1, 4, 1],\n",
" [ 1, 21, 18, 8, 3, 307, 4, 1, 0, 0, 2, 8, 4,\n",
" 6, 5, 0, 5, 1, 1, 0],\n",
" [ 0, 8, 13, 32, 12, 6, 241, 8, 4, 4, 15, 0, 9,\n",
" 6, 7, 1, 15, 4, 5, 0],\n",
" [ 3, 3, 28, 4, 0, 1, 13, 172, 54, 3, 5, 9, 19,\n",
" 7, 18, 3, 35, 6, 10, 3],\n",
" [ 6, 4, 15, 2, 1, 2, 11, 8, 226, 5, 13, 13, 14,\n",
" 8, 7, 3, 31, 12, 11, 6],\n",
" [ 5, 2, 17, 1, 1, 3, 0, 1, 2, 270, 44, 6, 1,\n",
" 7, 3, 7, 15, 5, 3, 4],\n",
" [ 6, 2, 10, 1, 0, 0, 1, 0, 2, 4, 342, 7, 1,\n",
" 1, 3, 3, 3, 8, 5, 0],\n",
" [ 3, 7, 22, 2, 0, 5, 3, 2, 3, 2, 6, 277, 4,\n",
" 3, 7, 4, 20, 12, 10, 4],\n",
" [ 2, 30, 16, 38, 1, 14, 19, 7, 13, 1, 4, 39, 136,\n",
" 23, 26, 1, 15, 5, 3, 0],\n",
" [ 10, 4, 15, 4, 0, 1, 1, 2, 3, 0, 2, 2, 5,\n",
" 273, 10, 12, 18, 20, 11, 3],\n",
" [ 7, 9, 19, 0, 1, 3, 3, 1, 3, 2, 3, 16, 6,\n",
" 14, 256, 4, 18, 19, 8, 2],\n",
" [ 34, 2, 15, 0, 0, 2, 1, 0, 1, 1, 2, 3, 0,\n",
" 3, 2, 278, 10, 16, 5, 23],\n",
" [ 7, 0, 12, 0, 1, 0, 1, 1, 2, 1, 5, 18, 2,\n",
" 6, 8, 7, 232, 31, 19, 11],\n",
" [ 17, 1, 6, 0, 0, 0, 0, 0, 0, 1, 2, 14, 0,\n",
" 1, 0, 20, 18, 285, 8, 3],\n",
" [ 16, 0, 9, 0, 0, 1, 1, 2, 1, 2, 5, 18, 2,\n",
" 12, 10, 7, 82, 26, 111, 5],\n",
" [ 46, 3, 7, 0, 0, 3, 0, 0, 3, 1, 4, 4, 1,\n",
" 10, 5, 65, 24, 21, 13, 41]])\n"
"array([[ 99, 1, 0, 2, 1, 1, 3, 11, 1, 1, 2, 14, 1,\n",
" 10, 18, 57, 8, 43, 19, 27],\n",
" [ 3, 113, 15, 17, 8, 110, 5, 7, 3, 2, 2, 51, 15,\n",
" 18, 10, 3, 1, 5, 1, 0],\n",
" [ 0, 9, 120, 52, 8, 118, 2, 16, 2, 1, 0, 26, 4,\n",
" 7, 16, 4, 1, 2, 5, 1],\n",
" [ 0, 7, 36, 201, 18, 42, 8, 12, 0, 0, 0, 19, 36,\n",
" 5, 4, 2, 0, 1, 1, 0],\n",
" [ 1, 10, 16, 44, 138, 33, 11, 23, 1, 0, 1, 30, 54,\n",
" 7, 7, 2, 0, 5, 2, 0],\n",
" [ 1, 14, 9, 7, 1, 306, 2, 8, 0, 4, 0, 26, 5,\n",
" 5, 6, 0, 1, 0, 0, 0],\n",
" [ 1, 7, 3, 30, 16, 15, 197, 30, 3, 0, 2, 14, 28,\n",
" 15, 15, 1, 3, 6, 3, 1],\n",
" [ 4, 1, 2, 0, 3, 7, 4, 277, 8, 5, 6, 14, 20,\n",
" 7, 12, 2, 7, 13, 4, 0],\n",
" [ 9, 3, 1, 0, 3, 5, 5, 82, 132, 7, 6, 27, 33,\n",
" 10, 10, 8, 14, 27, 12, 4],\n",
" [ 3, 1, 2, 0, 0, 7, 6, 23, 4, 245, 31, 14, 2,\n",
" 10, 11, 4, 8, 12, 9, 5],\n",
" [ 7, 0, 1, 2, 0, 2, 2, 10, 2, 9, 332, 6, 0,\n",
" 2, 2, 3, 4, 12, 2, 1],\n",
" [ 6, 2, 5, 1, 0, 7, 2, 20, 0, 2, 1, 285, 9,\n",
" 5, 7, 4, 9, 23, 5, 3],\n",
" [ 5, 7, 8, 14, 10, 27, 12, 21, 2, 2, 4, 66, 170,\n",
" 13, 16, 2, 4, 7, 2, 1],\n",
" [ 9, 1, 2, 1, 2, 3, 3, 24, 1, 1, 1, 14, 11,\n",
" 255, 17, 13, 11, 16, 9, 2],\n",
" [ 3, 5, 2, 1, 0, 9, 5, 19, 2, 2, 1, 17, 21,\n",
" 13, 252, 5, 5, 21, 10, 1],\n",
" [ 13, 2, 0, 0, 1, 4, 0, 14, 1, 2, 0, 8, 0,\n",
" 3, 6, 273, 4, 30, 8, 29],\n",
" [ 8, 2, 0, 0, 1, 3, 1, 19, 2, 3, 0, 37, 7,\n",
" 9, 3, 6, 174, 48, 21, 20],\n",
" [ 8, 0, 0, 0, 0, 2, 1, 7, 0, 2, 2, 17, 1,\n",
" 1, 1, 7, 8, 310, 7, 2],\n",
" [ 13, 0, 2, 0, 1, 1, 0, 8, 3, 0, 0, 35, 0,\n",
" 11, 12, 9, 65, 53, 91, 6],\n",
" [ 29, 1, 1, 1, 0, 3, 1, 9, 3, 1, 1, 14, 3,\n",
" 6, 11, 59, 16, 40, 8, 44]])\n"
]
}
],
Expand All @@ -210,7 +211,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.572092405736\n"
"0.532926181625\n"
]
}
],
Expand All @@ -220,29 +221,11 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<type 'numpy.ndarray'> <type 'numpy.ndarray'> <type 'fortran'>\n"
]
},
{
"data": {
"text/plain": [
"(20, 7920)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# from scipy.linalg import get_blas_funcs\n",
"# b_w_d = (X_u > 0).T.toarray()\n",
Expand All @@ -255,21 +238,58 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
"source": [
"# find the most informative features \n",
"import numpy as np\n",
"def show_topK(classifier, vectorizer, categories, K=10):\n",
" feature_names = np.asarray(vectorizer.get_feature_names())\n",
" for i, category in enumerate(categories):\n",
" topK = np.argsort(classifier.coef_[i])[-K:]\n",
" print(\"%s: %s\" % (category, \" \".join(feature_names[topK])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: talking islamic satan atheist evidence claim cheat does hillary moral freewill objective morality just people think islam don atheism god\n",
"comp.graphics: images unix code don software using video windows help looking format use know program 3d image files file thanks graphics\n",
"comp.os.ms-windows.misc: like hi drivers ms use using know microsoft problem program does mail card thanks ftp dos ax files file windows\n",
"comp.sys.ibm.pc.hardware: board irq just does computer bios 486 monitor cards isa card use pc drives thanks ide bus controller scsi drive\n",
"comp.sys.mac.hardware: mail disk lc cable fpu keyboard monitor speed problem serial c650 adb card know use thanks drive does mac apple\n",
"comp.windows.x: windows widgets mouse mit ncd error clients manager running use using display xterm does file hi thanks motif server window\n",
"misc.forsale: monitor best floppy 10 50 includes excellent asking used new sell drive price email 00 interested shipping condition offer sale\n",
"rec.autos: vw tires know autos auto price new honda don good like speed saturn just oil ford dealer engine cars car\n",
"rec.motorcycles: ll dwi right bmw really don fall know favorite think road riding honda like ride just motorcycle helmet dod bike\n",
"rec.sport.baseball: record phillies braves did stadium player baseball hit pitching just league think runs home year jewish team games game players\n",
"rec.sport.hockey: just captain traded playoffs teams pick pittsburgh pens playoff nhl roger games leafs year season players play game hockey team\n",
"sci.crypt: secret encrypted law number use don secure people algorithm crypto phone security government escrow nsa keys clipper encryption chip key\n",
"sci.electronics: detector switch want company audio circuit ground chips number line data output information like pin know does radar electronics use\n",
"sci.med: doctor pain surrender pitt banks skepticism intellect shameful geb cadre chastity n3jxp dsl know does gordon food medical patients msg\n",
"sci.space: astro launched know long don centaur rocket idea think program launch just like lunar earth satellite orbit moon nasa space\n",
"soc.religion.christian: truth reason christian heaven does satan christianity love believe sin think people church life christians christ faith bible jesus god\n",
"talk.politics.guns: used public don amendment waco know government just weapon time firearms law fbi weapons crime batf people right guns gun\n",
"talk.politics.mideast: going genocide turks rights like american war palestinians said killed arabs palestinian turkish people armenian arab armenians jews israeli israel\n",
"talk.politics.misc: rights let money gay like state just time make health clinton did think know government don jobs president tax people\n",
"talk.religion.misc: objective life frank values does lds good evidence children believe think know said people mormons don bible jesus christian god\n"
]
}
],
"source": [
"show_topK(clf, vectorizer, train_Xy.target_names, K=20)"
]
},
{
"cell_type": "code",
Expand Down
Loading

0 comments on commit 0c4ca4f

Please sign in to comment.