Skip to content

Commit

Permalink
set up code file for text preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
sli41 committed Apr 23, 2017
1 parent b8da887 commit 5fd40f7
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 17 deletions.
174 changes: 157 additions & 17 deletions code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"collapsed": false
},
Expand All @@ -60,7 +60,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(1131, 101631) (10183, 101631)\n"
"(3394, 101631) (7920, 101631)\n"
]
}
],
Expand All @@ -83,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"collapsed": false
},
Expand All @@ -92,9 +92,28 @@
"name": "stdout",
"output_type": "stream",
"text": [
"-12932118.6543\n",
"1\n"
"-8851790.55769\n",
"1\n",
"-6437159.6717\n",
"2\n",
"-6436536.34232\n",
"3\n",
"-6436334.42252\n",
"4\n",
"-6436331.61876\n",
"5\n",
"-6436331.61876\n"
]
},
{
"data": {
"text/plain": [
"<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x7f919684e440>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -106,11 +125,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" alt.atheism 0.61 0.18 0.28 319\n",
" comp.graphics 0.66 0.52 0.58 389\n",
" comp.os.ms-windows.misc 0.54 0.38 0.44 394\n",
"comp.sys.ibm.pc.hardware 0.53 0.53 0.53 392\n",
" comp.sys.mac.hardware 0.81 0.39 0.52 385\n",
" comp.windows.x 0.77 0.65 0.71 395\n",
" misc.forsale 0.74 0.57 0.65 390\n",
" rec.autos 0.76 0.55 0.64 396\n",
" rec.motorcycles 0.73 0.43 0.54 398\n",
" rec.sport.baseball 0.94 0.67 0.78 397\n",
" rec.sport.hockey 0.88 0.85 0.86 399\n",
" sci.crypt 0.23 0.85 0.36 396\n",
" sci.electronics 0.58 0.42 0.49 393\n",
" sci.med 0.67 0.73 0.70 396\n",
" sci.space 0.63 0.69 0.66 394\n",
" soc.religion.christian 0.48 0.82 0.61 398\n",
" talk.politics.guns 0.58 0.30 0.40 364\n",
" talk.politics.mideast 0.46 0.80 0.58 376\n",
" talk.politics.misc 0.54 0.30 0.39 310\n",
" talk.religion.misc 0.28 0.11 0.16 251\n",
"\n",
" avg / total 0.63 0.55 0.56 7532\n",
"\n"
]
}
],
"source": [
"# Evaluate NB classifier using test data set\n",
"pred = clf.predict(test_vec)\n",
Expand All @@ -119,56 +170,145 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"array([[ 59, 1, 0, 3, 0, 0, 0, 1, 3, 1, 3, 38, 1,\n",
" 3, 9, 117, 4, 44, 8, 24],\n",
" [ 0, 204, 23, 12, 3, 29, 4, 0, 2, 1, 0, 80, 9,\n",
" 5, 12, 2, 1, 1, 0, 1],\n",
" [ 1, 26, 149, 56, 2, 26, 6, 2, 2, 0, 0, 94, 3,\n",
" 8, 10, 4, 1, 2, 2, 0],\n",
" [ 0, 14, 40, 208, 11, 5, 16, 0, 1, 0, 2, 57, 32,\n",
" 1, 1, 0, 0, 1, 3, 0],\n",
" [ 1, 13, 26, 56, 149, 5, 18, 2, 0, 0, 2, 74, 21,\n",
" 6, 11, 1, 0, 0, 0, 0],\n",
" [ 0, 24, 13, 4, 1, 257, 3, 0, 0, 0, 0, 72, 2,\n",
" 9, 5, 2, 1, 2, 0, 0],\n",
" [ 1, 2, 8, 29, 12, 2, 224, 12, 6, 4, 5, 38, 12,\n",
" 7, 13, 2, 4, 8, 1, 0],\n",
" [ 1, 1, 0, 0, 1, 0, 10, 216, 25, 2, 2, 84, 13,\n",
" 6, 11, 5, 2, 12, 3, 2],\n",
" [ 2, 1, 2, 1, 0, 2, 6, 38, 173, 0, 5, 73, 14,\n",
" 14, 13, 6, 4, 32, 9, 3],\n",
" [ 2, 2, 0, 0, 2, 0, 3, 0, 2, 267, 21, 41, 0,\n",
" 16, 6, 13, 2, 15, 5, 0],\n",
" [ 0, 0, 1, 0, 0, 0, 1, 1, 2, 5, 339, 33, 0,\n",
" 1, 1, 4, 0, 6, 5, 0],\n",
" [ 0, 1, 7, 0, 0, 2, 0, 0, 0, 1, 1, 338, 6,\n",
" 4, 11, 2, 3, 14, 3, 3],\n",
" [ 1, 8, 4, 21, 2, 5, 10, 5, 9, 0, 0, 115, 167,\n",
" 19, 18, 1, 1, 6, 1, 0],\n",
" [ 1, 3, 2, 2, 0, 0, 1, 2, 1, 0, 0, 38, 4,\n",
" 289, 10, 19, 3, 14, 5, 2],\n",
" [ 0, 4, 1, 1, 0, 1, 0, 4, 0, 1, 1, 55, 5,\n",
" 12, 271, 6, 1, 22, 8, 1],\n",
" [ 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0,\n",
" 4, 2, 327, 1, 13, 4, 17],\n",
" [ 3, 0, 0, 1, 0, 0, 1, 0, 1, 0, 3, 109, 0,\n",
" 10, 9, 20, 110, 70, 14, 13],\n",
" [ 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 2, 36, 0,\n",
" 1, 2, 24, 0, 301, 3, 0],\n",
" [ 3, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 64, 0,\n",
" 8, 6, 22, 43, 63, 94, 3],\n",
" [ 16, 3, 0, 1, 0, 0, 0, 0, 5, 1, 1, 36, 1,\n",
" 7, 6, 98, 10, 33, 6, 27]])\n"
]
}
],
"source": [
"pprint(metrics.confusion_matrix(test_Xy.target, pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.553505045141\n"
]
}
],
"source": [
"print(metrics.accuracy_score(test_Xy.target, pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"scipy.sparse.csr.csr_matrix"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b_w_d = X_u>0\n",
"type(b_w_d)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"<7920x7920 sparse matrix of type '<type 'numpy.bool_'>'\n",
"\twith 56642993 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b_w_d.dot(b_w_d.transpose())"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(6788,)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.concatenate((y_l, y_l), axis=0).shape"
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
34 changes: 34 additions & 0 deletions code/EM_NB_text_classification_v3.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit 5fd40f7

Please sign in to comment.