diff --git a/code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb b/code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb index ce975be..8f50f76 100644 --- a/code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb +++ b/code/.ipynb_checkpoints/EM_NB_text_classification_v2-checkpoint.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -60,7 +60,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(1131, 101631) (10183, 101631)\n" + "(3394, 101631) (7920, 101631)\n" ] } ], @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -92,9 +92,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "-12932118.6543\n", - "1\n" + "-8851790.55769\n", + "1\n", + "-6437159.6717\n", + "2\n", + "-6436536.34232\n", + "3\n", + "-6436334.42252\n", + "4\n", + "-6436331.61876\n", + "5\n", + "-6436331.61876\n" ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -106,11 +125,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " alt.atheism 0.61 0.18 0.28 319\n", + " comp.graphics 0.66 0.52 0.58 389\n", + " comp.os.ms-windows.misc 0.54 0.38 0.44 394\n", + "comp.sys.ibm.pc.hardware 0.53 0.53 0.53 392\n", + " comp.sys.mac.hardware 0.81 0.39 0.52 385\n", + " comp.windows.x 0.77 0.65 0.71 395\n", + " misc.forsale 0.74 0.57 0.65 390\n", + " rec.autos 0.76 0.55 0.64 396\n", + " rec.motorcycles 0.73 0.43 0.54 398\n", + " rec.sport.baseball 0.94 0.67 0.78 397\n", + " rec.sport.hockey 0.88 0.85 0.86 399\n", + " sci.crypt 0.23 0.85 0.36 396\n", + " sci.electronics 0.58 0.42 0.49 393\n", + " sci.med 0.67 0.73 0.70 396\n", + " sci.space 0.63 0.69 0.66 394\n", + " soc.religion.christian 0.48 0.82 0.61 398\n", + " talk.politics.guns 0.58 0.30 0.40 364\n", + " talk.politics.mideast 0.46 0.80 0.58 376\n", + " talk.politics.misc 0.54 0.30 0.39 310\n", + " talk.religion.misc 0.28 0.11 0.16 251\n", + "\n", + " avg / total 0.63 0.55 0.56 7532\n", + "\n" + ] + } + ], "source": [ "# Evaluate NB classifier using test data set\n", "pred = clf.predict(test_vec)\n", @@ -119,33 +170,99 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array([[ 59, 1, 0, 3, 0, 0, 0, 1, 3, 1, 3, 38, 1,\n", + " 3, 9, 117, 4, 44, 8, 24],\n", + " [ 0, 204, 23, 12, 3, 29, 4, 0, 2, 1, 0, 80, 9,\n", + " 5, 12, 2, 1, 1, 0, 1],\n", + " [ 1, 26, 149, 56, 2, 26, 6, 2, 2, 0, 0, 94, 3,\n", + " 8, 10, 4, 1, 2, 2, 0],\n", + " [ 0, 14, 40, 208, 11, 5, 16, 0, 1, 0, 2, 57, 32,\n", + " 1, 1, 0, 0, 1, 3, 0],\n", + " [ 1, 13, 26, 56, 149, 5, 18, 2, 0, 0, 2, 74, 21,\n", + " 6, 11, 1, 0, 0, 0, 0],\n", + " [ 0, 24, 13, 4, 1, 257, 3, 0, 0, 0, 0, 72, 2,\n", + " 9, 5, 2, 1, 2, 0, 0],\n", + " [ 1, 2, 8, 29, 12, 2, 224, 12, 6, 4, 5, 38, 12,\n", + " 7, 13, 2, 4, 8, 1, 0],\n", + " [ 1, 1, 0, 0, 1, 0, 10, 216, 25, 2, 2, 84, 13,\n", + " 6, 11, 5, 2, 12, 3, 2],\n", + " [ 2, 1, 2, 1, 0, 2, 6, 38, 173, 0, 5, 73, 14,\n", + " 14, 13, 6, 4, 32, 9, 3],\n", + " [ 2, 2, 0, 0, 2, 0, 3, 0, 2, 267, 21, 41, 0,\n", + " 16, 6, 13, 2, 15, 5, 0],\n", + " [ 0, 0, 1, 0, 0, 0, 1, 1, 2, 5, 339, 33, 0,\n", + " 1, 1, 4, 0, 6, 5, 0],\n", + " [ 0, 1, 7, 0, 0, 2, 0, 0, 0, 1, 1, 338, 6,\n", + " 4, 11, 2, 3, 14, 3, 3],\n", + " [ 1, 8, 4, 21, 2, 5, 10, 5, 9, 0, 0, 115, 167,\n", + " 19, 18, 1, 1, 6, 1, 0],\n", + " [ 1, 3, 2, 2, 0, 0, 1, 2, 1, 0, 0, 38, 4,\n", + " 289, 10, 19, 3, 14, 5, 2],\n", + " [ 0, 4, 1, 1, 0, 1, 0, 4, 0, 1, 1, 55, 5,\n", + " 12, 271, 6, 1, 22, 8, 1],\n", + " [ 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0,\n", + " 4, 2, 327, 1, 13, 4, 17],\n", + " [ 3, 0, 0, 1, 0, 0, 1, 0, 1, 0, 3, 109, 0,\n", + " 10, 9, 20, 110, 70, 14, 13],\n", + " [ 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 2, 36, 0,\n", + " 1, 2, 24, 0, 301, 3, 0],\n", + " [ 3, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 64, 0,\n", + " 8, 6, 22, 43, 63, 94, 3],\n", + " [ 16, 3, 0, 1, 0, 0, 0, 0, 5, 1, 1, 36, 1,\n", + " 7, 6, 98, 10, 33, 6, 27]])\n" + ] + } + ], "source": [ "pprint(metrics.confusion_matrix(test_Xy.target, pred))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.553505045141\n" + ] + } + ], "source": [ "print(metrics.accuracy_score(test_Xy.target, pred))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "scipy.sparse.csr.csr_matrix" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "b_w_d = X_u>0\n", "type(b_w_d)" @@ -153,22 +270,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "<7920x7920 sparse matrix of type ''\n", + "\twith 56642993 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "b_w_d.dot(b_w_d.transpose())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(6788,)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "np.concatenate((y_l, y_l), axis=0).shape" ] diff --git a/code/.ipynb_checkpoints/EM_NB_text_classification_v3-checkpoint.ipynb b/code/.ipynb_checkpoints/EM_NB_text_classification_v3-checkpoint.ipynb new file mode 100644 index 0000000..7068054 --- /dev/null +++ b/code/.ipynb_checkpoints/EM_NB_text_classification_v3-checkpoint.ipynb @@ -0,0 +1,34 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/code/EM_NB_text_classification_v3.ipynb b/code/EM_NB_text_classification_v3.ipynb new file mode 100644 index 0000000..7068054 --- /dev/null +++ b/code/EM_NB_text_classification_v3.ipynb @@ -0,0 +1,34 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}