diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5f2b45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.ipynb_checkpoints +.DS_Store diff --git a/ML_anomaly_detection.ipynb b/ML_anomaly_detection.ipynb new file mode 100644 index 0000000..dd45ff5 --- /dev/null +++ b/ML_anomaly_detection.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 337, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.metrics import roc_auc_score, average_precision_score\n", + "\n", + "import tensorflow as tf\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from keras.datasets import mnist" + ] + }, + { + "cell_type": "code", + "execution_count": 338, + "metadata": {}, + "outputs": [], + "source": [ + "rng = np.random.RandomState(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 339, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_mnist_mlfetch():\n", + " (x_train, x_trainLabels), (x_test, x_testLabels) = mnist.load_data()\n", + " labels = x_trainLabels\n", + " data = x_train\n", + "\n", + " ## 4859 digit- 4\n", + " k_four = np.where(labels == 4)\n", + " label_four = labels[k_four]\n", + " data_four = data[k_four]\n", + "\n", + " k_zeros = np.where(labels == 0)\n", + " k_sevens = np.where(labels == 7)\n", + " k_nine = np.where(labels == 9)\n", + "\n", + " ## 265 (0,7,9)\n", + " label_zeros = labels[k_zeros]\n", + " data_zeros = data[k_zeros]\n", + "\n", + " label_sevens = labels[k_sevens]\n", + " data_sevens = data[k_sevens]\n", + "\n", + " label_nine = labels[k_nine]\n", + " data_nines = data[k_nine]\n", + "\n", + "\n", + " data_four = data_four[:220]\n", + "\n", + " data_zeros = data_zeros[:5]\n", + " data_sevens = data_sevens[:3]\n", + " data_nines = data_nines[:3]\n", + "\n", + " data_sevens = data_sevens[:11]\n", + "\n", + " normal = data_four\n", + " anomalies = np.concatenate((data_zeros, data_sevens, data_nines), axis=0)\n", + "\n", + "\n", + " normal = np.reshape(normal, (len(normal), 784))\n", + " anomalies = np.reshape(anomalies, (len(anomalies), 784))\n", + "\n", + " return [normal, anomalies]" + ] + }, + { + "cell_type": "code", + "execution_count": 340, + "metadata": {}, + "outputs": [], + "source": [ + "def sklearn_IsolationForest(data_train, data_test):\n", + " clf = IsolationForest(max_samples=50, random_state=rng)\n", + " clf.fit(data_train)\n", + " pos_decisionScore = clf.predict(data_train)\n", + " neg_decisionScore = clf.predict(data_test)\n", + " \n", + " return [pos_decisionScore, neg_decisionScore]" + ] + }, + { + "cell_type": "code", + "execution_count": 341, + "metadata": {}, + "outputs": [], + "source": [ + "def sklearn_OCSVM_linear(data_train, data_test, nu):\n", + " ocSVM = OneClassSVM(nu = nu, kernel = 'linear')\n", + " ocSVM.fit(data_train)\n", + " pos_decisionScore = ocSVM.decision_function(data_train)\n", + " neg_decisionScore = ocSVM.decision_function(data_test)\n", + " \n", + " return [pos_decisionScore, neg_decisionScore]" + ] + }, + { + "cell_type": "code", + "execution_count": 342, + "metadata": {}, + "outputs": [], + "source": [ + "def au_roc(y_true, y_score):\n", + " roc_score = roc_auc_score(y_true, y_score)\n", + " print('ROC score: {0:0.4f}'.format(roc_score))\n", + " \n", + " return roc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 343, + "metadata": {}, + "outputs": [], + "source": [ + "def au_prc(y_true, y_score):\n", + " average_precision = average_precision_score(y_true, y_score)\n", + " print('Average precision-recall score: {0:0.4f}'.format(average_precision))\n", + " \n", + " return average_precision" + ] + }, + { + "cell_type": "code", + "execution_count": 344, + "metadata": {}, + "outputs": [], + "source": [ + "y_true_pos = np.ones(X_train.shape[0])\n", + "y_true_neg = np.zeros(X_test.shape[0])\n", + "y_true = np.concatenate((y_true_pos, y_true_neg))" + ] + }, + { + "cell_type": "code", + "execution_count": 345, + "metadata": {}, + "outputs": [], + "source": [ + "[X_train, X_test] = prepare_mnist_mlfetch()" + ] + }, + { + "cell_type": "code", + "execution_count": 346, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC score: 0.6773\n", + "Average precision-recall score: 0.9688\n" + ] + } + ], + "source": [ + "[y_scores_pos_IF, y_scores_neg_IF] = sklearn_IsolationForest(X_train, X_test)\n", + "\n", + "y_score_IF = np.concatenate((y_scores_pos_IF, y_scores_neg_IF))\n", + "\n", + "isolation_forest_roc = au_roc(y_true, y_score_IF)\n", + "isolation_forest_prc = au_prc(y_true, y_score_IF)" + ] + }, + { + "cell_type": "code", + "execution_count": 347, + "metadata": {}, + "outputs": [], + "source": [ + "nu = 0.04" + ] + }, + { + "cell_type": "code", + "execution_count": 348, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC score: 0.5740\n", + "Average precision-recall score: 0.9686\n" + ] + } + ], + "source": [ + "[y_scores_pos_OCSVM_linear, y_scores_neg_OCSVM_linear] = sklearn_OCSVM_linear(X_train, X_test, nu)\n", + "\n", + "y_score_OCSVM_linear = np.concatenate((y_scores_pos_OCSVM_linear, y_scores_neg_OCSVM_linear))\n", + "\n", + "OCSVM_linear_roc = au_roc(y_true, y_score_OCSVM_linear)\n", + "OCSVM_linear_prc = au_prc(y_true, y_score_OCSVM_linear)" + ] + }, + { + "cell_type": "code", + "execution_count": 349, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(y_scores_pos_OCSVM_linear, bins = 25, label = 'Normal')\n", + "plt.hist(y_scores_neg_OCSVM_linear, bins = 25, label = 'Anomaly')\n", + "plt.title(\"ocsvm: linear\");\n", + "plt.legend(loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 350, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(y_scores_pos_IF, bins = 25, label = 'Normal')\n", + "plt.hist(y_scores_neg_IF, bins = 25, label = 'Anomaly')\n", + "plt.title(\"Isolation forest\");\n", + "plt.legend(loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 351, + "metadata": {}, + "outputs": [], + "source": [ + "def tf_OneClass_NN_sigmoid(data_train, data_test):\n", + " tf.reset_default_graph()\n", + "\n", + " train_X = data_train\n", + "\n", + " RANDOM_SEED = 42\n", + " tf.set_random_seed(RANDOM_SEED)\n", + "\n", + " # Layer's sizes\n", + " x_size = train_X.shape[1] # Number of input nodes: 4 features and 1 bias\n", + " h_size = 32 # Number of hidden nodes\n", + " y_size = 1 # Number of outcomes (3 iris flowers)\n", + " D = x_size\n", + " K = h_size\n", + "\n", + " theta = np.random.normal(0, 1, K + K*D + 1)\n", + " rvalue = np.random.normal(0, 1, (len(train_X), y_size))\n", + " nu = 0.04\n", + "\n", + " def init_weights(shape):\n", + " \"\"\" Weight initialization \"\"\"\n", + " weights = tf.random_normal(shape,mean=0, stddev=0.00001)\n", + " return tf.Variable(weights)\n", + "\n", + " def forwardprop(X, w_1, w_2):\n", + " \"\"\"\n", + " Forward-propagation.\n", + " IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.\n", + " \"\"\"\n", + " X = tf.cast(X, tf.float32)\n", + " w_1 = tf.cast(w_1, tf.float32)\n", + " w_2 = tf.cast(w_2, tf.float32)\n", + " h = tf.nn.sigmoid(tf.matmul(X, w_1)) # The sigma function\n", + " yhat = tf.matmul(h, w_2) # The varphi function\n", + " return yhat\n", + "\n", + " g = lambda x : 1 / (1 + tf.exp(-x))\n", + "\n", + " def nnScore(X, w, V, g):\n", + " X = tf.cast(X, tf.float32)\n", + " w = tf.cast(w, tf.float32)\n", + " V = tf.cast(V, tf.float32)\n", + " return tf.matmul(g((tf.matmul(X, w))), V)\n", + "\n", + " def relu(x):\n", + " y = x # y[y < 0] = 0\n", + " return y\n", + "\n", + " def ocnn_obj(theta, X, nu, w1, w2, g,r):\n", + " w = w1\n", + " V = w2\n", + "\n", + " X = tf.cast(X, tf.float32)\n", + " w = tf.cast(w1, tf.float32)\n", + " V = tf.cast(w2, tf.float32)\n", + "\n", + " term1 = 0.5 * tf.reduce_sum(w**2)\n", + " term2 = 0.5 * tf.reduce_sum(V**2)\n", + " term3 = 1/nu * tf.reduce_mean(relu(r - nnScore(X, w, V, g)))\n", + " term4 = -r\n", + "\n", + " return term1 + term2 + term3 + term4\n", + "\n", + " # For testing the algorithm\n", + " test_X = data_test\n", + "\n", + "\n", + " # Symbols\n", + " X = tf.placeholder(\"float32\", shape = [None, x_size])\n", + "\n", + " r = tf.get_variable(\"r\", dtype = tf.float32, shape = (), trainable = False)\n", + "\n", + " # Weight initializations\n", + " w_1 = init_weights((x_size, h_size))\n", + " w_2 = init_weights((h_size, y_size))\n", + "\n", + " # Forward propagation\n", + " yhat = forwardprop(X, w_1, w_2)\n", + " predict = tf.argmax(yhat, axis = 1)\n", + "\n", + "\n", + " # Backward propagation\n", + " cost = ocnn_obj(theta, X, nu, w_1, w_2, g, r)\n", + " updates = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)\n", + "\n", + " # Run SGD\n", + " sess = tf.Session()\n", + " init = tf.global_variables_initializer()\n", + " sess.run(init)\n", + " rvalue = 0.1\n", + " for epoch in range(30):\n", + " # Train with each example\n", + " sess.run(updates, feed_dict = {X: train_X, r:rvalue})\n", + " rvalue = nnScore(train_X, w_1, w_2, g)\n", + " with sess.as_default():\n", + " rvalue = rvalue.eval()\n", + " rvalue = np.percentile(rvalue,q = 100*0.04)\n", + " print(\"Epoch = %d, r = %f\" % (epoch + 1,rvalue))\n", + "\n", + "\n", + " train = nnScore(train_X, w_1, w_2, g)\n", + " test = nnScore(test_X, w_1, w_2, g)\n", + " with sess.as_default():\n", + " arrayTrain = train.eval()\n", + " arrayTest = test.eval()\n", + "\n", + " rstar = rvalue\n", + " sess.close()\n", + " print(\"Session Closed!!!\")\n", + "\n", + " pos_decisionScore = arrayTrain - rstar\n", + " neg_decisionScore = arrayTest - rstar\n", + "\n", + " return [pos_decisionScore, neg_decisionScore]" + ] + }, + { + "cell_type": "code", + "execution_count": 352, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch = 1, r = 0.019881\n", + "Epoch = 2, r = 0.061114\n", + "Epoch = 3, r = 0.133869\n", + "Epoch = 4, r = 0.212051\n", + "Epoch = 5, r = 0.291868\n", + "Epoch = 6, r = 0.372199\n", + "Epoch = 7, r = 0.452624\n", + "Epoch = 8, r = 0.533001\n", + "Epoch = 9, r = 0.613309\n", + "Epoch = 10, r = 0.693517\n", + "Epoch = 11, r = 0.773710\n", + "Epoch = 12, r = 0.853920\n", + "Epoch = 13, r = 0.934096\n", + "Epoch = 14, r = 1.014242\n", + "Epoch = 15, r = 1.094359\n", + "Epoch = 16, r = 1.174450\n", + "Epoch = 17, r = 1.254516\n", + "Epoch = 18, r = 1.334560\n", + "Epoch = 19, r = 1.414583\n", + "Epoch = 20, r = 1.494586\n", + "Epoch = 21, r = 1.574571\n", + "Epoch = 22, r = 1.654538\n", + "Epoch = 23, r = 1.734489\n", + "Epoch = 24, r = 1.814424\n", + "Epoch = 25, r = 1.894344\n", + "Epoch = 26, r = 1.974250\n", + "Epoch = 27, r = 2.054142\n", + "Epoch = 28, r = 2.134021\n", + "Epoch = 29, r = 2.213887\n", + "Epoch = 30, r = 2.293741\n", + "Session Closed!!!\n", + "ROC score: 0.6116\n", + "Average precision-recall score: 0.9710\n" + ] + } + ], + "source": [ + "[y_scores_pos_tf_OneClass_NN_sigmoid, y_scores_neg_tf_OneClass_NN_sigmoid] = tf_OneClass_NN_sigmoid(X_train, X_test)\n", + "\n", + "y_score_tf_OneClass_NN_sigmoid = np.concatenate((y_scores_pos_tf_OneClass_NN_sigmoid, y_scores_neg_tf_OneClass_NN_sigmoid))\n", + "\n", + "tf_OneClass_NN_sigmoid_roc = au_roc(y_true, y_score_tf_OneClass_NN_sigmoid)\n", + "tf_OneClass_NN_sigmoid_prc = au_prc(y_true, y_score_tf_OneClass_NN_sigmoid)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 353, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(y_scores_pos_tf_OneClass_NN_sigmoid, bins = 25, label = 'Normal')\n", + "plt.hist(y_scores_neg_tf_OneClass_NN_sigmoid, bins = 25, label = 'Anomaly')\n", + "plt.title(\"OneClass NN sigmoid\");\n", + "plt.legend(loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}