Rx chat-bot

samisa-abeysinghe · Sep 30, 2017 · 450d675 · 450d675
1 parent 34e0aa9
commit 450d675
Show file tree

Hide file tree

Showing 12 changed files with 1,284 additions and 60 deletions.
diff --git a/Rx_training_data b/Rx_training_data
diff --git a/Rxmodel.tflearn.data-00000-of-00001 b/Rxmodel.tflearn.data-00000-of-00001
diff --git a/Rxmodel.tflearn.index b/Rxmodel.tflearn.index
diff --git a/Rxmodel.tflearn.meta b/Rxmodel.tflearn.meta
diff --git a/Tensorflow Rx chat-bot (build model).ipynb b/Tensorflow Rx chat-bot (build model).ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# things we need for NLP\n",
+    "import nltk\n",
+    "from nltk.stem.lancaster import LancasterStemmer\n",
+    "stemmer = LancasterStemmer()\n",
+    "\n",
+    "# things we need for Tensorflow\n",
+    "import numpy as np\n",
+    "import tflearn\n",
+    "import tensorflow as tf\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# import our chat-bot intents file\n",
+    "import json\n",
+    "with open('intents_Rx.json') as json_data:\n",
+    "    intents = json.load(json_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "17 documents\n",
+      "5 classes ['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n",
+      "40 unique stemmed words [\"'m\", \"'s\", 'a', 'anyon', 'ar', 'buy', 'bye', 'can', 'cheap', 'cheapest', 'coupon', 'day', 'deal', 'find', 'for', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'lat', 'less', 'look', 'me', 'med', 'money', 'see', 'send', 'thank', 'that', 'the', 'ther', 'to', 'want', 'what', 'wher', 'you']\n"
+     ]
+    }
+   ],
+   "source": [
+    "words = []\n",
+    "classes = []\n",
+    "documents = []\n",
+    "ignore_words = ['?']\n",
+    "# loop through each sentence in our intents patterns\n",
+    "for intent in intents['intents']:\n",
+    "    for pattern in intent['patterns']:\n",
+    "        # tokenize each word in the sentence\n",
+    "        w = nltk.word_tokenize(pattern)\n",
+    "        # add to our words list\n",
+    "        words.extend(w)\n",
+    "        # add to documents in our corpus\n",
+    "        documents.append((w, intent['tag']))\n",
+    "        # add to our classes list\n",
+    "        if intent['tag'] not in classes:\n",
+    "            classes.append(intent['tag'])\n",
+    "\n",
+    "# stem and lower each word and remove duplicates\n",
+    "words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n",
+    "words = sorted(list(set(words)))\n",
+    "\n",
+    "# remove duplicates\n",
+    "classes = sorted(list(set(classes)))\n",
+    "\n",
+    "print (len(documents), \"documents\")\n",
+    "print (len(classes), \"classes\", classes)\n",
+    "print (len(words), \"unique stemmed words\", words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create our training data\n",
+    "training = []\n",
+    "output = []\n",
+    "# create an empty array for our output\n",
+    "output_empty = [0] * len(classes)\n",
+    "\n",
+    "# training set, bag of words for each sentence\n",
+    "for doc in documents:\n",
+    "    # initialize our bag of words\n",
+    "    bag = []\n",
+    "    # list of tokenized words for the pattern\n",
+    "    pattern_words = doc[0]\n",
+    "    # stem each word\n",
+    "    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n",
+    "    # create our bag of words array\n",
+    "    for w in words:\n",
+    "        bag.append(1) if w in pattern_words else bag.append(0)\n",
+    "\n",
+    "    # output is a '0' for each tag and '1' for current tag\n",
+    "    output_row = list(output_empty)\n",
+    "    output_row[classes.index(doc[1])] = 1\n",
+    "\n",
+    "    training.append([bag, output_row])\n",
+    "\n",
+    "# shuffle our features and turn into np.array\n",
+    "random.shuffle(training)\n",
+    "training = np.array(training)\n",
+    "\n",
+    "# create train and test lists\n",
+    "train_x = list(training[:,0])\n",
+    "train_y = list(training[:,1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 2999  | total loss: \u001b[1m\u001b[32m0.15651\u001b[0m\u001b[0m | time: 0.006s\n",
+      "| Adam | epoch: 1000 | loss: 0.15651 - acc: 0.9794 -- iter: 16/17\n",
+      "Training Step: 3000  | total loss: \u001b[1m\u001b[32m0.14101\u001b[0m\u001b[0m | time: 0.008s\n",
+      "| Adam | epoch: 1000 | loss: 0.14101 - acc: 0.9815 -- iter: 17/17\n",
+      "--\n",
+      "INFO:tensorflow:/home/gk/gensim/notebooks/Rxmodel.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# reset underlying graph data\n",
+    "tf.reset_default_graph()\n",
+    "# Build neural network\n",
+    "net = tflearn.input_data(shape=[None, len(train_x[0])])\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n",
+    "net = tflearn.regression(net)\n",
+    "\n",
+    "# Define model and setup tensorboard\n",
+    "model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')\n",
+    "# Start training (apply gradient descent algorithm)\n",
+    "model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)\n",
+    "model.save('Rxmodel.tflearn')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def clean_up_sentence(sentence):\n",
+    "    # tokenize the pattern\n",
+    "    sentence_words = nltk.word_tokenize(sentence)\n",
+    "    # stem each word\n",
+    "    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]\n",
+    "    return sentence_words\n",
+    "\n",
+    "# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence\n",
+    "def bow(sentence, words, show_details=False):\n",
+    "    # tokenize the pattern\n",
+    "    sentence_words = clean_up_sentence(sentence)\n",
+    "    # bag of words\n",
+    "    bag = [0]*len(words)  \n",
+    "    for s in sentence_words:\n",
+    "        for i,w in enumerate(words):\n",
+    "            if w == s: \n",
+    "                bag[i] = 1\n",
+    "                if show_details:\n",
+    "                    print (\"found in bag: %s\" % w)\n",
+    "\n",
+    "    return(np.array(bag))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
+      " 0 0 0]\n",
+      "['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n"
+     ]
+    }
+   ],
+   "source": [
+    "p = bow(\"hello\", words)\n",
+    "print (p)\n",
+    "print (classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[4.3407872851730644e-09, 0.009914605878293514, 0.9880092740058899, 0.0020757599268108606, 3.3042027780538774e-07]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.predict([p]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# save all of our data structures\n",
+    "import pickle\n",
+    "pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"Rx_training_data\", \"wb\" ) )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}