Skip to content

Commit

Permalink
Rx chat-bot
Browse files Browse the repository at this point in the history
  • Loading branch information
ugik committed Sep 30, 2017
1 parent 34e0aa9 commit 450d675
Show file tree
Hide file tree
Showing 12 changed files with 1,284 additions and 60 deletions.
Binary file added Rx_training_data
Binary file not shown.
Binary file added Rxmodel.tflearn.data-00000-of-00001
Binary file not shown.
Binary file added Rxmodel.tflearn.index
Binary file not shown.
Binary file added Rxmodel.tflearn.meta
Binary file not shown.
271 changes: 271 additions & 0 deletions Tensorflow Rx chat-bot (build model).ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# things we need for NLP\n",
"import nltk\n",
"from nltk.stem.lancaster import LancasterStemmer\n",
"stemmer = LancasterStemmer()\n",
"\n",
"# things we need for Tensorflow\n",
"import numpy as np\n",
"import tflearn\n",
"import tensorflow as tf\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# import our chat-bot intents file\n",
"import json\n",
"with open('intents_Rx.json') as json_data:\n",
" intents = json.load(json_data)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17 documents\n",
"5 classes ['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n",
"40 unique stemmed words [\"'m\", \"'s\", 'a', 'anyon', 'ar', 'buy', 'bye', 'can', 'cheap', 'cheapest', 'coupon', 'day', 'deal', 'find', 'for', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'lat', 'less', 'look', 'me', 'med', 'money', 'see', 'send', 'thank', 'that', 'the', 'ther', 'to', 'want', 'what', 'wher', 'you']\n"
]
}
],
"source": [
"words = []\n",
"classes = []\n",
"documents = []\n",
"ignore_words = ['?']\n",
"# loop through each sentence in our intents patterns\n",
"for intent in intents['intents']:\n",
" for pattern in intent['patterns']:\n",
" # tokenize each word in the sentence\n",
" w = nltk.word_tokenize(pattern)\n",
" # add to our words list\n",
" words.extend(w)\n",
" # add to documents in our corpus\n",
" documents.append((w, intent['tag']))\n",
" # add to our classes list\n",
" if intent['tag'] not in classes:\n",
" classes.append(intent['tag'])\n",
"\n",
"# stem and lower each word and remove duplicates\n",
"words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n",
"words = sorted(list(set(words)))\n",
"\n",
"# remove duplicates\n",
"classes = sorted(list(set(classes)))\n",
"\n",
"print (len(documents), \"documents\")\n",
"print (len(classes), \"classes\", classes)\n",
"print (len(words), \"unique stemmed words\", words)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# create our training data\n",
"training = []\n",
"output = []\n",
"# create an empty array for our output\n",
"output_empty = [0] * len(classes)\n",
"\n",
"# training set, bag of words for each sentence\n",
"for doc in documents:\n",
" # initialize our bag of words\n",
" bag = []\n",
" # list of tokenized words for the pattern\n",
" pattern_words = doc[0]\n",
" # stem each word\n",
" pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n",
" # create our bag of words array\n",
" for w in words:\n",
" bag.append(1) if w in pattern_words else bag.append(0)\n",
"\n",
" # output is a '0' for each tag and '1' for current tag\n",
" output_row = list(output_empty)\n",
" output_row[classes.index(doc[1])] = 1\n",
"\n",
" training.append([bag, output_row])\n",
"\n",
"# shuffle our features and turn into np.array\n",
"random.shuffle(training)\n",
"training = np.array(training)\n",
"\n",
"# create train and test lists\n",
"train_x = list(training[:,0])\n",
"train_y = list(training[:,1])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Step: 2999 | total loss: \u001b[1m\u001b[32m0.15651\u001b[0m\u001b[0m | time: 0.006s\n",
"| Adam | epoch: 1000 | loss: 0.15651 - acc: 0.9794 -- iter: 16/17\n",
"Training Step: 3000 | total loss: \u001b[1m\u001b[32m0.14101\u001b[0m\u001b[0m | time: 0.008s\n",
"| Adam | epoch: 1000 | loss: 0.14101 - acc: 0.9815 -- iter: 17/17\n",
"--\n",
"INFO:tensorflow:/home/gk/gensim/notebooks/Rxmodel.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n"
]
}
],
"source": [
"# reset underlying graph data\n",
"tf.reset_default_graph()\n",
"# Build neural network\n",
"net = tflearn.input_data(shape=[None, len(train_x[0])])\n",
"net = tflearn.fully_connected(net, 8)\n",
"net = tflearn.fully_connected(net, 8)\n",
"net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n",
"net = tflearn.regression(net)\n",
"\n",
"# Define model and setup tensorboard\n",
"model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')\n",
"# Start training (apply gradient descent algorithm)\n",
"model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)\n",
"model.save('Rxmodel.tflearn')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def clean_up_sentence(sentence):\n",
" # tokenize the pattern\n",
" sentence_words = nltk.word_tokenize(sentence)\n",
" # stem each word\n",
" sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]\n",
" return sentence_words\n",
"\n",
"# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence\n",
"def bow(sentence, words, show_details=False):\n",
" # tokenize the pattern\n",
" sentence_words = clean_up_sentence(sentence)\n",
" # bag of words\n",
" bag = [0]*len(words) \n",
" for s in sentence_words:\n",
" for i,w in enumerate(words):\n",
" if w == s: \n",
" bag[i] = 1\n",
" if show_details:\n",
" print (\"found in bag: %s\" % w)\n",
"\n",
" return(np.array(bag))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0]\n",
"['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n"
]
}
],
"source": [
"p = bow(\"hello\", words)\n",
"print (p)\n",
"print (classes)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[4.3407872851730644e-09, 0.009914605878293514, 0.9880092740058899, 0.0020757599268108606, 3.3042027780538774e-07]]\n"
]
}
],
"source": [
"print(model.predict([p]))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# save all of our data structures\n",
"import pickle\n",
"pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"Rx_training_data\", \"wb\" ) )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Loading

0 comments on commit 450d675

Please sign in to comment.