forked from ugik/notebooks
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
ugik
committed
Sep 30, 2017
1 parent
34e0aa9
commit 450d675
Showing
12 changed files
with
1,284 additions
and
60 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 40, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# things we need for NLP\n", | ||
"import nltk\n", | ||
"from nltk.stem.lancaster import LancasterStemmer\n", | ||
"stemmer = LancasterStemmer()\n", | ||
"\n", | ||
"# things we need for Tensorflow\n", | ||
"import numpy as np\n", | ||
"import tflearn\n", | ||
"import tensorflow as tf\n", | ||
"import random" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 41, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# import our chat-bot intents file\n", | ||
"import json\n", | ||
"with open('intents_Rx.json') as json_data:\n", | ||
" intents = json.load(json_data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 42, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"17 documents\n", | ||
"5 classes ['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n", | ||
"40 unique stemmed words [\"'m\", \"'s\", 'a', 'anyon', 'ar', 'buy', 'bye', 'can', 'cheap', 'cheapest', 'coupon', 'day', 'deal', 'find', 'for', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'lat', 'less', 'look', 'me', 'med', 'money', 'see', 'send', 'thank', 'that', 'the', 'ther', 'to', 'want', 'what', 'wher', 'you']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"words = []\n", | ||
"classes = []\n", | ||
"documents = []\n", | ||
"ignore_words = ['?']\n", | ||
"# loop through each sentence in our intents patterns\n", | ||
"for intent in intents['intents']:\n", | ||
" for pattern in intent['patterns']:\n", | ||
" # tokenize each word in the sentence\n", | ||
" w = nltk.word_tokenize(pattern)\n", | ||
" # add to our words list\n", | ||
" words.extend(w)\n", | ||
" # add to documents in our corpus\n", | ||
" documents.append((w, intent['tag']))\n", | ||
" # add to our classes list\n", | ||
" if intent['tag'] not in classes:\n", | ||
" classes.append(intent['tag'])\n", | ||
"\n", | ||
"# stem and lower each word and remove duplicates\n", | ||
"words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n", | ||
"words = sorted(list(set(words)))\n", | ||
"\n", | ||
"# remove duplicates\n", | ||
"classes = sorted(list(set(classes)))\n", | ||
"\n", | ||
"print (len(documents), \"documents\")\n", | ||
"print (len(classes), \"classes\", classes)\n", | ||
"print (len(words), \"unique stemmed words\", words)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 43, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# create our training data\n", | ||
"training = []\n", | ||
"output = []\n", | ||
"# create an empty array for our output\n", | ||
"output_empty = [0] * len(classes)\n", | ||
"\n", | ||
"# training set, bag of words for each sentence\n", | ||
"for doc in documents:\n", | ||
" # initialize our bag of words\n", | ||
" bag = []\n", | ||
" # list of tokenized words for the pattern\n", | ||
" pattern_words = doc[0]\n", | ||
" # stem each word\n", | ||
" pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n", | ||
" # create our bag of words array\n", | ||
" for w in words:\n", | ||
" bag.append(1) if w in pattern_words else bag.append(0)\n", | ||
"\n", | ||
" # output is a '0' for each tag and '1' for current tag\n", | ||
" output_row = list(output_empty)\n", | ||
" output_row[classes.index(doc[1])] = 1\n", | ||
"\n", | ||
" training.append([bag, output_row])\n", | ||
"\n", | ||
"# shuffle our features and turn into np.array\n", | ||
"random.shuffle(training)\n", | ||
"training = np.array(training)\n", | ||
"\n", | ||
"# create train and test lists\n", | ||
"train_x = list(training[:,0])\n", | ||
"train_y = list(training[:,1])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 44, | ||
"metadata": { | ||
"collapsed": false, | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Training Step: 2999 | total loss: \u001b[1m\u001b[32m0.15651\u001b[0m\u001b[0m | time: 0.006s\n", | ||
"| Adam | epoch: 1000 | loss: 0.15651 - acc: 0.9794 -- iter: 16/17\n", | ||
"Training Step: 3000 | total loss: \u001b[1m\u001b[32m0.14101\u001b[0m\u001b[0m | time: 0.008s\n", | ||
"| Adam | epoch: 1000 | loss: 0.14101 - acc: 0.9815 -- iter: 17/17\n", | ||
"--\n", | ||
"INFO:tensorflow:/home/gk/gensim/notebooks/Rxmodel.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# reset underlying graph data\n", | ||
"tf.reset_default_graph()\n", | ||
"# Build neural network\n", | ||
"net = tflearn.input_data(shape=[None, len(train_x[0])])\n", | ||
"net = tflearn.fully_connected(net, 8)\n", | ||
"net = tflearn.fully_connected(net, 8)\n", | ||
"net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n", | ||
"net = tflearn.regression(net)\n", | ||
"\n", | ||
"# Define model and setup tensorboard\n", | ||
"model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')\n", | ||
"# Start training (apply gradient descent algorithm)\n", | ||
"model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)\n", | ||
"model.save('Rxmodel.tflearn')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 45, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def clean_up_sentence(sentence):\n", | ||
" # tokenize the pattern\n", | ||
" sentence_words = nltk.word_tokenize(sentence)\n", | ||
" # stem each word\n", | ||
" sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]\n", | ||
" return sentence_words\n", | ||
"\n", | ||
"# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence\n", | ||
"def bow(sentence, words, show_details=False):\n", | ||
" # tokenize the pattern\n", | ||
" sentence_words = clean_up_sentence(sentence)\n", | ||
" # bag of words\n", | ||
" bag = [0]*len(words) \n", | ||
" for s in sentence_words:\n", | ||
" for i,w in enumerate(words):\n", | ||
" if w == s: \n", | ||
" bag[i] = 1\n", | ||
" if show_details:\n", | ||
" print (\"found in bag: %s\" % w)\n", | ||
"\n", | ||
" return(np.array(bag))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | ||
" 0 0 0]\n", | ||
"['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"p = bow(\"hello\", words)\n", | ||
"print (p)\n", | ||
"print (classes)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 47, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[[4.3407872851730644e-09, 0.009914605878293514, 0.9880092740058899, 0.0020757599268108606, 3.3042027780538774e-07]]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(model.predict([p]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 48, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# save all of our data structures\n", | ||
"import pickle\n", | ||
"pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"Rx_training_data\", \"wb\" ) )" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
Oops, something went wrong.