diff --git a/bigram_model.ipynb b/bigram_model.ipynb
new file mode 100644
index 0000000..2b58745
--- /dev/null
+++ b/bigram_model.ipynb
@@ -0,0 +1,533 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "from pathlib import Path\n",
+ "import string\n",
+ "from functools import reduce\n",
+ "from math import log\n",
+ "import itertools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Enter smoothing or no smoothing.\n",
+ "smoothing = 1\n",
+ "filename = \"train_corpus.txt\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Loads file\n",
+ "# input - filename.txt \n",
+ "# returns a list of sentences seperated by newline in the main corpus/text. \n",
+ "def load_file(filename):\n",
+ " with open(filename) as f:\n",
+ " lines = [line.rstrip() for line in f]\n",
+ " print(\"No of sentences in Corpus: \"+str(len(lines)))\n",
+ " return lines"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Tokenizes the sentences meaning split the sentences into words seperated by the \"white sapce\".\n",
+ "# input - List of sentences\n",
+ "# returns a list of lists of each sentence being tokenized.\n",
+ "def tokenize_sentence(lines):\n",
+ " lines = [i.strip(\"''\").split(\" \") for i in lines] \n",
+ " print(\"No of sentences in Corpus: \"+str(len(lines)))\n",
+ " return lines"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prepare the data for training the bigram model.\n",
+ "# remove punctuations -print(string.punctuation) ---- !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ----\n",
+ "# remove empty strings.\n",
+ "# lower case all the words\n",
+ "# add at the beginning and at the end of every sentence in the corpus.\n",
+ "# input - list of lists of words obtained from \"tokenize_sentence\" function.\n",
+ "# returns - list of lists\n",
+ "def prep_data(lines):\n",
+ " for i in range(len(lines)):\n",
+ " lines[i] = [''.join(c for c in s if c not in string.punctuation) for s in lines[i]] # remove punctuations\n",
+ " lines[i] = [s for s in lines[i] if s] # removes empty strings\n",
+ " lines[i] = [word.lower() for word in lines[i]] # lower case\n",
+ " lines[i] += [''] # Append at the end of each sentence in the corpus\n",
+ " lines[i].insert(0, '') # Append at the beginning of each sentence in the corpus\n",
+ " print(\"No of sentences in Corpus: \"+str(len(lines)))\n",
+ " return lines"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No of sentences in Corpus: 10059\n",
+ "No of sentences in Corpus: 10059\n"
+ ]
+ }
+ ],
+ "source": [
+ "dataset = load_file(filename)\n",
+ "dataset = tokenize_sentence(dataset)\n",
+ "dataset = prep_data(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creates the vocabulary file of the dataset.\n",
+ "def vocabulary(dataset):\n",
+ " dataset_vocab = set(itertools.chain.from_iterable(dataset))\n",
+ " # remove and from the vocabulary of the dataset\n",
+ " dataset_vocab.remove('')\n",
+ " dataset_vocab.remove('')\n",
+ " dataset_vocab = list(dataset_vocab)\n",
+ " dataset_vocab.append('')\n",
+ " dataset_vocab.append('')\n",
+ " return dataset_vocab\n",
+ "\n",
+ "dataset_vocab = vocabulary(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "17141"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(dataset_vocab)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Counts the no. of times a word repeats (frequency of each word) in the corpus.\n",
+ "# input - list of lists of words obtained from \"prep_data\"\n",
+ "# returns - a dictionary defined as {word:frequency} for words of the corpus including and .\n",
+ "def freq_of_unique_words(lines):\n",
+ " bag_of_words = list(itertools.chain.from_iterable(lines)) # change the nested list to one single list\n",
+ " corpus_word_count = 0 # No of words in the corpus excluding and .\n",
+ " #count the no. of times a word repeats in the corpus\n",
+ " count = {}\n",
+ " for word in bag_of_words:\n",
+ " if word in count :\n",
+ " count[word] += 1\n",
+ " else:\n",
+ " count[word] = 1\n",
+ " if word != '' and word != '':\n",
+ " corpus_word_count +=1\n",
+ " \n",
+ " unique_word_count = len(count) - 2 # number of unique words in the corpus excluding and \n",
+ " \n",
+ " #print(\"!!! IT IS EXCLUDING AND !!!\")\n",
+ " print(\"No of unique words in corpus : \"+ str(unique_word_count))\n",
+ " print(\"No of words in corpus: \"+ str(corpus_word_count))\n",
+ " \n",
+ " return count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No of unique words in corpus : 17139\n",
+ "No of words in corpus: 218619\n"
+ ]
+ }
+ ],
+ "source": [
+ "unique_word_frequency = freq_of_unique_words(dataset)\n",
+ "#len(unique_word_frequency)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Computes the bigram frequncies\n",
+ "# Bigram frequncies means the number of times a word appears after a given word in the corpus.\n",
+ "# inputs:\n",
+ "# lines - list of lists obtained from \"prep_data\".\n",
+ "# count - dictionary obtained from \"freq_of_unique_words\".\n",
+ "# returns - dictionary of bigram frequencies {(word|given word): count(word|given word)} --- count(word|given word)~int.\n",
+ "def compute_bigram_frequencies(lines):\n",
+ " bigram_frequencies = dict() \n",
+ " #unique_bigrams = set()\n",
+ " for sentence in lines:\n",
+ " given_word = None\n",
+ " for word in sentence:\n",
+ " if given_word != None:\n",
+ " bigram_frequencies[(given_word, word)] = bigram_frequencies.get((given_word, word),0) + 1\n",
+ "# if(previous_word!='' and word!=''):\n",
+ "# unique_bigrams.add((previous_word,word))\n",
+ " given_word = word\n",
+ " #The number of bigram_frquencies in the corpus \n",
+ " #print(len(bigram_frequencies))\n",
+ " return bigram_frequencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bigram_frequencies = compute_bigram_frequencies(dataset)\n",
+ "#print(bigram_frequencies)\n",
+ "bigram_unique_word_count = len(unique_word_frequency)\n",
+ "# print(\"\\n\"+\"No of words in bigram: \"+str(bigram_unique_word_count))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Calculating bigram probability\n",
+ "# bigram probability means P(word|given word) = count(word|given word)/ count(given word).\n",
+ "# if count(word|given word) or count(given word) is 0 then probability is 0.\n",
+ "# input bigram_frquencies and count obtained from \"freq_of_unique_words\".\n",
+ "# returns dictionary of bigram probabilities {(word|given word): probabilty} --- probability is a float value.\n",
+ "def compute_bigram_probabilities(bigram_frequencies,count):\n",
+ " bigram_probabilities = dict() \n",
+ " for key in bigram_frequencies:\n",
+ " numerator = bigram_frequencies.get(key)\n",
+ " denominator = count.get(key[0]) # count.get(key[0]) will get the frequency of \"given word\" in the corpus.\n",
+ " if (numerator ==0 or denominator==0):\n",
+ " bigram_probabilities[key] = 0\n",
+ " else:\n",
+ " bigram_probabilities[key] = float(numerator)/float(denominator)\n",
+ " return bigram_probabilities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "bigram_probabilities = compute_bigram_probabilities(bigram_frequencies,unique_word_frequency)\n",
+ "#bigram_probabilities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Bigram frequncies of the test sentence computed using the bigram frequencies of the training data.\n",
+ "# add-one smoothing if 1, no smoothing if 0 ----- smoothing\n",
+ "def compute_bigram_count_test_sentence(given_word,word,smoothing):\n",
+ " if smoothing==0:\n",
+ " return 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n",
+ " elif smoothing == 1:\n",
+ " return 1 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))+1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# A table showing the bigram counts for test sentence.\n",
+ "def print_bigram_freq_test_sentence(test_sentence_vocab,smoothing):\n",
+ " print(\"A table showing the bigram counts for test sentence.\"+\"\\nsmoothing =\"+str(smoothing))\n",
+ " print(\"\\t\\t\\t\", end=\"\")\n",
+ " for word in test_sentence_vocab:\n",
+ " if word != '':\n",
+ " print(word, end=\"\\t\\t\")\n",
+ " print(\"\")\n",
+ " for given_word in test_sentence_vocab:\n",
+ " if given_word != '':\n",
+ " if(smoothing==1):\n",
+ " print(unique_word_frequency.get(given_word)+bigram_unique_word_count, end =\"\\t\")\n",
+ " elif(smoothing==0):\n",
+ " print(unique_word_frequency.get(given_word), end =\"\\t\")\n",
+ " print(given_word, end=\"\\t\\t\")\n",
+ " for word in test_sentence_vocab:\n",
+ " if word !='':\n",
+ " print(\"{0:}\".format(compute_bigram_count_test_sentence(given_word,word,smoothing)), end=\"\\t\\t\")\n",
+ " print(\"\")\n",
+ " print(\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Bigram probabilities of the test sentence computed using the bigram probabilities of the training data.\n",
+ "# add-one smoothing if 1, no smoothing if 0 ---- smoothing\n",
+ "def compute_bigram_prob_test_sentence(given_word,word,smoothing):\n",
+ " bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n",
+ " uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))\n",
+ " if smoothing==0:\n",
+ " return 0 if bigram_probabilities.get((given_word,word))==None else bigram_probabilities.get((given_word,word))\n",
+ " elif smoothing == 1:\n",
+ " numerator = bigram_freq+1\n",
+ " denominator = uni_freq+bigram_unique_word_count\n",
+ " return 0.0 if numerator == 0 or denominator == 0 else float(numerator) / float(denominator)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# A table showing the bigram probabilities for test sentence.\n",
+ "def print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing):\n",
+ " print(\"A table showing the bigram probabilities for test sentence\"+\"\\nsmoothing =\"+str(smoothing))\n",
+ " print(\"\\t\\t\", end=\"\")\n",
+ " for word in test_sentence_vocab:\n",
+ " if word != '':\n",
+ " print(word, end=\"\\t\\t\")\n",
+ " print(\"\")\n",
+ " for given_word in test_sentence_vocab:\n",
+ " if given_word != '':\n",
+ " print(given_word, end=\"\\t\\t\")\n",
+ " for word in test_sentence_vocab:\n",
+ " if word !='':\n",
+ " print(\"{0:.5f}\".format(compute_bigram_prob_test_sentence(given_word,word,smoothing)), end=\"\\t\\t\")\n",
+ " print(\"\")\n",
+ " print(\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Print the probability of the test sentence\n",
+ "# for add-one smoothing if 1, no smoothing if 0\n",
+ "def compute_prob_test_sentence(sentence,smoothing):\n",
+ " test_sent_prob = 0\n",
+ " \n",
+ " if(smoothing == 0):\n",
+ " given_word = None\n",
+ " for word in sentence:\n",
+ " if given_word!=None:\n",
+ " if bigram_probabilities.get((given_word,word))==0 or bigram_probabilities.get((given_word,word))== None:\n",
+ " return 0\n",
+ " else:\n",
+ " test_sent_prob+=log((bigram_probabilities.get((given_word,word),0)),10)\n",
+ " given_word = word\n",
+ " \n",
+ " elif(smoothing ==1):\n",
+ " given_word = None\n",
+ " for word in sentence:\n",
+ " if given_word!=None:\n",
+ " bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n",
+ " uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))\n",
+ " numerator = bigram_freq+1\n",
+ " denominator = uni_freq+bigram_unique_word_count\n",
+ " probability = 0 if numerator==0 or denominator ==0 else float(numerator)/float(denominator)\n",
+ " if(probability==0):\n",
+ " return 0\n",
+ " test_sent_prob +=log(probability,10)\n",
+ " given_word = word\n",
+ " \n",
+ " return 10**test_sent_prob"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Test sentence here\n",
+ "test_sentences = [['upon this the captain started , and eagerly desired to know more .'],['thus , because no man can follow another into these halls .']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "!!!!!!!!!!The test Sentence is!!!!!!!!!!\n",
+ "['upon this the captain started , and eagerly desired to know more .']\n",
+ "A table showing the bigram counts for test sentence.\n",
+ "smoothing =1\n",
+ "\t\t\tand\t\teagerly\t\tthis\t\tto\t\tdesired\t\tupon\t\tthe\t\tstarted\t\tcaptain\t\tknow\t\tmore\t\t\t\t\n",
+ "23571\tand\t\t1\t\t3\t\t45\t\t31\t\t1\t\t3\t\t371\t\t2\t\t6\t\t2\t\t46\t\t1\t\t\n",
+ "17149\teagerly\t\t2\t\t1\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n",
+ "18535\tthis\t\t10\t\t1\t\t2\t\t8\t\t1\t\t1\t\t25\t\t1\t\t2\t\t1\t\t1\t\t26\t\t\n",
+ "21766\tto\t\t24\t\t1\t\t51\t\t2\t\t1\t\t2\t\t730\t\t1\t\t3\t\t16\t\t2\t\t27\t\t\n",
+ "17152\tdesired\t\t1\t\t1\t\t1\t\t3\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n",
+ "17707\tupon\t\t1\t\t1\t\t24\t\t1\t\t1\t\t1\t\t221\t\t1\t\t2\t\t1\t\t1\t\t5\t\t\n",
+ "31572\tthe\t\t1\t\t1\t\t1\t\t1\t\t4\t\t1\t\t1\t\t1\t\t98\t\t1\t\t51\t\t2\t\t\n",
+ "17168\tstarted\t\t2\t\t1\t\t1\t\t6\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n",
+ "17470\tcaptain\t\t12\t\t1\t\t3\t\t9\t\t1\t\t2\t\t4\t\t2\t\t1\t\t1\t\t1\t\t11\t\t\n",
+ "17293\tknow\t\t3\t\t1\t\t1\t\t2\t\t1\t\t1\t\t9\t\t1\t\t2\t\t1\t\t2\t\t8\t\t\n",
+ "17649\tmore\t\t25\t\t1\t\t1\t\t12\t\t1\t\t3\t\t15\t\t2\t\t1\t\t1\t\t1\t\t26\t\t\n",
+ "27200\t\t\t409\t\t1\t\t116\t\t61\t\t1\t\t27\t\t630\t\t2\t\t23\t\t3\t\t8\t\t27\t\t\n",
+ "\n",
+ "A table showing the bigram probabilities for test sentence\n",
+ "smoothing =1\n",
+ "\t\tand\t\teagerly\t\tthis\t\tto\t\tdesired\t\tupon\t\tthe\t\tstarted\t\tcaptain\t\tknow\t\tmore\t\t\t\t\n",
+ "and\t\t0.00004\t\t0.00013\t\t0.00191\t\t0.00132\t\t0.00004\t\t0.00013\t\t0.01574\t\t0.00008\t\t0.00025\t\t0.00008\t\t0.00195\t\t0.00004\t\t\n",
+ "eagerly\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n",
+ "this\t\t0.00054\t\t0.00005\t\t0.00011\t\t0.00043\t\t0.00005\t\t0.00005\t\t0.00135\t\t0.00005\t\t0.00011\t\t0.00005\t\t0.00005\t\t0.00140\t\t\n",
+ "to\t\t0.00110\t\t0.00005\t\t0.00234\t\t0.00009\t\t0.00005\t\t0.00009\t\t0.03354\t\t0.00005\t\t0.00014\t\t0.00074\t\t0.00009\t\t0.00124\t\t\n",
+ "desired\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n",
+ "upon\t\t0.00006\t\t0.00006\t\t0.00136\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.01248\t\t0.00006\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00028\t\t\n",
+ "the\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00013\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00310\t\t0.00003\t\t0.00162\t\t0.00006\t\t\n",
+ "started\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00035\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n",
+ "captain\t\t0.00069\t\t0.00006\t\t0.00017\t\t0.00052\t\t0.00006\t\t0.00011\t\t0.00023\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00063\t\t\n",
+ "know\t\t0.00017\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00052\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00012\t\t0.00046\t\t\n",
+ "more\t\t0.00142\t\t0.00006\t\t0.00006\t\t0.00068\t\t0.00006\t\t0.00017\t\t0.00085\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00147\t\t\n",
+ "\t\t0.01504\t\t0.00004\t\t0.00426\t\t0.00224\t\t0.00004\t\t0.00099\t\t0.02316\t\t0.00007\t\t0.00085\t\t0.00011\t\t0.00029\t\t0.00099\t\t\n",
+ "\n",
+ "The probability of the sentence under the trained model\n",
+ "smoothing =1\n",
+ "1.01571679763017e-23\n",
+ "!!!!!!!!!!The test Sentence is!!!!!!!!!!\n",
+ "['thus , because no man can follow another into these halls .']\n",
+ "A table showing the bigram counts for test sentence.\n",
+ "smoothing =1\n",
+ "\t\t\thalls\t\tcan\t\tanother\t\tno\t\tthese\t\tthus\t\tbecause\t\tman\t\tinto\t\tfollow\t\t\t\t\n",
+ "17144\thalls\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t\n",
+ "17377\tcan\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t3\t\t10\t\t\n",
+ "17256\tanother\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t1\t\t4\t\t\n",
+ "17727\tno\t\t1\t\t1\t\t1\t\t19\t\t1\t\t1\t\t2\t\t10\t\t1\t\t1\t\t18\t\t\n",
+ "17544\tthese\t\t2\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t3\t\t\n",
+ "17274\tthus\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t2\t\t\n",
+ "17233\tbecause\t\t1\t\t1\t\t1\t\t3\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n",
+ "17668\tman\t\t1\t\t6\t\t1\t\t1\t\t3\t\t2\t\t1\t\t2\t\t3\t\t1\t\t63\t\t\n",
+ "17664\tinto\t\t1\t\t1\t\t1\t\t1\t\t4\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t\n",
+ "17161\tfollow\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t6\t\t\n",
+ "27200\t\t\t1\t\t17\t\t8\t\t102\t\t22\t\t27\t\t19\t\t12\t\t3\t\t1\t\t27\t\t\n",
+ "\n",
+ "A table showing the bigram probabilities for test sentence\n",
+ "smoothing =1\n",
+ "\t\thalls\t\tcan\t\tanother\t\tno\t\tthese\t\tthus\t\tbecause\t\tman\t\tinto\t\tfollow\t\t\t\t\n",
+ "halls\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t\n",
+ "can\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00058\t\t\n",
+ "another\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00023\t\t\n",
+ "no\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00107\t\t0.00006\t\t0.00006\t\t0.00011\t\t0.00056\t\t0.00006\t\t0.00006\t\t0.00102\t\t\n",
+ "these\t\t0.00011\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t\n",
+ "thus\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t\n",
+ "because\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n",
+ "man\t\t0.00006\t\t0.00034\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00011\t\t0.00006\t\t0.00011\t\t0.00017\t\t0.00006\t\t0.00357\t\t\n",
+ "into\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00023\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00011\t\t\n",
+ "follow\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00035\t\t\n",
+ "\t\t0.00004\t\t0.00063\t\t0.00029\t\t0.00375\t\t0.00081\t\t0.00099\t\t0.00070\t\t0.00044\t\t0.00011\t\t0.00004\t\t0.00099\t\t\n",
+ "\n",
+ "The probability of the sentence under the trained model\n",
+ "smoothing =1\n",
+ "1.0762093761487005e-21\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in range (len(test_sentences)):\n",
+ " test_sentence = test_sentences[i]\n",
+ " print(\"!!!!!!!!!!The test Sentence is!!!!!!!!!!\")\n",
+ " print(test_sentence)\n",
+ " test_sentence = tokenize_sentence(test_sentence)\n",
+ " test_sentence = prep_data(test_sentence)\n",
+ "\n",
+ " # Vocabulary of test sentence\n",
+ " test_sentence_vocab = vocabulary(test_sentence)\n",
+ "\n",
+ " test_sentence = list(itertools.chain.from_iterable(test_sentence))\n",
+ " #test_sentence\n",
+ "\n",
+ " # A table showing the bigram counts for test sentence.\n",
+ " print_bigram_freq_test_sentence(test_sentence_vocab,smoothing)\n",
+ "\n",
+ " # A table showing the bigram probabilities for test sentence.\n",
+ " print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing)\n",
+ "\n",
+ " # The probability of the sentence under the trained model\n",
+ " print(\"The probability of the sentence under the trained model\"+\"\\nsmoothing =\"+str(smoothing))\n",
+ " print(compute_prob_test_sentence(test_sentence,0))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/results_no_smoothing.txt b/results_no_smoothing.txt
new file mode 100644
index 0000000..e4552df
--- /dev/null
+++ b/results_no_smoothing.txt
@@ -0,0 +1,72 @@
+!!!!!!!!!!The test Sentence is!!!!!!!!!!
+['upon this the captain started , and eagerly desired to know more .']
+A table showing the bigram counts for test sentence.
+smoothing =0
+ know this and more to upon the desired started eagerly captain
+152 know 0 0 2 1 1 0 8 0 0 0 1 7
+1394 this 0 1 9 0 7 0 24 0 0 0 1 25
+6430 and 1 44 0 45 30 2 370 0 1 2 5 0
+508 more 0 0 24 0 11 2 14 0 1 0 0 25
+4625 to 15 50 23 1 1 1 729 0 0 0 2 26
+566 upon 0 23 0 0 0 0 220 0 0 0 1 4
+14431 the 0 0 0 50 0 0 0 3 0 0 97 1
+11 desired 0 0 0 0 2 0 0 0 0 0 0 0
+27 started 0 0 1 0 5 0 1 0 0 0 0 0
+8 eagerly 0 0 1 0 0 0 0 1 0 0 0 0
+329 captain 0 2 11 0 8 1 3 0 1 0 0 10
+10059 2 115 408 7 60 26 629 0 1 0 22 26
+
+A table showing the bigram probabilities for test sentence
+smoothing =0
+ know this and more to upon the desired started eagerly captain
+know 0.00000 0.00000 0.01316 0.00658 0.00658 0.00000 0.05263 0.00000 0.00000 0.00000 0.00658 0.04605
+this 0.00000 0.00072 0.00646 0.00000 0.00502 0.00000 0.01722 0.00000 0.00000 0.00000 0.00072 0.01793
+and 0.00016 0.00684 0.00000 0.00700 0.00467 0.00031 0.05754 0.00000 0.00016 0.00031 0.00078 0.00000
+more 0.00000 0.00000 0.04724 0.00000 0.02165 0.00394 0.02756 0.00000 0.00197 0.00000 0.00000 0.04921
+to 0.00324 0.01081 0.00497 0.00022 0.00022 0.00022 0.15762 0.00000 0.00000 0.00000 0.00043 0.00562
+upon 0.00000 0.04064 0.00000 0.00000 0.00000 0.00000 0.38869 0.00000 0.00000 0.00000 0.00177 0.00707
+the 0.00000 0.00000 0.00000 0.00346 0.00000 0.00000 0.00000 0.00021 0.00000 0.00000 0.00672 0.00007
+desired 0.00000 0.00000 0.00000 0.00000 0.18182 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
+started 0.00000 0.00000 0.03704 0.00000 0.18519 0.00000 0.03704 0.00000 0.00000 0.00000 0.00000 0.00000
+eagerly 0.00000 0.00000 0.12500 0.00000 0.00000 0.00000 0.00000 0.12500 0.00000 0.00000 0.00000 0.00000
+captain 0.00000 0.00608 0.03343 0.00000 0.02432 0.00304 0.00912 0.00000 0.00304 0.00000 0.00000 0.03040
+ 0.00020 0.01143 0.04056 0.00070 0.00596 0.00258 0.06253 0.00000 0.00010 0.00000 0.00219 0.00258
+
+The probability of the sentence under the trained model
+smoothing =0
+1.01571679763017e-23
+!!!!!!!!!!The test Sentence is!!!!!!!!!!
+['thus , because no man can follow another into these halls .']
+A table showing the bigram counts for test sentence.
+smoothing =0
+ halls man these because can follow into thus no another
+3 halls 0 0 0 0 0 0 0 0 0 0 1
+527 man 0 1 2 0 5 0 2 1 0 0 62
+403 these 1 0 0 0 1 0 0 0 0 0 2
+92 because 0 0 1 0 0 0 0 0 2 0 0
+236 can 0 0 0 0 0 2 0 0 0 0 9
+20 follow 0 0 0 0 0 0 0 0 0 1 5
+523 into 0 0 3 0 0 0 0 0 0 0 1
+133 thus 0 0 0 1 0 0 0 0 0 0 1
+586 no 0 9 0 1 0 0 0 0 18 0 17
+115 another 0 0 0 0 0 0 1 0 0 0 3
+10059 0 11 21 18 16 0 2 26 101 7 26
+
+A table showing the bigram probabilities for test sentence
+smoothing =0
+ halls man these because can follow into thus no another
+halls 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.33333
+man 0.00000 0.00190 0.00380 0.00000 0.00949 0.00000 0.00380 0.00190 0.00000 0.00000 0.11765
+these 0.00248 0.00000 0.00000 0.00000 0.00248 0.00000 0.00000 0.00000 0.00000 0.00000 0.00496
+because 0.00000 0.00000 0.01087 0.00000 0.00000 0.00000 0.00000 0.00000 0.02174 0.00000 0.00000
+can 0.00000 0.00000 0.00000 0.00000 0.00000 0.00847 0.00000 0.00000 0.00000 0.00000 0.03814
+follow 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.05000 0.25000
+into 0.00000 0.00000 0.00574 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00191
+thus 0.00000 0.00000 0.00000 0.00752 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00752
+no 0.00000 0.01536 0.00000 0.00171 0.00000 0.00000 0.00000 0.00000 0.03072 0.00000 0.02901
+another 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00870 0.00000 0.00000 0.00000 0.02609
+ 0.00000 0.00109 0.00209 0.00179 0.00159 0.00000 0.00020 0.00258 0.01004 0.00070 0.00258
+
+The probability of the sentence under the trained model
+smoothing =0
+1.0762093761487005e-21
diff --git a/resutls_add_one_smoothing.txt b/resutls_add_one_smoothing.txt
new file mode 100644
index 0000000..0d8cc52
--- /dev/null
+++ b/resutls_add_one_smoothing.txt
@@ -0,0 +1,72 @@
+!!!!!!!!!!The test Sentence is!!!!!!!!!!
+['upon this the captain started , and eagerly desired to know more .']
+A table showing the bigram counts for test sentence.
+smoothing =1
+ started desired more eagerly to captain this and know the upon
+17168 started 1 1 1 1 6 1 1 2 1 2 1 1
+17152 desired 1 1 1 1 3 1 1 1 1 1 1 1
+17649 more 2 1 1 1 12 1 1 25 1 15 3 26
+17149 eagerly 1 2 1 1 1 1 1 2 1 1 1 1
+21766 to 1 1 2 1 2 3 51 24 16 730 2 27
+17470 captain 2 1 1 1 9 1 3 12 1 4 2 11
+18535 this 1 1 1 1 8 2 2 10 1 25 1 26
+23571 and 2 1 46 3 31 6 45 1 2 371 3 1
+17293 know 1 1 2 1 2 2 1 3 1 9 1 8
+31572 the 1 4 51 1 1 98 1 1 1 1 1 2
+17707 upon 1 1 1 1 1 2 24 1 1 221 1 5
+27200 2 1 8 1 61 23 116 409 3 630 27 27
+
+A table showing the bigram probabilities for test sentence
+smoothing =1
+ started desired more eagerly to captain this and know the upon
+started 0.00006 0.00006 0.00006 0.00006 0.00035 0.00006 0.00006 0.00012 0.00006 0.00012 0.00006 0.00006
+desired 0.00006 0.00006 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006
+more 0.00011 0.00006 0.00006 0.00006 0.00068 0.00006 0.00006 0.00142 0.00006 0.00085 0.00017 0.00147
+eagerly 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006
+to 0.00005 0.00005 0.00009 0.00005 0.00009 0.00014 0.00234 0.00110 0.00074 0.03354 0.00009 0.00124
+captain 0.00011 0.00006 0.00006 0.00006 0.00052 0.00006 0.00017 0.00069 0.00006 0.00023 0.00011 0.00063
+this 0.00005 0.00005 0.00005 0.00005 0.00043 0.00011 0.00011 0.00054 0.00005 0.00135 0.00005 0.00140
+and 0.00008 0.00004 0.00195 0.00013 0.00132 0.00025 0.00191 0.00004 0.00008 0.01574 0.00013 0.00004
+know 0.00006 0.00006 0.00012 0.00006 0.00012 0.00012 0.00006 0.00017 0.00006 0.00052 0.00006 0.00046
+the 0.00003 0.00013 0.00162 0.00003 0.00003 0.00310 0.00003 0.00003 0.00003 0.00003 0.00003 0.00006
+upon 0.00006 0.00006 0.00006 0.00006 0.00006 0.00011 0.00136 0.00006 0.00006 0.01248 0.00006 0.00028
+ 0.00007 0.00004 0.00029 0.00004 0.00224 0.00085 0.00426 0.01504 0.00011 0.02316 0.00099 0.00099
+
+The probability of the sentence under the trained model
+smoothing =1
+1.01571679763017e-23
+!!!!!!!!!!The test Sentence is!!!!!!!!!!
+['thus , because no man can follow another into these halls .']
+A table showing the bigram counts for test sentence.
+smoothing =1
+ thus because into these halls follow no can another man
+17274 thus 1 2 1 1 1 1 1 1 1 1 2
+17233 because 1 1 1 2 1 1 3 1 1 1 1
+17664 into 1 1 1 4 1 1 1 1 1 1 2
+17544 these 1 1 1 1 2 1 1 2 1 1 3
+17144 halls 1 1 1 1 1 1 1 1 1 1 2
+17161 follow 1 1 1 1 1 1 1 1 2 1 6
+17727 no 1 2 1 1 1 1 19 1 1 10 18
+17377 can 1 1 1 1 1 3 1 1 1 1 10
+17256 another 1 1 2 1 1 1 1 1 1 1 4
+17668 man 2 1 3 3 1 1 1 6 1 2 63
+27200 27 19 3 22 1 1 102 17 8 12 27
+
+A table showing the bigram probabilities for test sentence
+smoothing =1
+ thus because into these halls follow no can another man
+thus 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012
+because 0.00006 0.00006 0.00006 0.00012 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006
+into 0.00006 0.00006 0.00006 0.00023 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00011
+these 0.00006 0.00006 0.00006 0.00006 0.00011 0.00006 0.00006 0.00011 0.00006 0.00006 0.00017
+halls 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012
+follow 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 0.00006 0.00035
+no 0.00006 0.00011 0.00006 0.00006 0.00006 0.00006 0.00107 0.00006 0.00006 0.00056 0.00102
+can 0.00006 0.00006 0.00006 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006 0.00058
+another 0.00006 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00023
+man 0.00011 0.00006 0.00017 0.00017 0.00006 0.00006 0.00006 0.00034 0.00006 0.00011 0.00357
+ 0.00099 0.00070 0.00011 0.00081 0.00004 0.00004 0.00375 0.00063 0.00029 0.00044 0.00099
+
+The probability of the sentence under the trained model
+smoothing =1
+1.0762093761487005e-21