diff --git a/bigram_model.ipynb b/bigram_model.ipynb new file mode 100644 index 0000000..2b58745 --- /dev/null +++ b/bigram_model.ipynb @@ -0,0 +1,533 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from pathlib import Path\n", + "import string\n", + "from functools import reduce\n", + "from math import log\n", + "import itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Enter smoothing or no smoothing.\n", + "smoothing = 1\n", + "filename = \"train_corpus.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Loads file\n", + "# input - filename.txt \n", + "# returns a list of sentences seperated by newline in the main corpus/text. \n", + "def load_file(filename):\n", + " with open(filename) as f:\n", + " lines = [line.rstrip() for line in f]\n", + " print(\"No of sentences in Corpus: \"+str(len(lines)))\n", + " return lines" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Tokenizes the sentences meaning split the sentences into words seperated by the \"white sapce\".\n", + "# input - List of sentences\n", + "# returns a list of lists of each sentence being tokenized.\n", + "def tokenize_sentence(lines):\n", + " lines = [i.strip(\"''\").split(\" \") for i in lines] \n", + " print(\"No of sentences in Corpus: \"+str(len(lines)))\n", + " return lines" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the data for training the bigram model.\n", + "# remove punctuations -print(string.punctuation) ---- !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ----\n", + "# remove empty strings.\n", + "# lower case all the words\n", + "# add at the beginning and at the end of every sentence in the corpus.\n", + "# input - list of lists of words obtained from \"tokenize_sentence\" function.\n", + "# returns - list of lists\n", + "def prep_data(lines):\n", + " for i in range(len(lines)):\n", + " lines[i] = [''.join(c for c in s if c not in string.punctuation) for s in lines[i]] # remove punctuations\n", + " lines[i] = [s for s in lines[i] if s] # removes empty strings\n", + " lines[i] = [word.lower() for word in lines[i]] # lower case\n", + " lines[i] += [''] # Append at the end of each sentence in the corpus\n", + " lines[i].insert(0, '') # Append at the beginning of each sentence in the corpus\n", + " print(\"No of sentences in Corpus: \"+str(len(lines)))\n", + " return lines" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No of sentences in Corpus: 10059\n", + "No of sentences in Corpus: 10059\n" + ] + } + ], + "source": [ + "dataset = load_file(filename)\n", + "dataset = tokenize_sentence(dataset)\n", + "dataset = prep_data(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the vocabulary file of the dataset.\n", + "def vocabulary(dataset):\n", + " dataset_vocab = set(itertools.chain.from_iterable(dataset))\n", + " # remove and from the vocabulary of the dataset\n", + " dataset_vocab.remove('')\n", + " dataset_vocab.remove('')\n", + " dataset_vocab = list(dataset_vocab)\n", + " dataset_vocab.append('')\n", + " dataset_vocab.append('')\n", + " return dataset_vocab\n", + "\n", + "dataset_vocab = vocabulary(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17141" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dataset_vocab)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Counts the no. of times a word repeats (frequency of each word) in the corpus.\n", + "# input - list of lists of words obtained from \"prep_data\"\n", + "# returns - a dictionary defined as {word:frequency} for words of the corpus including and .\n", + "def freq_of_unique_words(lines):\n", + " bag_of_words = list(itertools.chain.from_iterable(lines)) # change the nested list to one single list\n", + " corpus_word_count = 0 # No of words in the corpus excluding and .\n", + " #count the no. of times a word repeats in the corpus\n", + " count = {}\n", + " for word in bag_of_words:\n", + " if word in count :\n", + " count[word] += 1\n", + " else:\n", + " count[word] = 1\n", + " if word != '' and word != '':\n", + " corpus_word_count +=1\n", + " \n", + " unique_word_count = len(count) - 2 # number of unique words in the corpus excluding and \n", + " \n", + " #print(\"!!! IT IS EXCLUDING AND !!!\")\n", + " print(\"No of unique words in corpus : \"+ str(unique_word_count))\n", + " print(\"No of words in corpus: \"+ str(corpus_word_count))\n", + " \n", + " return count" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No of unique words in corpus : 17139\n", + "No of words in corpus: 218619\n" + ] + } + ], + "source": [ + "unique_word_frequency = freq_of_unique_words(dataset)\n", + "#len(unique_word_frequency)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Computes the bigram frequncies\n", + "# Bigram frequncies means the number of times a word appears after a given word in the corpus.\n", + "# inputs:\n", + "# lines - list of lists obtained from \"prep_data\".\n", + "# count - dictionary obtained from \"freq_of_unique_words\".\n", + "# returns - dictionary of bigram frequencies {(word|given word): count(word|given word)} --- count(word|given word)~int.\n", + "def compute_bigram_frequencies(lines):\n", + " bigram_frequencies = dict() \n", + " #unique_bigrams = set()\n", + " for sentence in lines:\n", + " given_word = None\n", + " for word in sentence:\n", + " if given_word != None:\n", + " bigram_frequencies[(given_word, word)] = bigram_frequencies.get((given_word, word),0) + 1\n", + "# if(previous_word!='' and word!=''):\n", + "# unique_bigrams.add((previous_word,word))\n", + " given_word = word\n", + " #The number of bigram_frquencies in the corpus \n", + " #print(len(bigram_frequencies))\n", + " return bigram_frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "bigram_frequencies = compute_bigram_frequencies(dataset)\n", + "#print(bigram_frequencies)\n", + "bigram_unique_word_count = len(unique_word_frequency)\n", + "# print(\"\\n\"+\"No of words in bigram: \"+str(bigram_unique_word_count))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculating bigram probability\n", + "# bigram probability means P(word|given word) = count(word|given word)/ count(given word).\n", + "# if count(word|given word) or count(given word) is 0 then probability is 0.\n", + "# input bigram_frquencies and count obtained from \"freq_of_unique_words\".\n", + "# returns dictionary of bigram probabilities {(word|given word): probabilty} --- probability is a float value.\n", + "def compute_bigram_probabilities(bigram_frequencies,count):\n", + " bigram_probabilities = dict() \n", + " for key in bigram_frequencies:\n", + " numerator = bigram_frequencies.get(key)\n", + " denominator = count.get(key[0]) # count.get(key[0]) will get the frequency of \"given word\" in the corpus.\n", + " if (numerator ==0 or denominator==0):\n", + " bigram_probabilities[key] = 0\n", + " else:\n", + " bigram_probabilities[key] = float(numerator)/float(denominator)\n", + " return bigram_probabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "bigram_probabilities = compute_bigram_probabilities(bigram_frequencies,unique_word_frequency)\n", + "#bigram_probabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Bigram frequncies of the test sentence computed using the bigram frequencies of the training data.\n", + "# add-one smoothing if 1, no smoothing if 0 ----- smoothing\n", + "def compute_bigram_count_test_sentence(given_word,word,smoothing):\n", + " if smoothing==0:\n", + " return 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n", + " elif smoothing == 1:\n", + " return 1 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))+1" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# A table showing the bigram counts for test sentence.\n", + "def print_bigram_freq_test_sentence(test_sentence_vocab,smoothing):\n", + " print(\"A table showing the bigram counts for test sentence.\"+\"\\nsmoothing =\"+str(smoothing))\n", + " print(\"\\t\\t\\t\", end=\"\")\n", + " for word in test_sentence_vocab:\n", + " if word != '':\n", + " print(word, end=\"\\t\\t\")\n", + " print(\"\")\n", + " for given_word in test_sentence_vocab:\n", + " if given_word != '':\n", + " if(smoothing==1):\n", + " print(unique_word_frequency.get(given_word)+bigram_unique_word_count, end =\"\\t\")\n", + " elif(smoothing==0):\n", + " print(unique_word_frequency.get(given_word), end =\"\\t\")\n", + " print(given_word, end=\"\\t\\t\")\n", + " for word in test_sentence_vocab:\n", + " if word !='':\n", + " print(\"{0:}\".format(compute_bigram_count_test_sentence(given_word,word,smoothing)), end=\"\\t\\t\")\n", + " print(\"\")\n", + " print(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Bigram probabilities of the test sentence computed using the bigram probabilities of the training data.\n", + "# add-one smoothing if 1, no smoothing if 0 ---- smoothing\n", + "def compute_bigram_prob_test_sentence(given_word,word,smoothing):\n", + " bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n", + " uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))\n", + " if smoothing==0:\n", + " return 0 if bigram_probabilities.get((given_word,word))==None else bigram_probabilities.get((given_word,word))\n", + " elif smoothing == 1:\n", + " numerator = bigram_freq+1\n", + " denominator = uni_freq+bigram_unique_word_count\n", + " return 0.0 if numerator == 0 or denominator == 0 else float(numerator) / float(denominator)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# A table showing the bigram probabilities for test sentence.\n", + "def print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing):\n", + " print(\"A table showing the bigram probabilities for test sentence\"+\"\\nsmoothing =\"+str(smoothing))\n", + " print(\"\\t\\t\", end=\"\")\n", + " for word in test_sentence_vocab:\n", + " if word != '':\n", + " print(word, end=\"\\t\\t\")\n", + " print(\"\")\n", + " for given_word in test_sentence_vocab:\n", + " if given_word != '':\n", + " print(given_word, end=\"\\t\\t\")\n", + " for word in test_sentence_vocab:\n", + " if word !='':\n", + " print(\"{0:.5f}\".format(compute_bigram_prob_test_sentence(given_word,word,smoothing)), end=\"\\t\\t\")\n", + " print(\"\")\n", + " print(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Print the probability of the test sentence\n", + "# for add-one smoothing if 1, no smoothing if 0\n", + "def compute_prob_test_sentence(sentence,smoothing):\n", + " test_sent_prob = 0\n", + " \n", + " if(smoothing == 0):\n", + " given_word = None\n", + " for word in sentence:\n", + " if given_word!=None:\n", + " if bigram_probabilities.get((given_word,word))==0 or bigram_probabilities.get((given_word,word))== None:\n", + " return 0\n", + " else:\n", + " test_sent_prob+=log((bigram_probabilities.get((given_word,word),0)),10)\n", + " given_word = word\n", + " \n", + " elif(smoothing ==1):\n", + " given_word = None\n", + " for word in sentence:\n", + " if given_word!=None:\n", + " bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))\n", + " uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))\n", + " numerator = bigram_freq+1\n", + " denominator = uni_freq+bigram_unique_word_count\n", + " probability = 0 if numerator==0 or denominator ==0 else float(numerator)/float(denominator)\n", + " if(probability==0):\n", + " return 0\n", + " test_sent_prob +=log(probability,10)\n", + " given_word = word\n", + " \n", + " return 10**test_sent_prob" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Test sentence here\n", + "test_sentences = [['upon this the captain started , and eagerly desired to know more .'],['thus , because no man can follow another into these halls .']]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "!!!!!!!!!!The test Sentence is!!!!!!!!!!\n", + "['upon this the captain started , and eagerly desired to know more .']\n", + "A table showing the bigram counts for test sentence.\n", + "smoothing =1\n", + "\t\t\tand\t\teagerly\t\tthis\t\tto\t\tdesired\t\tupon\t\tthe\t\tstarted\t\tcaptain\t\tknow\t\tmore\t\t\t\t\n", + "23571\tand\t\t1\t\t3\t\t45\t\t31\t\t1\t\t3\t\t371\t\t2\t\t6\t\t2\t\t46\t\t1\t\t\n", + "17149\teagerly\t\t2\t\t1\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n", + "18535\tthis\t\t10\t\t1\t\t2\t\t8\t\t1\t\t1\t\t25\t\t1\t\t2\t\t1\t\t1\t\t26\t\t\n", + "21766\tto\t\t24\t\t1\t\t51\t\t2\t\t1\t\t2\t\t730\t\t1\t\t3\t\t16\t\t2\t\t27\t\t\n", + "17152\tdesired\t\t1\t\t1\t\t1\t\t3\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n", + "17707\tupon\t\t1\t\t1\t\t24\t\t1\t\t1\t\t1\t\t221\t\t1\t\t2\t\t1\t\t1\t\t5\t\t\n", + "31572\tthe\t\t1\t\t1\t\t1\t\t1\t\t4\t\t1\t\t1\t\t1\t\t98\t\t1\t\t51\t\t2\t\t\n", + "17168\tstarted\t\t2\t\t1\t\t1\t\t6\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n", + "17470\tcaptain\t\t12\t\t1\t\t3\t\t9\t\t1\t\t2\t\t4\t\t2\t\t1\t\t1\t\t1\t\t11\t\t\n", + "17293\tknow\t\t3\t\t1\t\t1\t\t2\t\t1\t\t1\t\t9\t\t1\t\t2\t\t1\t\t2\t\t8\t\t\n", + "17649\tmore\t\t25\t\t1\t\t1\t\t12\t\t1\t\t3\t\t15\t\t2\t\t1\t\t1\t\t1\t\t26\t\t\n", + "27200\t\t\t409\t\t1\t\t116\t\t61\t\t1\t\t27\t\t630\t\t2\t\t23\t\t3\t\t8\t\t27\t\t\n", + "\n", + "A table showing the bigram probabilities for test sentence\n", + "smoothing =1\n", + "\t\tand\t\teagerly\t\tthis\t\tto\t\tdesired\t\tupon\t\tthe\t\tstarted\t\tcaptain\t\tknow\t\tmore\t\t\t\t\n", + "and\t\t0.00004\t\t0.00013\t\t0.00191\t\t0.00132\t\t0.00004\t\t0.00013\t\t0.01574\t\t0.00008\t\t0.00025\t\t0.00008\t\t0.00195\t\t0.00004\t\t\n", + "eagerly\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n", + "this\t\t0.00054\t\t0.00005\t\t0.00011\t\t0.00043\t\t0.00005\t\t0.00005\t\t0.00135\t\t0.00005\t\t0.00011\t\t0.00005\t\t0.00005\t\t0.00140\t\t\n", + "to\t\t0.00110\t\t0.00005\t\t0.00234\t\t0.00009\t\t0.00005\t\t0.00009\t\t0.03354\t\t0.00005\t\t0.00014\t\t0.00074\t\t0.00009\t\t0.00124\t\t\n", + "desired\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n", + "upon\t\t0.00006\t\t0.00006\t\t0.00136\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.01248\t\t0.00006\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00028\t\t\n", + "the\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00013\t\t0.00003\t\t0.00003\t\t0.00003\t\t0.00310\t\t0.00003\t\t0.00162\t\t0.00006\t\t\n", + "started\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00035\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n", + "captain\t\t0.00069\t\t0.00006\t\t0.00017\t\t0.00052\t\t0.00006\t\t0.00011\t\t0.00023\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00063\t\t\n", + "know\t\t0.00017\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00052\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00012\t\t0.00046\t\t\n", + "more\t\t0.00142\t\t0.00006\t\t0.00006\t\t0.00068\t\t0.00006\t\t0.00017\t\t0.00085\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00147\t\t\n", + "\t\t0.01504\t\t0.00004\t\t0.00426\t\t0.00224\t\t0.00004\t\t0.00099\t\t0.02316\t\t0.00007\t\t0.00085\t\t0.00011\t\t0.00029\t\t0.00099\t\t\n", + "\n", + "The probability of the sentence under the trained model\n", + "smoothing =1\n", + "1.01571679763017e-23\n", + "!!!!!!!!!!The test Sentence is!!!!!!!!!!\n", + "['thus , because no man can follow another into these halls .']\n", + "A table showing the bigram counts for test sentence.\n", + "smoothing =1\n", + "\t\t\thalls\t\tcan\t\tanother\t\tno\t\tthese\t\tthus\t\tbecause\t\tman\t\tinto\t\tfollow\t\t\t\t\n", + "17144\thalls\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t\n", + "17377\tcan\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t3\t\t10\t\t\n", + "17256\tanother\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t1\t\t4\t\t\n", + "17727\tno\t\t1\t\t1\t\t1\t\t19\t\t1\t\t1\t\t2\t\t10\t\t1\t\t1\t\t18\t\t\n", + "17544\tthese\t\t2\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t3\t\t\n", + "17274\tthus\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t2\t\t\n", + "17233\tbecause\t\t1\t\t1\t\t1\t\t3\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t\n", + "17668\tman\t\t1\t\t6\t\t1\t\t1\t\t3\t\t2\t\t1\t\t2\t\t3\t\t1\t\t63\t\t\n", + "17664\tinto\t\t1\t\t1\t\t1\t\t1\t\t4\t\t1\t\t1\t\t1\t\t1\t\t1\t\t2\t\t\n", + "17161\tfollow\t\t1\t\t1\t\t2\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t1\t\t6\t\t\n", + "27200\t\t\t1\t\t17\t\t8\t\t102\t\t22\t\t27\t\t19\t\t12\t\t3\t\t1\t\t27\t\t\n", + "\n", + "A table showing the bigram probabilities for test sentence\n", + "smoothing =1\n", + "\t\thalls\t\tcan\t\tanother\t\tno\t\tthese\t\tthus\t\tbecause\t\tman\t\tinto\t\tfollow\t\t\t\t\n", + "halls\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t\n", + "can\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00058\t\t\n", + "another\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00023\t\t\n", + "no\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00107\t\t0.00006\t\t0.00006\t\t0.00011\t\t0.00056\t\t0.00006\t\t0.00006\t\t0.00102\t\t\n", + "these\t\t0.00011\t\t0.00011\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t\n", + "thus\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00012\t\t\n", + "because\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t\n", + "man\t\t0.00006\t\t0.00034\t\t0.00006\t\t0.00006\t\t0.00017\t\t0.00011\t\t0.00006\t\t0.00011\t\t0.00017\t\t0.00006\t\t0.00357\t\t\n", + "into\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00023\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00011\t\t\n", + "follow\t\t0.00006\t\t0.00006\t\t0.00012\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00006\t\t0.00035\t\t\n", + "\t\t0.00004\t\t0.00063\t\t0.00029\t\t0.00375\t\t0.00081\t\t0.00099\t\t0.00070\t\t0.00044\t\t0.00011\t\t0.00004\t\t0.00099\t\t\n", + "\n", + "The probability of the sentence under the trained model\n", + "smoothing =1\n", + "1.0762093761487005e-21\n" + ] + } + ], + "source": [ + "for i in range (len(test_sentences)):\n", + " test_sentence = test_sentences[i]\n", + " print(\"!!!!!!!!!!The test Sentence is!!!!!!!!!!\")\n", + " print(test_sentence)\n", + " test_sentence = tokenize_sentence(test_sentence)\n", + " test_sentence = prep_data(test_sentence)\n", + "\n", + " # Vocabulary of test sentence\n", + " test_sentence_vocab = vocabulary(test_sentence)\n", + "\n", + " test_sentence = list(itertools.chain.from_iterable(test_sentence))\n", + " #test_sentence\n", + "\n", + " # A table showing the bigram counts for test sentence.\n", + " print_bigram_freq_test_sentence(test_sentence_vocab,smoothing)\n", + "\n", + " # A table showing the bigram probabilities for test sentence.\n", + " print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing)\n", + "\n", + " # The probability of the sentence under the trained model\n", + " print(\"The probability of the sentence under the trained model\"+\"\\nsmoothing =\"+str(smoothing))\n", + " print(compute_prob_test_sentence(test_sentence,0))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/results_no_smoothing.txt b/results_no_smoothing.txt new file mode 100644 index 0000000..e4552df --- /dev/null +++ b/results_no_smoothing.txt @@ -0,0 +1,72 @@ +!!!!!!!!!!The test Sentence is!!!!!!!!!! +['upon this the captain started , and eagerly desired to know more .'] +A table showing the bigram counts for test sentence. +smoothing =0 + know this and more to upon the desired started eagerly captain +152 know 0 0 2 1 1 0 8 0 0 0 1 7 +1394 this 0 1 9 0 7 0 24 0 0 0 1 25 +6430 and 1 44 0 45 30 2 370 0 1 2 5 0 +508 more 0 0 24 0 11 2 14 0 1 0 0 25 +4625 to 15 50 23 1 1 1 729 0 0 0 2 26 +566 upon 0 23 0 0 0 0 220 0 0 0 1 4 +14431 the 0 0 0 50 0 0 0 3 0 0 97 1 +11 desired 0 0 0 0 2 0 0 0 0 0 0 0 +27 started 0 0 1 0 5 0 1 0 0 0 0 0 +8 eagerly 0 0 1 0 0 0 0 1 0 0 0 0 +329 captain 0 2 11 0 8 1 3 0 1 0 0 10 +10059 2 115 408 7 60 26 629 0 1 0 22 26 + +A table showing the bigram probabilities for test sentence +smoothing =0 + know this and more to upon the desired started eagerly captain +know 0.00000 0.00000 0.01316 0.00658 0.00658 0.00000 0.05263 0.00000 0.00000 0.00000 0.00658 0.04605 +this 0.00000 0.00072 0.00646 0.00000 0.00502 0.00000 0.01722 0.00000 0.00000 0.00000 0.00072 0.01793 +and 0.00016 0.00684 0.00000 0.00700 0.00467 0.00031 0.05754 0.00000 0.00016 0.00031 0.00078 0.00000 +more 0.00000 0.00000 0.04724 0.00000 0.02165 0.00394 0.02756 0.00000 0.00197 0.00000 0.00000 0.04921 +to 0.00324 0.01081 0.00497 0.00022 0.00022 0.00022 0.15762 0.00000 0.00000 0.00000 0.00043 0.00562 +upon 0.00000 0.04064 0.00000 0.00000 0.00000 0.00000 0.38869 0.00000 0.00000 0.00000 0.00177 0.00707 +the 0.00000 0.00000 0.00000 0.00346 0.00000 0.00000 0.00000 0.00021 0.00000 0.00000 0.00672 0.00007 +desired 0.00000 0.00000 0.00000 0.00000 0.18182 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 +started 0.00000 0.00000 0.03704 0.00000 0.18519 0.00000 0.03704 0.00000 0.00000 0.00000 0.00000 0.00000 +eagerly 0.00000 0.00000 0.12500 0.00000 0.00000 0.00000 0.00000 0.12500 0.00000 0.00000 0.00000 0.00000 +captain 0.00000 0.00608 0.03343 0.00000 0.02432 0.00304 0.00912 0.00000 0.00304 0.00000 0.00000 0.03040 + 0.00020 0.01143 0.04056 0.00070 0.00596 0.00258 0.06253 0.00000 0.00010 0.00000 0.00219 0.00258 + +The probability of the sentence under the trained model +smoothing =0 +1.01571679763017e-23 +!!!!!!!!!!The test Sentence is!!!!!!!!!! +['thus , because no man can follow another into these halls .'] +A table showing the bigram counts for test sentence. +smoothing =0 + halls man these because can follow into thus no another +3 halls 0 0 0 0 0 0 0 0 0 0 1 +527 man 0 1 2 0 5 0 2 1 0 0 62 +403 these 1 0 0 0 1 0 0 0 0 0 2 +92 because 0 0 1 0 0 0 0 0 2 0 0 +236 can 0 0 0 0 0 2 0 0 0 0 9 +20 follow 0 0 0 0 0 0 0 0 0 1 5 +523 into 0 0 3 0 0 0 0 0 0 0 1 +133 thus 0 0 0 1 0 0 0 0 0 0 1 +586 no 0 9 0 1 0 0 0 0 18 0 17 +115 another 0 0 0 0 0 0 1 0 0 0 3 +10059 0 11 21 18 16 0 2 26 101 7 26 + +A table showing the bigram probabilities for test sentence +smoothing =0 + halls man these because can follow into thus no another +halls 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.33333 +man 0.00000 0.00190 0.00380 0.00000 0.00949 0.00000 0.00380 0.00190 0.00000 0.00000 0.11765 +these 0.00248 0.00000 0.00000 0.00000 0.00248 0.00000 0.00000 0.00000 0.00000 0.00000 0.00496 +because 0.00000 0.00000 0.01087 0.00000 0.00000 0.00000 0.00000 0.00000 0.02174 0.00000 0.00000 +can 0.00000 0.00000 0.00000 0.00000 0.00000 0.00847 0.00000 0.00000 0.00000 0.00000 0.03814 +follow 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.05000 0.25000 +into 0.00000 0.00000 0.00574 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00191 +thus 0.00000 0.00000 0.00000 0.00752 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00752 +no 0.00000 0.01536 0.00000 0.00171 0.00000 0.00000 0.00000 0.00000 0.03072 0.00000 0.02901 +another 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00870 0.00000 0.00000 0.00000 0.02609 + 0.00000 0.00109 0.00209 0.00179 0.00159 0.00000 0.00020 0.00258 0.01004 0.00070 0.00258 + +The probability of the sentence under the trained model +smoothing =0 +1.0762093761487005e-21 diff --git a/resutls_add_one_smoothing.txt b/resutls_add_one_smoothing.txt new file mode 100644 index 0000000..0d8cc52 --- /dev/null +++ b/resutls_add_one_smoothing.txt @@ -0,0 +1,72 @@ +!!!!!!!!!!The test Sentence is!!!!!!!!!! +['upon this the captain started , and eagerly desired to know more .'] +A table showing the bigram counts for test sentence. +smoothing =1 + started desired more eagerly to captain this and know the upon +17168 started 1 1 1 1 6 1 1 2 1 2 1 1 +17152 desired 1 1 1 1 3 1 1 1 1 1 1 1 +17649 more 2 1 1 1 12 1 1 25 1 15 3 26 +17149 eagerly 1 2 1 1 1 1 1 2 1 1 1 1 +21766 to 1 1 2 1 2 3 51 24 16 730 2 27 +17470 captain 2 1 1 1 9 1 3 12 1 4 2 11 +18535 this 1 1 1 1 8 2 2 10 1 25 1 26 +23571 and 2 1 46 3 31 6 45 1 2 371 3 1 +17293 know 1 1 2 1 2 2 1 3 1 9 1 8 +31572 the 1 4 51 1 1 98 1 1 1 1 1 2 +17707 upon 1 1 1 1 1 2 24 1 1 221 1 5 +27200 2 1 8 1 61 23 116 409 3 630 27 27 + +A table showing the bigram probabilities for test sentence +smoothing =1 + started desired more eagerly to captain this and know the upon +started 0.00006 0.00006 0.00006 0.00006 0.00035 0.00006 0.00006 0.00012 0.00006 0.00012 0.00006 0.00006 +desired 0.00006 0.00006 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 +more 0.00011 0.00006 0.00006 0.00006 0.00068 0.00006 0.00006 0.00142 0.00006 0.00085 0.00017 0.00147 +eagerly 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 +to 0.00005 0.00005 0.00009 0.00005 0.00009 0.00014 0.00234 0.00110 0.00074 0.03354 0.00009 0.00124 +captain 0.00011 0.00006 0.00006 0.00006 0.00052 0.00006 0.00017 0.00069 0.00006 0.00023 0.00011 0.00063 +this 0.00005 0.00005 0.00005 0.00005 0.00043 0.00011 0.00011 0.00054 0.00005 0.00135 0.00005 0.00140 +and 0.00008 0.00004 0.00195 0.00013 0.00132 0.00025 0.00191 0.00004 0.00008 0.01574 0.00013 0.00004 +know 0.00006 0.00006 0.00012 0.00006 0.00012 0.00012 0.00006 0.00017 0.00006 0.00052 0.00006 0.00046 +the 0.00003 0.00013 0.00162 0.00003 0.00003 0.00310 0.00003 0.00003 0.00003 0.00003 0.00003 0.00006 +upon 0.00006 0.00006 0.00006 0.00006 0.00006 0.00011 0.00136 0.00006 0.00006 0.01248 0.00006 0.00028 + 0.00007 0.00004 0.00029 0.00004 0.00224 0.00085 0.00426 0.01504 0.00011 0.02316 0.00099 0.00099 + +The probability of the sentence under the trained model +smoothing =1 +1.01571679763017e-23 +!!!!!!!!!!The test Sentence is!!!!!!!!!! +['thus , because no man can follow another into these halls .'] +A table showing the bigram counts for test sentence. +smoothing =1 + thus because into these halls follow no can another man +17274 thus 1 2 1 1 1 1 1 1 1 1 2 +17233 because 1 1 1 2 1 1 3 1 1 1 1 +17664 into 1 1 1 4 1 1 1 1 1 1 2 +17544 these 1 1 1 1 2 1 1 2 1 1 3 +17144 halls 1 1 1 1 1 1 1 1 1 1 2 +17161 follow 1 1 1 1 1 1 1 1 2 1 6 +17727 no 1 2 1 1 1 1 19 1 1 10 18 +17377 can 1 1 1 1 1 3 1 1 1 1 10 +17256 another 1 1 2 1 1 1 1 1 1 1 4 +17668 man 2 1 3 3 1 1 1 6 1 2 63 +27200 27 19 3 22 1 1 102 17 8 12 27 + +A table showing the bigram probabilities for test sentence +smoothing =1 + thus because into these halls follow no can another man +thus 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 +because 0.00006 0.00006 0.00006 0.00012 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006 +into 0.00006 0.00006 0.00006 0.00023 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00011 +these 0.00006 0.00006 0.00006 0.00006 0.00011 0.00006 0.00006 0.00011 0.00006 0.00006 0.00017 +halls 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 +follow 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00012 0.00006 0.00035 +no 0.00006 0.00011 0.00006 0.00006 0.00006 0.00006 0.00107 0.00006 0.00006 0.00056 0.00102 +can 0.00006 0.00006 0.00006 0.00006 0.00006 0.00017 0.00006 0.00006 0.00006 0.00006 0.00058 +another 0.00006 0.00006 0.00012 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00006 0.00023 +man 0.00011 0.00006 0.00017 0.00017 0.00006 0.00006 0.00006 0.00034 0.00006 0.00011 0.00357 + 0.00099 0.00070 0.00011 0.00081 0.00004 0.00004 0.00375 0.00063 0.00029 0.00044 0.00099 + +The probability of the sentence under the trained model +smoothing =1 +1.0762093761487005e-21