-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
perplexity and min delay experiments
- Loading branch information
1 parent
fb9f254
commit 4c1ae88
Showing
2 changed files
with
373 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Populating the interactive namespace from numpy and matplotlib\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%pylab inline\n", | ||
"from nltk.corpus import brown" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"docs = []\n", | ||
"word_to_count = {}\n", | ||
"for fileid in brown.fileids():\n", | ||
" doc = []\n", | ||
" for word in brown.words(fileids=fileid):\n", | ||
" word = word.lower()\n", | ||
" if word not in word_to_count:\n", | ||
" word_to_count[word] = 0\n", | ||
" word_to_count[word] += 1\n", | ||
" doc.append(word)\n", | ||
" docs.append(doc)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"word_to_id = {w: i for i, (w, c) in enumerate(sorted(word_to_count.items(), key=lambda x: -x[1]))}\n", | ||
"docs_ids = []\n", | ||
"for doc in docs:\n", | ||
" doc_ids = []\n", | ||
" for word in doc:\n", | ||
" doc_ids.append(word_to_id[word])\n", | ||
" docs_ids.append(doc_ids)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"doc_train = docs_ids[:450]\n", | ||
"doc_eval = docs_ids[450:]\n", | ||
"n = 4\n", | ||
"\n", | ||
"ngram_to_count = {}\n", | ||
"for doc in doc_train:\n", | ||
" for i in xrange(len(doc)):\n", | ||
" ngram = tuple(doc[i:i + n])\n", | ||
" if len(ngram) < n:\n", | ||
" break\n", | ||
" if ngram not in ngram_to_count:\n", | ||
" ngram_to_count[ngram] = 0\n", | ||
" ngram_to_count[ngram] += 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"ngram_posterior = {}\n", | ||
"for ngram, count in ngram_to_count.items():\n", | ||
" context = ngram[:n - 1]\n", | ||
" target = ngram[n - 1]\n", | ||
" if context not in ngram_posterior:\n", | ||
" ngram_posterior[context] = {}\n", | ||
" if target not in ngram_posterior[context]:\n", | ||
" ngram_posterior[context][target] = 0\n", | ||
" ngram_posterior[context][target] += count" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"18.5403201473\n", | ||
"381234.761692\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"eps = 1e-6\n", | ||
"log_prob_cum = 0.0\n", | ||
"nsamples = 0\n", | ||
"for doc in doc_eval:\n", | ||
" for i in xrange(len(doc)):\n", | ||
" ngram = tuple(doc[i:i + n])\n", | ||
" if len(ngram) < n:\n", | ||
" break\n", | ||
" context = ngram[:n - 1]\n", | ||
" target = ngram[n - 1]\n", | ||
" if context in ngram_posterior:\n", | ||
" distn = ngram_posterior[context]\n", | ||
" total_samples = float(sum(distn.values()))\n", | ||
" if target in distn:\n", | ||
" p = distn[target] / total_samples\n", | ||
" else:\n", | ||
" p = eps\n", | ||
" else:\n", | ||
" p = eps\n", | ||
" log_prob_cum += np.log2(p)\n", | ||
" nsamples += 1\n", | ||
"negative_log_prob = -log_prob_cum / float(nsamples)\n", | ||
"print negative_log_prob\n", | ||
"print np.exp2(negative_log_prob)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [Root]", | ||
"language": "python", | ||
"name": "Python [Root]" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.