Skip to content

Commit

Permalink
perplexity and min delay experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisdonahue committed Nov 28, 2016
1 parent fb9f254 commit 4c1ae88
Show file tree
Hide file tree
Showing 2 changed files with 373 additions and 0 deletions.
168 changes: 168 additions & 0 deletions 16_11_22 brown trigram perplexity.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline\n",
"from nltk.corpus import brown"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"docs = []\n",
"word_to_count = {}\n",
"for fileid in brown.fileids():\n",
" doc = []\n",
" for word in brown.words(fileids=fileid):\n",
" word = word.lower()\n",
" if word not in word_to_count:\n",
" word_to_count[word] = 0\n",
" word_to_count[word] += 1\n",
" doc.append(word)\n",
" docs.append(doc)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"word_to_id = {w: i for i, (w, c) in enumerate(sorted(word_to_count.items(), key=lambda x: -x[1]))}\n",
"docs_ids = []\n",
"for doc in docs:\n",
" doc_ids = []\n",
" for word in doc:\n",
" doc_ids.append(word_to_id[word])\n",
" docs_ids.append(doc_ids)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"doc_train = docs_ids[:450]\n",
"doc_eval = docs_ids[450:]\n",
"n = 4\n",
"\n",
"ngram_to_count = {}\n",
"for doc in doc_train:\n",
" for i in xrange(len(doc)):\n",
" ngram = tuple(doc[i:i + n])\n",
" if len(ngram) < n:\n",
" break\n",
" if ngram not in ngram_to_count:\n",
" ngram_to_count[ngram] = 0\n",
" ngram_to_count[ngram] += 1"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ngram_posterior = {}\n",
"for ngram, count in ngram_to_count.items():\n",
" context = ngram[:n - 1]\n",
" target = ngram[n - 1]\n",
" if context not in ngram_posterior:\n",
" ngram_posterior[context] = {}\n",
" if target not in ngram_posterior[context]:\n",
" ngram_posterior[context][target] = 0\n",
" ngram_posterior[context][target] += count"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"18.5403201473\n",
"381234.761692\n"
]
}
],
"source": [
"eps = 1e-6\n",
"log_prob_cum = 0.0\n",
"nsamples = 0\n",
"for doc in doc_eval:\n",
" for i in xrange(len(doc)):\n",
" ngram = tuple(doc[i:i + n])\n",
" if len(ngram) < n:\n",
" break\n",
" context = ngram[:n - 1]\n",
" target = ngram[n - 1]\n",
" if context in ngram_posterior:\n",
" distn = ngram_posterior[context]\n",
" total_samples = float(sum(distn.values()))\n",
" if target in distn:\n",
" p = distn[target] / total_samples\n",
" else:\n",
" p = eps\n",
" else:\n",
" p = eps\n",
" log_prob_cum += np.log2(p)\n",
" nsamples += 1\n",
"negative_log_prob = -log_prob_cum / float(nsamples)\n",
"print negative_log_prob\n",
"print np.exp2(negative_log_prob)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
205 changes: 205 additions & 0 deletions 16_11_28 blocksize.ipynb

Large diffs are not rendered by default.

0 comments on commit 4c1ae88

Please sign in to comment.