Skip to content

Commit

Permalink
initial commit of word language model
Browse files Browse the repository at this point in the history
  • Loading branch information
adamlerer committed Oct 17, 2016
1 parent 764ac3b commit 6fde116
Show file tree
Hide file tree
Showing 3 changed files with 365 additions and 0 deletions.
53 changes: 53 additions & 0 deletions word_language_model/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
########################################
# Data Fetching Script for PTB
########################################

import torch
import os.path

class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []

def addword(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word)

return self.word2idx[word]

def ntokens(self):
return len(self.idx2word)


class Corpus(object):
def __init__(self, path):
self.dic = Dictionary()
self.train=self._loadfile(os.path.join(path, 'train.txt'))
self.valid=self._loadfile(os.path.join(path, 'valid.txt'))
self.test =self._loadfile(os.path.join(path, 'test.txt'))

# | Tokenize a text file.
def _loadfile(self, path):
# Read words from file.
assert(os.path.exists(path))
tokens = 0
with open(path, 'r') as f:
for line in f:
words = line.split() + ['<eos>']
for word in words:
self.dic.addword(word)
tokens += 1

with open(path, 'r') as f:
ids = torch.LongTensor(tokens)
token = 0
for line in f:
words = line.split() + ['<eos>']
for word in words:
ids[token] = self.dic.word2idx[word]
token += 1

# Final dataset.
return ids
222 changes: 222 additions & 0 deletions word_language_model/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
###############################################################################
# Language Modeling on Penn Tree Bank
###############################################################################

import argparse
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable

from rnn_modules import *
import data

parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')

# Data parameters
parser.add_argument('-data' , type=str, default='./data/penn', help='Location of the data corpus' )
# Model parameters.
parser.add_argument('-model' , type=str, default='RNN' , help='Type of recurrent net. RNN, LSTM, or GRU.' )
parser.add_argument('-emsize' , type=int, default=200 , help='Size of word embeddings' )
parser.add_argument('-nhid' , type=int, default=200 , help='Number of hidden units per layer.' )
# Optimization parameters.
parser.add_argument('-lr' , type=float, default=20 , help='Initial learning rate.' )
parser.add_argument('-clip' , type=float, default=0.5 , help='Gradient clipping.' )
parser.add_argument('-maxepoch' , type=int, default=6 , help='Upper epoch limit.' )
parser.add_argument('-batchsize' , type=int, default=20 , help='Batch size.' )
parser.add_argument('-bptt' , type=int, default=20 , help='Sequence length.' )
# Device parameters.
parser.add_argument('-seed' , type=int, default=1111 , help='Random seed.' )
parser.add_argument('-cuda' , action='store_true' , help='Use CUDA.' )
# Misc parameters.
parser.add_argument('-reportint' , type=int, default=1000 , help='Report interval.' )
parser.add_argument('-save' , type=str, default='model.pt' , help='Path to save the final model.' )
args = parser.parse_args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
# If the GPU is enabled, do some plumbing.

if torch.cuda.is_available() and not args.cuda:
print("WARNING: You have a CUDA device, so you should probably run with -cuda")

###############################################################################
## LOAD DATA
###############################################################################

corpus = data.Corpus(args.data)

def batchify(data, bsz, bptt):
nbatch = int(math.floor(data.size(0) / bsz / bptt))
data = data.narrow(0, 0, nbatch * bptt * bsz)
data = data.view(bsz, -1).t().contiguous()
if args.cuda:
data = data.cuda()
return data

train = batchify(corpus.train, args.batchsize, args.bptt)
valid = batchify(corpus.valid, 10, 1)
test = batchify(corpus.test, 10, 1)
train = train[:10000]
valid = valid[:100]

bptt = args.bptt
bsz = args.batchsize

###############################################################################
# MAKE MODEL
###############################################################################

initrange = 0.1

class RNNModel(nn.Container):
"""A container module with an encoder, an RNN (one of several flavors),
and a decoder. Runs one RNN step at a time.
"""

@staticmethod
def name2module(name):
if name == 'RNN':
return RNN
elif name == 'LSTM':
return LSTM
elif name == 'GRU':
return GRU
else:
error("Unknown RNN module: " + name)

def __init__(self, rnnType, ntoken, ninp, nhid):
rnnModule = RNNModel.name2module(rnnType)
super(RNNModel, self).__init__(
encoder = nn.sparse.Embedding(ntoken, ninp),
rnn = rnnModule(ninp, nhid),
decoder = nn.Linear(nhid, ntoken),
)

# FIXME: is this better than the standard init? probably
# FIXME: we need better reset_parameters methods in stdlib
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.fill_(0)
self.decoder.weight.data.uniform_(-initrange, initrange)

def __call__(self, hidden, input):
emb = self.encoder(input)
hidden, output = self.rnn(hidden, emb)
decoded = self.decoder(output)
return hidden, decoded

def initHidden(self, bsz):
return self.rnn.initHidden(bsz)

model = RNNModel(args.model, corpus.dic.ntokens(), args.emsize, args.nhid)
if args.cuda:
model.cuda()

criterion = nn.CrossEntropyLoss()

########################################
# TRAINING
########################################

lr = args.lr
clip = args.clip
reportinterval = args.reportint

# Perform the forward pass only.
def evaluate(model, data, criterion):
loss = 0
hidden = model.initHidden(data.size(1))
# Loop over validation data.
for i in range(0, data.size(0) - 1):
hidden, output = model(hidden, Variable(data[i], requires_grad=False))
loss += criterion(output, Variable(data[i+1], requires_grad=False)).data[0]

return loss / data.size(0)

# simple gradient clipping, using the total norm of the gradient
def clipGradient(model, clip):
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
return min(1, args.clip / (totalnorm + 1e-6))

# Between bptt intervals, we want to maintain the hidden state data
# but don't want to backprop gradients across bptt intervals.
# So we have to rewrap the hidden state in a fresh Variable.
def repackageHidden(h):
if type(h) == Variable:
return Variable(h.data)
else:
return tuple(repackageVariable(v) for v in h)

# Loop over epochs.
prev_loss = None
for epoch in range(1, args.maxepoch+1):
total_loss = 0
epoch_start_time = time.time()
# Start with an initial hidden state.
hidden = model.initHidden(bsz)
# Loop over the training data.
loss = 0
i = 0
model.zero_grad()

total_loss = 0
start_time = epoch_start_time = time.time()
while i < train.size(0) - 1:
hidden, output = model(hidden, Variable(train[i], requires_grad=False))
loss += criterion(output, Variable(train[i+1], requires_grad=False))
i += 1

if i % bptt == 0:
loss.backward()

clipped_lr = lr * clipGradient(model, args.clip)

for p in model.parameters():
p.data.sub_(p.grad.mul(clipped_lr))

hidden = repackageHidden(hidden)
model.zero_grad()
total_loss += loss.data[0]
loss = 0

if i % reportinterval == 0:
cur_loss = total_loss / reportinterval
elapsed = time.time() - start_time
print(
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
+ 'train loss {:5.2f} | train ppl {:8.2f}').format(
epoch, i, train.size(0), lr, elapsed * 1000 / reportinterval * bsz,
cur_loss, math.exp(cur_loss)
))
total_loss = 0
start_time = time.time()

val_loss = evaluate(model, valid, criterion)

print(
'| end of epoch {:3d} | ms/batch {:5.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
epoch, (time.time() - epoch_start_time) * 1000 / train.size(0), val_loss, math.exp(val_loss)
))

# The annealing schedule.
if prev_loss and val_loss > prev_loss:
lr = lr / 4

prev_loss = val_loss

# Run on test data.
test_loss = evaluate(model, test, criterion)
print(
'| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
test_loss, math.exp(test_loss)
))

if args.save != '' :
with open(args.save, 'wb') as f:
torch.save(model, f)
90 changes: 90 additions & 0 deletions word_language_model/rnn_modules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
###############################################################################
# Various RNN Modules
###############################################################################

import torch
import torch.nn as nn
from torch.autograd import Variable

# FIXME: add CUDNN

class RNN(nn.Container):

def __init__(self, ninp, nhid):
super(RNN, self).__init__(
i2h=nn.Linear(ninp, nhid),
h2h=nn.Linear(nhid, nhid),
sigmoid=nn.Sigmoid(),
)
self.ninp = ninp
self.nhid = nhid

def __call__(self, hidden, input):
next = self.sigmoid(self.h2h(hidden) + self.i2h(input))
return next, next

def initHidden(self, bsz):
return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())


class LSTM(nn.Container):

def __init__(self, ninp, nhid):
super(LSTM, self).__init__(
i2h=nn.Linear(ninp, 4 * nhid),
h2h=nn.Linear(nhid, 4 * nhid),
sigmoid=nn.Sigmoid(),
tanh=nn.Tanh(),
)
self.ninp = ninp
self.nhid = nhid

def __call__(self, hidden, input):
c, h = hidden
gates = self.h2h(h) + self.i2h(input)
gates = gates.view(input.size(0), 4, self.nhid).transpose(0, 1)

ingate = self.sigmoid(gates[0])
cellgate = self.tanh(gates[1])
forgetgate = self.sigmoid(gates[2])
outgate = self.sigmoid(gates[3])

nextc = (forgetgate * c) + (ingate * cellgate)
nexth = outgate * self.tanh(nextc)

return (nextc, nexth), nexth

def initHidden(self, bsz):
return (Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()),
Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()))


class GRU(nn.Container):

def __init__(self, ninp, nhid):
super(GRU, self).__init__(
i2h=nn.Linear(ninp, 3 * nhid),
h2h=nn.Linear(nhid, 3 * nhid),
sigmoid=nn.Sigmoid(),
tanh=nn.Tanh(),
)
self.ninp = ninp
self.nhid = nhid

def __call__(self, hidden, input):
gi = i2h(input).view(3, input.size(0), self.nhid).transpose(0, 1)
gh = h2h(hidden).view(3, input.size(0), self.nhid).transpose(0, 1)

resetgate = self.sigmoid(gi[0] + gh[0])
updategate = self.sigmoid(gi[1] + gh[1])

output = self.tanh(gi[2] + resetgate * gh[2])
nexth = hidden + updategate * (output - h)

return nexth, output

def initHidden(self, bsz):
return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())



0 comments on commit 6fde116

Please sign in to comment.