Skip to content

Commit

Permalink
A window to NER implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
imraviagrawal committed Nov 1, 2017
0 parents commit 3bc3398
Show file tree
Hide file tree
Showing 40 changed files with 727,226 additions and 0 deletions.
Binary file added assignment3.pdf
Binary file not shown.
55,044 changes: 55,044 additions & 0 deletions data/dev.conll

Large diffs are not rendered by default.

50,350 changes: 50,350 additions & 0 deletions data/test.masked

Large diffs are not rendered by default.

9,985 changes: 9,985 additions & 0 deletions data/tiny.conll

Large diffs are not rendered by default.

219,554 changes: 219,554 additions & 0 deletions data/train.conll

Large diffs are not rendered by default.

100,232 changes: 100,232 additions & 0 deletions data/vocab.txt

Large diffs are not rendered by default.

100,232 changes: 100,232 additions & 0 deletions data/wordVectors.txt

Large diffs are not rendered by default.

181 changes: 181 additions & 0 deletions data_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Utility functions to process data.
"""
import os
import pickle
import logging
from collections import Counter

import numpy as np
from util import read_conll, one_hot, window_iterator, ConfusionMatrix, load_word_vector_mapping
from defs import LBLS, NONE, LMAP, NUM, UNK, EMBED_SIZE

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)


FDIM = 4
P_CASE = "CASE:"
CASES = ["aa", "AA", "Aa", "aA"]
START_TOKEN = "<s>"
END_TOKEN = "</s>"

def casing(word):
if len(word) == 0: return word

# all lowercase
if word.islower(): return "aa"
# all uppercase
elif word.isupper(): return "AA"
# starts with capital
elif word[0].isupper(): return "Aa"
# has non-initial capital
else: return "aA"

def normalize(word):
"""
Normalize words that are numbers or have casing.
"""
if word.isdigit(): return NUM
else: return word.lower()

def featurize(embeddings, word):
"""
Featurize a word given embeddings.
"""
case = casing(word)
word = normalize(word)
case_mapping = {c: one_hot(FDIM, i) for i, c in enumerate(CASES)}
wv = embeddings.get(word, embeddings[UNK])
fv = case_mapping[case]
return np.hstack((wv, fv))

def evaluate(model, X, Y):
cm = ConfusionMatrix(labels=LBLS)
Y_ = model.predict(X)
for i in range(Y.shape[0]):
y, y_ = np.argmax(Y[i]), np.argmax(Y_[i])
cm.update(y,y_)
cm.print_table()
return cm.summary()

class ModelHelper(object):
"""
This helper takes care of preprocessing data, constructing embeddings, etc.
"""
def __init__(self, tok2id, max_length):
self.tok2id = tok2id
self.START = [tok2id[START_TOKEN], tok2id[P_CASE + "aa"]]
self.END = [tok2id[END_TOKEN], tok2id[P_CASE + "aa"]]
self.max_length = max_length

def vectorize_example(self, sentence, labels=None):
sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence]
if labels:
labels_ = [LBLS.index(l) for l in labels]
return sentence_, labels_
else:
return sentence_, [LBLS[-1] for _ in sentence]

def vectorize(self, data):
return [self.vectorize_example(sentence, labels) for sentence, labels in data]

@classmethod
def build(cls, data):
# Preprocess data to construct an embedding
# Reserve 0 for the special NIL token.
tok2id = build_dict((normalize(word) for sentence, _ in data for word in sentence), offset=1, max_words=10000)
tok2id.update(build_dict([P_CASE + c for c in CASES], offset=len(tok2id)))
tok2id.update(build_dict([START_TOKEN, END_TOKEN, UNK], offset=len(tok2id)))
assert sorted(tok2id.items(), key=lambda t: t[1])[0][1] == 1
logger.info("Built dictionary for %d features.", len(tok2id))

max_length = max(len(sentence) for sentence, _ in data)

return cls(tok2id, max_length)

def save(self, path):
# Make sure the directory exists.
if not os.path.exists(path):
os.makedirs(path)
# Save the tok2id map.
with open(os.path.join(path, "features.pkl"), "w") as f:
pickle.dump([self.tok2id, self.max_length], f)

@classmethod
def load(cls, path):
# Make sure the directory exists.
assert os.path.exists(path) and os.path.exists(os.path.join(path, "features.pkl"))
# Save the tok2id map.
with open(os.path.join(path, "features.pkl")) as f:
tok2id, max_length = pickle.load(f)
return cls(tok2id, max_length)

def load_and_preprocess_data(args):
logger.info("Loading training data...")
train = read_conll(args.data_train)
logger.info("Done. Read %d sentences", len(train))
logger.info("Loading dev data...")
dev = read_conll(args.data_dev)
logger.info("Done. Read %d sentences", len(dev))

helper = ModelHelper.build(train)

# now process all the input data.
train_data = helper.vectorize(train)
dev_data = helper.vectorize(dev)

return helper, train_data, dev_data, train, dev

def load_embeddings(args, helper):
embeddings = np.array(np.random.randn(len(helper.tok2id) + 1, EMBED_SIZE), dtype=np.float32)
embeddings[0] = 0.
for word, vec in load_word_vector_mapping(args.vocab, args.vectors).items():
word = normalize(word)
if word in helper.tok2id:
embeddings[helper.tok2id[word]] = vec
logger.info("Initialized embeddings.")

return embeddings

def build_dict(words, max_words=None, offset=0):
cnt = Counter(words)
if max_words:
words = cnt.most_common(max_words)
else:
words = cnt.most_common()
return {word: offset+i for i, (word, _) in enumerate(words)}


def get_chunks(seq, default=LBLS.index(NONE)):
"""Breaks input of 4 4 4 0 0 4 0 -> (0, 4, 5), (0, 6, 7)"""
chunks = []
chunk_type, chunk_start = None, None
for i, tok in enumerate(seq):
# End of a chunk 1
if tok == default and chunk_type is not None:
# Add a chunk.
chunk = (chunk_type, chunk_start, i)
chunks.append(chunk)
chunk_type, chunk_start = None, None
# End of a chunk + start of a chunk!
elif tok != default:
if chunk_type is None:
chunk_type, chunk_start = tok, i
elif tok != chunk_type:
chunk = (chunk_type, chunk_start, i)
chunks.append(chunk)
chunk_type, chunk_start = tok, i
else:
pass
# end condition
if chunk_type is not None:
chunk = (chunk_type, chunk_start, len(seq))
chunks.append(chunk)
return chunks

def test_get_chunks():
assert get_chunks([4, 4, 4, 0, 0, 4, 1, 2, 4, 3], 4) == [(0,3,5), (1, 6, 7), (2, 7, 8), (3,9,10)]
Binary file added data_util.pyc
Binary file not shown.
21 changes: 21 additions & 0 deletions defs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Common definitions for NER
"""

from util import one_hot

LBLS = [
"PER",
"ORG",
"LOC",
"MISC",
"O",
]
NONE = "O"
LMAP = {k: one_hot(5,i) for i, k in enumerate(LBLS)}
NUM = "NNNUMMM"
UNK = "UUUNKKK"

EMBED_SIZE = 50
Binary file added defs.pyc
Binary file not shown.
18 changes: 18 additions & 0 deletions make_submission.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
echo "Creating submission file."
PYFILES=`ls *.py`
CONLL_FILES="window_predictions.conll rnn_predictions.conll gru_predictions.conll"

CONLL=
for conll in $CONLL_FILES; do
if [ -e $conll ]; then
CONLL="$CONLL $conll"
else
echo "WARNING: Could not find $conll in this directory. If you
have generated it, please move it from the appropriate folder in
results/"
fi
done;

zip submissions.zip $PYFILES $CONLL
echo "Done."
110 changes: 110 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
class Model(object):
"""Abstracts a Tensorflow graph for a learning task.
We use various Model classes as usual abstractions to encapsulate tensorflow
computational graphs. Each algorithm you will construct in this homework will
inherit from a Model object.
"""
def add_placeholders(self):
"""Adds placeholder variables to tensorflow computational graph.
Tensorflow uses placeholder variables to represent locations in a
computational graph where data is inserted. These placeholders are used as
inputs by the rest of the model building and will be fed data during
training.
See for more information:
https://www.tensorflow.org/versions/r0.7/api_docs/python/io_ops.html#placeholders
"""
raise NotImplementedError("Each Model must re-implement this method.")

def create_feed_dict(self, inputs_batch, labels_batch=None):
"""Creates the feed_dict for one step of training.
A feed_dict takes the form of:
feed_dict = {
<placeholder>: <tensor of values to be passed for placeholder>,
....
}
If labels_batch is None, then no labels are added to feed_dict.
Hint: The keys for the feed_dict should be a subset of the placeholder
tensors created in add_placeholders.
Args:
inputs_batch: A batch of input data.
labels_batch: A batch of label data.
Returns:
feed_dict: The feed dictionary mapping from placeholders to values.
"""
raise NotImplementedError("Each Model must re-implement this method.")

def add_prediction_op(self):
"""Implements the core of the model that transforms a batch of input data into predictions.
Returns:
pred: A tensor of shape (batch_size, n_classes)
"""
raise NotImplementedError("Each Model must re-implement this method.")

def add_loss_op(self, pred):
"""Adds Ops for the loss function to the computational graph.
Args:
pred: A tensor of shape (batch_size, n_classes)
Returns:
loss: A 0-d tensor (scalar) output
"""
raise NotImplementedError("Each Model must re-implement this method.")

def add_training_op(self, loss):
"""Sets up the training Ops.
Creates an optimizer and applies the gradients to all trainable variables.
The Op returned by this function is what must be passed to the
sess.run() to train the model. See
https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer
for more information.
Args:
loss: Loss tensor (a scalar).
Returns:
train_op: The Op for training.
"""

raise NotImplementedError("Each Model must re-implement this method.")

def train_on_batch(self, sess, inputs_batch, labels_batch):
"""Perform one step of gradient descent on the provided batch of data.
Args:
sess: tf.Session()
input_batch: np.ndarray of shape (n_samples, n_features)
labels_batch: np.ndarray of shape (n_samples, n_classes)
Returns:
loss: loss over the batch (a scalar)
"""
feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch)
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
return loss

def predict_on_batch(self, sess, inputs_batch):
"""Make predictions for the provided batch of data
Args:
sess: tf.Session()
input_batch: np.ndarray of shape (n_samples, n_features)
Returns:
predictions: np.ndarray of shape (n_samples, n_classes)
"""
feed = self.create_feed_dict(inputs_batch)
predictions = sess.run(self.pred, feed_dict=feed)
return predictions

def build(self):
self.add_placeholders()
self.pred = self.add_prediction_op()
self.loss = self.add_loss_op(self.pred)
self.train_op = self.add_training_op(self.loss)
Binary file added model.pyc
Binary file not shown.
Loading

0 comments on commit 3bc3398

Please sign in to comment.