-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3bc3398
Showing
40 changed files
with
727,226 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Utility functions to process data. | ||
""" | ||
import os | ||
import pickle | ||
import logging | ||
from collections import Counter | ||
|
||
import numpy as np | ||
from util import read_conll, one_hot, window_iterator, ConfusionMatrix, load_word_vector_mapping | ||
from defs import LBLS, NONE, LMAP, NUM, UNK, EMBED_SIZE | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.DEBUG) | ||
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) | ||
|
||
|
||
FDIM = 4 | ||
P_CASE = "CASE:" | ||
CASES = ["aa", "AA", "Aa", "aA"] | ||
START_TOKEN = "<s>" | ||
END_TOKEN = "</s>" | ||
|
||
def casing(word): | ||
if len(word) == 0: return word | ||
|
||
# all lowercase | ||
if word.islower(): return "aa" | ||
# all uppercase | ||
elif word.isupper(): return "AA" | ||
# starts with capital | ||
elif word[0].isupper(): return "Aa" | ||
# has non-initial capital | ||
else: return "aA" | ||
|
||
def normalize(word): | ||
""" | ||
Normalize words that are numbers or have casing. | ||
""" | ||
if word.isdigit(): return NUM | ||
else: return word.lower() | ||
|
||
def featurize(embeddings, word): | ||
""" | ||
Featurize a word given embeddings. | ||
""" | ||
case = casing(word) | ||
word = normalize(word) | ||
case_mapping = {c: one_hot(FDIM, i) for i, c in enumerate(CASES)} | ||
wv = embeddings.get(word, embeddings[UNK]) | ||
fv = case_mapping[case] | ||
return np.hstack((wv, fv)) | ||
|
||
def evaluate(model, X, Y): | ||
cm = ConfusionMatrix(labels=LBLS) | ||
Y_ = model.predict(X) | ||
for i in range(Y.shape[0]): | ||
y, y_ = np.argmax(Y[i]), np.argmax(Y_[i]) | ||
cm.update(y,y_) | ||
cm.print_table() | ||
return cm.summary() | ||
|
||
class ModelHelper(object): | ||
""" | ||
This helper takes care of preprocessing data, constructing embeddings, etc. | ||
""" | ||
def __init__(self, tok2id, max_length): | ||
self.tok2id = tok2id | ||
self.START = [tok2id[START_TOKEN], tok2id[P_CASE + "aa"]] | ||
self.END = [tok2id[END_TOKEN], tok2id[P_CASE + "aa"]] | ||
self.max_length = max_length | ||
|
||
def vectorize_example(self, sentence, labels=None): | ||
sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence] | ||
if labels: | ||
labels_ = [LBLS.index(l) for l in labels] | ||
return sentence_, labels_ | ||
else: | ||
return sentence_, [LBLS[-1] for _ in sentence] | ||
|
||
def vectorize(self, data): | ||
return [self.vectorize_example(sentence, labels) for sentence, labels in data] | ||
|
||
@classmethod | ||
def build(cls, data): | ||
# Preprocess data to construct an embedding | ||
# Reserve 0 for the special NIL token. | ||
tok2id = build_dict((normalize(word) for sentence, _ in data for word in sentence), offset=1, max_words=10000) | ||
tok2id.update(build_dict([P_CASE + c for c in CASES], offset=len(tok2id))) | ||
tok2id.update(build_dict([START_TOKEN, END_TOKEN, UNK], offset=len(tok2id))) | ||
assert sorted(tok2id.items(), key=lambda t: t[1])[0][1] == 1 | ||
logger.info("Built dictionary for %d features.", len(tok2id)) | ||
|
||
max_length = max(len(sentence) for sentence, _ in data) | ||
|
||
return cls(tok2id, max_length) | ||
|
||
def save(self, path): | ||
# Make sure the directory exists. | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
# Save the tok2id map. | ||
with open(os.path.join(path, "features.pkl"), "w") as f: | ||
pickle.dump([self.tok2id, self.max_length], f) | ||
|
||
@classmethod | ||
def load(cls, path): | ||
# Make sure the directory exists. | ||
assert os.path.exists(path) and os.path.exists(os.path.join(path, "features.pkl")) | ||
# Save the tok2id map. | ||
with open(os.path.join(path, "features.pkl")) as f: | ||
tok2id, max_length = pickle.load(f) | ||
return cls(tok2id, max_length) | ||
|
||
def load_and_preprocess_data(args): | ||
logger.info("Loading training data...") | ||
train = read_conll(args.data_train) | ||
logger.info("Done. Read %d sentences", len(train)) | ||
logger.info("Loading dev data...") | ||
dev = read_conll(args.data_dev) | ||
logger.info("Done. Read %d sentences", len(dev)) | ||
|
||
helper = ModelHelper.build(train) | ||
|
||
# now process all the input data. | ||
train_data = helper.vectorize(train) | ||
dev_data = helper.vectorize(dev) | ||
|
||
return helper, train_data, dev_data, train, dev | ||
|
||
def load_embeddings(args, helper): | ||
embeddings = np.array(np.random.randn(len(helper.tok2id) + 1, EMBED_SIZE), dtype=np.float32) | ||
embeddings[0] = 0. | ||
for word, vec in load_word_vector_mapping(args.vocab, args.vectors).items(): | ||
word = normalize(word) | ||
if word in helper.tok2id: | ||
embeddings[helper.tok2id[word]] = vec | ||
logger.info("Initialized embeddings.") | ||
|
||
return embeddings | ||
|
||
def build_dict(words, max_words=None, offset=0): | ||
cnt = Counter(words) | ||
if max_words: | ||
words = cnt.most_common(max_words) | ||
else: | ||
words = cnt.most_common() | ||
return {word: offset+i for i, (word, _) in enumerate(words)} | ||
|
||
|
||
def get_chunks(seq, default=LBLS.index(NONE)): | ||
"""Breaks input of 4 4 4 0 0 4 0 -> (0, 4, 5), (0, 6, 7)""" | ||
chunks = [] | ||
chunk_type, chunk_start = None, None | ||
for i, tok in enumerate(seq): | ||
# End of a chunk 1 | ||
if tok == default and chunk_type is not None: | ||
# Add a chunk. | ||
chunk = (chunk_type, chunk_start, i) | ||
chunks.append(chunk) | ||
chunk_type, chunk_start = None, None | ||
# End of a chunk + start of a chunk! | ||
elif tok != default: | ||
if chunk_type is None: | ||
chunk_type, chunk_start = tok, i | ||
elif tok != chunk_type: | ||
chunk = (chunk_type, chunk_start, i) | ||
chunks.append(chunk) | ||
chunk_type, chunk_start = tok, i | ||
else: | ||
pass | ||
# end condition | ||
if chunk_type is not None: | ||
chunk = (chunk_type, chunk_start, len(seq)) | ||
chunks.append(chunk) | ||
return chunks | ||
|
||
def test_get_chunks(): | ||
assert get_chunks([4, 4, 4, 0, 0, 4, 1, 2, 4, 3], 4) == [(0,3,5), (1, 6, 7), (2, 7, 8), (3,9,10)] |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Common definitions for NER | ||
""" | ||
|
||
from util import one_hot | ||
|
||
LBLS = [ | ||
"PER", | ||
"ORG", | ||
"LOC", | ||
"MISC", | ||
"O", | ||
] | ||
NONE = "O" | ||
LMAP = {k: one_hot(5,i) for i, k in enumerate(LBLS)} | ||
NUM = "NNNUMMM" | ||
UNK = "UUUNKKK" | ||
|
||
EMBED_SIZE = 50 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
echo "Creating submission file." | ||
PYFILES=`ls *.py` | ||
CONLL_FILES="window_predictions.conll rnn_predictions.conll gru_predictions.conll" | ||
|
||
CONLL= | ||
for conll in $CONLL_FILES; do | ||
if [ -e $conll ]; then | ||
CONLL="$CONLL $conll" | ||
else | ||
echo "WARNING: Could not find $conll in this directory. If you | ||
have generated it, please move it from the appropriate folder in | ||
results/" | ||
fi | ||
done; | ||
|
||
zip submissions.zip $PYFILES $CONLL | ||
echo "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
class Model(object): | ||
"""Abstracts a Tensorflow graph for a learning task. | ||
We use various Model classes as usual abstractions to encapsulate tensorflow | ||
computational graphs. Each algorithm you will construct in this homework will | ||
inherit from a Model object. | ||
""" | ||
def add_placeholders(self): | ||
"""Adds placeholder variables to tensorflow computational graph. | ||
Tensorflow uses placeholder variables to represent locations in a | ||
computational graph where data is inserted. These placeholders are used as | ||
inputs by the rest of the model building and will be fed data during | ||
training. | ||
See for more information: | ||
https://www.tensorflow.org/versions/r0.7/api_docs/python/io_ops.html#placeholders | ||
""" | ||
raise NotImplementedError("Each Model must re-implement this method.") | ||
|
||
def create_feed_dict(self, inputs_batch, labels_batch=None): | ||
"""Creates the feed_dict for one step of training. | ||
A feed_dict takes the form of: | ||
feed_dict = { | ||
<placeholder>: <tensor of values to be passed for placeholder>, | ||
.... | ||
} | ||
If labels_batch is None, then no labels are added to feed_dict. | ||
Hint: The keys for the feed_dict should be a subset of the placeholder | ||
tensors created in add_placeholders. | ||
Args: | ||
inputs_batch: A batch of input data. | ||
labels_batch: A batch of label data. | ||
Returns: | ||
feed_dict: The feed dictionary mapping from placeholders to values. | ||
""" | ||
raise NotImplementedError("Each Model must re-implement this method.") | ||
|
||
def add_prediction_op(self): | ||
"""Implements the core of the model that transforms a batch of input data into predictions. | ||
Returns: | ||
pred: A tensor of shape (batch_size, n_classes) | ||
""" | ||
raise NotImplementedError("Each Model must re-implement this method.") | ||
|
||
def add_loss_op(self, pred): | ||
"""Adds Ops for the loss function to the computational graph. | ||
Args: | ||
pred: A tensor of shape (batch_size, n_classes) | ||
Returns: | ||
loss: A 0-d tensor (scalar) output | ||
""" | ||
raise NotImplementedError("Each Model must re-implement this method.") | ||
|
||
def add_training_op(self, loss): | ||
"""Sets up the training Ops. | ||
Creates an optimizer and applies the gradients to all trainable variables. | ||
The Op returned by this function is what must be passed to the | ||
sess.run() to train the model. See | ||
https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer | ||
for more information. | ||
Args: | ||
loss: Loss tensor (a scalar). | ||
Returns: | ||
train_op: The Op for training. | ||
""" | ||
|
||
raise NotImplementedError("Each Model must re-implement this method.") | ||
|
||
def train_on_batch(self, sess, inputs_batch, labels_batch): | ||
"""Perform one step of gradient descent on the provided batch of data. | ||
Args: | ||
sess: tf.Session() | ||
input_batch: np.ndarray of shape (n_samples, n_features) | ||
labels_batch: np.ndarray of shape (n_samples, n_classes) | ||
Returns: | ||
loss: loss over the batch (a scalar) | ||
""" | ||
feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch) | ||
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed) | ||
return loss | ||
|
||
def predict_on_batch(self, sess, inputs_batch): | ||
"""Make predictions for the provided batch of data | ||
Args: | ||
sess: tf.Session() | ||
input_batch: np.ndarray of shape (n_samples, n_features) | ||
Returns: | ||
predictions: np.ndarray of shape (n_samples, n_classes) | ||
""" | ||
feed = self.create_feed_dict(inputs_batch) | ||
predictions = sess.run(self.pred, feed_dict=feed) | ||
return predictions | ||
|
||
def build(self): | ||
self.add_placeholders() | ||
self.pred = self.add_prediction_op() | ||
self.loss = self.add_loss_op(self.pred) | ||
self.train_op = self.add_training_op(self.loss) |
Binary file not shown.
Oops, something went wrong.