A window to NER implemented

imraviagrawal · Nov 1, 2017 · 3bc3398 · 3bc3398
commit 3bc3398
Show file tree

Hide file tree

Showing 40 changed files with 727,226 additions and 0 deletions.
diff --git a/assignment3.pdf b/assignment3.pdf
diff --git a/data/dev.conll b/data/dev.conll
diff --git a/data/test.masked b/data/test.masked
diff --git a/data/tiny.conll b/data/tiny.conll
diff --git a/data/train.conll b/data/train.conll
diff --git a/data/vocab.txt b/data/vocab.txt
diff --git a/data/wordVectors.txt b/data/wordVectors.txt
diff --git a/data_util.py b/data_util.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Utility functions to process data.
+"""
+import os
+import pickle
+import logging
+from collections import Counter
+
+import numpy as np
+from util import read_conll, one_hot, window_iterator, ConfusionMatrix, load_word_vector_mapping
+from defs import LBLS, NONE, LMAP, NUM, UNK, EMBED_SIZE
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
+
+
+FDIM = 4
+P_CASE = "CASE:"
+CASES = ["aa", "AA", "Aa", "aA"]
+START_TOKEN = "<s>"
+END_TOKEN = "</s>"
+
+def casing(word):
+    if len(word) == 0: return word
+
+    # all lowercase
+    if word.islower(): return "aa"
+    # all uppercase
+    elif word.isupper(): return "AA"
+    # starts with capital
+    elif word[0].isupper(): return "Aa"
+    # has non-initial capital
+    else: return "aA"
+
+def normalize(word):
+    """
+    Normalize words that are numbers or have casing.
+    """
+    if word.isdigit(): return NUM
+    else: return word.lower()
+
+def featurize(embeddings, word):
+    """
+    Featurize a word given embeddings.
+    """
+    case = casing(word)
+    word = normalize(word)
+    case_mapping = {c: one_hot(FDIM, i) for i, c in enumerate(CASES)}
+    wv = embeddings.get(word, embeddings[UNK])
+    fv = case_mapping[case]
+    return np.hstack((wv, fv))
+
+def evaluate(model, X, Y):
+    cm = ConfusionMatrix(labels=LBLS)
+    Y_ = model.predict(X)
+    for i in range(Y.shape[0]):
+        y, y_ = np.argmax(Y[i]), np.argmax(Y_[i])
+        cm.update(y,y_)
+    cm.print_table()
+    return cm.summary()
+
+class ModelHelper(object):
+    """
+    This helper takes care of preprocessing data, constructing embeddings, etc.
+    """
+    def __init__(self, tok2id, max_length):
+        self.tok2id = tok2id
+        self.START = [tok2id[START_TOKEN], tok2id[P_CASE + "aa"]]
+        self.END = [tok2id[END_TOKEN], tok2id[P_CASE + "aa"]]
+        self.max_length = max_length
+
+    def vectorize_example(self, sentence, labels=None):
+        sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence]
+        if labels:
+            labels_ = [LBLS.index(l) for l in labels]
+            return sentence_, labels_
+        else:
+            return sentence_, [LBLS[-1] for _ in sentence]
+
+    def vectorize(self, data):
+        return [self.vectorize_example(sentence, labels) for sentence, labels in data]
+
+    @classmethod
+    def build(cls, data):
+        # Preprocess data to construct an embedding
+        # Reserve 0 for the special NIL token.
+        tok2id = build_dict((normalize(word) for sentence, _ in data for word in sentence), offset=1, max_words=10000)
+        tok2id.update(build_dict([P_CASE + c for c in CASES], offset=len(tok2id)))
+        tok2id.update(build_dict([START_TOKEN, END_TOKEN, UNK], offset=len(tok2id)))
+        assert sorted(tok2id.items(), key=lambda t: t[1])[0][1] == 1
+        logger.info("Built dictionary for %d features.", len(tok2id))
+
+        max_length = max(len(sentence) for sentence, _ in data)
+
+        return cls(tok2id, max_length)
+
+    def save(self, path):
+        # Make sure the directory exists.
+        if not os.path.exists(path):
+            os.makedirs(path)
+        # Save the tok2id map.
+        with open(os.path.join(path, "features.pkl"), "w") as f:
+            pickle.dump([self.tok2id, self.max_length], f)
+
+    @classmethod
+    def load(cls, path):
+        # Make sure the directory exists.
+        assert os.path.exists(path) and os.path.exists(os.path.join(path, "features.pkl"))
+        # Save the tok2id map.
+        with open(os.path.join(path, "features.pkl")) as f:
+            tok2id, max_length = pickle.load(f)
+        return cls(tok2id, max_length)
+
+def load_and_preprocess_data(args):
+    logger.info("Loading training data...")
+    train = read_conll(args.data_train)
+    logger.info("Done. Read %d sentences", len(train))
+    logger.info("Loading dev data...")
+    dev = read_conll(args.data_dev)
+    logger.info("Done. Read %d sentences", len(dev))
+
+    helper = ModelHelper.build(train)
+
+    # now process all the input data.
+    train_data = helper.vectorize(train)
+    dev_data = helper.vectorize(dev)
+
+    return helper, train_data, dev_data, train, dev
+
+def load_embeddings(args, helper):
+    embeddings = np.array(np.random.randn(len(helper.tok2id) + 1, EMBED_SIZE), dtype=np.float32)
+    embeddings[0] = 0.
+    for word, vec in load_word_vector_mapping(args.vocab, args.vectors).items():
+        word = normalize(word)
+        if word in helper.tok2id:
+            embeddings[helper.tok2id[word]] = vec
+    logger.info("Initialized embeddings.")
+
+    return embeddings
+
+def build_dict(words, max_words=None, offset=0):
+    cnt = Counter(words)
+    if max_words:
+        words = cnt.most_common(max_words)
+    else:
+        words = cnt.most_common()
+    return {word: offset+i for i, (word, _) in enumerate(words)}
+
+
+def get_chunks(seq, default=LBLS.index(NONE)):
+    """Breaks input of 4 4 4 0 0 4 0 ->   (0, 4, 5), (0, 6, 7)"""
+    chunks = []
+    chunk_type, chunk_start = None, None
+    for i, tok in enumerate(seq):
+        # End of a chunk 1
+        if tok == default and chunk_type is not None:
+            # Add a chunk.
+            chunk = (chunk_type, chunk_start, i)
+            chunks.append(chunk)
+            chunk_type, chunk_start = None, None
+        # End of a chunk + start of a chunk!
+        elif tok != default:
+            if chunk_type is None:
+                chunk_type, chunk_start = tok, i
+            elif tok != chunk_type:
+                chunk = (chunk_type, chunk_start, i)
+                chunks.append(chunk)
+                chunk_type, chunk_start = tok, i
+        else:
+            pass
+    # end condition
+    if chunk_type is not None:
+        chunk = (chunk_type, chunk_start, len(seq))
+        chunks.append(chunk)
+    return chunks
+
+def test_get_chunks():
+    assert get_chunks([4, 4, 4, 0, 0, 4, 1, 2, 4, 3], 4) == [(0,3,5), (1, 6, 7), (2, 7, 8), (3,9,10)]
diff --git a/data_util.pyc b/data_util.pyc
diff --git a/defs.py b/defs.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Common definitions for NER
+"""
+
+from util import one_hot
+
+LBLS = [
+    "PER",
+    "ORG",
+    "LOC",
+    "MISC",
+    "O",
+    ]
+NONE = "O"
+LMAP = {k: one_hot(5,i) for i, k in enumerate(LBLS)}
+NUM = "NNNUMMM"
+UNK = "UUUNKKK"
+
+EMBED_SIZE = 50
diff --git a/defs.pyc b/defs.pyc
diff --git a/make_submission.sh b/make_submission.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+echo "Creating submission file."
+PYFILES=`ls *.py`
+CONLL_FILES="window_predictions.conll rnn_predictions.conll gru_predictions.conll"
+
+CONLL=
+for conll in $CONLL_FILES; do
+    if [ -e $conll ]; then
+        CONLL="$CONLL $conll"
+    else
+        echo "WARNING: Could not find $conll in this directory. If you
+        have generated it, please move it from the appropriate folder in
+        results/"
+    fi
+done;
+
+zip submissions.zip $PYFILES $CONLL
+echo "Done."
diff --git a/model.py b/model.py
@@ -0,0 +1,110 @@
+class Model(object):
+    """Abstracts a Tensorflow graph for a learning task.
+
+    We use various Model classes as usual abstractions to encapsulate tensorflow
+    computational graphs. Each algorithm you will construct in this homework will
+    inherit from a Model object.
+    """
+    def add_placeholders(self):
+        """Adds placeholder variables to tensorflow computational graph.
+
+        Tensorflow uses placeholder variables to represent locations in a
+        computational graph where data is inserted.  These placeholders are used as
+        inputs by the rest of the model building and will be fed data during
+        training.
+
+        See for more information:
+        https://www.tensorflow.org/versions/r0.7/api_docs/python/io_ops.html#placeholders
+        """
+        raise NotImplementedError("Each Model must re-implement this method.")
+
+    def create_feed_dict(self, inputs_batch, labels_batch=None):
+        """Creates the feed_dict for one step of training.
+
+        A feed_dict takes the form of:
+        feed_dict = {
+                <placeholder>: <tensor of values to be passed for placeholder>,
+                ....
+        }
+
+        If labels_batch is None, then no labels are added to feed_dict.
+
+        Hint: The keys for the feed_dict should be a subset of the placeholder
+                    tensors created in add_placeholders.
+        Args:
+            inputs_batch: A batch of input data.
+            labels_batch: A batch of label data.
+        Returns:
+            feed_dict: The feed dictionary mapping from placeholders to values.
+        """
+        raise NotImplementedError("Each Model must re-implement this method.")
+
+    def add_prediction_op(self):
+        """Implements the core of the model that transforms a batch of input data into predictions.
+
+        Returns:
+            pred: A tensor of shape (batch_size, n_classes)
+        """
+        raise NotImplementedError("Each Model must re-implement this method.")
+
+    def add_loss_op(self, pred):
+        """Adds Ops for the loss function to the computational graph.
+
+        Args:
+            pred: A tensor of shape (batch_size, n_classes)
+        Returns:
+            loss: A 0-d tensor (scalar) output
+        """
+        raise NotImplementedError("Each Model must re-implement this method.")
+
+    def add_training_op(self, loss):
+        """Sets up the training Ops.
+
+        Creates an optimizer and applies the gradients to all trainable variables.
+        The Op returned by this function is what must be passed to the
+        sess.run() to train the model. See
+
+        https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer
+
+        for more information.
+
+        Args:
+            loss: Loss tensor (a scalar).
+        Returns:
+            train_op: The Op for training.
+        """
+
+        raise NotImplementedError("Each Model must re-implement this method.")
+
+    def train_on_batch(self, sess, inputs_batch, labels_batch):
+        """Perform one step of gradient descent on the provided batch of data.
+
+        Args:
+            sess: tf.Session()
+            input_batch: np.ndarray of shape (n_samples, n_features)
+            labels_batch: np.ndarray of shape (n_samples, n_classes)
+        Returns:
+            loss: loss over the batch (a scalar)
+        """
+        feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch)
+        _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
+        return loss
+
+    def predict_on_batch(self, sess, inputs_batch):
+        """Make predictions for the provided batch of data
+
+        Args:
+            sess: tf.Session()
+            input_batch: np.ndarray of shape (n_samples, n_features)
+        Returns:
+            predictions: np.ndarray of shape (n_samples, n_classes)
+        """
+        feed = self.create_feed_dict(inputs_batch)
+        predictions = sess.run(self.pred, feed_dict=feed)
+        return predictions
+
+    def build(self):
+        self.add_placeholders()
+        self.pred = self.add_prediction_op()
+        self.loss = self.add_loss_op(self.pred)
+        self.train_op = self.add_training_op(self.loss)
diff --git a/model.pyc b/model.pyc