research/lexnet_nc/lexnet_common.py

# Copyright 2017, 2018 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Common stuff used with LexNET."""
# pylint: disable=bad-whitespace

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
from sklearn import metrics
import tensorflow as tf

# Part of speech tags used in the paths.
POSTAGS = [
    'PAD',   'VERB',   'CONJ',   'NOUN',   'PUNCT',
    'ADP',   'ADJ',    'DET',    'ADV',    'PART',
    'NUM',   'X',      'INTJ',   'SYM',
]

POSTAG_TO_ID = {tag: tid for tid, tag in enumerate(POSTAGS)}

# Dependency labels used in the paths.
DEPLABELS = [
    'PAD',     'UNK',       'ROOT',    'abbrev',    'acomp', 'advcl',
    'advmod',  'agent',     'amod',    'appos',     'attr',  'aux',
    'auxpass', 'cc',        'ccomp',   'complm',    'conj',  'cop',
    'csubj',   'csubjpass', 'dep',     'det',       'dobj',  'expl',
    'infmod',  'iobj',      'mark',    'mwe',       'nc',    'neg',
    'nn',      'npadvmod',  'nsubj',   'nsubjpass', 'num',   'number',
    'p',       'parataxis', 'partmod', 'pcomp',     'pobj',  'poss',
    'preconj', 'predet',    'prep',    'prepc',     'prt',   'ps',
    'purpcl',  'quantmod',  'rcmod',   'ref',       'rel',   'suffix',
    'title',   'tmod',      'xcomp',   'xsubj',
]

DEPLABEL_TO_ID = {label: lid for lid, label in enumerate(DEPLABELS)}

# Direction codes used in the paths.
DIRS = '_^V<>'
DIR_TO_ID = {dir: did for did, dir in enumerate(DIRS)}


def load_word_embeddings(embedding_filename):
  """Loads pretrained word embeddings from a binary file and returns the matrix.

  Adds the <PAD>, <UNK>, <X>, and <Y> tokens to the beginning of the vocab.

  Args:
    embedding_filename: filename of the binary NPY data

  Returns:
    The word embeddings matrix
  """
  embeddings = np.load(embedding_filename)
  dim = embeddings.shape[1]

  # Four initially random vectors for the special tokens: <PAD>, <UNK>, <X>, <Y>
  special_embeddings = np.random.normal(0, 0.1, (4, dim))
  embeddings = np.vstack((special_embeddings, embeddings))
  embeddings = embeddings.astype(np.float32)

  return embeddings


def full_evaluation(model, session, instances, labels, set_name, classes):
  """Prints a full evaluation on the current set.

  Performance (recall, precision and F1), classification report (per
  class performance), and confusion matrix).

  Args:
    model: The currently trained path-based model.
    session: The current TensorFlow session.
    instances: The current set instances.
    labels: The current set labels.
    set_name: The current set name (train/validation/test).
    classes: The class label names.

  Returns:
    The model's prediction for the given instances.
  """

  # Predict the labels
  pred = model.predict(session, instances)

  # Print the performance
  precision, recall, f1, _ = metrics.precision_recall_fscore_support(
      labels, pred, average='weighted')

  print('%s set: Precision: %.3f, Recall: %.3f, F1: %.3f' % (
      set_name, precision, recall, f1))

  # Print a classification report
  print('%s classification report:' % set_name)
  print(metrics.classification_report(labels, pred, target_names=classes))

  # Print the confusion matrix
  print('%s confusion matrix:' % set_name)
  cm = metrics.confusion_matrix(labels, pred, labels=range(len(classes)))
  cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
  print_cm(cm, labels=classes)
  return pred


def print_cm(cm, labels):
  """Pretty print for confusion matrices.

  From: https://gist.github.com/zachguo/10296432.

  Args:
    cm: The confusion matrix.
    labels: The class names.
  """
  columnwidth = 10
  empty_cell = ' ' * columnwidth
  short_labels = [label[:12].rjust(10, ' ') for label in labels]

  # Print header
  header = empty_cell + ' '
  header += ''.join([' %{0}s '.format(columnwidth) % label
                     for label in short_labels])

  print(header)

  # Print rows
  for i, label1 in enumerate(short_labels):
    row = '%{0}s '.format(columnwidth) % label1[:10]
    for j in range(len(short_labels)):
      value = int(cm[i, j]) if not np.isnan(cm[i, j]) else 0
      cell = ' %{0}d '.format(10) % value
      row += cell + ' '
    print(row)


def load_all_labels(records):
  """Reads TensorFlow examples from a RecordReader and returns only the labels.

  Args:
    records: a record list with TensorFlow examples.

  Returns:
    The labels
  """
  curr_features = tf.parse_example(records, {
      'rel_id': tf.FixedLenFeature([1], dtype=tf.int64),
  })

  labels = tf.squeeze(curr_features['rel_id'], [-1])
  return labels


def load_all_pairs(records):
  """Reads TensorFlow examples from a RecordReader and returns the word pairs.

  Args:
    records: a record list with TensorFlow examples.

  Returns:
    The word pairs
  """
  curr_features = tf.parse_example(records, {
      'pair': tf.FixedLenFeature([1], dtype=tf.string)
  })

  word_pairs = curr_features['pair']
  return word_pairs


def write_predictions(pairs, labels, predictions, classes, predictions_file):
  """Write the predictions to a file.

  Args:
    pairs: the word pairs (list of tuple of two strings).
    labels: the gold-standard labels for these pairs (array of rel ID).
    predictions: the predicted labels for these pairs (array of rel ID).
    classes: a list of relation names.
    predictions_file: where to save the predictions.
  """
  with open(predictions_file, 'w') as f_out:
    for pair, label, pred in zip(pairs, labels, predictions):
      w1, w2 = pair
      f_out.write('\t'.join([w1, w2, classes[label], classes[pred]]) + '\n')