Added support for re-training the models

vedic-partap · Mar 14, 2017 · 53a2b60 · 53a2b60
1 parent ec3f394
commit 53a2b60
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -30,3 +30,9 @@ Run the tool with the command
 ## Output
 
 The output is a file with cues and scopes encoded in the CD format. The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension. 
+
+## Training your own models
+To re-train the cue- and scope learner with a new dataset, run the tool with the command:
+`python negtool.py -m retraining`
+
+You will be asked to provide the filename of the training file and a test file. Both files need to be on CoNLL_X format. 
diff --git a/feature_extraction.py b/feature_extraction.py
@@ -1,5 +1,4 @@
 import numpy as np
-from sklearn.feature_extraction import DictVectorizer
 import sklearn.metrics as metrics
 from pystruct.models import ChainCRF
 from pystruct.learners import FrankWolfeSSVM
@@ -69,7 +68,7 @@ def extract_labels_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon):
     for sent in sentence_dicts:
         for key, value in sent.iteritems():
             if isinstance(key, int):
-                if not known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
+                if not_known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
                     continue
                 if any(cue_position == key for (cue, cue_position, cue_type) in sent['cues']) or any(mw_pos == key for (mw_cue, mw_pos) in sent['mw_cues']):
                     labels.append(1)

diff --git a/negtool.py b/negtool.py
@@ -1,14 +1,18 @@
-from pystruct.utils import SaveLogger
 import pickle
 import sys
 import argparse
 from os import remove, path
 from sklearn.externals import joblib
+from sklearn.feature_extraction import DictVectorizer
+from pystruct.models import ChainCRF, BinaryClf
+from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
+from pystruct.utils import SaveLogger
 
 from file_reading import *
 from feature_extraction import extract_features_scope, extract_features_cue 
 from utils import *
 from file_writing import *
+from read_labelled_data import read_file
 
 def load_cue_learner():
     """
@@ -31,6 +35,26 @@ def load_scope_learner():
     scope_vectorizer = joblib.load("objectfiles/scope_vectorizer.pkl")
     return scope_ssvm, scope_vectorizer
 
+def train_cue_learner(sentence_dicts, C_value):
+    cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
+    cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
+    vectorizer = DictVectorizer()
+    fvs = vectorizer.fit_transform(cue_instances).toarray()
+    model = BinaryClf()
+    cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
+    cue_ssvm.fit(fvs, np.asarray(cue_labels))
+    return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon
+
+def train_scope_learner(sentence_dicts, C_value):
+    scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
+    vectorizer = DictVectorizer()
+    fvs = vectorizer.fit_transform(scope_instances).toarray()
+    X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
+    model = ChainCRF()
+    scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
+    scope_ssvm.fit(X_train, y_train)
+    return scope_ssvm, vectorizer
+
 def run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, mode):
     """
     Reads the file with the input data, extracts features for cue detection,
@@ -58,26 +82,51 @@ def run_scope_learner(scope_ssvm, scope_vectorizer, filename, mode):
 
 if __name__ == '__main__':
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed", type=str, choices=['raw','parsed'])
-    argparser.add_argument('-f', '--filename', help="input file", type=str)
+    argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed', 'retraining'])
+    argparser.add_argument('-f', '--filename', help="input file", type=str, nargs='?')
     argparser.add_argument('-d', '--directory', help="absolute path to corenlp directory. needs to be provided in raw mode", type=str, nargs='?')
     args = argparser.parse_args()
 
-    filename = args.filename
-    if not path.isfile(filename):
-        print "ERROR: File does not exist. Program will exit"
-        sys.exit(1)
-    if args.mode == 'raw':
-        path_to_corenlp = args.directory
-        if args.directory == None:
-            path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
-        elif not path.exists(args.directory):
-            path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
-        run_corenlp(path_to_corenlp, args.filename)
-        filename = args.filename + ".conll"
-    cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
-    run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
-    cue_file = filename.split(".")[0] + "_cues.neg"
-    scope_ssvm, scope_vectorizer = load_scope_learner()
-    run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
-    remove(cue_file)
+    if args.mode == 'retraining':
+        training_file = raw_input("Enter file name of training file: ")
+        test_file = raw_input("Enter file name of test file: ")
+        sentence_dicts = read_file(training_file)
+        print "Setning 0:"
+        print sentence_dicts[0]['cues']
+        print sentence_dicts[0]['scopes']
+        print ""
+        print "Setning 1:"
+        print sentence_dicts[1]['cues']
+        print sentence_dicts[1]['scopes']
+        print ""
+        print "Setning 2:"
+        print sentence_dicts[2]['cues']
+        print sentence_dicts[2]['scopes']
+        cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, 0.20)
+        print affixal_cue_lexicon
+        run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, test_file, 'parsed')
+
+        scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
+        cue_file = test_file.split(".")[0] + "_cues.neg"
+        run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, 'parsed')
+        remove(cue_file)
+    else:
+        filename = args.filename
+        if not path.isfile(filename):
+            print "ERROR: File does not exist. Program will exit"
+            sys.exit(1)
+        if args.mode == 'raw':
+            path_to_corenlp = args.directory
+            if args.directory == None:
+                path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
+            elif not path.exists(args.directory):
+                path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
+                run_corenlp(path_to_corenlp, args.filename)
+            filename = args.filename + ".conll"
+
+        cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
+        run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
+        cue_file = filename.split(".")[0] + "_cues.neg"
+        scope_ssvm, scope_vectorizer = load_scope_learner()
+        run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
+        remove(cue_file)
diff --git a/read_labelled_data.py b/read_labelled_data.py
@@ -5,7 +5,7 @@ def read_file(filename):
     Read input file and make dictionaries for each sentence. 
     Used for training with the CD dataset.
     """
-    with open(filename, 'r') as infile1:
+    with open(filename, 'r') as infile:
         sentence = {}
         cues = []
         mw_cues = []
@@ -20,7 +20,7 @@ def read_file(filename):
         cue_offset = upper_limit - 5
         instances = []
 
-        for line in infile1:
+        for line in infile:
             token_dict = {}
             tokens = line.split()
             #check for sentence end
@@ -49,6 +49,7 @@ def read_file(filename):
                 instances.append(sentence)
                 sentence = {}
                 counter = 0
+                cue_counter = 0
                 prev_cue_column = -1
                 cues = []
                 mw_cues = []
@@ -67,14 +68,14 @@ def read_file(filename):
                         if cues[-1][2] == 'm':
                             mw_cues.append([cues[-1][0],cues[-1][1]])
                         mw_cues.append([tokens[i], counter])
-                    elif tokens[i] != tokens[3]:
+                    elif tokens[i] != tokens[1]:
                         cues.append([tokens[i], counter, 'a'])
                         prev_cue_column = i
                     else:
                         cues.append([tokens[i], counter, 's'])
                         prev_cue_column = i
                 elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
-                    cue_counter = (i-upper_limit+2)/3
+                    cue_counter = (i-upper_limit+2)/3 - 1
                     if cue_counter in scopes:
                         scopes[cue_counter].append([tokens[i], counter])
                     else: