Moved training to a separate file

vedic-partap · Mar 19, 2017 · 11a54a8 · 11a54a8
1 parent 53a2b60
commit 11a54a8
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 123 deletions.
diff --git a/README.md b/README.md
@@ -29,10 +29,11 @@ Run the tool with the command
 
 ## Output
 
-The output is a file with cues and scopes encoded in the CD format. The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension. 
+The output is a file where the first 8 columns are identical to the inputfile, and the following columns include cues and scopes encoded in the CD format. The column for events is included with the symbol "_". The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension. 
 
 ## Training your own models
-To re-train the cue- and scope learner with a new dataset, run the tool with the command:
-`python negtool.py -m retraining`
+You can train your own cue and/or scope model with a new dataset. The dataset needs to be on CoNLL_X format with cues and scopes encoded on the CD format starting at column 9. Training is done by running
 
-You will be asked to provide the filename of the training file and a test file. Both files need to be on CoNLL_X format. 
+`python train.py -m <model to train> -tf <training file> -cp <cue regularisation> -sp <scope regularisation>`
+
+For the -m option, the program accepts either cue, scope or all. The default value for both the cue regularisation parameter and the scope regularisation parameter is 0.20. 
diff --git a/negtool.py b/negtool.py
@@ -4,15 +4,12 @@
 from os import remove, path
 from sklearn.externals import joblib
 from sklearn.feature_extraction import DictVectorizer
-from pystruct.models import ChainCRF, BinaryClf
-from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
 from pystruct.utils import SaveLogger
 
 from file_reading import *
 from feature_extraction import extract_features_scope, extract_features_cue 
 from utils import *
 from file_writing import *
-from read_labelled_data import read_file
 
 def load_cue_learner():
     """
@@ -35,26 +32,6 @@ def load_scope_learner():
     scope_vectorizer = joblib.load("objectfiles/scope_vectorizer.pkl")
     return scope_ssvm, scope_vectorizer
 
-def train_cue_learner(sentence_dicts, C_value):
-    cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
-    cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
-    vectorizer = DictVectorizer()
-    fvs = vectorizer.fit_transform(cue_instances).toarray()
-    model = BinaryClf()
-    cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
-    cue_ssvm.fit(fvs, np.asarray(cue_labels))
-    return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon
-
-def train_scope_learner(sentence_dicts, C_value):
-    scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
-    vectorizer = DictVectorizer()
-    fvs = vectorizer.fit_transform(scope_instances).toarray()
-    X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
-    model = ChainCRF()
-    scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
-    scope_ssvm.fit(X_train, y_train)
-    return scope_ssvm, vectorizer
-
 def run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, mode):
     """
     Reads the file with the input data, extracts features for cue detection,
@@ -82,51 +59,27 @@ def run_scope_learner(scope_ssvm, scope_vectorizer, filename, mode):
 
 if __name__ == '__main__':
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed', 'retraining'])
+    argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed'])
     argparser.add_argument('-f', '--filename', help="input file", type=str, nargs='?')
     argparser.add_argument('-d', '--directory', help="absolute path to corenlp directory. needs to be provided in raw mode", type=str, nargs='?')
     args = argparser.parse_args()
 
-    if args.mode == 'retraining':
-        training_file = raw_input("Enter file name of training file: ")
-        test_file = raw_input("Enter file name of test file: ")
-        sentence_dicts = read_file(training_file)
-        print "Setning 0:"
-        print sentence_dicts[0]['cues']
-        print sentence_dicts[0]['scopes']
-        print ""
-        print "Setning 1:"
-        print sentence_dicts[1]['cues']
-        print sentence_dicts[1]['scopes']
-        print ""
-        print "Setning 2:"
-        print sentence_dicts[2]['cues']
-        print sentence_dicts[2]['scopes']
-        cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, 0.20)
-        print affixal_cue_lexicon
-        run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, test_file, 'parsed')
-
-        scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
-        cue_file = test_file.split(".")[0] + "_cues.neg"
-        run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, 'parsed')
-        remove(cue_file)
-    else:
-        filename = args.filename
-        if not path.isfile(filename):
-            print "ERROR: File does not exist. Program will exit"
-            sys.exit(1)
-        if args.mode == 'raw':
-            path_to_corenlp = args.directory
-            if args.directory == None:
-                path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
-            elif not path.exists(args.directory):
-                path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
-                run_corenlp(path_to_corenlp, args.filename)
-            filename = args.filename + ".conll"
+    filename = args.filename
+    if not path.isfile(filename):
+        print "ERROR: File does not exist. Program will exit"
+        sys.exit(1)
+    if args.mode == 'raw':
+        path_to_corenlp = args.directory
+        if args.directory == None:
+            path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
+        elif not path.exists(args.directory):
+            path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
+        run_corenlp(path_to_corenlp, args.filename)
+        filename = args.filename + ".conll"
 
-        cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
-        run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
-        cue_file = filename.split(".")[0] + "_cues.neg"
-        scope_ssvm, scope_vectorizer = load_scope_learner()
-        run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
-        remove(cue_file)
+    cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
+    run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
+    cue_file = filename.split(".")[0] + "_cues.neg"
+    scope_ssvm, scope_vectorizer = load_scope_learner()
+    run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
+    remove(cue_file)
diff --git a/train.py b/train.py
@@ -0,0 +1,65 @@
+import argparse
+import pickle
+from sklearn.externals import joblib
+from sklearn.feature_extraction import DictVectorizer
+from pystruct.models import ChainCRF, BinaryClf
+from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
+from pystruct.utils import SaveLogger
+
+from file_reading import *
+from feature_extraction import extract_features_scope, extract_features_cue 
+from utils import *
+from file_writing import *
+from read_labelled_data import read_file
+
+def train_cue_learner(sentence_dicts, C_value):
+    cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
+    cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
+    vectorizer = DictVectorizer()
+    fvs = vectorizer.fit_transform(cue_instances).toarray()
+    model = BinaryClf()
+    cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
+    cue_ssvm.fit(fvs, np.asarray(cue_labels))
+    return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon
+
+def train_scope_learner(sentence_dicts, C_value):
+    scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
+    vectorizer = DictVectorizer()
+    fvs = vectorizer.fit_transform(scope_instances).toarray()
+    X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
+    model = ChainCRF()
+    scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
+    scope_ssvm.fit(X_train, y_train)
+    return scope_ssvm, vectorizer
+
+def save_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename):
+    pickle.dump(cue_ssvm, open("cue_model_%s.pkl" %filename, "wb"))
+    joblib.dump(cue_vectorizer, "cue_vectorizer_%s.pkl" %filename)
+    pickle.dump(cue_lexicon, open("cue_lexicon_%s.pkl" %filename, "wb"))
+    pickle.dump(affixal_cue_lexicon, open("affixal_cue_lexicon_%s.pkl" %filename, "wb"))
+
+def save_scope_learner(scope_ssvm, scope_vectorizer, filename):
+    pickle.dump(scope_ssvm, open("scope_model_%s.pkl" %filename, "wb"))
+    joblib.dump(scope_vectorizer, "scope_vectorizer_%s.pkl" %filename)
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('-m', '--model', help="model to train. Either cue, scope or all", type=str, choices=['cue', 'scope', 'all'])
+    argparser.add_argument('-tf', '--trainingfile', help="filename of training file", type=str)
+    argparser.add_argument('-cp', '--cueparameter', help="regularisation parameter for the cue model", type=float, nargs="?", default=0.20)
+    argparser.add_argument('-sp', '--scopeparameter', help="regularisation parameter for the scope model", type=float, nargs="?", default=0.20)
+    args = argparser.parse_args()
+
+    print "lese inn setninger"
+    sentence_dicts = read_file(args.trainingfile)
+    filename = args.trainingfile.split(".")[0]
+    if args.model == 'cue' or args.model == 'all':
+        print "trener cue"
+        cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, args.scopeparameter)
+        save_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename)
+
+    if args.model == 'scope' or args.model == 'all':
+        print "trener scope"
+        scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
+        save_scope_learner(scope_ssvm, scope_vectorizer, filename)
+
diff --git a/train_models.py b/train_models.py