Skip to content

Commit

Permalink
Moved training to a separate file
Browse files Browse the repository at this point in the history
  • Loading branch information
marenger committed Mar 19, 2017
1 parent 53a2b60 commit 11a54a8
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 123 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ Run the tool with the command

## Output

The output is a file with cues and scopes encoded in the CD format. The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension.
The output is a file where the first 8 columns are identical to the inputfile, and the following columns include cues and scopes encoded in the CD format. The column for events is included with the symbol "_". The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension.

## Training your own models
To re-train the cue- and scope learner with a new dataset, run the tool with the command:
`python negtool.py -m retraining`
You can train your own cue and/or scope model with a new dataset. The dataset needs to be on CoNLL_X format with cues and scopes encoded on the CD format starting at column 9. Training is done by running

You will be asked to provide the filename of the training file and a test file. Both files need to be on CoNLL_X format.
`python train.py -m <model to train> -tf <training file> -cp <cue regularisation> -sp <scope regularisation>`

For the -m option, the program accepts either cue, scope or all. The default value for both the cue regularisation parameter and the scope regularisation parameter is 0.20.
85 changes: 19 additions & 66 deletions negtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
from os import remove, path
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from pystruct.models import ChainCRF, BinaryClf
from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
from pystruct.utils import SaveLogger

from file_reading import *
from feature_extraction import extract_features_scope, extract_features_cue
from utils import *
from file_writing import *
from read_labelled_data import read_file

def load_cue_learner():
"""
Expand All @@ -35,26 +32,6 @@ def load_scope_learner():
scope_vectorizer = joblib.load("objectfiles/scope_vectorizer.pkl")
return scope_ssvm, scope_vectorizer

def train_cue_learner(sentence_dicts, C_value):
cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(cue_instances).toarray()
model = BinaryClf()
cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
cue_ssvm.fit(fvs, np.asarray(cue_labels))
return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon

def train_scope_learner(sentence_dicts, C_value):
scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(scope_instances).toarray()
X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
model = ChainCRF()
scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
scope_ssvm.fit(X_train, y_train)
return scope_ssvm, vectorizer

def run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, mode):
"""
Reads the file with the input data, extracts features for cue detection,
Expand Down Expand Up @@ -82,51 +59,27 @@ def run_scope_learner(scope_ssvm, scope_vectorizer, filename, mode):

if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed', 'retraining'])
argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed'])
argparser.add_argument('-f', '--filename', help="input file", type=str, nargs='?')
argparser.add_argument('-d', '--directory', help="absolute path to corenlp directory. needs to be provided in raw mode", type=str, nargs='?')
args = argparser.parse_args()

if args.mode == 'retraining':
training_file = raw_input("Enter file name of training file: ")
test_file = raw_input("Enter file name of test file: ")
sentence_dicts = read_file(training_file)
print "Setning 0:"
print sentence_dicts[0]['cues']
print sentence_dicts[0]['scopes']
print ""
print "Setning 1:"
print sentence_dicts[1]['cues']
print sentence_dicts[1]['scopes']
print ""
print "Setning 2:"
print sentence_dicts[2]['cues']
print sentence_dicts[2]['scopes']
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, 0.20)
print affixal_cue_lexicon
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, test_file, 'parsed')

scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
cue_file = test_file.split(".")[0] + "_cues.neg"
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, 'parsed')
remove(cue_file)
else:
filename = args.filename
if not path.isfile(filename):
print "ERROR: File does not exist. Program will exit"
sys.exit(1)
if args.mode == 'raw':
path_to_corenlp = args.directory
if args.directory == None:
path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
elif not path.exists(args.directory):
path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
run_corenlp(path_to_corenlp, args.filename)
filename = args.filename + ".conll"
filename = args.filename
if not path.isfile(filename):
print "ERROR: File does not exist. Program will exit"
sys.exit(1)
if args.mode == 'raw':
path_to_corenlp = args.directory
if args.directory == None:
path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
elif not path.exists(args.directory):
path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
run_corenlp(path_to_corenlp, args.filename)
filename = args.filename + ".conll"

cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
cue_file = filename.split(".")[0] + "_cues.neg"
scope_ssvm, scope_vectorizer = load_scope_learner()
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
remove(cue_file)
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
cue_file = filename.split(".")[0] + "_cues.neg"
scope_ssvm, scope_vectorizer = load_scope_learner()
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
remove(cue_file)
65 changes: 65 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import pickle
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from pystruct.models import ChainCRF, BinaryClf
from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
from pystruct.utils import SaveLogger

from file_reading import *
from feature_extraction import extract_features_scope, extract_features_cue
from utils import *
from file_writing import *
from read_labelled_data import read_file

def train_cue_learner(sentence_dicts, C_value):
cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(cue_instances).toarray()
model = BinaryClf()
cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
cue_ssvm.fit(fvs, np.asarray(cue_labels))
return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon

def train_scope_learner(sentence_dicts, C_value):
scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(scope_instances).toarray()
X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
model = ChainCRF()
scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
scope_ssvm.fit(X_train, y_train)
return scope_ssvm, vectorizer

def save_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename):
pickle.dump(cue_ssvm, open("cue_model_%s.pkl" %filename, "wb"))
joblib.dump(cue_vectorizer, "cue_vectorizer_%s.pkl" %filename)
pickle.dump(cue_lexicon, open("cue_lexicon_%s.pkl" %filename, "wb"))
pickle.dump(affixal_cue_lexicon, open("affixal_cue_lexicon_%s.pkl" %filename, "wb"))

def save_scope_learner(scope_ssvm, scope_vectorizer, filename):
pickle.dump(scope_ssvm, open("scope_model_%s.pkl" %filename, "wb"))
joblib.dump(scope_vectorizer, "scope_vectorizer_%s.pkl" %filename)

if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('-m', '--model', help="model to train. Either cue, scope or all", type=str, choices=['cue', 'scope', 'all'])
argparser.add_argument('-tf', '--trainingfile', help="filename of training file", type=str)
argparser.add_argument('-cp', '--cueparameter', help="regularisation parameter for the cue model", type=float, nargs="?", default=0.20)
argparser.add_argument('-sp', '--scopeparameter', help="regularisation parameter for the scope model", type=float, nargs="?", default=0.20)
args = argparser.parse_args()

print "lese inn setninger"
sentence_dicts = read_file(args.trainingfile)
filename = args.trainingfile.split(".")[0]
if args.model == 'cue' or args.model == 'all':
print "trener cue"
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, args.scopeparameter)
save_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename)

if args.model == 'scope' or args.model == 'all':
print "trener scope"
scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
save_scope_learner(scope_ssvm, scope_vectorizer, filename)

53 changes: 0 additions & 53 deletions train_models.py

This file was deleted.

0 comments on commit 11a54a8

Please sign in to comment.