Skip to content

Commit

Permalink
Added support for re-training the models
Browse files Browse the repository at this point in the history
  • Loading branch information
marenger committed Mar 14, 2017
1 parent ec3f394 commit 53a2b60
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 27 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ Run the tool with the command
## Output

The output is a file with cues and scopes encoded in the CD format. The name of the file is the same as the name of the input file, but with the extension `.neg` instead of the original file extension.

## Training your own models
To re-train the cue- and scope learner with a new dataset, run the tool with the command:
`python negtool.py -m retraining`

You will be asked to provide the filename of the training file and a test file. Both files need to be on CoNLL_X format.
3 changes: 1 addition & 2 deletions feature_extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import sklearn.metrics as metrics
from pystruct.models import ChainCRF
from pystruct.learners import FrankWolfeSSVM
Expand Down Expand Up @@ -69,7 +68,7 @@ def extract_labels_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon):
for sent in sentence_dicts:
for key, value in sent.iteritems():
if isinstance(key, int):
if not known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
if not_known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
continue
if any(cue_position == key for (cue, cue_position, cue_type) in sent['cues']) or any(mw_pos == key for (mw_cue, mw_pos) in sent['mw_cues']):
labels.append(1)
Expand Down
91 changes: 70 additions & 21 deletions negtool.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from pystruct.utils import SaveLogger
import pickle
import sys
import argparse
from os import remove, path
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from pystruct.models import ChainCRF, BinaryClf
from pystruct.learners import FrankWolfeSSVM, NSlackSSVM
from pystruct.utils import SaveLogger

from file_reading import *
from feature_extraction import extract_features_scope, extract_features_cue
from utils import *
from file_writing import *
from read_labelled_data import read_file

def load_cue_learner():
"""
Expand All @@ -31,6 +35,26 @@ def load_scope_learner():
scope_vectorizer = joblib.load("objectfiles/scope_vectorizer.pkl")
return scope_ssvm, scope_vectorizer

def train_cue_learner(sentence_dicts, C_value):
cue_lexicon, affixal_cue_lexicon = get_cue_lexicon(sentence_dicts)
cue_sentence_dicts, cue_instances, cue_labels = extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(cue_instances).toarray()
model = BinaryClf()
cue_ssvm = NSlackSSVM(model, C=C_value, batch_size=-1)
cue_ssvm.fit(fvs, np.asarray(cue_labels))
return cue_ssvm, vectorizer, cue_lexicon, affixal_cue_lexicon

def train_scope_learner(sentence_dicts, C_value):
scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training')
vectorizer = DictVectorizer()
fvs = vectorizer.fit_transform(scope_instances).toarray()
X_train, y_train = make_splits(fvs, scope_labels, sentence_splits)
model = ChainCRF()
scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10)
scope_ssvm.fit(X_train, y_train)
return scope_ssvm, vectorizer

def run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, mode):
"""
Reads the file with the input data, extracts features for cue detection,
Expand Down Expand Up @@ -58,26 +82,51 @@ def run_scope_learner(scope_ssvm, scope_vectorizer, filename, mode):

if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed", type=str, choices=['raw','parsed'])
argparser.add_argument('-f', '--filename', help="input file", type=str)
argparser.add_argument('-m', '--mode', help="program mode. either raw or parsed or retraining", type=str, choices=['raw','parsed', 'retraining'])
argparser.add_argument('-f', '--filename', help="input file", type=str, nargs='?')
argparser.add_argument('-d', '--directory', help="absolute path to corenlp directory. needs to be provided in raw mode", type=str, nargs='?')
args = argparser.parse_args()

filename = args.filename
if not path.isfile(filename):
print "ERROR: File does not exist. Program will exit"
sys.exit(1)
if args.mode == 'raw':
path_to_corenlp = args.directory
if args.directory == None:
path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
elif not path.exists(args.directory):
path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
run_corenlp(path_to_corenlp, args.filename)
filename = args.filename + ".conll"
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
cue_file = filename.split(".")[0] + "_cues.neg"
scope_ssvm, scope_vectorizer = load_scope_learner()
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
remove(cue_file)
if args.mode == 'retraining':
training_file = raw_input("Enter file name of training file: ")
test_file = raw_input("Enter file name of test file: ")
sentence_dicts = read_file(training_file)
print "Setning 0:"
print sentence_dicts[0]['cues']
print sentence_dicts[0]['scopes']
print ""
print "Setning 1:"
print sentence_dicts[1]['cues']
print sentence_dicts[1]['scopes']
print ""
print "Setning 2:"
print sentence_dicts[2]['cues']
print sentence_dicts[2]['scopes']
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = train_cue_learner(sentence_dicts, 0.20)
print affixal_cue_lexicon
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, test_file, 'parsed')

scope_ssvm, scope_vectorizer = train_scope_learner(sentence_dicts, 0.20)
cue_file = test_file.split(".")[0] + "_cues.neg"
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, 'parsed')
remove(cue_file)
else:
filename = args.filename
if not path.isfile(filename):
print "ERROR: File does not exist. Program will exit"
sys.exit(1)
if args.mode == 'raw':
path_to_corenlp = args.directory
if args.directory == None:
path_to_corenlp = raw_input("Absolute path to CoreNLP directory:")
elif not path.exists(args.directory):
path_to_corenlp = raw_input("ERROR: You specified the wrong path. Please specify the right path:")
run_corenlp(path_to_corenlp, args.filename)
filename = args.filename + ".conll"

cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = load_cue_learner()
run_cue_learner(cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon, filename, args.mode)
cue_file = filename.split(".")[0] + "_cues.neg"
scope_ssvm, scope_vectorizer = load_scope_learner()
run_scope_learner(scope_ssvm, scope_vectorizer, cue_file, args.mode)
remove(cue_file)
9 changes: 5 additions & 4 deletions read_labelled_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def read_file(filename):
Read input file and make dictionaries for each sentence.
Used for training with the CD dataset.
"""
with open(filename, 'r') as infile1:
with open(filename, 'r') as infile:
sentence = {}
cues = []
mw_cues = []
Expand All @@ -20,7 +20,7 @@ def read_file(filename):
cue_offset = upper_limit - 5
instances = []

for line in infile1:
for line in infile:
token_dict = {}
tokens = line.split()
#check for sentence end
Expand Down Expand Up @@ -49,6 +49,7 @@ def read_file(filename):
instances.append(sentence)
sentence = {}
counter = 0
cue_counter = 0
prev_cue_column = -1
cues = []
mw_cues = []
Expand All @@ -67,14 +68,14 @@ def read_file(filename):
if cues[-1][2] == 'm':
mw_cues.append([cues[-1][0],cues[-1][1]])
mw_cues.append([tokens[i], counter])
elif tokens[i] != tokens[3]:
elif tokens[i] != tokens[1]:
cues.append([tokens[i], counter, 'a'])
prev_cue_column = i
else:
cues.append([tokens[i], counter, 's'])
prev_cue_column = i
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
cue_counter = (i-upper_limit+2)/3
cue_counter = (i-upper_limit+2)/3 - 1
if cue_counter in scopes:
scopes[cue_counter].append([tokens[i], counter])
else:
Expand Down

0 comments on commit 53a2b60

Please sign in to comment.