Skip to content

Commit

Permalink
Renamed files, cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
marenger committed Jan 24, 2017
1 parent ad03abb commit 41ded79
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 67 deletions.
27 changes: 2 additions & 25 deletions input_output.py → file_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def read_cuepredicted_data(filename, mode):
sentence = {}
cues = []
mw_cues = []
scopes = {}
events = {}
line_counter = 0
counter = 0
cue_counter = 0
Expand All @@ -93,16 +91,8 @@ def read_cuepredicted_data(filename, mode):
sentence[key]['head-pos'] = sentence[head_index][5]
else:
sentence[key]['head-pos'] = sentence[key][5]

if len(scopes) != len(cues):
for i in range(len(cues)):
if not i in scopes:
scopes[i] = []

sentence['cues'] = cues
sentence['mw_cues'] = mw_cues
sentence['scopes'] = scopes
sentence['events'] = events
if len(cues) > 0:
sentence['neg'] = True
else:
Expand All @@ -114,8 +104,6 @@ def read_cuepredicted_data(filename, mode):
prev_cue_column = -1
cues = []
mw_cues = []
scopes = {}
events = {}
line_counter += 1
continue

Expand All @@ -138,27 +126,16 @@ def read_cuepredicted_data(filename, mode):
else:
cues.append([tokens[i], counter, 's'])
prev_cue_column = i
#scope column
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
cue_counter = (i-upper_limit+2)/3
if cue_counter in scopes:
scopes[cue_counter].append([tokens[i], counter])
else:
scopes[cue_counter] = [[tokens[i], counter]]
#event column
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-2) % 3 == 0:
cue_counter = (i-upper_limit+3)/3
events[cue_counter] = tokens[i]

if mode == 'raw':
token_dict['head'] = tokens[5]
token_dict['deprel'] = tokens[6]
else:
token_dict[5] = tokens[4] #record only the pos-tag, not cpos-tag for conll-x data
token_dict[5] = tokens[4] #for conll-x data: record only the pos-tag, not cpos-tag
token_dict['head'] = tokens[6]
token_dict['deprel'] = tokens[7]

sentence[counter] = token_dict
counter += 1
line_counter += 1
return instances

File renamed without changes.
4 changes: 2 additions & 2 deletions negtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from os import remove, path
from sklearn.externals import joblib

from input_output import *
from file_reading import *
from feature_extraction import extract_features_scope, extract_features_cue
from utils import *
from evaluation import *
from file_writing import *

def load_cue_learner():
"""
Expand Down
51 changes: 15 additions & 36 deletions dev/data_processing.py → read_labelled_data.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import numpy as np

def read_file(filename, conll_filename):
def read_file(filename):
"""
Read input file and make dictionaries for each sentence.
Used for training with the CD dataset.
"""
with open(filename, 'r') as infile1, open(conll_filename) as infile2:
with open(filename, 'r') as infile1:
sentence = {}
cues = []
mw_cues = []
Expand All @@ -15,13 +15,14 @@ def read_file(filename, conll_filename):
counter = 0
cue_counter = 0
prev_cue_column = -1
lower_limit = 3
upper_limit = 7
cue_offset = upper_limit - 5
instances = []

for line in infile1:
conll_line = infile2.readline()
token_dict = {}
tokens = line.split()
conll_tokens = conll_line.split()
#check for sentence end
if len(tokens) == 0:
for key in sentence:
Expand All @@ -45,8 +46,6 @@ def read_file(filename, conll_filename):
sentence['neg'] = True
else:
sentence['neg'] = False

#yield sentence
instances.append(sentence)
sentence = {}
counter = 0
Expand All @@ -59,9 +58,9 @@ def read_file(filename, conll_filename):
continue

for i in range(len(tokens)):
if tokens[i] != "_" and i < 6:
token_dict[i] = tokens[i]
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-1) % 3 == 0:
if tokens[i] != "_" and i < lower_limit:
token_dict[i+2] = tokens[i]
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset) % 3 == 0:
if i == prev_cue_column:
cues[-1][2] = 'm'
prev_cue_column = i
Expand All @@ -74,40 +73,20 @@ def read_file(filename, conll_filename):
else:
cues.append([tokens[i], counter, 's'])
prev_cue_column = i
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-2) % 3 == 0:
cue_counter = (i-8)/3
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
cue_counter = (i-upper_limit+2)/3
if cue_counter in scopes:
scopes[cue_counter].append([tokens[i], counter])
else:
scopes[cue_counter] = [[tokens[i], counter]]
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-3) % 3 == 0:
cue_counter = (i-9)/3
elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-2) % 3 == 0:
cue_counter = (i-upper_limit+3)/3
events[cue_counter] = tokens[i]
token_dict['head'] = conll_tokens[6]
token_dict['deprel'] = conll_tokens[7]
token_dict[5] = tokens[4]
token_dict['head'] = tokens[6]
token_dict['deprel'] = tokens[7]
sentence[counter] = token_dict
counter += 1
line_counter += 1
return instances

if __name__ == '__main__':
cue_counter = 0
scope_counter = 0
event_counter = 0
sentence_counter = 0
negsent_counter = 0
ex_sent = None
for sentence in read_file("../data/gold/cdd.txt", "../data/cdd_parsed.txt"):
cue_counter += len(sentence['cues'])
scope_counter += len(sentence['scopes'])
event_counter += len(sentence['events'])
sentence_counter += 1
if sentence['neg']:
negsent_counter += 1

print "Number of sentences:", sentence_counter
print "Number of negated sentences:", negsent_counter
print "Number of cues:", cue_counter
print "Number of scopes:", scope_counter
print "Number of events:", event_counter

6 changes: 3 additions & 3 deletions dev/main_program.py → train_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from data_processing import read_file
from feature_extraction import extract_features_scope, extract_features_cue
from utils import make_splits, get_cue_lexicon, print_cue_lexicons, make_complete_labelarray
from evaluation import *
from file_writing import *

def cue_detection(C_value, train_file, train_file_parsed, config='training'):
""" Extract sentence dictionaries, lexicons and features, then train the cue model"""
Expand Down Expand Up @@ -36,7 +36,7 @@ def scope_resolution(C_value, state_value, train_file, train_file_parsed, config

def save_cue_learner(train_file, train_file_parsed):
"""
Saves the cue learner object, the cue vectorizer, the cue lexicon
Save the cue learner object, the cue vectorizer, the cue lexicon
and the affixal cue lexicon to files
"""
cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = cue_detection(0.20, train_file, train_file_parsed)
Expand All @@ -46,7 +46,7 @@ def save_cue_learner(train_file, train_file_parsed):
pickle.dump(affixal_cue_lexicon, open("affixal_cue_lexicon.pkl", "wb"))

def save_scope_learner(train_file, train_file_parsed):
"""Saves the scope learner object and the scope vectorizer object to files"""
"""Save the scope learner object and the scope vectorizer object to files"""
scope_ssvm, scope_vectorizer = scope_resolution(0.10, 10, train_file, train_file_parsed)
pickle.dump(scope_ssvm, open("scope_model.pkl", "wb"))
joblib.dump(scope_vectorizer, 'scope_vectorizer.pkl')
Expand Down
1 change: 0 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def make_splits(X, y, splits):
y_train.append(np.asarray(y[i:(i + offset)]))
i += offset
j += 1
print "Number of training instances:", len(X_train)
return np.asarray(X_train), np.asarray(y_train)

def convert_to_IO(y):
Expand Down

0 comments on commit 41ded79

Please sign in to comment.