Renamed files, cleanup

vedic-partap · Jan 24, 2017 · 41ded79 · 41ded79
1 parent ad03abb
commit 41ded79
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 67 deletions.
diff --git a/input_output.py → file_reading.py b/input_output.py → file_reading.py
@@ -73,8 +73,6 @@ def read_cuepredicted_data(filename, mode):
         sentence = {}
         cues = []
         mw_cues = []
-        scopes = {}
-        events = {}
         line_counter = 0
         counter = 0
         cue_counter = 0
@@ -93,16 +91,8 @@ def read_cuepredicted_data(filename, mode):
                         sentence[key]['head-pos'] = sentence[head_index][5]
                     else:
                         sentence[key]['head-pos'] = sentence[key][5]
-
-                if len(scopes) != len(cues):
-                    for i in range(len(cues)):
-                        if not i in scopes:
-                            scopes[i] = []
-
                 sentence['cues'] = cues
                 sentence['mw_cues'] = mw_cues
-                sentence['scopes'] = scopes
-                sentence['events'] = events
                 if len(cues) > 0:
                     sentence['neg'] = True
                 else:
@@ -114,8 +104,6 @@ def read_cuepredicted_data(filename, mode):
                 prev_cue_column = -1
                 cues = []
                 mw_cues = []
-                scopes = {}
-                events = {}
                 line_counter += 1
                 continue
 
@@ -138,27 +126,16 @@ def read_cuepredicted_data(filename, mode):
                     else:
                         cues.append([tokens[i], counter, 's'])
                         prev_cue_column = i
-                #scope column
-                elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
-                    cue_counter = (i-upper_limit+2)/3
-                    if cue_counter in scopes:
-                        scopes[cue_counter].append([tokens[i], counter])
-                    else:
-                        scopes[cue_counter] = [[tokens[i], counter]]
-                #event column
-                elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-2) % 3 == 0:
-                    cue_counter = (i-upper_limit+3)/3
-                    events[cue_counter] = tokens[i]
-
             if mode == 'raw':
                 token_dict['head'] = tokens[5]
                 token_dict['deprel'] = tokens[6]
             else:
-                token_dict[5] = tokens[4] #record only the pos-tag, not cpos-tag for conll-x data
+                token_dict[5] = tokens[4] #for conll-x data: record only the pos-tag, not cpos-tag
                 token_dict['head'] = tokens[6]
                 token_dict['deprel'] = tokens[7]
 
             sentence[counter] = token_dict
             counter += 1
             line_counter += 1
         return instances
+
diff --git a/evaluation.py → file_writing.py b/evaluation.py → file_writing.py
diff --git a/negtool.py b/negtool.py
@@ -5,10 +5,10 @@
 from os import remove, path
 from sklearn.externals import joblib
 
-from input_output import *
+from file_reading import *
 from feature_extraction import extract_features_scope, extract_features_cue 
 from utils import *
-from evaluation import *
+from file_writing import *
 
 def load_cue_learner():
     """

diff --git a/dev/data_processing.py → read_labelled_data.py b/dev/data_processing.py → read_labelled_data.py
@@ -1,11 +1,11 @@
 import numpy as np
 
-def read_file(filename, conll_filename):
+def read_file(filename):
     """
     Read input file and make dictionaries for each sentence. 
     Used for training with the CD dataset.
     """
-    with open(filename, 'r') as infile1, open(conll_filename) as infile2:
+    with open(filename, 'r') as infile1:
         sentence = {}
         cues = []
         mw_cues = []
@@ -15,13 +15,14 @@ def read_file(filename, conll_filename):
         counter = 0
         cue_counter = 0
         prev_cue_column = -1
+        lower_limit = 3
+        upper_limit = 7
+        cue_offset = upper_limit - 5
         instances = []
 
         for line in infile1:
-            conll_line = infile2.readline()
             token_dict = {}
             tokens = line.split()
-            conll_tokens = conll_line.split()
             #check for sentence end
             if len(tokens) == 0:
                 for key in sentence:
@@ -45,8 +46,6 @@ def read_file(filename, conll_filename):
                     sentence['neg'] = True
                 else:
                     sentence['neg'] = False
-
-                #yield sentence
                 instances.append(sentence)
                 sentence = {}
                 counter = 0
@@ -59,9 +58,9 @@ def read_file(filename, conll_filename):
                 continue
 
             for i in range(len(tokens)):            
-                if tokens[i] != "_" and  i < 6:
-                    token_dict[i] = tokens[i]
-                elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-1) % 3 == 0:
+                if tokens[i] != "_" and  i < lower_limit:
+                    token_dict[i+2] = tokens[i]
+                elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset) % 3 == 0:
                     if i == prev_cue_column:
                         cues[-1][2] = 'm'
                         prev_cue_column = i
@@ -74,40 +73,20 @@ def read_file(filename, conll_filename):
                     else:
                         cues.append([tokens[i], counter, 's'])
                         prev_cue_column = i
-                elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-2) % 3 == 0:
-                    cue_counter = (i-8)/3
+                elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-1) % 3 == 0:
+                    cue_counter = (i-upper_limit+2)/3
                     if cue_counter in scopes:
                         scopes[cue_counter].append([tokens[i], counter])
                     else:
                         scopes[cue_counter] = [[tokens[i], counter]]
-                elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-3) % 3 == 0:
-                    cue_counter = (i-9)/3
+                elif tokens[i] != "***" and tokens[i] != "_" and i > upper_limit and (i-cue_offset-2) % 3 == 0:
+                    cue_counter = (i-upper_limit+3)/3
                     events[cue_counter] = tokens[i]
-            token_dict['head'] = conll_tokens[6]
-            token_dict['deprel'] = conll_tokens[7]
+            token_dict[5] = tokens[4]
+            token_dict['head'] = tokens[6]
+            token_dict['deprel'] = tokens[7]
             sentence[counter] = token_dict
             counter += 1
             line_counter += 1
         return instances
 
-if __name__ == '__main__':
-    cue_counter = 0
-    scope_counter = 0
-    event_counter = 0
-    sentence_counter = 0
-    negsent_counter = 0
-    ex_sent = None
-    for sentence in read_file("../data/gold/cdd.txt", "../data/cdd_parsed.txt"):
-        cue_counter += len(sentence['cues'])
-        scope_counter += len(sentence['scopes'])
-        event_counter += len(sentence['events'])
-        sentence_counter += 1
-        if sentence['neg']:
-           negsent_counter += 1
-
-    print "Number of sentences:", sentence_counter
-    print "Number of negated sentences:", negsent_counter
-    print "Number of cues:", cue_counter
-    print "Number of scopes:", scope_counter
-    print "Number of events:", event_counter
-
diff --git a/dev/main_program.py → train_models.py b/dev/main_program.py → train_models.py
@@ -8,7 +8,7 @@
 from data_processing import read_file
 from feature_extraction import extract_features_scope, extract_features_cue 
 from utils import make_splits, get_cue_lexicon, print_cue_lexicons, make_complete_labelarray
-from evaluation import *
+from file_writing import *
 
 def cue_detection(C_value, train_file, train_file_parsed, config='training'):
     """ Extract sentence dictionaries, lexicons and features, then train the cue model"""
@@ -36,7 +36,7 @@ def scope_resolution(C_value, state_value, train_file, train_file_parsed, config
 
 def save_cue_learner(train_file, train_file_parsed):
     """
-    Saves the cue learner object, the cue vectorizer, the cue lexicon
+    Save the cue learner object, the cue vectorizer, the cue lexicon
     and the affixal cue lexicon to files
     """
     cue_ssvm, cue_vectorizer, cue_lexicon, affixal_cue_lexicon = cue_detection(0.20, train_file, train_file_parsed)
@@ -46,7 +46,7 @@ def save_cue_learner(train_file, train_file_parsed):
     pickle.dump(affixal_cue_lexicon, open("affixal_cue_lexicon.pkl", "wb"))
 
 def save_scope_learner(train_file, train_file_parsed):
-    """Saves the scope learner object and the scope vectorizer object to files"""
+    """Save the scope learner object and the scope vectorizer object to files"""
     scope_ssvm, scope_vectorizer = scope_resolution(0.10, 10, train_file, train_file_parsed)
     pickle.dump(scope_ssvm, open("scope_model.pkl", "wb"))
     joblib.dump(scope_vectorizer, 'scope_vectorizer.pkl')

diff --git a/utils.py b/utils.py
@@ -176,7 +176,6 @@ def make_splits(X, y, splits):
         y_train.append(np.asarray(y[i:(i + offset)]))
         i += offset
         j += 1
-    print "Number of training instances:", len(X_train)
     return np.asarray(X_train), np.asarray(y_train)
 
 def convert_to_IO(y):