Merge pull request #1 from neurotechuoft/original_push

Original commit
neurotechuoft · Sep 29, 2018 · 6c3cc73 · 6c3cc73
2 parents b7dc5a2 + 6a54395
commit 6c3cc73
Show file tree

Hide file tree

Showing 26 changed files with 9,258,872 additions and 17 deletions.
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/bencmarks/another_file b/src/bencmarks/another_file
diff --git a/src/bencmarks/benchmark.py b/src/bencmarks/benchmark.py
@@ -0,0 +1,76 @@
+import time
+import codecs
+import pandas as pd
+
+from nlp import complete
+
+
+def benchmark(file: str) -> float:
+    """
+    Benchmark the average time it takes to find the next word from a file
+    :param file: A file that lists the words to complete, separated by newline
+    :return: the average time to predict the words
+    """
+    with open(file) as f:
+        content = f.readlines()
+
+    time_total = [time_wrapper(x) for x in content]
+
+    return (sum(time_total) * 1.0) / len(time_total)
+
+
+def time_wrapper(word: str) -> float:
+    """
+    A wrapper for the autocomplete function, to make it time each call
+    :param word: A word to complete
+    :return: time it takes to complete the word
+    """
+    start = time.time()
+    complete.autocomplete(word)
+    return (time.time() - start) * 1000.0
+
+def performance_test():
+    # Performance Tests
+    with codecs.open('/Users/ouutsuyuki/PycharmProjects/random/test_data_w2', "r", encoding='utf-8',
+                     errors='ignore')  as source:
+
+        testdata = pd.read_table(source, names=["freq", "first", "second"])
+
+    # Intrinstic Test 1: 2nd word prediction based on first word input
+    c1 = 0
+    for i in range(1, len(testdata)):
+        if complete.autocomplete(testdata["first"][i] + ' ') == testdata["second"][i]:
+            c1 = c1 + 1
+    print('Test1: Prediction accuracy = ', c1 / len(testdata))
+
+    '''
+    Usage: python complete.py phrase_to_complete
+    Test1: Prediction accuracy =  0.05392156862745098
+    '''
+
+    # Test 2: Prediction of Completing a word given increasing amount of alphabet input
+    c2 = 0
+    n = 0  # Number of comparision
+    # for i in range(1, len(testdata)):
+    for i in range(10):
+        for j in range(len(testdata["first"][i])):
+            n += 1
+            if complete.autocomplete(testdata["first"][i][:j]) == testdata["first"][i]:
+                c2 = c2 + 1
+
+        for k in range(len(testdata["second"][i])):
+            n += 1
+            if complete.autocomplete(testdata["second"][i][:k]) == testdata["second"][i]:
+                c2 = c2 + 1
+
+    print('Test2: Prediction accuracy = ', c2 / n)
+
+    '''
+        Test2: Prediction accuracy =  0.2661290322580645 for 10 sets of words
+    '''
+
+    return (c1 / len(testdata), c2 / n)
+
+
+if __name__ == "__main__":
+    print(benchmark("test.txt"))
diff --git a/src/bencmarks/native_version.py b/src/bencmarks/native_version.py
@@ -0,0 +1,50 @@
+import codecs
+import pandas as pd
+import pytrie
+import pickle
+
+
+def native_load_data(path_to_data):
+    """
+    Load the longest version of the trie, containing most n-grams
+    :param path_to_data: path to the n-gram corpus
+    :return: the trie, which also gets stored on the drive
+    """
+
+    with codecs.open(path_to_data, "r", encoding='utf-8', errors='ignore') as fdata:
+        grams = pd.read_table(fdata, names=["freq", "first", "second"])
+
+    grams['freq'] = grams['freq'].apply(lambda x: (x,))
+    freqs = grams['freq'].values
+    phrases = grams['first'] + " " + grams['second']
+    res = dict(zip(phrases, freqs))
+    pytrie1 = pytrie.StringTrie(res)
+    with open('pytrie.pkl', 'wb') as output:
+        pickle.dump(pytrie1, output, pickle.HIGHEST_PROTOCOL)
+    return pytrie1
+
+
+def native_autocomplete(trie1, word):
+    """
+    Autocomplete the word/phrase using native python implementation of trie.
+    If it's an incomplete word, then return the most likely completion.
+    If it's a complete word, return the next word that is most likely.
+    For now it's slower and less memory efficient than the C++ version,
+    so use that one instead.
+    :param word: (part of) a word
+    :return: completed string
+    """
+    maxi = 0
+    compl = ''
+    for item in trie1.items(prefix = word):
+        if item[1] > maxi:
+            maxi = item[1]
+            compl = item[0]
+    if compl == '':
+       return "couldn't find autocomplete for \"{}\"".format(word)
+    longer = compl.split(' ')
+    if len(longer) > 1 and longer[-2] in word:
+        return longer[-1]
+    return longer[0]
+
+
diff --git a/src/bencmarks/random/randomization.py b/src/bencmarks/random/randomization.py
@@ -0,0 +1,73 @@
+import codecs
+import random
+
+#with codecs.open('./w2_.txt', "r", encoding='utf-8', errors='ignore')  as source:
+#    data = [ (random.random(), line) for line in source ]
+
+#with open('randomized_result','w') as target:
+#    for _, line in data:
+#        target.write( line )
+#random.shuffle(data)
+#train_data = data[:int((len(data) + 1) * .999)]  # Remaining 80% to training set
+#test_data = data[int(len(data) * .999 + 1):]  # Splits 20% data to test set
+
+
+for i in range(2,6):
+    test_path = './w'+str(i)+'_.txt'
+    with codecs.open(test_path, "r", encoding='utf-8',
+                     errors='ignore')  as source:
+        data = [(random.random(), line) for line in source]
+
+    random.shuffle(data)
+    train_data = data[:int((len(data) + 1) * .999)]  # Remaining 80% to training set
+    test_data = data[int(len(data) * .999 + 1):]  # Splits 20% data to test set
+    #Save data to desire file
+    train_store = 'train_data_w'+str(i)
+    with open(train_store, 'w') as target:
+        for (_, line) in train_data:
+            target.write(line)
+    test_store = 'test_data_w' + str(i)
+    with open(test_store, 'w') as target:
+        for (_, line) in test_data:
+            target.write(line)
+
+
+'''
+import random
+with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore')  as source:
+    data = [ (random.random(), line) for line in source ]
+#data.sort()
+with open('another_file','w') as target:
+    for _, line in data:
+        target.write( line )
+
+
+file = open("datafile.txt", "r")
+data = list()
+for line in file:
+    data.append(line.split(  # your preferred delimiter))
+        file.close()
+    random.shuffle(data)
+    train_data = data[:int((len(data) + 1) * .80)]  # Remaining 80% to training set
+    test_data = data[int(len(data) * .80 + 1):]  # Splits 20% data to test set
+
+'''
+
+'''
+from nltk import ngrams
+
+sentence = 'this is a foo bar sentences, and i want to ongranize it. I\'ve eaten too much'
+n = 2
+bigrams = ngrams(sentence.split(), n)
+for grams in bigrams:
+    print(grams)
+'''
+
+'''
+with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore') as fdata:
+    grams = pd.read_table(fdata, names=["freq", "first", "second"])
+
+grams = grams.sort_values(by='freq', ascending=False)
+random.sample(population, k)
+'''
+