Skip to content

Commit

Permalink
Merge pull request #1 from neurotechuoft/original_push
Browse files Browse the repository at this point in the history
Original commit
  • Loading branch information
IVeselovskyy authored Sep 29, 2018
2 parents b7dc5a2 + 6a54395 commit 6c3cc73
Show file tree
Hide file tree
Showing 26 changed files with 9,258,872 additions and 17 deletions.
Empty file added src/__init__.py
Empty file.
Empty file added src/bencmarks/another_file
Empty file.
76 changes: 76 additions & 0 deletions src/bencmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import time
import codecs
import pandas as pd

from nlp import complete


def benchmark(file: str) -> float:
"""
Benchmark the average time it takes to find the next word from a file
:param file: A file that lists the words to complete, separated by newline
:return: the average time to predict the words
"""
with open(file) as f:
content = f.readlines()

time_total = [time_wrapper(x) for x in content]

return (sum(time_total) * 1.0) / len(time_total)


def time_wrapper(word: str) -> float:
"""
A wrapper for the autocomplete function, to make it time each call
:param word: A word to complete
:return: time it takes to complete the word
"""
start = time.time()
complete.autocomplete(word)
return (time.time() - start) * 1000.0

def performance_test():
# Performance Tests
with codecs.open('/Users/ouutsuyuki/PycharmProjects/random/test_data_w2', "r", encoding='utf-8',
errors='ignore') as source:

testdata = pd.read_table(source, names=["freq", "first", "second"])

# Intrinstic Test 1: 2nd word prediction based on first word input
c1 = 0
for i in range(1, len(testdata)):
if complete.autocomplete(testdata["first"][i] + ' ') == testdata["second"][i]:
c1 = c1 + 1
print('Test1: Prediction accuracy = ', c1 / len(testdata))

'''
Usage: python complete.py phrase_to_complete
Test1: Prediction accuracy = 0.05392156862745098
'''

# Test 2: Prediction of Completing a word given increasing amount of alphabet input
c2 = 0
n = 0 # Number of comparision
# for i in range(1, len(testdata)):
for i in range(10):
for j in range(len(testdata["first"][i])):
n += 1
if complete.autocomplete(testdata["first"][i][:j]) == testdata["first"][i]:
c2 = c2 + 1

for k in range(len(testdata["second"][i])):
n += 1
if complete.autocomplete(testdata["second"][i][:k]) == testdata["second"][i]:
c2 = c2 + 1

print('Test2: Prediction accuracy = ', c2 / n)

'''
Test2: Prediction accuracy = 0.2661290322580645 for 10 sets of words
'''

return (c1 / len(testdata), c2 / n)


if __name__ == "__main__":
print(benchmark("test.txt"))
50 changes: 50 additions & 0 deletions src/bencmarks/native_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import codecs
import pandas as pd
import pytrie
import pickle


def native_load_data(path_to_data):
"""
Load the longest version of the trie, containing most n-grams
:param path_to_data: path to the n-gram corpus
:return: the trie, which also gets stored on the drive
"""

with codecs.open(path_to_data, "r", encoding='utf-8', errors='ignore') as fdata:
grams = pd.read_table(fdata, names=["freq", "first", "second"])

grams['freq'] = grams['freq'].apply(lambda x: (x,))
freqs = grams['freq'].values
phrases = grams['first'] + " " + grams['second']
res = dict(zip(phrases, freqs))
pytrie1 = pytrie.StringTrie(res)
with open('pytrie.pkl', 'wb') as output:
pickle.dump(pytrie1, output, pickle.HIGHEST_PROTOCOL)
return pytrie1


def native_autocomplete(trie1, word):
"""
Autocomplete the word/phrase using native python implementation of trie.
If it's an incomplete word, then return the most likely completion.
If it's a complete word, return the next word that is most likely.
For now it's slower and less memory efficient than the C++ version,
so use that one instead.
:param word: (part of) a word
:return: completed string
"""
maxi = 0
compl = ''
for item in trie1.items(prefix = word):
if item[1] > maxi:
maxi = item[1]
compl = item[0]
if compl == '':
return "couldn't find autocomplete for \"{}\"".format(word)
longer = compl.split(' ')
if len(longer) > 1 and longer[-2] in word:
return longer[-1]
return longer[0]


73 changes: 73 additions & 0 deletions src/bencmarks/random/randomization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import codecs
import random

#with codecs.open('./w2_.txt', "r", encoding='utf-8', errors='ignore') as source:
# data = [ (random.random(), line) for line in source ]

#with open('randomized_result','w') as target:
# for _, line in data:
# target.write( line )
#random.shuffle(data)
#train_data = data[:int((len(data) + 1) * .999)] # Remaining 80% to training set
#test_data = data[int(len(data) * .999 + 1):] # Splits 20% data to test set


for i in range(2,6):
test_path = './w'+str(i)+'_.txt'
with codecs.open(test_path, "r", encoding='utf-8',
errors='ignore') as source:
data = [(random.random(), line) for line in source]

random.shuffle(data)
train_data = data[:int((len(data) + 1) * .999)] # Remaining 80% to training set
test_data = data[int(len(data) * .999 + 1):] # Splits 20% data to test set
#Save data to desire file
train_store = 'train_data_w'+str(i)
with open(train_store, 'w') as target:
for (_, line) in train_data:
target.write(line)
test_store = 'test_data_w' + str(i)
with open(test_store, 'w') as target:
for (_, line) in test_data:
target.write(line)


'''
import random
with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore') as source:
data = [ (random.random(), line) for line in source ]
#data.sort()
with open('another_file','w') as target:
for _, line in data:
target.write( line )
file = open("datafile.txt", "r")
data = list()
for line in file:
data.append(line.split( # your preferred delimiter))
file.close()
random.shuffle(data)
train_data = data[:int((len(data) + 1) * .80)] # Remaining 80% to training set
test_data = data[int(len(data) * .80 + 1):] # Splits 20% data to test set
'''

'''
from nltk import ngrams
sentence = 'this is a foo bar sentences, and i want to ongranize it. I\'ve eaten too much'
n = 2
bigrams = ngrams(sentence.split(), n)
for grams in bigrams:
print(grams)
'''

'''
with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore') as fdata:
grams = pd.read_table(fdata, names=["freq", "first", "second"])
grams = grams.sort_values(by='freq', ascending=False)
random.sample(population, k)
'''

Loading

0 comments on commit 6c3cc73

Please sign in to comment.