-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from neurotechuoft/original_push
Original commit
- Loading branch information
Showing
26 changed files
with
9,258,872 additions
and
17 deletions.
There are no files selected for viewing
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import time | ||
import codecs | ||
import pandas as pd | ||
|
||
from nlp import complete | ||
|
||
|
||
def benchmark(file: str) -> float: | ||
""" | ||
Benchmark the average time it takes to find the next word from a file | ||
:param file: A file that lists the words to complete, separated by newline | ||
:return: the average time to predict the words | ||
""" | ||
with open(file) as f: | ||
content = f.readlines() | ||
|
||
time_total = [time_wrapper(x) for x in content] | ||
|
||
return (sum(time_total) * 1.0) / len(time_total) | ||
|
||
|
||
def time_wrapper(word: str) -> float: | ||
""" | ||
A wrapper for the autocomplete function, to make it time each call | ||
:param word: A word to complete | ||
:return: time it takes to complete the word | ||
""" | ||
start = time.time() | ||
complete.autocomplete(word) | ||
return (time.time() - start) * 1000.0 | ||
|
||
def performance_test(): | ||
# Performance Tests | ||
with codecs.open('/Users/ouutsuyuki/PycharmProjects/random/test_data_w2', "r", encoding='utf-8', | ||
errors='ignore') as source: | ||
|
||
testdata = pd.read_table(source, names=["freq", "first", "second"]) | ||
|
||
# Intrinstic Test 1: 2nd word prediction based on first word input | ||
c1 = 0 | ||
for i in range(1, len(testdata)): | ||
if complete.autocomplete(testdata["first"][i] + ' ') == testdata["second"][i]: | ||
c1 = c1 + 1 | ||
print('Test1: Prediction accuracy = ', c1 / len(testdata)) | ||
|
||
''' | ||
Usage: python complete.py phrase_to_complete | ||
Test1: Prediction accuracy = 0.05392156862745098 | ||
''' | ||
|
||
# Test 2: Prediction of Completing a word given increasing amount of alphabet input | ||
c2 = 0 | ||
n = 0 # Number of comparision | ||
# for i in range(1, len(testdata)): | ||
for i in range(10): | ||
for j in range(len(testdata["first"][i])): | ||
n += 1 | ||
if complete.autocomplete(testdata["first"][i][:j]) == testdata["first"][i]: | ||
c2 = c2 + 1 | ||
|
||
for k in range(len(testdata["second"][i])): | ||
n += 1 | ||
if complete.autocomplete(testdata["second"][i][:k]) == testdata["second"][i]: | ||
c2 = c2 + 1 | ||
|
||
print('Test2: Prediction accuracy = ', c2 / n) | ||
|
||
''' | ||
Test2: Prediction accuracy = 0.2661290322580645 for 10 sets of words | ||
''' | ||
|
||
return (c1 / len(testdata), c2 / n) | ||
|
||
|
||
if __name__ == "__main__": | ||
print(benchmark("test.txt")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import codecs | ||
import pandas as pd | ||
import pytrie | ||
import pickle | ||
|
||
|
||
def native_load_data(path_to_data): | ||
""" | ||
Load the longest version of the trie, containing most n-grams | ||
:param path_to_data: path to the n-gram corpus | ||
:return: the trie, which also gets stored on the drive | ||
""" | ||
|
||
with codecs.open(path_to_data, "r", encoding='utf-8', errors='ignore') as fdata: | ||
grams = pd.read_table(fdata, names=["freq", "first", "second"]) | ||
|
||
grams['freq'] = grams['freq'].apply(lambda x: (x,)) | ||
freqs = grams['freq'].values | ||
phrases = grams['first'] + " " + grams['second'] | ||
res = dict(zip(phrases, freqs)) | ||
pytrie1 = pytrie.StringTrie(res) | ||
with open('pytrie.pkl', 'wb') as output: | ||
pickle.dump(pytrie1, output, pickle.HIGHEST_PROTOCOL) | ||
return pytrie1 | ||
|
||
|
||
def native_autocomplete(trie1, word): | ||
""" | ||
Autocomplete the word/phrase using native python implementation of trie. | ||
If it's an incomplete word, then return the most likely completion. | ||
If it's a complete word, return the next word that is most likely. | ||
For now it's slower and less memory efficient than the C++ version, | ||
so use that one instead. | ||
:param word: (part of) a word | ||
:return: completed string | ||
""" | ||
maxi = 0 | ||
compl = '' | ||
for item in trie1.items(prefix = word): | ||
if item[1] > maxi: | ||
maxi = item[1] | ||
compl = item[0] | ||
if compl == '': | ||
return "couldn't find autocomplete for \"{}\"".format(word) | ||
longer = compl.split(' ') | ||
if len(longer) > 1 and longer[-2] in word: | ||
return longer[-1] | ||
return longer[0] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import codecs | ||
import random | ||
|
||
#with codecs.open('./w2_.txt', "r", encoding='utf-8', errors='ignore') as source: | ||
# data = [ (random.random(), line) for line in source ] | ||
|
||
#with open('randomized_result','w') as target: | ||
# for _, line in data: | ||
# target.write( line ) | ||
#random.shuffle(data) | ||
#train_data = data[:int((len(data) + 1) * .999)] # Remaining 80% to training set | ||
#test_data = data[int(len(data) * .999 + 1):] # Splits 20% data to test set | ||
|
||
|
||
for i in range(2,6): | ||
test_path = './w'+str(i)+'_.txt' | ||
with codecs.open(test_path, "r", encoding='utf-8', | ||
errors='ignore') as source: | ||
data = [(random.random(), line) for line in source] | ||
|
||
random.shuffle(data) | ||
train_data = data[:int((len(data) + 1) * .999)] # Remaining 80% to training set | ||
test_data = data[int(len(data) * .999 + 1):] # Splits 20% data to test set | ||
#Save data to desire file | ||
train_store = 'train_data_w'+str(i) | ||
with open(train_store, 'w') as target: | ||
for (_, line) in train_data: | ||
target.write(line) | ||
test_store = 'test_data_w' + str(i) | ||
with open(test_store, 'w') as target: | ||
for (_, line) in test_data: | ||
target.write(line) | ||
|
||
|
||
''' | ||
import random | ||
with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore') as source: | ||
data = [ (random.random(), line) for line in source ] | ||
#data.sort() | ||
with open('another_file','w') as target: | ||
for _, line in data: | ||
target.write( line ) | ||
file = open("datafile.txt", "r") | ||
data = list() | ||
for line in file: | ||
data.append(line.split( # your preferred delimiter)) | ||
file.close() | ||
random.shuffle(data) | ||
train_data = data[:int((len(data) + 1) * .80)] # Remaining 80% to training set | ||
test_data = data[int(len(data) * .80 + 1):] # Splits 20% data to test set | ||
''' | ||
|
||
''' | ||
from nltk import ngrams | ||
sentence = 'this is a foo bar sentences, and i want to ongranize it. I\'ve eaten too much' | ||
n = 2 | ||
bigrams = ngrams(sentence.split(), n) | ||
for grams in bigrams: | ||
print(grams) | ||
''' | ||
|
||
''' | ||
with codecs.open('./w3_.txt', "r", encoding='utf-8', errors='ignore') as fdata: | ||
grams = pd.read_table(fdata, names=["freq", "first", "second"]) | ||
grams = grams.sort_values(by='freq', ascending=False) | ||
random.sample(population, k) | ||
''' | ||
|
Oops, something went wrong.