From 2b6fd20af5c7470810dae544cbae7912505a5b22 Mon Sep 17 00:00:00 2001 From: Tyrone Warren Date: Tue, 12 Sep 2017 13:51:09 -0700 Subject: [PATCH 1/3] Altered to accept and return maps of [string]interface string is searchable, interface is the return value --- closestmatch.go | 45 ++++++++++++++-------- closestmatch_test.go | 91 +++++++++++++++++++++++++++----------------- test/data.go | 35 ++++++++++++++--- 3 files changed, 115 insertions(+), 56 deletions(-) diff --git a/closestmatch.go b/closestmatch.go index 1ea7a6a..fa21f99 100755 --- a/closestmatch.go +++ b/closestmatch.go @@ -22,23 +22,26 @@ type ClosestMatch struct { type IDInfo struct { Key string NumSubstrings int + Data interface{} } // New returns a new structure for performing closest matches -func New(possible []string, subsetSize []int) *ClosestMatch { +func New(possible map[string]interface{}, subsetSize []int) *ClosestMatch { cm := new(ClosestMatch) cm.SubstringSizes = subsetSize cm.SubstringToID = make(map[string]map[uint32]struct{}) cm.ID = make(map[uint32]IDInfo) - for i, s := range possible { - substrings := cm.splitWord(strings.ToLower(s)) - cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)} + i := 0 + for k, m := range possible { + substrings := cm.splitWord(strings.ToLower(k)) + cm.ID[uint32(i)] = IDInfo{Key: k, NumSubstrings: len(substrings), Data: m} for substring := range substrings { if _, ok := cm.SubstringToID[substring]; !ok { cm.SubstringToID[substring] = make(map[uint32]struct{}) } cm.SubstringToID[substring][uint32(i)] = struct{}{} } + i++ } return cm @@ -77,16 +80,23 @@ func (cm *ClosestMatch) Save(filename string) error { return enc.Encode(cm) } +type workerResult struct { + Value int + Data interface{} +} + func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) { for j := range jobs { - m := make(map[string]int) + m := make(map[string]workerResult) if ids, ok := cm.SubstringToID[j.substring]; ok { weight := 1000 / len(ids) for id := range ids { if _, ok2 := m[cm.ID[id].Key]; !ok2 { - m[cm.ID[id].Key] = 0 + m[cm.ID[id].Key] = workerResult{Value: 0, Data: cm.ID[id].Data} } - m[cm.ID[id].Key] += 1 + 1000/len(cm.ID[id].Key) + weight + item := m[cm.ID[id].Key] + item.Value += 1 + 1000/len(cm.ID[id].Key) + weight + m[cm.ID[id].Key] = item } } results <- result{m: m} @@ -98,10 +108,10 @@ type job struct { } type result struct { - m map[string]int + m map[string]workerResult } -func (cm *ClosestMatch) match(searchWord string) map[string]int { +func (cm *ClosestMatch) match(searchWord string) map[string]workerResult { searchSubstrings := cm.splitWord(searchWord) searchSubstringsLen := len(searchSubstrings) @@ -118,12 +128,14 @@ func (cm *ClosestMatch) match(searchWord string) map[string]int { } close(jobs) - m := make(map[string]int) + m := make(map[string]workerResult) for a := 1; a <= searchSubstringsLen; a++ { r := <-results for key := range r.m { if _, ok := m[key]; ok { - m[key] += r.m[key] + x := m[key] + x.Value += r.m[key].Value + m[key] = x } else { m[key] = r.m[key] } @@ -142,22 +154,22 @@ func (cm *ClosestMatch) Closest(searchWord string) string { } // ClosestN searches for the `searchWord` and returns the n closests matches -func (cm *ClosestMatch) ClosestN(searchWord string, max int) []string { - matches := make([]string, 0, max) +func (cm *ClosestMatch) ClosestN(searchWord string, max int) []interface{} { + matches := make([]interface{}, 0, max) for i, pair := range rankByWordCount(cm.match(searchWord)) { if i >= max { break } - matches = append(matches, pair.Key) + matches = append(matches, pair.Data) } return matches } -func rankByWordCount(wordFrequencies map[string]int) PairList { +func rankByWordCount(wordFrequencies map[string]workerResult) PairList { pl := make(PairList, len(wordFrequencies)) i := 0 for k, v := range wordFrequencies { - pl[i] = Pair{k, v} + pl[i] = Pair{k, v.Value, v.Data} i++ } sort.Sort(sort.Reverse(pl)) @@ -167,6 +179,7 @@ func rankByWordCount(wordFrequencies map[string]int) PairList { type Pair struct { Key string Value int + Data interface{} } type PairList []Pair diff --git a/closestmatch_test.go b/closestmatch_test.go index 1a19789..8f3316d 100755 --- a/closestmatch_test.go +++ b/closestmatch_test.go @@ -6,17 +6,17 @@ import ( "strings" "testing" - "github.com/schollz/closestmatch/test" + "github.com/Yugloocamai/closestmatch/test" ) func BenchmarkNew(b *testing.B) { for i := 0; i < b.N; i++ { - New(test.WordsToTest, []int{3}) + New(test.BooksToTest, []int{3}) } } func BenchmarkSplitOne(b *testing.B) { - cm := New(test.WordsToTest, []int{3}) + cm := New(test.BooksToTest, []int{3}) searchWord := test.SearchWords[0] b.ResetTimer() for i := 0; i < b.N; i++ { @@ -26,8 +26,8 @@ func BenchmarkSplitOne(b *testing.B) { func BenchmarkClosestOne(b *testing.B) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3}) searchWord := test.SearchWords[0] b.ResetTimer() for i := 0; i < b.N; i++ { @@ -37,8 +37,8 @@ func BenchmarkClosestOne(b *testing.B) { func BenchmarkClosest3(b *testing.B) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3}) searchWord := test.SearchWords[0] b.ResetTimer() for i := 0; i < b.N; i++ { @@ -48,8 +48,8 @@ func BenchmarkClosest3(b *testing.B) { func BenchmarkClosest30(b *testing.B) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3}) searchWord := test.SearchWords[0] b.ResetTimer() for i := 0; i < b.N; i++ { @@ -59,8 +59,8 @@ func BenchmarkClosest30(b *testing.B) { func BenchmarkFileLoad(b *testing.B) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3, 4}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3, 4}) cm.Save("test/books.list.cm.gz") b.ResetTimer() for i := 0; i < b.N; i++ { @@ -70,8 +70,8 @@ func BenchmarkFileLoad(b *testing.B) { func BenchmarkFileSave(b *testing.B) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3, 4}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3, 4}) b.ResetTimer() for i := 0; i < b.N; i++ { cm.Save("test/books.list.cm.gz") @@ -79,7 +79,13 @@ func BenchmarkFileSave(b *testing.B) { } func ExampleMatchingSmall() { - cm := New([]string{"love", "loving", "cat", "kit", "cats"}, []int{4}) + loveCats := make(map[string]interface{}) + loveCats["love"] = map[string]string{"name": "love"} + loveCats["loving"] = map[string]string{"name": "loving"} + loveCats["cat"] = map[string]string{"name": "cat"} + loveCats["kit"] = map[string]string{"name": "kit"} + loveCats["cats"] = map[string]string{"name": "cats"} + cm := New(loveCats, []int{4}) fmt.Println(cm.splitWord("love")) fmt.Println(cm.splitWord("kit")) fmt.Println(cm.Closest("kit")) @@ -91,7 +97,13 @@ func ExampleMatchingSmall() { } func ExampleMatchingSimple() { - cm := New(test.WordsToTest, []int{3}) + + booksLines := strings.Split(strings.ToLower(test.Books), "\n") + wordsToTest := make(map[string]interface{}) + for _, v := range booksLines { + wordsToTest[v] = map[string]string{"words": v} + } + cm := New(wordsToTest, []int{3}) for _, searchWord := range test.SearchWords { fmt.Printf("'%s' matched '%s'\n", searchWord, cm.Closest(searchWord)) } @@ -100,21 +112,26 @@ func ExampleMatchingSimple() { // 'mysterious afur at styles by christie' matched 'the mysterious affair at styles by agatha christie' // 'hard times by charles dickens' matched 'hard times by charles dickens' // 'complete william shakespeare' matched 'the complete works of william shakespeare by william shakespeare' - // 'war by hg wells' matched 'the war of the worlds by h. g. wells' + // 'War by HG Wells' matched 'the war of the worlds by h. g. wells' } func ExampleMatchingN() { - cm := New(test.WordsToTest, []int{4}) - fmt.Println(cm.ClosestN("war h.g. wells", 3)) + cm := New(test.BooksToTest, []int{4}) + results := cm.ClosestN("war h.g. wells", 3) + var slice []string + for _, v := range results { + slice = append(slice, v.(map[string]string)["name"]) + } + fmt.Println(slice) // Output: // [the war of the worlds by h. g. wells the time machine by h. g. wells war and peace by graf leo tolstoy] } func ExampleMatchingBigList() { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{3}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{3}) searchWord := "island of a thod mirrors" fmt.Println(cm.Closest(searchWord)) // Output: @@ -123,12 +140,12 @@ func ExampleMatchingBigList() { func ExampleMatchingCatcher() { bText, _ := ioutil.ReadFile("test/catcher.txt") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{5}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{5}) searchWord := "catcher in the rye by jd salinger" for i, match := range cm.ClosestN(searchWord, 3) { if i == 2 { - fmt.Println(match) + fmt.Println(match.(map[string]string)["name"]) } } // Output: @@ -137,12 +154,12 @@ func ExampleMatchingCatcher() { func ExampleMatchingPotter() { bText, _ := ioutil.ReadFile("test/potter.txt") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{5}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{5}) searchWord := "harry potter and the half blood prince by j.k. rowling" for i, match := range cm.ClosestN(searchWord, 3) { if i == 1 { - fmt.Println(match) + fmt.Println(match.(map[string]string)["name"]) } } // Output: @@ -151,23 +168,27 @@ func ExampleMatchingPotter() { func TestAccuracyBookWords(t *testing.T) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{4, 5}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{4, 5}) accuracy := cm.AccuracyMutatingWords() fmt.Printf("Accuracy with mutating words in book list:\t%2.1f%%\n", accuracy) } func TestAccuracyBookLetters(t *testing.T) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") - cm := New(wordsToTest, []int{5}) + books := test.GetBooks(string(bText)) + cm := New(books, []int{5}) accuracy := cm.AccuracyMutatingLetters() fmt.Printf("Accuracy with mutating letters in book list:\t%2.1f%%\n", accuracy) } func TestAccuracyDictionaryLetters(t *testing.T) { bText, _ := ioutil.ReadFile("test/popular.txt") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") + words := strings.Split(strings.ToLower(string(bText)), "\n") + wordsToTest := make(map[string]interface{}) + for _, v := range words { + wordsToTest[v] = map[string]string{"word": v} + } cm := New(wordsToTest, []int{2, 3, 4}) accuracy := cm.AccuracyMutatingWords() fmt.Printf("Accuracy with mutating letters in dictionary:\t%2.1f%%\n", accuracy) @@ -175,12 +196,12 @@ func TestAccuracyDictionaryLetters(t *testing.T) { func TestSaveLoad(t *testing.T) { bText, _ := ioutil.ReadFile("test/books.list") - wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n") + books := test.GetBooks(string(bText)) type TestStruct struct { cm *ClosestMatch } tst := new(TestStruct) - tst.cm = New(wordsToTest, []int{5}) + tst.cm = New(books, []int{5}) err := tst.cm.Save("test.gob") if err != nil { t.Error(err) @@ -191,8 +212,8 @@ func TestSaveLoad(t *testing.T) { if err != nil { t.Error(err) } - answer2 := tst2.cm.Closest("war of the worlds by hg wells") - answer1 := tst.cm.Closest("war of the worlds by hg wells") + answer2 := tst2.cm.Closest("war of the worlds") + answer1 := tst.cm.Closest("war of the worlds") if answer1 != answer2 { t.Errorf("Differing answers: '%s' '%s'", answer1, answer2) } diff --git a/test/data.go b/test/data.go index ed4ebb0..eef3386 100644 --- a/test/data.go +++ b/test/data.go @@ -4,7 +4,7 @@ import ( "strings" ) -var books = `Pride and Prejudice by Jane Austen +var Books = `Pride and Prejudice by Jane Austen Alice's Adventures in Wonderland by Lewis Carroll The Importance of Being Earnest: A Trivial Comedy for Serious People by Oscar Wilde A Tale of Two Cities by Charles Dickens @@ -109,12 +109,37 @@ Red Riding Hood by Sarah Blakley-Cartwright The Kingdom of This World by Alejo Carpentier Hitty, Her First Hundred Years by Rachel Field` -var WordsToTest []string +var WordsToTest map[string]interface{} +var BooksToTest map[string]interface{} + var SearchWords = []string{"cervantes don quixote", "mysterious afur at styles by christie", "hard times by charles dickens", "complete william shakespeare", "War by HG Wells"} func init() { - WordsToTest = strings.Split(strings.ToLower(books), "\n") - for i := range SearchWords { - SearchWords[i] = strings.ToLower(SearchWords[i]) + + WordsToTest = make(map[string]interface{}) + for _, v := range SearchWords { + WordsToTest[v] = map[string]string{"words": v} + } + // for i := range SearchWords { + // SearchWords[i] = strings.ToLower(SearchWords[i]) + // } + + BooksToTest = GetBooks(Books) + // for i := range SearchWords { + // SearchWords[i] = strings.ToLower(SearchWords[i]) + // } +} + +func GetBooks(text string) map[string]interface{} { + booksLines := strings.Split(strings.ToLower(text), "\n") + books := make(map[string]interface{}) + for _, v := range booksLines { + pair := strings.Split(v, " by ") + author := "unknown" + if len(pair) == 2 { + author = pair[1] + } + books[v] = map[string]string{"author": author, "name": v} } + return books } From 502175c3c48bae3b4381491be0b2bfa1dbfa9916 Mon Sep 17 00:00:00 2001 From: Yugloocamai Date: Thu, 14 Sep 2017 15:14:54 -0700 Subject: [PATCH 2/3] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6d9e132..36404a9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ +#THIS IS A MODIFIED VERSION OF CLOSESTMATCH! DOCUMENTATION WILL BE INACCURATE UNTIL I UPDATE. This fork allows an interface to be attached to each searchable item so we can return anything we want. + # closestmatch :page_with_curl: Version From 090dcfb9e28396c6d6e72b1cfc8e7cbb1ee3be9e Mon Sep 17 00:00:00 2001 From: Yugloocamai Date: Thu, 14 Sep 2017 15:15:23 -0700 Subject: [PATCH 3/3] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 36404a9..3398e4f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -#THIS IS A MODIFIED VERSION OF CLOSESTMATCH! DOCUMENTATION WILL BE INACCURATE UNTIL I UPDATE. This fork allows an interface to be attached to each searchable item so we can return anything we want. +# THIS IS A MODIFIED VERSION OF CLOSESTMATCH! DOCUMENTATION WILL BE INACCURATE UNTIL I UPDATE. This fork allows an interface to be attached to each searchable item so we can return anything we want. # closestmatch :page_with_curl: