From e5bdab39eb061bc311b6d3fd6a6beaad51bf1a0d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 3 Dec 2024 13:26:09 +0900 Subject: [PATCH 1/2] add slowLookupAllEntries method --- .../com/worksap/nlp/sudachi/Dictionary.java | 24 +++++++++++--- .../nlp/sudachi/JapaneseDictionary.java | 20 ++++++++---- .../worksap/nlp/sudachi/TextNormalizer.java | 17 ++++++++-- .../nlp/sudachi/JapaneseDictionaryTest.kt | 32 +++++++++++++++++++ .../worksap/nlp/sudachi/TextNormalizerTest.kt | 3 +- 5 files changed, 81 insertions(+), 15 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java index 9e9bffed..188c076e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java @@ -61,7 +61,7 @@ public interface Dictionary extends AutoCloseable { * Create a parallel stream of all words in the dictionary as morphemes. * * Corresponds to the lines in the lexicon csv, i.e. it includes entries that - * appear only when refered from other words (e.g. as constitution) during an + * appear only when referred from other words (e.g. as constitution) during an * analysis and excludes entries that automatically added to store a * normalization form of another word. Entries in the stream are not sorted. * @@ -72,17 +72,33 @@ public interface Dictionary extends AutoCloseable { /** * Lookup entries in the dictionary without performing an analysis. * - * Specified surface will be normalized. This will work like performing analysis - * on the given headword and find paths with a single morpheme, but returns all + * Specified surface will be normalized. This works like performing analysis on + * the given headword and find paths with a single morpheme, but returns all * paths instead of the lowest cost one. * * @param surface - * to lookup. Will be normalized beforehand. + * surface to lookup. Will be normalized beforehand. * @return a list of morphemes that match the surface. Their begin/end will be * 0/length of their headword. */ public List lookup(CharSequence surface); + /** + * Lookup from all entries in the dictionary. + * + * Specified surface will be normalized. This can find entries that are not + * indexed and appear only when referred from other words (e.g. constitution), + * but is VERY slow instead. {@link Dictionary#lookup(CharSequence)} should be + * used for most cases. + * + * @param surface + * surface to lookup. Will be normalized beforehand. + * @return a list of morphemes that match the surface. Their begin/end will be + * 0/length of their headword. + * @see Dictionary#lookup(CharSequence) + */ + public List slowLookupAllEntries(CharSequence surface); + /** * Create an out-of-vocabulary morpheme from the pos id and string forms. * diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index 9b06c986..cc307b33 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -24,6 +24,8 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; +import java.util.stream.Collectors; import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -174,12 +176,8 @@ public Stream entries() { @Override public List lookup(CharSequence surface) { - UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar); - for (InputTextPlugin plugin : inputTextPlugins) { - plugin.rewrite(builder); - } - UTF8InputText input = builder.build(); - byte[] bytes = input.getByteText(); + TextNormalizer textNormalizer = textNormalizer(); + byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText(); List morphemes = new ArrayList<>(); WordLookup wordLookup = lexicon.makeLookup(); @@ -200,6 +198,16 @@ public List lookup(CharSequence surface) { return morphemes; } + @Override + public List slowLookupAllEntries(CharSequence surface) { + TextNormalizer textNormalizer = textNormalizer(); + byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText(); + + return entries() + .filter(m -> Arrays.equals(bytes, textNormalizer.normalizedInputText(m.surface()).getByteText())) + .collect(Collectors.toList()); + } + @Override public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm, String dictionaryForm) { diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java index 664309be..fe56a40d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java @@ -96,13 +96,24 @@ private static List setupDefaultInputTextPlugins(Grammar gramma return plugins; } - /** Normalize given text */ - public String normalize(CharSequence text) { + /** + * Build {@link InputText} for the text and apply InputTextPlugins. + * + * @param text + * text to normalize + * @return Normalized text as InputText + */ + /* internal */ InputText normalizedInputText(CharSequence text) { UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar); for (InputTextPlugin plugin : inputTextPlugins) { plugin.rewrite(builder); } - UTF8InputText input = builder.build(); + return builder.build(); + } + + /** Normalize the text */ + public String normalize(CharSequence text) { + InputText input = normalizedInputText(text); return input.getText(); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt index 93327de1..66be188c 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt @@ -201,6 +201,38 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""") assertEquals("abc", found.get(3).surface()) } + @Test + fun slowLookup() { + // nothing + val nothing = dict.slowLookupAllEntries("存在しない語") + assertTrue(nothing.isEmpty()) + + // system + val tokyo = dict.slowLookupAllEntries("東京都") + assertEquals(1, tokyo.size) + assertEquals("トウキョウト", tokyo[0].readingForm()) + + // user + val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち") + assertEquals(1, sudachi.size) + assertEquals("徳島県産", sudachi[0].getUserData()) + + // CAN find entry with -1 conjunction cost + val hidden = dict.slowLookupAllEntries("隠し") + assertEquals(1, hidden.size) + assertEquals("隠し", hidden[0].surface()) + + // will be normalized + val norm = dict.slowLookupAllEntries("特A") + assertEquals(1, norm.size) + assertEquals("特A", norm[0].normalizedForm()) + + // inputTextPlugin + val yomi = dict.slowLookupAllEntries("京都(キョウト)") + assertEquals(1, yomi.size) + assertEquals("京都", yomi[0].normalizedForm()) + } + @Test fun oovMorpheme() { val m1 = dict.oovMorpheme(1, "OOV") diff --git a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt index e7329413..9c66f5bc 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Works Applications Co., Ltd. + * Copyright (c) 2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,6 @@ class TextNormalizerTest { // will use default config, which has InputTextPlugins of // [Default, ProlongedSoundMark, IgnoreYomigana] val tn = dic.textNormalizer() - print(dic.inputTextPlugins) assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark From c99fa522f5f7d9ae2c04cdaa60d4c2816e7e61eb Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 3 Dec 2024 16:54:14 +0900 Subject: [PATCH 2/2] rm "slow" from the method name --- .../java/com/worksap/nlp/sudachi/Dictionary.java | 2 +- .../com/worksap/nlp/sudachi/JapaneseDictionary.java | 2 +- .../worksap/nlp/sudachi/JapaneseDictionaryTest.kt | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java index 188c076e..9b4426a1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java @@ -97,7 +97,7 @@ public interface Dictionary extends AutoCloseable { * 0/length of their headword. * @see Dictionary#lookup(CharSequence) */ - public List slowLookupAllEntries(CharSequence surface); + public List lookupAllEntries(CharSequence surface); /** * Create an out-of-vocabulary morpheme from the pos id and string forms. diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index cc307b33..4a5b4a19 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -199,7 +199,7 @@ public List lookup(CharSequence surface) { } @Override - public List slowLookupAllEntries(CharSequence surface) { + public List lookupAllEntries(CharSequence surface) { TextNormalizer textNormalizer = textNormalizer(); byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText(); diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt index 66be188c..551ada8e 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt @@ -204,31 +204,31 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""") @Test fun slowLookup() { // nothing - val nothing = dict.slowLookupAllEntries("存在しない語") + val nothing = dict.lookupAllEntries("存在しない語") assertTrue(nothing.isEmpty()) // system - val tokyo = dict.slowLookupAllEntries("東京都") + val tokyo = dict.lookupAllEntries("東京都") assertEquals(1, tokyo.size) assertEquals("トウキョウト", tokyo[0].readingForm()) // user - val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち") + val sudachi = TestDictionary.user1().lookupAllEntries("すだち") assertEquals(1, sudachi.size) assertEquals("徳島県産", sudachi[0].getUserData()) // CAN find entry with -1 conjunction cost - val hidden = dict.slowLookupAllEntries("隠し") + val hidden = dict.lookupAllEntries("隠し") assertEquals(1, hidden.size) assertEquals("隠し", hidden[0].surface()) // will be normalized - val norm = dict.slowLookupAllEntries("特A") + val norm = dict.lookupAllEntries("特A") assertEquals(1, norm.size) assertEquals("特A", norm[0].normalizedForm()) // inputTextPlugin - val yomi = dict.slowLookupAllEntries("京都(キョウト)") + val yomi = dict.lookupAllEntries("京都(キョウト)") assertEquals(1, yomi.size) assertEquals("京都", yomi[0].normalizedForm()) }