From e5bdab39eb061bc311b6d3fd6a6beaad51bf1a0d Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 3 Dec 2024 13:26:09 +0900
Subject: [PATCH 1/2] add slowLookupAllEntries method

---
 .../com/worksap/nlp/sudachi/Dictionary.java   | 24 +++++++++++---
 .../nlp/sudachi/JapaneseDictionary.java       | 20 ++++++++----
 .../worksap/nlp/sudachi/TextNormalizer.java   | 17 ++++++++--
 .../nlp/sudachi/JapaneseDictionaryTest.kt     | 32 +++++++++++++++++++
 .../worksap/nlp/sudachi/TextNormalizerTest.kt |  3 +-
 5 files changed, 81 insertions(+), 15 deletions(-)
diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
index 9e9bffed..188c076e 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -61,7 +61,7 @@ public interface Dictionary extends AutoCloseable {
      * Create a parallel stream of all words in the dictionary as morphemes.
      *
      * Corresponds to the lines in the lexicon csv, i.e. it includes entries that
-     * appear only when refered from other words (e.g. as constitution) during an
+     * appear only when referred from other words (e.g. as constitution) during an
      * analysis and excludes entries that automatically added to store a
      * normalization form of another word. Entries in the stream are not sorted.
      *
@@ -72,17 +72,33 @@ public interface Dictionary extends AutoCloseable {
     /**
      * Lookup entries in the dictionary without performing an analysis.
      * 
-     * Specified surface will be normalized. This will work like performing analysis
-     * on the given headword and find paths with a single morpheme, but returns all
+     * Specified surface will be normalized. This works like performing analysis on
+     * the given headword and find paths with a single morpheme, but returns all
      * paths instead of the lowest cost one.
      * 
      * @param surface
-     *            to lookup. Will be normalized beforehand.
+     *            surface to lookup. Will be normalized beforehand.
      * @return a list of morphemes that match the surface. Their begin/end will be
      *         0/length of their headword.
      */
     public List<Morpheme> lookup(CharSequence surface);
 
+    /**
+     * Lookup from all entries in the dictionary.
+     * 
+     * Specified surface will be normalized. This can find entries that are not
+     * indexed and appear only when referred from other words (e.g. constitution),
+     * but is VERY slow instead. {@link Dictionary#lookup(CharSequence)} should be
+     * used for most cases.
+     * 
+     * @param surface
+     *            surface to lookup. Will be normalized beforehand.
+     * @return a list of morphemes that match the surface. Their begin/end will be
+     *         0/length of their headword.
+     * @see Dictionary#lookup(CharSequence)
+     */
+    public List<Morpheme> slowLookupAllEntries(CharSequence surface);
+
     /**
      * Create an out-of-vocabulary morpheme from the pos id and string forms.
      * 
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
index 9b06c986..cc307b33 100644
--- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
+++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -24,6 +24,8 @@
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.stream.Collectors;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
@@ -174,12 +176,8 @@ public Stream<Morpheme> entries() {
 
     @Override
     public List<Morpheme> lookup(CharSequence surface) {
-        UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
-        for (InputTextPlugin plugin : inputTextPlugins) {
-            plugin.rewrite(builder);
-        }
-        UTF8InputText input = builder.build();
-        byte[] bytes = input.getByteText();
+        TextNormalizer textNormalizer = textNormalizer();
+        byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();
 
         List<Morpheme> morphemes = new ArrayList<>();
         WordLookup wordLookup = lexicon.makeLookup();
@@ -200,6 +198,16 @@ public List<Morpheme> lookup(CharSequence surface) {
         return morphemes;
     }
 
+    @Override
+    public List<Morpheme> slowLookupAllEntries(CharSequence surface) {
+        TextNormalizer textNormalizer = textNormalizer();
+        byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();
+
+        return entries()
+                .filter(m -> Arrays.equals(bytes, textNormalizer.normalizedInputText(m.surface()).getByteText()))
+                .collect(Collectors.toList());
+    }
+
     @Override
     public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
             String dictionaryForm) {
diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
index 664309be..fe56a40d 100644
--- a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
@@ -96,13 +96,24 @@ private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar gramma
         return plugins;
     }
 
-    /** Normalize given text */
-    public String normalize(CharSequence text) {
+    /**
+     * Build {@link InputText} for the text and apply InputTextPlugins.
+     * 
+     * @param text
+     *            text to normalize
+     * @return Normalized text as InputText
+     */
+    /* internal */ InputText normalizedInputText(CharSequence text) {
         UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar);
         for (InputTextPlugin plugin : inputTextPlugins) {
             plugin.rewrite(builder);
         }
-        UTF8InputText input = builder.build();
+        return builder.build();
+    }
+
+    /** Normalize the text */
+    public String normalize(CharSequence text) {
+        InputText input = normalizedInputText(text);
         return input.getText();
     }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
index 93327de1..66be188c 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
@@ -201,6 +201,38 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""")
     assertEquals("abc", found.get(3).surface())
   }
 
+  @Test
+  fun slowLookup() {
+    // nothing
+    val nothing = dict.slowLookupAllEntries("存在しない語")
+    assertTrue(nothing.isEmpty())
+
+    // system
+    val tokyo = dict.slowLookupAllEntries("東京都")
+    assertEquals(1, tokyo.size)
+    assertEquals("トウキョウト", tokyo[0].readingForm())
+
+    // user
+    val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち")
+    assertEquals(1, sudachi.size)
+    assertEquals("徳島県産", sudachi[0].getUserData())
+
+    // CAN find entry with -1 conjunction cost
+    val hidden = dict.slowLookupAllEntries("隠し")
+    assertEquals(1, hidden.size)
+    assertEquals("隠し", hidden[0].surface())
+
+    // will be normalized
+    val norm = dict.slowLookupAllEntries("特A")
+    assertEquals(1, norm.size)
+    assertEquals("特A", norm[0].normalizedForm())
+
+    // inputTextPlugin
+    val yomi = dict.slowLookupAllEntries("京都（キョウト）")
+    assertEquals(1, yomi.size)
+    assertEquals("京都", yomi[0].normalizedForm())
+  }
+
   @Test
   fun oovMorpheme() {
     val m1 = dict.oovMorpheme(1, "OOV")
diff --git a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
index e7329413..9c66f5bc 100644
--- a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Works Applications Co., Ltd.
+ * Copyright (c) 2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,6 @@ class TextNormalizerTest {
     // will use default config, which has InputTextPlugins of
     // [Default, ProlongedSoundMark, IgnoreYomigana]
     val tn = dic.textNormalizer()
-    print(dic.inputTextPlugins)
 
     assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂＢΓД㈱ｶﾞウ゛⼼Ⅲ")) // default
     assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark

From c99fa522f5f7d9ae2c04cdaa60d4c2816e7e61eb Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 3 Dec 2024 16:54:14 +0900
Subject: [PATCH 2/2] rm "slow" from the method name

---
 .../java/com/worksap/nlp/sudachi/Dictionary.java     |  2 +-
 .../com/worksap/nlp/sudachi/JapaneseDictionary.java  |  2 +-
 .../worksap/nlp/sudachi/JapaneseDictionaryTest.kt    | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
index 188c076e..9b4426a1 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -97,7 +97,7 @@ public interface Dictionary extends AutoCloseable {
      *         0/length of their headword.
      * @see Dictionary#lookup(CharSequence)
      */
-    public List<Morpheme> slowLookupAllEntries(CharSequence surface);
+    public List<Morpheme> lookupAllEntries(CharSequence surface);
 
     /**
      * Create an out-of-vocabulary morpheme from the pos id and string forms.
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
index cc307b33..4a5b4a19 100644
--- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
+++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -199,7 +199,7 @@ public List<Morpheme> lookup(CharSequence surface) {
     }
 
     @Override
-    public List<Morpheme> slowLookupAllEntries(CharSequence surface) {
+    public List<Morpheme> lookupAllEntries(CharSequence surface) {
         TextNormalizer textNormalizer = textNormalizer();
         byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();
 
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
index 66be188c..551ada8e 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
@@ -204,31 +204,31 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""")
   @Test
   fun slowLookup() {
     // nothing
-    val nothing = dict.slowLookupAllEntries("存在しない語")
+    val nothing = dict.lookupAllEntries("存在しない語")
     assertTrue(nothing.isEmpty())
 
     // system
-    val tokyo = dict.slowLookupAllEntries("東京都")
+    val tokyo = dict.lookupAllEntries("東京都")
     assertEquals(1, tokyo.size)
     assertEquals("トウキョウト", tokyo[0].readingForm())
 
     // user
-    val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち")
+    val sudachi = TestDictionary.user1().lookupAllEntries("すだち")
     assertEquals(1, sudachi.size)
     assertEquals("徳島県産", sudachi[0].getUserData())
 
     // CAN find entry with -1 conjunction cost
-    val hidden = dict.slowLookupAllEntries("隠し")
+    val hidden = dict.lookupAllEntries("隠し")
     assertEquals(1, hidden.size)
     assertEquals("隠し", hidden[0].surface())
 
     // will be normalized
-    val norm = dict.slowLookupAllEntries("特A")
+    val norm = dict.lookupAllEntries("特A")
     assertEquals(1, norm.size)
     assertEquals("特A", norm[0].normalizedForm())
 
     // inputTextPlugin
-    val yomi = dict.slowLookupAllEntries("京都（キョウト）")
+    val yomi = dict.lookupAllEntries("京都（キョウト）")
     assertEquals(1, yomi.size)
     assertEquals("京都", yomi[0].normalizedForm())
   }