Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lookup that can find entries with -1 conjugation cost #257

Merged
merged 2 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public interface Dictionary extends AutoCloseable {
* Create a parallel stream of all words in the dictionary as morphemes.
*
* Corresponds to the lines in the lexicon csv, i.e. it includes entries that
* appear only when refered from other words (e.g. as constitution) during an
* appear only when referred from other words (e.g. as constitution) during an
* analysis and excludes entries that automatically added to store a
* normalization form of another word. Entries in the stream are not sorted.
*
Expand All @@ -72,17 +72,33 @@ public interface Dictionary extends AutoCloseable {
/**
* Lookup entries in the dictionary without performing an analysis.
*
* Specified surface will be normalized. This will work like performing analysis
* on the given headword and find paths with a single morpheme, but returns all
* Specified surface will be normalized. This works like performing analysis on
* the given headword and find paths with a single morpheme, but returns all
* paths instead of the lowest cost one.
*
* @param surface
* to lookup. Will be normalized beforehand.
* surface to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
*/
public List<Morpheme> lookup(CharSequence surface);

/**
* Lookup from all entries in the dictionary.
*
* Specified surface will be normalized. This can find entries that are not
* indexed and appear only when referred from other words (e.g. constitution),
* but is VERY slow instead. {@link Dictionary#lookup(CharSequence)} should be
* used for most cases.
*
* @param surface
* surface to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
* @see Dictionary#lookup(CharSequence)
*/
public List<Morpheme> lookupAllEntries(CharSequence surface);

/**
* Create an out-of-vocabulary morpheme from the pos id and string forms.
*
Expand Down
20 changes: 14 additions & 6 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -174,12 +176,8 @@ public Stream<Morpheme> entries() {

@Override
public List<Morpheme> lookup(CharSequence surface) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
byte[] bytes = input.getByteText();
TextNormalizer textNormalizer = textNormalizer();
byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();

List<Morpheme> morphemes = new ArrayList<>();
WordLookup wordLookup = lexicon.makeLookup();
Expand All @@ -200,6 +198,16 @@ public List<Morpheme> lookup(CharSequence surface) {
return morphemes;
}

@Override
public List<Morpheme> lookupAllEntries(CharSequence surface) {
TextNormalizer textNormalizer = textNormalizer();
byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();

return entries()
.filter(m -> Arrays.equals(bytes, textNormalizer.normalizedInputText(m.surface()).getByteText()))
.collect(Collectors.toList());
}

@Override
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm) {
Expand Down
17 changes: 14 additions & 3 deletions src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,24 @@ private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar gramma
return plugins;
}

/** Normalize given text */
public String normalize(CharSequence text) {
/**
* Build {@link InputText} for the text and apply InputTextPlugins.
*
* @param text
* text to normalize
* @return Normalized text as InputText
*/
/* internal */ InputText normalizedInputText(CharSequence text) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
return builder.build();
}

/** Normalize the text */
public String normalize(CharSequence text) {
InputText input = normalizedInputText(text);
return input.getText();
}
}
32 changes: 32 additions & 0 deletions src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,38 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""")
assertEquals("abc", found.get(3).surface())
}

@Test
fun slowLookup() {
// nothing
val nothing = dict.lookupAllEntries("存在しない語")
assertTrue(nothing.isEmpty())

// system
val tokyo = dict.lookupAllEntries("東京都")
assertEquals(1, tokyo.size)
assertEquals("トウキョウト", tokyo[0].readingForm())

// user
val sudachi = TestDictionary.user1().lookupAllEntries("すだち")
assertEquals(1, sudachi.size)
assertEquals("徳島県産", sudachi[0].getUserData())

// CAN find entry with -1 conjunction cost
val hidden = dict.lookupAllEntries("隠し")
assertEquals(1, hidden.size)
assertEquals("隠し", hidden[0].surface())

// will be normalized
val norm = dict.lookupAllEntries("特A")
assertEquals(1, norm.size)
assertEquals("特A", norm[0].normalizedForm())

// inputTextPlugin
val yomi = dict.lookupAllEntries("京都(キョウト)")
assertEquals(1, yomi.size)
assertEquals("京都", yomi[0].normalizedForm())
}

@Test
fun oovMorpheme() {
val m1 = dict.oovMorpheme(1, "OOV")
Expand Down
3 changes: 1 addition & 2 deletions src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -54,7 +54,6 @@ class TextNormalizerTest {
// will use default config, which has InputTextPlugins of
// [Default, ProlongedSoundMark, IgnoreYomigana]
val tn = dic.textNormalizer()
print(dic.inputTextPlugins)

assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default
assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark
Expand Down
Loading