diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java index a300de78..ac32a59a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java @@ -16,12 +16,13 @@ package com.worksap.nlp.sudachi; -import com.worksap.nlp.sudachi.dictionary.POS; - import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.function.Predicate; +import java.util.stream.Stream; + +import com.worksap.nlp.sudachi.dictionary.POS; /** * A lexicon and a grammar for morphological analysis. @@ -56,6 +57,18 @@ public interface Dictionary extends AutoCloseable { @Override public void close() throws IOException; + /** + * Create a parallel stream of all words in the dictionary as morphemes. + * + * Corresponds to the lines in the lexicon csv, i.e. it includes entries that + * appear only when refered from other words (e.g. as constitution) during an + * analysis and excludes entries that automatically added to store a + * normalization form of another word. Entries in the stream are not sorted. + * + * @return a parallel stream of morphemes. + */ + public Stream entries(); + /** * Lookup entries in the dictionary without performing an analysis. * diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index 77eeeaa5..f7b43727 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -25,9 +25,15 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; +import java.util.Iterator; import java.util.List; +import java.util.NoSuchElementException; +import java.util.Spliterator; +import java.util.Spliterators; import java.util.function.Predicate; import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; public class JapaneseDictionary implements Dictionary, DictionaryAccess { @@ -127,6 +133,45 @@ public void close() throws IOException { } } + /** + * Iterator of morphemes in the dictionary. + */ + private class EntryItr implements Iterator { + private final GrammarImpl grammar; + private final LexiconSet lexicon; + private Iterator wordIdItr; + + EntryItr() { + this.grammar = getGrammar(); + this.lexicon = getLexicon(); + this.wordIdItr = this.lexicon.wordIds(); + } + + @Override + public boolean hasNext() { + return wordIdItr.hasNext(); + } + + @Override + public Morpheme next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next()); + } + } + + @Override + public Stream entries() { + Iterator iterator = new EntryItr(); + int size = getLexicon().size(); + int characteristics = Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED; + boolean parallel = true; + + Spliterator spliterator = Spliterators.spliterator(iterator, size, characteristics); + return StreamSupport.stream(spliterator, parallel); + } + @Override public List lookup(CharSequence surface) { UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar); @@ -204,10 +249,12 @@ static String readAll(InputStream input) throws IOException { } } + @Override public GrammarImpl getGrammar() { return grammar; } + @Override public LexiconSet getLexicon() { return lexicon; } diff --git a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java index 37292e2f..e03317b1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java +++ b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,9 +123,10 @@ public interface Morpheme { * The IDs change when the dictionaries are updated or the combination of * dictionaries changes. * - * If the morpheme is OOV, it returns an undefined value. + * If the morpheme is OOV, it returns an id consist of OOV flag and pos id. * * @return the word ID + * @see WordId */ public int getWordId(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index a42828f0..56ceea92 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -93,12 +93,13 @@ public enum WordRefMode { grammar.setCharacterCategory(CharacterCategory.loadDefault()); textNormalizer = new TextNormalizer(grammar); - // in order to output dictionary entries in in-dictionary order we need to sort - // them. iterator over them will get them not in the sorted order, but grouped - // by index-form (and sorted in groups). + // In order to output dictionary entries in in-dictionary order we need to sort + // them. Iterator over them will get them not in the sorted order, but grouped + // by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds + // for the performance. DoubleArrayLexicon targetLex = dic.getLexicon(); Ints allIds = new Ints(targetLex.size()); - Iterator ids = targetLex.wordIds(0); + Iterator ids = targetLex.getWordIdTable().wordIds(); while (ids.hasNext()) { allIds.appendAll(ids.next()); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index b59011bc..844a6b98 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import java.nio.ByteBuffer; import java.nio.IntBuffer; import java.util.Iterator; +import java.util.NoSuchElementException; import com.worksap.nlp.dartsclone.DoubleArray; import com.worksap.nlp.sudachi.MorphemeList; @@ -103,7 +104,7 @@ public long parameters(int wordId) { private class Itr implements Iterator { private final Iterator iterator; - private Integer[] wordIds; + private int[] wordIds; private int length; private int index; @@ -148,8 +149,39 @@ public int size() { return description.getNumTotalEntries(); } - public Iterator wordIds(int dic) { - return wordIdTable.wordIds(); + public Iterator wordIds() { + return new WordIdItr(); + } + + private class WordIdItr implements Iterator { + private final Iterator iterator; + private Ints ints; + private int index; + + WordIdItr() { + this.iterator = getWordIdTable().wordIds(); + index = 0; + } + + @Override + public boolean hasNext() { + while (ints == null || index >= ints.length()) { + if (!iterator.hasNext()) { + return false; + } + ints = iterator.next(); + index = 0; + } + return true; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return ints.get(index++); + } } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java index a4a74fb7..0b3ebfcd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,15 @@ */ public interface Lexicon { + /** + * Lookup entries that match the text starting from the offset. + * + * @param text + * input byte text. should be normalized + * @param offset + * input offset to start lookup from + * @return iterator of (wordid, length) pair + */ Iterator lookup(byte[] text, int offset); /** @@ -73,8 +82,9 @@ public interface Lexicon { WordInfoList wordInfos(int dic); /** - * Iterates over all word ids in the specified dictionary. Returned word ids are - * not sorted. + * Iterates over all word ids in the dictionary. + * + * Returned word ids are not sorted. */ - Iterator wordIds(int dic); + Iterator wordIds(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java index db83275d..033abc4d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. + * Copyright (c) 2017-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,7 +171,38 @@ public WordInfoList wordInfos(int dic) { } @Override - public Iterator wordIds(int dic) { - return lexicons.get(dic).wordIds(dic); + public Iterator wordIds() { + return new WordIdItr(); + } + + private class WordIdItr implements Iterator { + private int dictId; + private Iterator iterator; + + WordIdItr() { + this.dictId = 0; + this.iterator = lexicons.get(dictId).wordIds(); + } + + @Override + public boolean hasNext() { + while (!iterator.hasNext()) { + int nextDictId = dictId + 1; + if (nextDictId >= lexicons.size()) { + return false; + } + dictId = nextDictId; + iterator = lexicons.get(nextDictId).wordIds(); + } + return true; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return iterator.next(); + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index b114afc3..7c812d4e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,13 @@ import java.util.NoSuchElementException; import java.util.Iterator; -class WordIdTable { +/** + * Table which contains the list of (internal) word ids that has same index + * form. + * + * Automatically fills dict parts of word id using the dicId set. + */ +public class WordIdTable { private final ByteBuffer bytes; private int dicIdMask = 0; @@ -31,19 +37,13 @@ class WordIdTable { this.bytes = bytes; } - Integer[] get(int index) { + int[] get(int index) { ByteBuffer dup = bytes.duplicate(); dup.position(index); BufReader reader = new BufReader(dup); int length = reader.readVarint32(); - Integer[] result = new Integer[length]; - int mask = dicIdMask; - int sum = 0; - for (int i = 0; i < length; i++) { - int v = reader.readVarint32(); - result[i] = WordId.applyMask(v + sum, mask); - sum += v; - } + int[] result = new int[length]; + readDeltaCompressed(result, length, this.dicIdMask, reader); return result; } @@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe } } - void setDictionaryId(int id) { - dicIdMask = WordId.dicIdMask(id); + void setDictionaryId(int dictId) { + dicIdMask = WordId.dicIdMask(dictId); } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 1ca5a702..17ee7e98 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -19,7 +19,6 @@ import com.worksap.nlp.sudachi.dictionary.Block; import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; import com.worksap.nlp.sudachi.dictionary.Ints; -import com.worksap.nlp.sudachi.dictionary.Lexicon; import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.io.IOException; @@ -63,17 +62,19 @@ public class RawLexicon { * used to resolve wordref. * * @param lexicon + * lexicon of a system dictionary. * @return number of entries read. */ - public int preloadFrom(Lexicon lexicon, Progress progress) { + public int preloadFrom(DoubleArrayLexicon lexicon, Progress progress) { this.isUser = true; Ints allIds = new Ints(lexicon.size()); - Iterator ids = lexicon.wordIds(0); + Iterator ids = lexicon.getWordIdTable().wordIds(); while (ids.hasNext()) { allIds.appendAll(ids.next()); } allIds.sort(); + for (int i = 0; i < allIds.length(); i++) { preloadedEntries.add(new CompiledWordEntry(lexicon, allIds.get(i))); progress.progress(i, allIds.length()); diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt index 2dee706d..93327de1 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt @@ -103,6 +103,29 @@ class JapaneseDictionaryTest { assertFailsWith(IllegalStateException::class) { tok.tokenize("a") } } + @Test + fun entries() { + // contains all morphemes, where all of them have different wordId + assertEquals(41, dict.entries().map { m -> m.getWordId() }.distinct().count()) + // includes entry with -1 conjunction cost + assertEquals(1, dict.entries().filter { m -> m.dictionaryForm() == "隠し" }.count()) + // excludes phantom entry + assertEquals(0, dict.entries().filter { m -> m.surface() == "なな" }.count()) + // use grammar + assertEquals(6, dict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count()) + // use lexicon + assertEquals(4, dict.entries().filter { m -> m.readingForm().contains("キョウ") }.count()) + } + + @Test + fun entriesWithUser() { + val udict = TestDictionary.user1() + assertEquals(41 + 4, udict.entries().map { m -> m.getWordId() }.distinct().count()) + assertEquals(6 + 1, udict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count()) + assertEquals(4 + 1, udict.entries().filter { m -> m.readingForm().contains("キョウ") }.count()) + udict.close() + } + @Test fun lookupEntries() { // nothing @@ -119,6 +142,10 @@ class JapaneseDictionaryTest { assertEquals(1, sudachi.size) assertEquals("徳島県産", sudachi[0].getUserData()) + // cannot find entry with -1 conjunction cost + val hidden = dict.lookup("隠し") + assertTrue(hidden.isEmpty()) + // will be normalized val norm = dict.lookup("特A") assertEquals(1, norm.size) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index 18c8eab8..0e944562 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -209,8 +209,8 @@ class DictionaryPrinterTest { val lexO = original.getLexicon() val lexR = rebuilt.getLexicon() - val wiIterO = lexO.wordIds(0) - val wiIterR = lexR.wordIds(0) + val wiIterO = lexO.getWordIdTable().wordIds() + val wiIterR = lexR.getWordIdTable().wordIds() while (wiIterO.hasNext()) { assertTrue(wiIterR.hasNext()) @@ -258,8 +258,8 @@ class DictionaryPrinterTest { val lexO = original.getLexicon() val lexR = rebuilt.getLexicon() - val wiIterO = lexO.wordIds(0) - val wiIterR = lexR.wordIds(0) + val wiIterO = lexO.getWordIdTable().wordIds() + val wiIterR = lexR.getWordIdTable().wordIds() while (wiIterO.hasNext()) { assertTrue(wiIterR.hasNext()) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt index 0109a2a1..98318631 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -33,7 +33,7 @@ class DoubleArrayLexiconTest { lexicon = DoubleArrayLexicon.load(bytes, desc) val wids = Ints(lexicon.size()) - for (ints: Ints in lexicon.wordIds(0)) { + for (ints: Ints in lexicon.getWordIdTable().wordIds()) { wids.appendAll(ints) } wids.sort() @@ -110,6 +110,12 @@ class DoubleArrayLexiconTest { assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getHeadword())) assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getHeadword())) + // な。な (phantom normalized form) + wi = lexicon.getWordInfo(getWordId(39)) + assertEquals("な。な", lexicon.string(0, wi.getHeadword())) + assertEquals("ナナ", lexicon.string(0, wi.getReadingForm())) + assertEquals("なな", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getHeadword())) + // 東京都 wi = lexicon.getWordInfo(getWordId(6)) assertEquals("東京都", lexicon.string(0, wi.getHeadword())) diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 75ebf5c0..c9a85645 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -38,5 +38,5 @@ IndexForm,LeftId,RightId,Cost,Headword,POS1,POS2,POS3,POS4,POS5,POS6,Reading_For 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,,,,,,, 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,,,,,,, 隠し,-1,-1,0,,名詞,普通名詞,一般,*,*,*,カクシ,,,,,,, -な。な,8,8,2914,,名詞,普通名詞,一般,*,*,*,ナナ,,,"アイウ,名詞,普通名詞,一般,*,*,*,アイウ","アイウ,名詞,普通名詞,一般,*,*,*,アイウ",,, +な。な,8,8,2914,,名詞,普通名詞,一般,*,*,*,ナナ,なな,,"アイウ,名詞,普通名詞,一般,*,*,*,アイウ","アイウ,名詞,普通名詞,一般,*,*,*,アイウ",,, 東東京都,6,8,6320,,名詞,固有名詞,地名,一般,*,*,ヒガシヒガシキョウト,,,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",,