From 8abf97e89dc6fd42cc47969d425f55d5489c0949 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 28 Nov 2024 09:38:52 +0900 Subject: [PATCH 1/5] deprecate old tokenizeSentences and replace it by lazyTokenizeSentences --- .../nlp/sudachi/JapaneseTokenizer.java | 27 +----- .../com/worksap/nlp/sudachi/Tokenizer.java | 35 ++++---- .../sudachi/JapaneseTokenizerStreamingTest.kt | 82 ++++++++----------- .../nlp/sudachi/JapaneseTokenizerTest.java | 70 ++++++++-------- 4 files changed, 86 insertions(+), 128 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 09cc3f76..76e46efe 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -98,32 +98,13 @@ public Iterable tokenizeSentences(SplitMode mode, String text) { } @Override - public Iterable tokenizeSentences(SplitMode mode, Reader reader) throws IOException { - IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader); - CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); - SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this); - - while (wrappedReader.read(buffer) > 0) { - buffer.flip(); - int length = analysis.tokenizeBuffer(buffer); - if (length < 0) { - buffer.position(analysis.bosPosition()); - buffer.compact(); - } - } - buffer.flip(); - ArrayList sentences = analysis.result; - - if (buffer.hasRemaining()) { - sentences.add(tokenizeSentence(mode, buildInputText(buffer))); - } - - return sentences; + public Iterator> tokenizeSentences(SplitMode mode, Readable input) { + return new SentenceSplittingLazyAnalysis(mode, this, input); } @Override - public Iterator> lazyTokenizeSentences(SplitMode mode, Readable readable) { - return new SentenceSplittingLazyAnalysis(mode, this, readable); + public Iterator> lazyTokenizeSentences(SplitMode mode, Readable input) { + return tokenizeSentences(mode, input); } @Override diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 89f6adef..e9f128e6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -79,36 +79,26 @@ default Iterable tokenizeSentences(String text) { /** * Read an input text from {@code input}, divide it into sentences and tokenize - * them. It reads all text in the input and uses a lot of memory when the text - * is long. + * them. It reads the input lazily. * * @param mode * a mode of splitting * @param input - * a reader of input text - * @return a result of tokenizing - * @throws IOException - * if reading a stream is failed - * @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead. + * a readable input text + * @return an iterator of tokenized sentences */ - @Deprecated - Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; + Iterator> tokenizeSentences(SplitMode mode, Readable input); /** - * Reads an input text from {@code input}, divide it into sentences and - * tokenizes them with {@link SplitMode}.C. It reads all text in the input and - * uses a lot of memory when the text is long. + * Read an input text from {@code input}, divide it into sentences and tokenize + * them with {@link SplitMode}.C. It reads the input lazily. * * @param input - * a reader of input text - * @return a result of tokenizing - * @throws IOException - * if reading a stream is failed - * @see #tokenizeSentences(SplitMode,Reader) - * @deprecated use {@link #lazyTokenizeSentences(Readable)} instead. + * a readable input text + * @return an iterator of tokenized sentences + * @see #tokenizeSentences(SplitMode,Readable) */ - @Deprecated - default Iterable tokenizeSentences(Reader input) throws IOException { + default Iterator> tokenizeSentences(Readable input) { return tokenizeSentences(SplitMode.C, input); } @@ -121,7 +111,10 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * @param input * a readable input text * @return a result of tokenizing + * @deprecated renamed to {@link #tokenizeSentences(SplitMode, Readable)} + * */ + @Deprecated Iterator> lazyTokenizeSentences(SplitMode mode, Readable input); /** @@ -132,7 +125,9 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * a readable input text * @return a result of tokenizing * @see #lazyTokenizeSentences(SplitMode,Readable) + * @deprecated renamed to {@link #tokenizeSentences(Readable)} */ + @Deprecated default Iterator> lazyTokenizeSentences(Readable input) { return lazyTokenizeSentences(SplitMode.C, input); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index 85aebbae..c77467bd 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -26,6 +26,35 @@ import kotlin.test.assertFailsWith class JapaneseTokenizerStreamingTest { private val tokenizer = TestDictionary.user0().tokenizer() + @Test + fun streamingReadable() { + val reader = StringReader("あ".repeat(5000)) + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(5000, totalLength) + } + + @Test + fun callingNextWithoutTextFails() { + val reader = StringReader("東京") + val it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) + + val morphemes = it.next() + assertEquals("東京", morphemes.get(0).surface()) + + assertFailsWith( + block = { it.next() }, + ) + } + + @Test + fun streamingLongTextShouldNotCauseOOM() { + val reader = StringReader("あ".repeat(10 * 1024 * 1024)) + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(10 * 1024 * 1024, totalLength) + } + class BadReader(private val data: String, private val window: Int = 512) : Reader() { private var position: Int = 0 @@ -50,61 +79,14 @@ class JapaneseTokenizerStreamingTest { override fun close() {} } - @Test - fun streamingTest() { - // Testing deprecated method `tokenizeSentences(Reader)` - val reader = StringReader("あ".repeat(5000)) - val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun streamingTestWithBadReader() { - // Testing deprecated method `tokenizeSentences(Reader)` - val reader = BadReader("あ".repeat(5000)) - val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun streamingReadable() { - val reader = StringReader("あ".repeat(5000)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun callingNextWithoutTextFails() { - val reader = StringReader("東京") - val it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) - - val morphemes = it.next() - assertEquals("東京", morphemes.get(0).surface()) - - assertFailsWith( - block = { it.next() }, - ) - } - @Test fun streamingBlockingReadable() { val reader = BadReader("あ".repeat(5000)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(5000, totalLength) } - @Test - fun streamingLongTextShouldNotCauseOOM() { - val reader = StringReader("あ".repeat(10 * 1024 * 1024)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(10 * 1024 * 1024, totalLength) - } - class FailReader(private val data: String) : Reader() { private var position: Int = 0 @@ -133,13 +115,13 @@ class JapaneseTokenizerStreamingTest { fun failsWhenReaderFails() { var reader = FailReader("あ".repeat(500)) // should not fail on the instantiation - var it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) + var it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.hasNext() }, ) reader = FailReader("あ".repeat(500)) - it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) + it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.next() }, ) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 50494e85..e84d4c2b 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -185,15 +185,10 @@ public void tokenizeSentencesWithSurrogatePair() { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithReader() throws IOException { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -205,11 +200,6 @@ public void tokenizerWithReader() throws IOException { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithLongReader() throws IOException { StringBuilder sb = new StringBuilder(); @@ -218,7 +208,7 @@ public void tokenizerWithLongReader() throws IOException { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -228,11 +218,6 @@ public void tokenizerWithLongReader() throws IOException { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithReaderAndNormalization() throws IOException { StringBuilder sb = new StringBuilder(); @@ -241,7 +226,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { @@ -254,6 +239,28 @@ public void tokenizerWithReaderAndNormalization() throws IOException { assertThat(it.hasNext(), is(false)); } + @Test + public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { + sb.append("。"); + } + sb.append("😀"); + StringReader reader = new StringReader(sb.toString()); + Iterator> it = tokenizer.tokenizeSentences(reader); + + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(1)); + assertThat(it.hasNext(), is(false)); + } + + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentences() { StringReader reader = new StringReader("京都。東京.東京都。京都"); @@ -275,6 +282,11 @@ public void lazyTokenizeSentences() { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentencesWithLongText() { StringBuilder sb = new StringBuilder(); @@ -293,6 +305,11 @@ public void lazyTokenizeSentencesWithLongText() { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentencesWithNormalization() { StringBuilder sb = new StringBuilder(); @@ -314,23 +331,6 @@ public void lazyTokenizeSentencesWithNormalization() { assertThat(it.hasNext(), is(false)); } - @Test - public void lazyTokenizeSentencesWithSurrogatePair() { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { - sb.append("。"); - } - sb.append("😀"); - StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.lazyTokenizeSentences(reader); - - assertThat(it.hasNext(), is(true)); - assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); - assertThat(it.hasNext(), is(true)); - assertThat(it.next().size(), is(1)); - assertThat(it.hasNext(), is(false)); - } - @Test public void zeroLengthMorpheme() { List s = tokenizer.tokenize("…"); From 4a9822e7f59fb43576f7f5c534bbe82ef72852fd Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 28 Nov 2024 11:05:30 +0900 Subject: [PATCH 2/5] use more specific return type --- .../nlp/sudachi/JapaneseTokenizer.java | 4 +-- .../com/worksap/nlp/sudachi/MorphemeList.java | 6 ++-- .../worksap/nlp/sudachi/MorphemeListItem.java | 2 +- .../SentenceSplittingLazyAnalysis.java | 3 +- .../com/worksap/nlp/sudachi/Tokenizer.java | 8 ++--- .../nlp/sudachi/JapaneseTokenizerTest.java | 30 +++++++++---------- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 76e46efe..8a481aad 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -98,12 +98,12 @@ public Iterable tokenizeSentences(SplitMode mode, String text) { } @Override - public Iterator> tokenizeSentences(SplitMode mode, Readable input) { + public Iterator tokenizeSentences(SplitMode mode, Readable input) { return new SentenceSplittingLazyAnalysis(mode, this, input); } @Override - public Iterator> lazyTokenizeSentences(SplitMode mode, Readable input) { + public Iterator lazyTokenizeSentences(SplitMode mode, Readable input) { return tokenizeSentences(mode, input); } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index 21571e9b..a05eb01f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ public class MorphemeList extends AbstractList { } @Override - public Morpheme get(int index) { + public MorphemeListItem get(int index) { return new MorphemeListItem(this, index); } @@ -91,7 +91,7 @@ WordInfo getWordInfo(int index) { return path.get(index).getWordInfo(); } - List split(Tokenizer.SplitMode mode, int index) { + MorphemeList split(Tokenizer.SplitMode mode, int index) { List nodes = new ArrayList<>(); LatticeNodeImpl node = path.get(index); node.appendSplitsTo(nodes, mode); diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java index e58d8fc6..125fd853 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java @@ -77,7 +77,7 @@ public String surface() { } @Override - public List split(Tokenizer.SplitMode mode) { + public MorphemeList split(Tokenizer.SplitMode mode) { return list.split(mode, index); } diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 44ebfdce..8e722b42 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -29,8 +29,7 @@ /** * Provides lazy sentence split and analysis. */ -/* internal */ class SentenceSplittingLazyAnalysis - implements SentenceDetector.NonBreakCheker, Iterator> { +/* internal */ class SentenceSplittingLazyAnalysis implements SentenceDetector.NonBreakCheker, Iterator { private final SentenceDetector detector = new SentenceDetector(); private final Tokenizer.SplitMode mode; diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index e9f128e6..06134840 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -87,7 +87,7 @@ default Iterable tokenizeSentences(String text) { * a readable input text * @return an iterator of tokenized sentences */ - Iterator> tokenizeSentences(SplitMode mode, Readable input); + Iterator tokenizeSentences(SplitMode mode, Readable input); /** * Read an input text from {@code input}, divide it into sentences and tokenize @@ -98,7 +98,7 @@ default Iterable tokenizeSentences(String text) { * @return an iterator of tokenized sentences * @see #tokenizeSentences(SplitMode,Readable) */ - default Iterator> tokenizeSentences(Readable input) { + default Iterator tokenizeSentences(Readable input) { return tokenizeSentences(SplitMode.C, input); } @@ -115,7 +115,7 @@ default Iterator> tokenizeSentences(Readable input) { * */ @Deprecated - Iterator> lazyTokenizeSentences(SplitMode mode, Readable input); + Iterator lazyTokenizeSentences(SplitMode mode, Readable input); /** * Read an input text from {@code input}, divide it into sentences and tokenize @@ -128,7 +128,7 @@ default Iterator> tokenizeSentences(Readable input) { * @deprecated renamed to {@link #tokenizeSentences(Readable)} */ @Deprecated - default Iterator> lazyTokenizeSentences(Readable input) { + default Iterator lazyTokenizeSentences(Readable input) { return lazyTokenizeSentences(SplitMode.C, input); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index e84d4c2b..8581be1c 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -77,7 +77,7 @@ public void tokenizeSmallKatakanaOnly() { @Test public void partOfSpeech() { - List ms = tokenizer.tokenize("京都"); + MorphemeList ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); Morpheme m = ms.get(0); short pid = m.partOfSpeechId(); @@ -88,7 +88,7 @@ public void partOfSpeech() { @Test public void getWordId() { - List ms = tokenizer.tokenize("京都"); + MorphemeList ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); int wid = ms.get(0).getWordId(); @@ -103,7 +103,7 @@ public void getWordId() { @Test public void getDictionaryId() { - List ms = tokenizer.tokenize("京都"); + MorphemeList ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); assertThat(ms.get(0).getDictionaryId(), is(0)); @@ -118,7 +118,7 @@ public void getDictionaryId() { @Test public void getSynonymGroupIds() { - List ms = tokenizer.tokenize("京都"); + MorphemeList ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); assertThat(ms.get(0).getSynonymGroupIds(), is(new int[] { 1, 5 })); @@ -188,7 +188,7 @@ public void tokenizeSentencesWithSurrogatePair() { @Test public void tokenizerWithReader() throws IOException { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator> it = tokenizer.tokenizeSentences(reader); + Iterator it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -208,7 +208,7 @@ public void tokenizerWithLongReader() throws IOException { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizeSentences(reader); + Iterator it = tokenizer.tokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -226,12 +226,12 @@ public void tokenizerWithReaderAndNormalization() throws IOException { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizeSentences(reader); + Iterator it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { assertThat(it.hasNext(), is(true)); - List ms = it.next(); + MorphemeList ms = it.next(); assertThat(ms.size(), is(2)); assertThat(ms.get(0).surface(), is("京都")); assertThat(ms.get(1).surface(), is("。")); @@ -247,7 +247,7 @@ public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { } sb.append("😀"); StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizeSentences(reader); + Iterator it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); @@ -264,7 +264,7 @@ public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { @Test public void lazyTokenizeSentences() { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator> it = tokenizer.lazyTokenizeSentences(reader); + Iterator it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -295,7 +295,7 @@ public void lazyTokenizeSentencesWithLongText() { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.lazyTokenizeSentences(reader); + Iterator it = tokenizer.lazyTokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -318,12 +318,12 @@ public void lazyTokenizeSentencesWithNormalization() { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.lazyTokenizeSentences(reader); + Iterator it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { assertThat(it.hasNext(), is(true)); - List ms = it.next(); + MorphemeList ms = it.next(); assertThat(ms.size(), is(2)); assertThat(ms.get(0).surface(), is("京都")); assertThat(ms.get(1).surface(), is("。")); @@ -333,7 +333,7 @@ public void lazyTokenizeSentencesWithNormalization() { @Test public void zeroLengthMorpheme() { - List s = tokenizer.tokenize("…"); + MorphemeList s = tokenizer.tokenize("…"); assertThat(s.size(), is(3)); assertThat(s.get(0).surface(), is("…")); assertThat(s.get(0).normalizedForm(), is(".")); @@ -355,7 +355,7 @@ public void disableEmptyMorpheme() throws IOException { dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false)); tokenizer = (JapaneseTokenizer) dict.tokenizer(); - List s = tokenizer.tokenize("…"); + MorphemeList s = tokenizer.tokenize("…"); assertThat(s.size(), is(3)); assertThat(s.get(0).surface(), is("…")); assertThat(s.get(0).normalizedForm(), is(".")); From 99617fa6084da4db680af6a1388114c8d99bd490 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 28 Nov 2024 12:08:06 +0900 Subject: [PATCH 3/5] remove SentenceSplittingAnalysis --- .../nlp/sudachi/JapaneseTokenizer.java | 20 +--- .../sudachi/SentenceSplittingAnalysis.java | 93 ------------------- 2 files changed, 5 insertions(+), 108 deletions(-) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 8a481aad..77d356d6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Reader; +import java.io.StringReader; import java.io.StringWriter; import java.nio.CharBuffer; import java.util.ArrayList; @@ -79,21 +80,10 @@ public Iterable tokenizeSentences(SplitMode mode, String text) { return Collections.emptyList(); } - SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this); - int length = analysis.tokenizeBuffer(text); - ArrayList result = analysis.result; - int bos = analysis.bos; - if (length < 0) { - // treat remaining thing as a single sentence - int eos = analysis.input.getText().length(); - if (bos != eos) { - UTF8InputText slice = analysis.input; - if (bos != 0) { - slice = slice.slice(bos, eos); - } - result.add(tokenizeSentence(mode, slice)); - } - } + StringReader input = new StringReader(text); + SentenceSplittingLazyAnalysis analysis = new SentenceSplittingLazyAnalysis(mode, this, input); + List result = new ArrayList<>(); + analysis.forEachRemaining(result::add); return result; } diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java deleted file mode 100644 index 254e0e51..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2023 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi; - -import com.worksap.nlp.sudachi.dictionary.LexiconSet; -import com.worksap.nlp.sudachi.sentdetect.SentenceDetector; - -import java.util.ArrayList; -import java.util.Iterator; - -/*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker { - private final SentenceDetector detector = new SentenceDetector(); - - private final Tokenizer.SplitMode mode; - private final JapaneseTokenizer tokenizer; - final ArrayList result = new ArrayList<>(); - - SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) { - this.mode = mode; - this.tokenizer = tokenizer; - } - - UTF8InputText input; - int bos; - - int tokenizeBuffer(CharSequence buffer) { - UTF8InputText input = tokenizer.buildInputText(buffer); - String normalized = input.getText(); - this.input = input; - - int bos = 0; - int length; - - this.bos = bos; - while ((length = detector.getEos(normalized, this)) > 0) { - int eos = bos + length; - if (eos < normalized.length()) { - eos = input.getNextInOriginal(eos - 1); - length = eos - bos; - } - UTF8InputText sentence = input.slice(bos, eos); - result.add(tokenizer.tokenizeSentence(mode, sentence)); - normalized = normalized.substring(length); - bos = eos; - this.bos = bos; - } - - // buffer is full, need to clean it up - if (length < 0 && buffer.length() == -length) { - result.add(tokenizer.tokenizeSentence(mode, input)); - return -length; - } - - return length; - } - - int bosPosition() { - return input.textIndexToOriginalTextIndex(bos); - } - - @Override - public boolean hasNonBreakWord(int length) { - UTF8InputText inp = input; - int byteEOS = inp.getCodePointsOffsetLength(0, bos + length); - byte[] bytes = inp.getByteText(); - LexiconSet lexicon = tokenizer.lexicon; - for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) { - Iterator iterator = lexicon.lookup(bytes, i); - while (iterator.hasNext()) { - int[] r = iterator.next(); - int l = r[1]; - if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) { - return true; - } - } - } - return false; - } -} From a204c42a9bf0fe132f7aab6769cd79ac45200ede Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 28 Nov 2024 17:42:03 +0900 Subject: [PATCH 4/5] Add Tokenizer.split deprecating MorphemeList.split --- .../nlp/sudachi/JapaneseTokenizer.java | 19 ++++++++ .../worksap/nlp/sudachi/LatticeNodeImpl.java | 10 +---- .../com/worksap/nlp/sudachi/MorphemeList.java | 5 ++- .../nlp/sudachi/SingleMorphemeImpl.java | 12 +----- .../com/worksap/nlp/sudachi/Tokenizer.java | 12 ++++++ .../nlp/sudachi/dictionary/WordInfo.java | 22 +++++++++- .../nlp/sudachi/JapaneseTokenizerTest.java | 43 ++++++++++++++++--- 7 files changed, 93 insertions(+), 30 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 77d356d6..7fa4e4ad 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -97,6 +97,25 @@ public Iterator lazyTokenizeSentences(SplitMode mode, Readable inp return tokenizeSentences(mode, input); } + @Override + public List split(List morphemes, SplitMode mode) { + if (morphemes instanceof MorphemeList) { + return ((MorphemeList) morphemes).split(mode); + } + + List result = new ArrayList<>(); + for (Morpheme m : morphemes) { + if (m instanceof SingleMorphemeImpl) { + ((SingleMorphemeImpl) m).appendSplitsTo(result, mode); + } else { + for (Morpheme subsplit : m.split(mode)) { + result.add(subsplit); + } + } + } + return result; + } + @Override public void setDumpOutput(PrintStream output) { dumpOutput = output; diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 385f214e..907bcfb5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -215,15 +215,7 @@ public StringsCache getStrings() { } /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { - if (mode == Tokenizer.SplitMode.A) { - appendSplitsTo(result, getWordInfo().getAunitSplit()); - } else if (mode == Tokenizer.SplitMode.B) { - appendSplitsTo(result, getWordInfo().getBunitSplit()); - } else if (mode == Tokenizer.SplitMode.C) { - appendSplitsTo(result, getWordInfo().getCunitSplit()); - } else { - result.add(this); - } + appendSplitsTo(result, getWordInfo().getUnitSplit(mode)); } private void appendSplitsTo(List result, int[] splitsId) { diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index a05eb01f..df3fdc4d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -106,18 +106,19 @@ MorphemeList split(Tokenizer.SplitMode mode, int index) { * @param mode * requested split mode * @return current list or a new list in the requested split mode. + * + * @deprecated will be internal only. use {@link Tokenizer#split} instead. */ + @Deprecated public MorphemeList split(Tokenizer.SplitMode mode) { if (mode.compareTo(this.mode) >= 0) { return this; } List nodes = new ArrayList<>(); - for (LatticeNodeImpl node : path) { node.appendSplitsTo(nodes, mode); } - return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode); } diff --git a/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java index 55cdc87e..a8f2d482 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java @@ -118,16 +118,8 @@ public List split(Tokenizer.SplitMode mode) { * * @see LatticeNodeImpl.appendSplitsTo */ - private void appendSplitsTo(List result, Tokenizer.SplitMode mode) { - if (mode == Tokenizer.SplitMode.A) { - appendSplitsTo(result, getWordInfo().getAunitSplit()); - } else if (mode == Tokenizer.SplitMode.B) { - appendSplitsTo(result, getWordInfo().getBunitSplit()); - } else if (mode == Tokenizer.SplitMode.C) { - appendSplitsTo(result, getWordInfo().getCunitSplit()); - } else { - result.add(this); - } + /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { + appendSplitsTo(result, getWordInfo().getUnitSplit(mode)); } private void appendSplitsTo(List result, int[] splitIds) { diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 06134840..20bf64cd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -132,6 +132,18 @@ default Iterator lazyTokenizeSentences(Readable input) { return lazyTokenizeSentences(SplitMode.C, input); } + /** + * Produce a copy of this list in a finer split mode. May return the given list + * if the mode is coarser than the current one. The given list is not modified. + * + * @param morphemes + * list of morphemes to split. + * @param mode + * requested split mode + * @return current list, or a new list in the requested split mode. + */ + List split(List morphemes, SplitMode mode); + /** * Prints lattice structure of the analysis into the passed {@link PrintStream}. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 8da239d6..9ccd2acb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -16,10 +16,11 @@ package com.worksap.nlp.sudachi.dictionary; -import com.worksap.nlp.sudachi.StringUtil; - import java.nio.ByteBuffer; +import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.Tokenizer; + /** * Internal morpheme information. This class does not contain any strings. * @@ -186,6 +187,23 @@ public int[] getCunitSplit() { return cUnitSplit; } + /** + * Returns the array of word IDs which the morpheme is compounded of in given + * mode. + * + * @return the word IDs of the given units + */ + public int[] getUnitSplit(Tokenizer.SplitMode mode) { + if (mode == Tokenizer.SplitMode.A) { + return getAunitSplit(); + } + if (mode == Tokenizer.SplitMode.B) { + return getBunitSplit(); + } + assert (mode == Tokenizer.SplitMode.C); + return getCunitSplit(); + } + /** * Returns the array of the morphemes which the morpheme is compounded of. * diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 8581be1c..d6df9d21 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -27,6 +27,7 @@ import java.io.StringReader; import java.util.Iterator; import java.util.List; +import java.util.ArrayList; import javax.json.Json; import javax.json.JsonArray; @@ -383,7 +384,7 @@ public void splitC() { public void splitAfterTokenizeCtoA() { MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); - MorphemeList morphemesA = morphemesC.split(Tokenizer.SplitMode.A); + List morphemesA = tokenizer.split(morphemesC, Tokenizer.SplitMode.A); assertThat(morphemesA.size(), is(2)); } @@ -391,7 +392,7 @@ public void splitAfterTokenizeCtoA() { public void splitAfterTokenizeCtoB() { MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); - MorphemeList morphemesB = morphemesC.split(Tokenizer.SplitMode.B); + List morphemesB = tokenizer.split(morphemesC, Tokenizer.SplitMode.B); assertThat(morphemesB.size(), is(1)); } @@ -399,7 +400,7 @@ public void splitAfterTokenizeCtoB() { public void splitAfterTokenizeCtoC() { MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemes1.size(), is(1)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @@ -407,7 +408,7 @@ public void splitAfterTokenizeCtoC() { public void splitAfterTokenizeAtoC() { MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.A, "東京都"); assertThat(morphemes1.size(), is(2)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @@ -415,7 +416,7 @@ public void splitAfterTokenizeAtoC() { public void splitAfterTokenizeBtoC() { MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.B, "東京都"); assertThat(morphemes1.size(), is(1)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @@ -428,7 +429,7 @@ public void splitWithZeroWidthTokens() { assertThat(morphemes1.get(2), morpheme("", 1, 1)); assertThat(morphemes1.get(3), morpheme("東京都", 1, 4)); assertThat(morphemes1.get(4), morpheme("…", 4, 5)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.A); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.A); assertThat(morphemes2.size(), is(8)); assertThat(morphemes2.get(3), morpheme("東京", 1, 3)); assertThat(morphemes2.get(4), morpheme("都", 3, 4)); @@ -439,11 +440,39 @@ public void splitSingleToken() { MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な"); assertThat(morphemes1.size(), is(1)); assertThat(morphemes1.get(0), morpheme("な。な", 0, 3)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.A); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.A); assertThat(morphemes2.get(0), morpheme("な。な", 0, 3)); assertThat(morphemes2.get(0).normalizedForm(), is("アイウ")); } + @Test + public void splitListOfSingleMorphemes() { + List morphemes = new ArrayList<>(); + + morphemes.add(dict.lookup("京都").get(0)); + morphemes.add(dict.lookup("東京都").get(0)); + + List splits = tokenizer.split(morphemes, Tokenizer.SplitMode.A); + assertThat(splits.size(), is(3)); + assertThat(splits.get(0).normalizedForm(), is("京都")); + assertThat(splits.get(1).normalizedForm(), is("東京")); + assertThat(splits.get(2).normalizedForm(), is("都")); + } + + @Test + public void splitMixedMorphemeList() { + List morphemes = new ArrayList<>(); + for (Morpheme m : tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な")) { + morphemes.add(m); + } + morphemes.add(dict.lookup("東京都").get(0)); + + List splits = tokenizer.split(morphemes, Tokenizer.SplitMode.A); + assertThat(splits.size(), is(3)); + assertThat(splits.get(0).normalizedForm(), is("アイウ")); + assertThat(splits.get(2).normalizedForm(), is("都")); + } + @Test public void dumpInternalStructures() { String json = tokenizer.dumpInternalStructures("東京都"); From 1d24e46eb9f503a2c9e6119d32546796266d66e9 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 28 Nov 2024 17:52:20 +0900 Subject: [PATCH 5/5] return List instead of MorphemeList --- .../nlp/sudachi/JapaneseTokenizer.java | 13 ++--- .../SentenceSplittingLazyAnalysis.java | 5 +- .../com/worksap/nlp/sudachi/Tokenizer.java | 16 +++--- .../dictionary/DoubleArrayLexicon.java | 2 +- .../nlp/sudachi/JapaneseTokenizerTest.java | 54 +++++++++---------- 5 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 7fa4e4ad..3af3b7a5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -66,8 +66,9 @@ class JapaneseTokenizer implements Tokenizer { } @Override - public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) { + public List tokenize(Tokenizer.SplitMode mode, String text) { if (text.isEmpty()) { + // return MorphemeList instance for the case internalCost is required. return MorphemeList.EMPTY; } UTF8InputText input = buildInputText(text); @@ -75,25 +76,25 @@ public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) { } @Override - public Iterable tokenizeSentences(SplitMode mode, String text) { + public Iterable> tokenizeSentences(SplitMode mode, String text) { if (text.isEmpty()) { return Collections.emptyList(); } StringReader input = new StringReader(text); SentenceSplittingLazyAnalysis analysis = new SentenceSplittingLazyAnalysis(mode, this, input); - List result = new ArrayList<>(); + List> result = new ArrayList<>(); analysis.forEachRemaining(result::add); return result; } @Override - public Iterator tokenizeSentences(SplitMode mode, Readable input) { + public Iterator> tokenizeSentences(SplitMode mode, Readable input) { return new SentenceSplittingLazyAnalysis(mode, this, input); } @Override - public Iterator lazyTokenizeSentences(SplitMode mode, Readable input) { + public Iterator> lazyTokenizeSentences(SplitMode mode, Readable input) { return tokenizeSentences(mode, input); } @@ -151,7 +152,7 @@ UTF8InputText buildInputText(CharSequence text) { return input; } - MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { + List tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { checkIfAlive(); buildLattice(input); diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 8e722b42..72eb9caf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -29,7 +29,8 @@ /** * Provides lazy sentence split and analysis. */ -/* internal */ class SentenceSplittingLazyAnalysis implements SentenceDetector.NonBreakCheker, Iterator { +/* internal */ class SentenceSplittingLazyAnalysis + implements SentenceDetector.NonBreakCheker, Iterator> { private final SentenceDetector detector = new SentenceDetector(); private final Tokenizer.SplitMode mode; @@ -99,7 +100,7 @@ public boolean hasNext() { } @Override - public MorphemeList next() { + public List next() { int length = detector.getEos(normalized, this); if (length > 0) { // sentence found int eos = bos + length; diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 20bf64cd..95e2f1aa 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -37,7 +37,7 @@ public interface Tokenizer { * input text * @return a result of tokenizing */ - MorphemeList tokenize(SplitMode mode, String text); + List tokenize(SplitMode mode, String text); /** * @@ -48,7 +48,7 @@ public interface Tokenizer { * @return a result of tokenizing * @see #tokenize(SplitMode,String) */ - default MorphemeList tokenize(final String text) { + default List tokenize(final String text) { return tokenize(SplitMode.C, text); } @@ -62,7 +62,7 @@ default MorphemeList tokenize(final String text) { * input text * @return a result of tokenizing */ - Iterable tokenizeSentences(SplitMode mode, String text); + Iterable> tokenizeSentences(SplitMode mode, String text); /** * Tokenize sentences. Divide an input text into sentences and tokenize them @@ -73,7 +73,7 @@ default MorphemeList tokenize(final String text) { * @return a result of tokenizing * @see #tokenizeSentences(SplitMode,String) */ - default Iterable tokenizeSentences(String text) { + default Iterable> tokenizeSentences(String text) { return tokenizeSentences(SplitMode.C, text); } @@ -87,7 +87,7 @@ default Iterable tokenizeSentences(String text) { * a readable input text * @return an iterator of tokenized sentences */ - Iterator tokenizeSentences(SplitMode mode, Readable input); + Iterator> tokenizeSentences(SplitMode mode, Readable input); /** * Read an input text from {@code input}, divide it into sentences and tokenize @@ -98,7 +98,7 @@ default Iterable tokenizeSentences(String text) { * @return an iterator of tokenized sentences * @see #tokenizeSentences(SplitMode,Readable) */ - default Iterator tokenizeSentences(Readable input) { + default Iterator> tokenizeSentences(Readable input) { return tokenizeSentences(SplitMode.C, input); } @@ -115,7 +115,7 @@ default Iterator tokenizeSentences(Readable input) { * */ @Deprecated - Iterator lazyTokenizeSentences(SplitMode mode, Readable input); + Iterator> lazyTokenizeSentences(SplitMode mode, Readable input); /** * Read an input text from {@code input}, divide it into sentences and tokenize @@ -128,7 +128,7 @@ default Iterator tokenizeSentences(Readable input) { * @deprecated renamed to {@link #tokenizeSentences(Readable)} */ @Deprecated - default Iterator lazyTokenizeSentences(Readable input) { + default Iterator> lazyTokenizeSentences(Readable input) { return lazyTokenizeSentences(SplitMode.C, input); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index b59011bc..ac39a693 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -176,7 +176,7 @@ public void calculateDynamicCosts(Tokenizer tokenizer) { } int headwordPtr = wordInfos.headwordPtr(wordId); String headword = strings.string(headwordPtr); - MorphemeList ms = tokenizer.tokenize(headword); + MorphemeList ms = (MorphemeList) tokenizer.tokenize(headword); int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size(); if (cost > Short.MAX_VALUE) { cost = Short.MAX_VALUE; diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index d6df9d21..f34a3abf 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -78,7 +78,7 @@ public void tokenizeSmallKatakanaOnly() { @Test public void partOfSpeech() { - MorphemeList ms = tokenizer.tokenize("京都"); + List ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); Morpheme m = ms.get(0); short pid = m.partOfSpeechId(); @@ -89,7 +89,7 @@ public void partOfSpeech() { @Test public void getWordId() { - MorphemeList ms = tokenizer.tokenize("京都"); + List ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); int wid = ms.get(0).getWordId(); @@ -104,7 +104,7 @@ public void getWordId() { @Test public void getDictionaryId() { - MorphemeList ms = tokenizer.tokenize("京都"); + List ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); assertThat(ms.get(0).getDictionaryId(), is(0)); @@ -119,7 +119,7 @@ public void getDictionaryId() { @Test public void getSynonymGroupIds() { - MorphemeList ms = tokenizer.tokenize("京都"); + List ms = tokenizer.tokenize("京都"); assertThat(ms.size(), is(1)); assertThat(ms.get(0).getSynonymGroupIds(), is(new int[] { 1, 5 })); @@ -141,7 +141,7 @@ public void tokenizeKanjiAlphabetWord() { @Test public void tokenizeSentences() { - Iterator it = tokenizer.tokenizeSentences("京都。東京.東京都。").iterator(); + Iterator> it = tokenizer.tokenizeSentences("京都。東京.東京都。").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -158,7 +158,7 @@ public void tokenizeSentences() { @Test public void tokenizerWithDots() { - MorphemeList s = tokenizer.tokenize("京都…"); + List s = tokenizer.tokenize("京都…"); assertThat(s.size(), is(4)); assertThat(s.get(1).surface(), is("…")); assertThat(s.get(1).normalizedForm(), is(".")); @@ -170,7 +170,7 @@ public void tokenizerWithDots() { @Test public void tokenizerWithModifiedChar() { - Iterator it = tokenizer.tokenizeSentences("´´").iterator(); + Iterator> it = tokenizer.tokenizeSentences("´´").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(4)); assertThat(it.hasNext(), is(false)); @@ -178,7 +178,7 @@ public void tokenizerWithModifiedChar() { @Test public void tokenizeSentencesWithSurrogatePair() { - Iterator it = tokenizer.tokenizeSentences("。😀").iterator(); + Iterator> it = tokenizer.tokenizeSentences("。😀").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(1)); assertThat(it.hasNext(), is(true)); @@ -189,7 +189,7 @@ public void tokenizeSentencesWithSurrogatePair() { @Test public void tokenizerWithReader() throws IOException { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator it = tokenizer.tokenizeSentences(reader); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -209,7 +209,7 @@ public void tokenizerWithLongReader() throws IOException { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader); + Iterator> it = tokenizer.tokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -227,12 +227,12 @@ public void tokenizerWithReaderAndNormalization() throws IOException { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { assertThat(it.hasNext(), is(true)); - MorphemeList ms = it.next(); + List ms = it.next(); assertThat(ms.size(), is(2)); assertThat(ms.get(0).surface(), is("京都")); assertThat(ms.get(1).surface(), is("。")); @@ -248,7 +248,7 @@ public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { } sb.append("😀"); StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); @@ -265,7 +265,7 @@ public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { @Test public void lazyTokenizeSentences() { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator it = tokenizer.lazyTokenizeSentences(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -296,7 +296,7 @@ public void lazyTokenizeSentencesWithLongText() { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.lazyTokenizeSentences(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -319,12 +319,12 @@ public void lazyTokenizeSentencesWithNormalization() { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.lazyTokenizeSentences(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { assertThat(it.hasNext(), is(true)); - MorphemeList ms = it.next(); + List ms = it.next(); assertThat(ms.size(), is(2)); assertThat(ms.get(0).surface(), is("京都")); assertThat(ms.get(1).surface(), is("。")); @@ -334,7 +334,7 @@ public void lazyTokenizeSentencesWithNormalization() { @Test public void zeroLengthMorpheme() { - MorphemeList s = tokenizer.tokenize("…"); + List s = tokenizer.tokenize("…"); assertThat(s.size(), is(3)); assertThat(s.get(0).surface(), is("…")); assertThat(s.get(0).normalizedForm(), is(".")); @@ -356,7 +356,7 @@ public void disableEmptyMorpheme() throws IOException { dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false)); tokenizer = (JapaneseTokenizer) dict.tokenizer(); - MorphemeList s = tokenizer.tokenize("…"); + List s = tokenizer.tokenize("…"); assertThat(s.size(), is(3)); assertThat(s.get(0).surface(), is("…")); assertThat(s.get(0).normalizedForm(), is(".")); @@ -374,7 +374,7 @@ public void disableEmptyMorpheme() throws IOException { @Test public void splitC() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東東京都"); assertThat(morphemesC.get(0).surface(), is("東")); assertThat(morphemesC.get(1).surface(), is("東")); assertThat(morphemesC.get(2).surface(), is("京都")); @@ -382,7 +382,7 @@ public void splitC() { @Test public void splitAfterTokenizeCtoA() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); List morphemesA = tokenizer.split(morphemesC, Tokenizer.SplitMode.A); assertThat(morphemesA.size(), is(2)); @@ -390,7 +390,7 @@ public void splitAfterTokenizeCtoA() { @Test public void splitAfterTokenizeCtoB() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); List morphemesB = tokenizer.split(morphemesC, Tokenizer.SplitMode.B); assertThat(morphemesB.size(), is(1)); @@ -398,7 +398,7 @@ public void splitAfterTokenizeCtoB() { @Test public void splitAfterTokenizeCtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemes1.size(), is(1)); List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); @@ -406,7 +406,7 @@ public void splitAfterTokenizeCtoC() { @Test public void splitAfterTokenizeAtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.A, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.A, "東京都"); assertThat(morphemes1.size(), is(2)); List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); @@ -414,7 +414,7 @@ public void splitAfterTokenizeAtoC() { @Test public void splitAfterTokenizeBtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.B, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.B, "東京都"); assertThat(morphemes1.size(), is(1)); List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); @@ -422,7 +422,7 @@ public void splitAfterTokenizeBtoC() { @Test public void splitWithZeroWidthTokens() { - MorphemeList morphemes1 = tokenizer.tokenize("…東京都…"); + List morphemes1 = tokenizer.tokenize("…東京都…"); assertThat(morphemes1.size(), is(7)); assertThat(morphemes1.get(0), morpheme("…", 0, 1)); assertThat(morphemes1.get(1), morpheme("", 1, 1)); @@ -437,7 +437,7 @@ public void splitWithZeroWidthTokens() { @Test public void splitSingleToken() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な"); assertThat(morphemes1.size(), is(1)); assertThat(morphemes1.get(0), morpheme("な。な", 0, 3)); List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.A);