Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate tokenizeSentences #254

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 31 additions & 40 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.CharBuffer;
import java.util.ArrayList;
Expand Down Expand Up @@ -65,65 +66,55 @@ class JapaneseTokenizer implements Tokenizer {
}

@Override
public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) {
public List<Morpheme> tokenize(Tokenizer.SplitMode mode, String text) {
if (text.isEmpty()) {
// return MorphemeList instance for the case internalCost is required.
return MorphemeList.EMPTY;
}
UTF8InputText input = buildInputText(text);
return tokenizeSentence(mode, input);
}

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
public Iterable<List<Morpheme>> tokenizeSentences(SplitMode mode, String text) {
if (text.isEmpty()) {
return Collections.emptyList();
}

SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
int length = analysis.tokenizeBuffer(text);
ArrayList<MorphemeList> result = analysis.result;
int bos = analysis.bos;
if (length < 0) {
// treat remaining thing as a single sentence
int eos = analysis.input.getText().length();
if (bos != eos) {
UTF8InputText slice = analysis.input;
if (bos != 0) {
slice = slice.slice(bos, eos);
}
result.add(tokenizeSentence(mode, slice));
}
}
StringReader input = new StringReader(text);
SentenceSplittingLazyAnalysis analysis = new SentenceSplittingLazyAnalysis(mode, this, input);
List<List<Morpheme>> result = new ArrayList<>();
analysis.forEachRemaining(result::add);
return result;
}

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);

while (wrappedReader.read(buffer) > 0) {
buffer.flip();
int length = analysis.tokenizeBuffer(buffer);
if (length < 0) {
buffer.position(analysis.bosPosition());
buffer.compact();
}
}
buffer.flip();
ArrayList<MorphemeList> sentences = analysis.result;

if (buffer.hasRemaining()) {
sentences.add(tokenizeSentence(mode, buildInputText(buffer)));
}
public Iterator<List<Morpheme>> tokenizeSentences(SplitMode mode, Readable input) {
return new SentenceSplittingLazyAnalysis(mode, this, input);
}

return sentences;
@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input) {
return tokenizeSentences(mode, input);
}

@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
return new SentenceSplittingLazyAnalysis(mode, this, readable);
public List<Morpheme> split(List<Morpheme> morphemes, SplitMode mode) {
if (morphemes instanceof MorphemeList) {
return ((MorphemeList) morphemes).split(mode);
}

List<Morpheme> result = new ArrayList<>();
for (Morpheme m : morphemes) {
if (m instanceof SingleMorphemeImpl) {
((SingleMorphemeImpl) m).appendSplitsTo(result, mode);
} else {
for (Morpheme subsplit : m.split(mode)) {
result.add(subsplit);
}
}
}
return result;
}

@Override
Expand Down Expand Up @@ -161,7 +152,7 @@ UTF8InputText buildInputText(CharSequence text) {
return input;
}

MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
List<Morpheme> tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
checkIfAlive();
buildLattice(input);

Expand Down
10 changes: 1 addition & 9 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,7 @@ public StringsCache getStrings() {
}

/* internal */ void appendSplitsTo(List<LatticeNodeImpl> result, Tokenizer.SplitMode mode) {
if (mode == Tokenizer.SplitMode.A) {
appendSplitsTo(result, getWordInfo().getAunitSplit());
} else if (mode == Tokenizer.SplitMode.B) {
appendSplitsTo(result, getWordInfo().getBunitSplit());
} else if (mode == Tokenizer.SplitMode.C) {
appendSplitsTo(result, getWordInfo().getCunitSplit());
} else {
result.add(this);
}
appendSplitsTo(result, getWordInfo().getUnitSplit(mode));
}

private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/MorphemeList.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,7 +48,7 @@ public class MorphemeList extends AbstractList<Morpheme> {
}

@Override
public Morpheme get(int index) {
public MorphemeListItem get(int index) {
return new MorphemeListItem(this, index);
}

Expand Down Expand Up @@ -91,7 +91,7 @@ WordInfo getWordInfo(int index) {
return path.get(index).getWordInfo();
}

List<Morpheme> split(Tokenizer.SplitMode mode, int index) {
MorphemeList split(Tokenizer.SplitMode mode, int index) {
List<LatticeNodeImpl> nodes = new ArrayList<>();
LatticeNodeImpl node = path.get(index);
node.appendSplitsTo(nodes, mode);
Expand All @@ -106,18 +106,19 @@ List<Morpheme> split(Tokenizer.SplitMode mode, int index) {
* @param mode
* requested split mode
* @return current list or a new list in the requested split mode.
*
* @deprecated will be internal only. use {@link Tokenizer#split} instead.
*/
@Deprecated
public MorphemeList split(Tokenizer.SplitMode mode) {
if (mode.compareTo(this.mode) >= 0) {
return this;
}

List<LatticeNodeImpl> nodes = new ArrayList<>();

for (LatticeNodeImpl node : path) {
node.appendSplitsTo(nodes, mode);
}

return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public String surface() {
}

@Override
public List<Morpheme> split(Tokenizer.SplitMode mode) {
public MorphemeList split(Tokenizer.SplitMode mode) {
return list.split(mode, index);
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public boolean hasNext() {
}

@Override
public MorphemeList next() {
public List<Morpheme> next() {
int length = detector.getEos(normalized, this);
if (length > 0) { // sentence found
int eos = bos + length;
Expand Down
12 changes: 2 additions & 10 deletions src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,8 @@ public List<Morpheme> split(Tokenizer.SplitMode mode) {
*
* @see LatticeNodeImpl.appendSplitsTo
*/
private void appendSplitsTo(List<Morpheme> result, Tokenizer.SplitMode mode) {
if (mode == Tokenizer.SplitMode.A) {
appendSplitsTo(result, getWordInfo().getAunitSplit());
} else if (mode == Tokenizer.SplitMode.B) {
appendSplitsTo(result, getWordInfo().getBunitSplit());
} else if (mode == Tokenizer.SplitMode.C) {
appendSplitsTo(result, getWordInfo().getCunitSplit());
} else {
result.add(this);
}
/* internal */ void appendSplitsTo(List<Morpheme> result, Tokenizer.SplitMode mode) {
appendSplitsTo(result, getWordInfo().getUnitSplit(mode));
}

private void appendSplitsTo(List<Morpheme> result, int[] splitIds) {
Expand Down
Loading