diff --git a/pom.xml b/pom.xml index 5a414cc3..02f2eb40 100644 --- a/pom.xml +++ b/pom.xml @@ -3,25 +3,25 @@ com.worksap.nlp analysis-sudachi-elasticsearch7.0 - 1.3.1-SNAPSHOT + 1.3.1 jar analysis-sudachi UTF-8 - 1.8 - 7.0.1 - 8.0.0 - 0.2.0 - https://sonarcloud.io - java - worksapplications - https://github.com/WorksApplications/elasticsearch-sudachi - https://travis-ci.org/WorksApplications/elasticsearch-sudachi - https://github.com/WorksApplications/elasticsearch-sudachi/issues - - ${project.build.directory}/surefire-reports + 1.8 + 7.0.1 + 8.0.0 + 0.3.0 + https://sonarcloud.io + java + worksapplications + https://github.com/WorksApplications/elasticsearch-sudachi + https://travis-ci.org/WorksApplications/elasticsearch-sudachi + https://github.com/WorksApplications/elasticsearch-sudachi/issues + + ${project.build.directory}/surefire-reports @@ -153,4 +153,4 @@ scm:git:git@github.com:WorksApplications/elasticsearch-sudachi.git https://github.com/WorksApplications/elasticsearch-sudachi - + \ No newline at end of file diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java index 290c27c7..e0a06cbf 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java @@ -223,18 +223,20 @@ String readSentences() throws IOException { offset = remainSize; length -= remainSize; } - int n = input.read(buffer, offset, length); - if (n < 0) { - if (remainSize != 0) { - String lastSentence = new String(buffer, 0, remainSize); - baseOffset = nextBaseOffset; - nextBaseOffset += remainSize; - remainSize = 0; - return lastSentence; + + while (length != 0) { + int ret = input.read(buffer, offset, length); + if (ret < 0) { + break; } + offset += ret; + length -= ret; + } + int n = offset; + + if (n == 0) { return null; } - n += offset; int eos = lastIndexOfEos(buffer, n); if (eos == n && Character.isHighSurrogate(buffer[n - 1])) { diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java index d88a9a5d..e42d936c 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java @@ -68,13 +68,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO case 'ト': builder.append('t'); break main; - case 'ナ': - case 'ニ': - case 'ヌ': - case 'ネ': - case 'ノ': - builder.append('n'); - break main; case 'ハ': case 'ヒ': case 'フ': @@ -132,6 +125,9 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO case 'ポ': builder.append('p'); break main; + case 'ヴ': + builder.append('v'); + break main; default: builder.append("ltu"); } @@ -337,10 +333,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO builder.append("tsi"); i++; break; - case 'ゥ': - builder.append("tsu"); - i++; - break; case 'ェ': builder.append("tse"); i++; @@ -512,7 +504,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO i++; break; default: - builder.append("ho"); + builder.append("hu"); break; } break; @@ -679,7 +671,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO i++; break; case 'ゥ': - builder.append("qwu"); + builder.append("gwu"); i++; break; case 'ェ': @@ -771,7 +763,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO } break; case 'ヅ': - builder.append("zu"); + builder.append("du"); break; case 'デ': switch(ch2) { diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java index 0728b136..1535227f 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java @@ -21,7 +21,6 @@ import java.io.InputStream; import java.io.StringReader; import java.util.HashMap; -import java.util.Map; import org.junit.Rule; import org.junit.rules.TemporaryFolder; diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java index badb9963..1b2a31bf 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java @@ -21,7 +21,6 @@ import java.io.InputStream; import java.io.StringReader; import java.util.HashMap; -import java.util.Map; import org.junit.Rule; import org.junit.rules.TemporaryFolder; @@ -52,14 +51,12 @@ public void setUp() throws Exception { public void testBasics() throws IOException { String tags = "動詞,非自立可能\n"; - TokenStream ts = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings); - ((Tokenizer)ts).setReader(new StringReader("東京都に行った。")); - Map args = new HashMap<>(); - args.put("tags", "stoptags.txt"); + Tokenizer tokenizer = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings); + tokenizer.setReader(new StringReader("東京都に行った。")); SudachiPartOfSpeechStopFilterFactory factory - = new SudachiPartOfSpeechStopFilterFactory(args); + = new SudachiPartOfSpeechStopFilterFactory(new HashMap() {{ put("tags", "stoptags.txt"); }}); factory.inform(new StringResourceLoader(tags)); - ts = factory.create(ts); + TokenStream ts = factory.create(tokenizer); assertTokenStreamContents(ts, new String[] {"東京都", "に", "た"}); } diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiReadingFormFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiReadingFormFilter.java new file mode 100644 index 00000000..d8da60c0 --- /dev/null +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiReadingFormFilter.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.lucene.sudachi.ja; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; + +import org.junit.Rule; +import org.junit.rules.TemporaryFolder; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +public class TestSudachiReadingFormFilter extends BaseTokenStreamTestCase { + TokenStream tokenStream; + + @Rule + public TemporaryFolder tempFolderForDictionary = new TemporaryFolder(); + + public void setUp() throws Exception { + super.setUp(); + tempFolderForDictionary.create(); + File tempFileForDictionary = tempFolderForDictionary + .newFolder("sudachiDictionary"); + ResourceUtil.copy(tempFileForDictionary); + + String settings; + try (InputStream is = this.getClass().getResourceAsStream("sudachi.json")) { + settings = ResourceUtil.getSudachiSetting(is); + } + + tokenStream = new SudachiTokenizer(true, SudachiTokenizer.Mode.SEARCH, tempFileForDictionary.getPath(), settings); + } + + public void testReadingForm() throws IOException { + SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(Collections.emptyMap()); + ((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。")); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, new String[] {"トウキョウト", "トウキョウ", "ト", "ニ", "イッ", "タ"}); + } + + public void testRomanizedReadingForm() throws IOException { + SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(new HashMap() {{ put("useRomaji", "true"); }}); + ((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。")); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, new String[] {"toukyouto", "toukyou", "to", "ni", "iltu", "ta"}); + } +} diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java index 040b12c4..31a3e8ba 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java @@ -18,11 +18,11 @@ package com.worksap.nlp.lucene.sudachi.ja; import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertThat; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -362,4 +362,46 @@ public void testReadSentencesWithSurrogatePair() throws IOException { } } + private static class ChunkedStringReader extends Reader { + private char[] in; + private int chunkSize; + private int pos; + public ChunkedStringReader(String in, int chunkSize) { + this.in = in.toCharArray(); + this.chunkSize = chunkSize; + this.pos = 0; + } + + @Override + public void close() throws IOException { + this.pos = this.in.length; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int length = len < this.chunkSize ? len : this.chunkSize; + if (length > this.in.length - this.pos) { + length = this.in.length - this.pos; + } + if (length == 0) { + return -1; + } + System.arraycopy(this.in, this.pos, cbuf, off, length); + this.pos += length; + return length; + } + } + + @Test + public void testReadSentencesFromChunkedCharFilter() throws IOException { + String inputString = "Elasticsearch"; + Reader charFilter = new ChunkedStringReader(inputString, 5); + tokenizer.setReader(charFilter); + tokenizer.reset(); + String[] answerList = { "Elasticsearch" }; + for (int i = 0; i < answerList.length; i++) { + assertThat(tokenizer.readSentences(), is(answerList[i])); + } + } + } diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/util/TestRomanizer.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/util/TestRomanizer.java new file mode 100644 index 00000000..804a1634 --- /dev/null +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/util/TestRomanizer.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2019 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.lucene.sudachi.ja.util; + +import junit.framework.TestCase; + +public class TestRomanizer extends TestCase { + public void testGetRomanizationWithSmallTsu() { + String input = "ッカッキックッケッコッサッシッスッセッソッタッチッツッテットッハッヒッフッヘッホッマッミッムッメッモッヤッユッヨッワッガッギッグッゲッゴッザッジッズッゼッゾッダッヂッヅッデッドッバッビッブッベッボッパッピップッペッポッヴッナ"; + String expected = "kkakkikkukkekkossassissussessottattittuttettohhahhihhuhhehhommammimmummemmoyyayyuyyowwaggaggigguggeggozzazzizzuzzezzoddaddidduddeddobbabbibbubbebboppappippuppeppovvultuna"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithA() { + String input = "アイウウァウィウェウォ"; + String expected = "aiuwhawhiwhewho"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithKa() { + String input = "カキキャキィキュキェキョククァクィクゥクェクォケコ"; + String expected = "kakikyakyikyukyekyokuqwaqwiqwuqweqwokeko"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithSa() { + String input = "サシシャシィシュシェショススァスィスゥスェスォセソ"; + String expected = "sasisyasyisyusyesyosuswaswiswusweswoseso"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithTa() { + String input = "タチチャチィチュチェチョツツァツィツゥツェツォテテャティテュテェテョトトァトィトゥトェトォ"; + String expected = "tatityatyityutyetyotutsatsitulutsetsotethathithuthethototwatwitwutwetwo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithNa() { + String input = "ナニニャニィニュニェニョヌネノ"; + String expected = "naninyanyinyunyenyonuneno"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithHa() { + String input = "ハヒヒャヒィヒュヒェヒョフファフィフゥフェフォフャフュフョヘホ"; + String expected = "hahihyahyihyuhyehyohufwafwifwufwefwofyafyufyoheho"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithMa() { + String input = "マミミャミィミュミェミョムメモ"; + String expected = "mamimyamyimyumyemyomumemo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithYa() { + String input = "ヤユヨ"; + String expected = "yayuyo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithRa() { + String input = "ラリリャリィリュリェリョルレロ"; + String expected = "rariryaryiryuryeryorurero"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithWa() { + String input = "ワヰヱヲ"; + String expected = "wawiwewo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithN() { + String input = "ンンア"; + String expected = "nnna"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithGa() { + String input = "ガギギャギィギュギェギョググァグィグゥグェグォゲゴ"; + String expected = "gagigyagyigyugyegyogugwagwigwugwegwogego"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithZa() { + String input = "ザジジャジィジュジェジョズゼゾ"; + String expected = "zazizyazyizyuzyezyozuzezo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithDa() { + String input = "ダヂヂャヂィヂュヂェヂョヅデデャディデュデェデョドドァドィドゥドェドォ"; + String expected = "dadidyadyidyudyedyodudedhadhidhudhedhododwadwidwudwedwo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithBa() { + String input = "バビビャビィビュビェビョブベボ"; + String expected = "babibyabyibyubyebyobubebo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithPa() { + String input = "パピピャピィピュピェピョプペポ"; + String expected = "papipyapyipyupyepyopupepo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithVa() { + String input = "ヴァヴィヴヴェヴォヴャヴュヴョ"; + String expected = "vavivuvevovyavyuvyo"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithSmall() { + String input = "ァィゥェォヵヶャュョヮ"; + String expected = "lalilulelolkalkelyalyulyolwa"; + + assertEquals(expected, Romanizer.getRomanization(input)); + } + + public void testGetRomanizationWithSymbols() { + assertEquals("-", Romanizer.getRomanization("・=ー")); + } + + public void testGetRomanizationWithoutJapanese() { + assertEquals("", Romanizer.getRomanization("")); + assertEquals("abc-", Romanizer.getRomanization("abc-")); + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinOovPlugin.java b/src/test/java/com/worksap/nlp/sudachi/JoinOovPlugin.java index 834f68e8..7e32fb55 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinOovPlugin.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinOovPlugin.java @@ -25,7 +25,7 @@ public void setUp(Grammar grammar) { } @Override - public void rewrite(InputText text, List path, + public void rewrite(InputText text, List path, Lattice lattice) { boolean isOOVNode = false; int begin = 0;