Merge branch 'develop' into release/7.0

WorksApplications · Jul 22, 2019 · 8479688 · 8479688
2 parents 22e450f + 9efdac7
commit 8479688
Show file tree

Hide file tree

Showing 9 changed files with 309 additions and 47 deletions.
diff --git a/pom.xml b/pom.xml
@@ -3,25 +3,25 @@
 
     <groupId>com.worksap.nlp</groupId>
     <artifactId>analysis-sudachi-elasticsearch7.0</artifactId>
-    <version>1.3.1-SNAPSHOT</version>
+    <version>1.3.1</version>
     <packaging>jar</packaging>
 
     <name>analysis-sudachi</name>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-	<java.version>1.8</java.version>
-	<elasticsearch.version>7.0.1</elasticsearch.version>
-	<lucene.version>8.0.0</lucene.version>
-	<sudachi.version>0.2.0</sudachi.version>
-	<sonar.host.url>https://sonarcloud.io</sonar.host.url>
-	<sonar.language>java</sonar.language>
-	<sonar.organization>worksapplications</sonar.organization>
-	<sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
-	<sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
-	<sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
-	<sonar.junit.reportsPath/>
-	<sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
+        <java.version>1.8</java.version>
+        <elasticsearch.version>7.0.1</elasticsearch.version>
+        <lucene.version>8.0.0</lucene.version>
+        <sudachi.version>0.3.0</sudachi.version>
+        <sonar.host.url>https://sonarcloud.io</sonar.host.url>
+        <sonar.language>java</sonar.language>
+        <sonar.organization>worksapplications</sonar.organization>
+        <sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
+        <sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
+        <sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
+        <sonar.junit.reportsPath />
+        <sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
     </properties>
 
     <build>
@@ -153,4 +153,4 @@
         <developerConnection>scm:git:[email protected]:WorksApplications/elasticsearch-sudachi.git</developerConnection>
         <url>https://github.com/WorksApplications/elasticsearch-sudachi</url>
     </scm>
-</project>
+</project>
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java
@@ -223,18 +223,20 @@ String readSentences() throws IOException {
             offset = remainSize;
             length -= remainSize;
         }
-        int n = input.read(buffer, offset, length);
-        if (n < 0) {
-            if (remainSize != 0) {
-                String lastSentence = new String(buffer, 0, remainSize);
-                baseOffset = nextBaseOffset;
-                nextBaseOffset += remainSize;
-                remainSize = 0;
-                return lastSentence;
+
+        while (length != 0) {
+            int ret = input.read(buffer, offset, length);
+            if (ret < 0) {
+                break;
             }
+            offset += ret;
+            length -= ret;
+        }
+        int n = offset;
+
+        if (n == 0) {
             return null;
         }
-        n += offset;
 
         int eos = lastIndexOfEos(buffer, n);
         if (eos == n && Character.isHighSurrogate(buffer[n - 1])) {

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java
@@ -68,13 +68,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                 case 'ト':
                     builder.append('t');
                     break main;
-                case 'ナ':
-                case 'ニ':
-                case 'ヌ':
-                case 'ネ':
-                case 'ノ':
-                    builder.append('n');
-                    break main;
                 case 'ハ':
                 case 'ヒ':
                 case 'フ':
@@ -132,6 +125,9 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                 case 'ポ':
                     builder.append('p');
                     break main;
+                case 'ヴ':
+                    builder.append('v');
+                    break main;
                 default:
                     builder.append("ltu");
                 }
@@ -337,10 +333,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                     builder.append("tsi");
                     i++;
                     break;
-                case 'ゥ':
-                    builder.append("tsu");
-                    i++;
-                    break;
                 case 'ェ':
                     builder.append("tse");
                     i++;
@@ -512,7 +504,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                     i++;
                     break;
                 default:
-                    builder.append("ho");
+                    builder.append("hu");
                     break;
                 }
                 break;
@@ -679,7 +671,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                     i++;
                     break;
                 case 'ゥ':
-                    builder.append("qwu");
+                    builder.append("gwu");
                     i++;
                     break;
                 case 'ェ':
@@ -771,7 +763,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
                 }
                 break;
             case 'ヅ':
-                builder.append("zu");
+                builder.append("du");
                 break;
             case 'デ':
                 switch(ch2) {

diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java
@@ -21,7 +21,6 @@
 import java.io.InputStream;
 import java.io.StringReader;
 import java.util.HashMap;
-import java.util.Map;
 
 import org.junit.Rule;
 import org.junit.rules.TemporaryFolder;

diff --git a/...test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java b/...test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java
@@ -21,7 +21,6 @@
 import java.io.InputStream;
 import java.io.StringReader;
 import java.util.HashMap;
-import java.util.Map;
 
 import org.junit.Rule;
 import org.junit.rules.TemporaryFolder;
@@ -52,14 +51,12 @@ public void setUp() throws Exception {
 
     public void testBasics() throws IOException {
         String tags = "動詞,非自立可能\n";
-        TokenStream ts = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
-        ((Tokenizer)ts).setReader(new StringReader("東京都に行った。"));
-        Map<String, String> args = new HashMap<>();
-        args.put("tags", "stoptags.txt");
+        Tokenizer tokenizer = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
+        tokenizer.setReader(new StringReader("東京都に行った。"));
         SudachiPartOfSpeechStopFilterFactory factory
-            = new SudachiPartOfSpeechStopFilterFactory(args);
+            = new SudachiPartOfSpeechStopFilterFactory(new HashMap<String, String>() {{ put("tags", "stoptags.txt"); }});
         factory.inform(new StringResourceLoader(tags));
-        ts = factory.create(ts);
+        TokenStream ts = factory.create(tokenizer);
         assertTokenStreamContents(ts,
                                   new String[] {"東京都", "に", "た"});
     }

diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiReadingFormFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiReadingFormFilter.java
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2019 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.lucene.sudachi.ja;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashMap;
+
+import org.junit.Rule;
+import org.junit.rules.TemporaryFolder;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class TestSudachiReadingFormFilter extends BaseTokenStreamTestCase {
+    TokenStream tokenStream;
+
+    @Rule
+    public TemporaryFolder tempFolderForDictionary = new TemporaryFolder();
+
+    public void setUp() throws Exception {
+        super.setUp();
+        tempFolderForDictionary.create();
+        File tempFileForDictionary = tempFolderForDictionary
+                .newFolder("sudachiDictionary");
+        ResourceUtil.copy(tempFileForDictionary);
+
+        String settings;
+        try (InputStream is = this.getClass().getResourceAsStream("sudachi.json")) {
+            settings = ResourceUtil.getSudachiSetting(is);
+        }
+
+        tokenStream = new SudachiTokenizer(true, SudachiTokenizer.Mode.SEARCH, tempFileForDictionary.getPath(), settings);
+    }
+
+    public void testReadingForm() throws IOException {
+        SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(Collections.emptyMap());
+        ((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
+        tokenStream = factory.create(tokenStream);
+        assertTokenStreamContents(tokenStream, new String[] {"トウキョウト", "トウキョウ", "ト", "ニ", "イッ", "タ"});
+    }
+
+    public void testRomanizedReadingForm() throws IOException {
+        SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(new HashMap<String, String>() {{ put("useRomaji", "true"); }});
+        ((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
+        tokenStream = factory.create(tokenStream);
+        assertTokenStreamContents(tokenStream, new String[] {"toukyouto", "toukyou", "to", "ni", "iltu", "ta"});
+    }
+}
diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java
@@ -18,11 +18,11 @@
 package com.worksap.nlp.lucene.sudachi.ja;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.junit.Assert.assertThat;
 
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -362,4 +362,46 @@ public void testReadSentencesWithSurrogatePair() throws IOException {
         }
     }
 
+    private static class ChunkedStringReader extends Reader {
+        private char[] in;
+        private int chunkSize;
+        private int pos;
+        public ChunkedStringReader(String in, int chunkSize) {
+            this.in = in.toCharArray();
+            this.chunkSize = chunkSize;
+            this.pos = 0;
+        }
+
+        @Override
+        public void close() throws IOException {
+            this.pos = this.in.length;
+        }
+
+        @Override
+        public int read(char[] cbuf, int off, int len) throws IOException {
+            int length = len < this.chunkSize ? len : this.chunkSize;
+            if (length > this.in.length - this.pos) {
+                length = this.in.length - this.pos;
+            }
+            if (length == 0) {
+                return -1;
+            }
+            System.arraycopy(this.in, this.pos, cbuf, off, length);
+            this.pos += length;
+            return length;
+        }
+    }
+
+    @Test
+    public void testReadSentencesFromChunkedCharFilter() throws IOException {
+        String inputString = "Elasticsearch";
+        Reader charFilter = new ChunkedStringReader(inputString, 5);
+        tokenizer.setReader(charFilter);
+        tokenizer.reset();
+        String[] answerList = { "Elasticsearch" };
+        for (int i = 0; i < answerList.length; i++) {
+            assertThat(tokenizer.readSentences(), is(answerList[i]));
+        }
+    }
+
 }