Skip to content

Commit

Permalink
Merge branch 'develop' into release/7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
kazuma-t committed Jul 22, 2019
2 parents 22e450f + 9efdac7 commit 8479688
Show file tree
Hide file tree
Showing 9 changed files with 309 additions and 47 deletions.
28 changes: 14 additions & 14 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,25 @@

<groupId>com.worksap.nlp</groupId>
<artifactId>analysis-sudachi-elasticsearch7.0</artifactId>
<version>1.3.1-SNAPSHOT</version>
<version>1.3.1</version>
<packaging>jar</packaging>

<name>analysis-sudachi</name>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<java.version>1.8</java.version>
<elasticsearch.version>7.0.1</elasticsearch.version>
<lucene.version>8.0.0</lucene.version>
<sudachi.version>0.2.0</sudachi.version>
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
<sonar.language>java</sonar.language>
<sonar.organization>worksapplications</sonar.organization>
<sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
<sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
<sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
<sonar.junit.reportsPath/>
<sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
<java.version>1.8</java.version>
<elasticsearch.version>7.0.1</elasticsearch.version>
<lucene.version>8.0.0</lucene.version>
<sudachi.version>0.3.0</sudachi.version>
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
<sonar.language>java</sonar.language>
<sonar.organization>worksapplications</sonar.organization>
<sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
<sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
<sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
<sonar.junit.reportsPath />
<sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
</properties>

<build>
Expand Down Expand Up @@ -153,4 +153,4 @@
<developerConnection>scm:git:[email protected]:WorksApplications/elasticsearch-sudachi.git</developerConnection>
<url>https://github.com/WorksApplications/elasticsearch-sudachi</url>
</scm>
</project>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -223,18 +223,20 @@ String readSentences() throws IOException {
offset = remainSize;
length -= remainSize;
}
int n = input.read(buffer, offset, length);
if (n < 0) {
if (remainSize != 0) {
String lastSentence = new String(buffer, 0, remainSize);
baseOffset = nextBaseOffset;
nextBaseOffset += remainSize;
remainSize = 0;
return lastSentence;

while (length != 0) {
int ret = input.read(buffer, offset, length);
if (ret < 0) {
break;
}
offset += ret;
length -= ret;
}
int n = offset;

if (n == 0) {
return null;
}
n += offset;

int eos = lastIndexOfEos(buffer, n);
if (eos == n && Character.isHighSurrogate(buffer[n - 1])) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
case 'ト':
builder.append('t');
break main;
case 'ナ':
case 'ニ':
case 'ヌ':
case 'ネ':
case 'ノ':
builder.append('n');
break main;
case 'ハ':
case 'ヒ':
case 'フ':
Expand Down Expand Up @@ -132,6 +125,9 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
case 'ポ':
builder.append('p');
break main;
case 'ヴ':
builder.append('v');
break main;
default:
builder.append("ltu");
}
Expand Down Expand Up @@ -337,10 +333,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
builder.append("tsi");
i++;
break;
case 'ゥ':
builder.append("tsu");
i++;
break;
case 'ェ':
builder.append("tse");
i++;
Expand Down Expand Up @@ -512,7 +504,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
i++;
break;
default:
builder.append("ho");
builder.append("hu");
break;
}
break;
Expand Down Expand Up @@ -679,7 +671,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
i++;
break;
case 'ゥ':
builder.append("qwu");
builder.append("gwu");
i++;
break;
case 'ェ':
Expand Down Expand Up @@ -771,7 +763,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
}
break;
case 'ヅ':
builder.append("zu");
builder.append("du");
break;
case 'デ':
switch(ch2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.junit.Rule;
import org.junit.rules.TemporaryFolder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.junit.Rule;
import org.junit.rules.TemporaryFolder;
Expand Down Expand Up @@ -52,14 +51,12 @@ public void setUp() throws Exception {

public void testBasics() throws IOException {
String tags = "動詞,非自立可能\n";
TokenStream ts = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
((Tokenizer)ts).setReader(new StringReader("東京都に行った。"));
Map<String, String> args = new HashMap<>();
args.put("tags", "stoptags.txt");
Tokenizer tokenizer = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
tokenizer.setReader(new StringReader("東京都に行った。"));
SudachiPartOfSpeechStopFilterFactory factory
= new SudachiPartOfSpeechStopFilterFactory(args);
= new SudachiPartOfSpeechStopFilterFactory(new HashMap<String, String>() {{ put("tags", "stoptags.txt"); }});
factory.inform(new StringResourceLoader(tags));
ts = factory.create(ts);
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts,
new String[] {"東京都", "に", "た"});
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2019 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.lucene.sudachi.ja;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;

import org.junit.Rule;
import org.junit.rules.TemporaryFolder;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;

public class TestSudachiReadingFormFilter extends BaseTokenStreamTestCase {
TokenStream tokenStream;

@Rule
public TemporaryFolder tempFolderForDictionary = new TemporaryFolder();

public void setUp() throws Exception {
super.setUp();
tempFolderForDictionary.create();
File tempFileForDictionary = tempFolderForDictionary
.newFolder("sudachiDictionary");
ResourceUtil.copy(tempFileForDictionary);

String settings;
try (InputStream is = this.getClass().getResourceAsStream("sudachi.json")) {
settings = ResourceUtil.getSudachiSetting(is);
}

tokenStream = new SudachiTokenizer(true, SudachiTokenizer.Mode.SEARCH, tempFileForDictionary.getPath(), settings);
}

public void testReadingForm() throws IOException {
SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(Collections.emptyMap());
((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
tokenStream = factory.create(tokenStream);
assertTokenStreamContents(tokenStream, new String[] {"トウキョウト", "トウキョウ", "ト", "ニ", "イッ", "タ"});
}

public void testRomanizedReadingForm() throws IOException {
SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(new HashMap<String, String>() {{ put("useRomaji", "true"); }});
((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
tokenStream = factory.create(tokenStream);
assertTokenStreamContents(tokenStream, new String[] {"toukyouto", "toukyou", "to", "ni", "iltu", "ta"});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
package com.worksap.nlp.lucene.sudachi.ja;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
Expand Down Expand Up @@ -362,4 +362,46 @@ public void testReadSentencesWithSurrogatePair() throws IOException {
}
}

private static class ChunkedStringReader extends Reader {
private char[] in;
private int chunkSize;
private int pos;
public ChunkedStringReader(String in, int chunkSize) {
this.in = in.toCharArray();
this.chunkSize = chunkSize;
this.pos = 0;
}

@Override
public void close() throws IOException {
this.pos = this.in.length;
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int length = len < this.chunkSize ? len : this.chunkSize;
if (length > this.in.length - this.pos) {
length = this.in.length - this.pos;
}
if (length == 0) {
return -1;
}
System.arraycopy(this.in, this.pos, cbuf, off, length);
this.pos += length;
return length;
}
}

@Test
public void testReadSentencesFromChunkedCharFilter() throws IOException {
String inputString = "Elasticsearch";
Reader charFilter = new ChunkedStringReader(inputString, 5);
tokenizer.setReader(charFilter);
tokenizer.reset();
String[] answerList = { "Elasticsearch" };
for (int i = 0; i < answerList.length; i++) {
assertThat(tokenizer.readSentences(), is(answerList[i]));
}
}

}
Loading

0 comments on commit 8479688

Please sign in to comment.