From bf8896ec47b5ee3f5907c68183df59ffe2a6912b Mon Sep 17 00:00:00 2001 From: liu_to Date: Mon, 1 Jul 2019 16:11:22 +0800 Subject: [PATCH] 1.3.1-SNAPSHOT --- .travis.yml | 44 +- .travis/.gitignore | 1 - .travis/settings.xml.enc | 5 - README.md | 2 +- pom.xml | 394 +++++++----------- .../lucene/sudachi/ja/SudachiTokenizer.java | 3 + .../sudachi/ja/TestSudachiTokenizer.java | 17 + 7 files changed, 195 insertions(+), 271 deletions(-) delete mode 100644 .travis/.gitignore delete mode 100644 .travis/settings.xml.enc diff --git a/.travis.yml b/.travis.yml index b20c585f..e794a022 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,27 +1,31 @@ language: java jdk: oraclejdk8 -# To use 7.5GiB memory, specify 'required' for 'sudo' -# https://docs.travis-ci.com/user/reference/overview/#Virtualization-environments sudo: required dist: trusty -env: - global: - # GITHUB_TOKEN - - secure: "WljmhJOo6S6DoLAUqQ8wXG+6soWQDlJVaamuk5UfT3DWfFjUlD6LkT0pTmvI6jdqbk1k7ogtuLq88VuaaY6OsB/Amz6CbJW/1Z5+EgYKdGY6nN4OSBZ253NGMy0RExP5iZicxCGdtcic7PcouKDpi9WjH0dRqpRkgYEdZGcsL7SlK18dnvmW4I6JGbOOmrOfTWE2nWIVt1Fh53eZiV9bffBHBAQn/di8iEk1hHLrUsDE7UyCw9wchlEW7sDL0rB1KlYvZ/G80y9xDMtGoRQo0EF49s4ACLIyg1uhlLniR0Rx/fHFyjUZLzmmFrlmZdM+AVsmgT/WpctgMW0ixzM3a9Wp9PKEm7KEcpgUpoPQBNYMOuxCe3DKZrHHluey9q7elKew4CamvJCyjbMln0dHJXaavaE/obiSu7YjZfymd9FB+DirjvaiL8ulG8qSzDd4FqvsqNzd18lDmzuGHVDlLYNCox3vlm0kuAM9cckn82f6SakqvMM1ZKSeLhWjD0tbpfd+okBvp2y1LaGGZrUhysEk3MXvxOAMa1pCw7Asj+CI9CAmjKjkykMx7csvjTgmCz9sX1aLqXGtEJnThM8R6OYHA05cr+z2eRsnmMWL/wITUXv8y3qKC42AlaA3M42I0lEVwIDLkL/vAa3nG2EjeqvFJ4kDFO34bOCdGgnBRgk=" - # SONAR_LOGIN - - secure: "Iqsmknto7LfqqyimLpAd9iAdHGuQN1tJ99hQ/2yNm31g6SxPxBiWEgifZvdJadjsY2YS/YMD6irEsvtgWhL1Meah2Cl8DdPGQTNxz5OOZIYPRwnTsLcE1JcNJbSqgmcWYv6bcaLSJgXDHgRW+C/FwkkEgvXvvWzdxw924vQlUh5h4StCjjOR5u5gFYvTDn4jMXC2TXv9wfh7Kby5DXHGviJk7idtwpl0YcmG/pgdX2RQR2UlNGRFVoDiUse5TmKkPIcIeUICz5I16SeOEEWbM1YbvAaSTMgnHC8t6lej5e7F0McW4IMXKQKPGABnDwc8gxssu3GP/3vbdZn3/JD3DOOSej7mEJhVAkhDOjCQCnyOIuVlSa+BnRwPtKyE7W9JMxMszNmFktFrzJzHVyXfddKEFWjlFMjeccVLFur4fkE8itgZ2uGVNzxMUG35qRIBPkuW4gCDGVmASYheal1KuPupZO7EmnhYQsvwaG8ZM2Ym5eEBMqTneLOsVLzyJeEDDE7Kyi4mzsrJw0ho/QfbNeCZFG9llaU5uQie+ws1TX7T19WjbvmuRB4OpYJylXkdUroPEFqTMMooq7Y+uz+sbnk9GEiPcel/toLJWb878MQO+iEbaEDCaOV2O7IvAFZgL+fFJfLOjXBVA/i1iFmflH7PIIKoeEuqyxfrQzEjcB4=" -install: - # Decrypt settings.xml which is necessary to deploy to Sonatype Maven Repositories - - if [ -n "$encrypted_6cebf91f1d36_key" ]; then openssl aes-256-cbc -K $encrypted_6cebf91f1d36_key -iv $encrypted_6cebf91f1d36_iv -in .travis/settings.xml.enc -out .travis/settings.xml -d; fi +install: true +addons: + sonarcloud: + organization: "worksapplications" + token: + secure: "JIKI52CkXqNLVn9p9JNb+gNEwJtq2k3lajCjf1g868xkh19kkDNF6wCP3Ni+1AILQlXBRGUzWqCYB9BlXTjBtT5A7pRppCM2Q6JOpKHS8V1RKex0mGZ4pp5IcZMtXO8YX6yILL9PuRezUhdo0aJiasWaL6SGdzeUkGumZLIM6UGNm6qI1eab2YIDKxVnQ5I5tJrJ2owd/kUmFckXREOlmMU3VowjvSnB1ezUqLHnhk2qO+ZHaHc9pO5njA4u+XzGDUypzmb1sgphqQf2Urh0yIwrB2cby/OdjJi4s4n5PDmjOSd1jLmOu9ZJ4Omr7iJJ2ZCF0d00ownOax1jCsM81uyaQYhi+5AEWSmNy/4+PfHpRD4w2uhNS+r9TFWRGVMwmyzENyoIvGObx0tECEB+o0h5yn6RBb5KzVzZlQYl8wQfA3nv3AIx7IkHkWtr/t//nyedpe1WgI1AR+ruAp9xP7vg5YrHKMzpDz9DBY5Q9ypkfv7Vne99kb8r3Ompzt2+XNysp47fR9LqB07WtjkJ6ia0wVsd8bWgADeMp6p5xa+4uwkl9/IXilPzAWQuI229OsskZXx2W6ANv22Vi+6CgrSlxUAtg5k0k7QzcCIp9fk4l5jmaTXJWRkWbsUO1thjr4ecylK1SJsMy9b2FGJNAM2YjD6lONzirI7NC0fTqws=" script: - - mvn jacoco:prepare-agent verify -B -V + - mvn clean org.jacoco:jacoco-maven-plugin:prepare-agent test -B after_success: - - mvn sonar:sonar -Dsonar.login=$SONAR_LOGIN + - mvn sonar:sonar -B +before_deploy: + - mvn clean package -B deploy: - - provider: script - script: mvn deploy --settings .travis/settings.xml - # Skip cleanup to keep decrepted .travis/settings.xml in workspace - # https://docs.travis-ci.com/user/deployment/ - skip_cleanup: true - on: - branch: develop + provider: releases + api_key: + secure: "Qst5ZeqRX8wckI5+0mFWBgHFFt2imdczSp4jLjoc/ydi/1aYxDx12Y30x1LWyf5gLuLavDgMxIegIfPATBvrOQysgY0qO5isc0vy6m6BSNXHFJZaWYXdSPBQCA/XCzE2L2s+3B6i4DL5X4e2SI+0DoyDjvXzGf5kH2sOBxkhGTu9C2lkyUiOXWXDkWR4xAnb5UK4KShjP3FtarJrmw82AOs3C5gYfGYI56D97WfQMpHq79hesrgSHW7zFA6IqthSTGEw4qCCnAHQyPE0DDbDAZTcAftjeecsJ5g+hkKy3iwgQDLN6iiM3RB1luqf1Us1ajWvCw88jdiYmGIr9wbivr5yxoi46SEkdZwjw1xSpCze3gRoNGxTnN4BOd/fs7t0BQNphqUHVeLPFu1ib4dnEzB1wNrcMNCJHhupVN4j8mqFIu0QWoDkh0MhRpcRw0tJK6EXnjARjgZ//0hjLqcbZeBgOoglrvkGVyO8G7OncC2ZqxVurxa4h+X5/SN/EicrKcg0gjoJrfVTL6OWzGWJGNQDmeUkcTNnocXn13549bwpEfamKAxCccVRIiZPAOsuDVkEc/Pk2Cfc8uFJ/NG0zVFU4aQCXbSvI9/I4ZAgEJ0WdBcKZNW4DF+jqd+vhpzzWBO0plk3B8huLendZ8whpdrOdJXzgVKsK8exv1nc/HQ=" + file_glob: true + file: target/releases/*.zip + overwrite: true + skip_cleanup: true + on: + repo: WorksApplications/elasticsearch-sudachi + tags: true +cache: + directories: + - "$HOME/.m2/repository" + - "$HOME/.sonar/cache" \ No newline at end of file diff --git a/.travis/.gitignore b/.travis/.gitignore deleted file mode 100644 index 92eb2403..00000000 --- a/.travis/.gitignore +++ /dev/null @@ -1 +0,0 @@ -settings.xml diff --git a/.travis/settings.xml.enc b/.travis/settings.xml.enc deleted file mode 100644 index 9d35e06e..00000000 --- a/.travis/settings.xml.enc +++ /dev/null @@ -1,5 +0,0 @@ -üרÔÑ£8¦Þ·k‘œh…¨%JÑößΊ -Ȕ̊̅_þbŠÙ\ y‰ƒ7¸Tc;aò^'ØJƒ¸,ê,„¢|L©§¢Q!û­El -9`s*±äxG¿PcT¤ú‘|¯wuMÔ -Àϣʘ<ÂàŒÔ]u›àQìŒ\^¼XôÀ90êù‹*0¿ŒÒzûËmó¿­¶€j`; -óe3 õRÍŠ7þVKµ•ß˜ Ž@;zE˜¡fÎ1 \ No newline at end of file diff --git a/README.md b/README.md index 85e659ac..275353f7 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us 1. Download analysis-sudachi-elasticsearch zip archive file 2. Move current dir to $ES_HOME -3. Execute "bin/elasticsearch-plugin install file:///" +3. Execute "bin/elasticsearch-plugin install file:///plugin-zip-path" 4. Download sudachi dictionary archive from https://github.com/WorksApplications/SudachiDict 5. Extract dic file and place it to config/sudachi_tokenizer/system_core.dic 6. Execute "bin/elasticsearch" diff --git a/pom.xml b/pom.xml index 8a138e84..a5b47440 100644 --- a/pom.xml +++ b/pom.xml @@ -1,250 +1,156 @@ - - 4.0.0 + + 4.0.0 - com.worksap.nlp + com.worksap.nlp analysis-sudachi-elasticsearch6.5 - 1.3.0 - jar + 1.3.1-SNAPSHOT + jar - analysis-sudachi + analysis-sudachi - - UTF-8 - 1.8 - 6.5.4 - 7.5.0 - 0.2.0 - true - true - https://sonarcloud.io - java - worksapplications - https://github.com/WorksApplications/elasticsearch-sudachi - https://travis-ci.org/WorksApplications/elasticsearch-sudachi - https://github.com/WorksApplications/elasticsearch-sudachi/issues - - ${project.build.directory}/surefire-reports - + + UTF-8 + 1.8 + 6.5.4 + 7.5.0 + 0.2.0 + https://sonarcloud.io + java + worksapplications + https://github.com/WorksApplications/elasticsearch-sudachi + https://travis-ci.org/WorksApplications/elasticsearch-sudachi + https://github.com/WorksApplications/elasticsearch-sudachi/issues + + ${project.build.directory}/surefire-reports + - - - sonatype-snapshot - Sonatype Snapshot Repository - https://oss.sonatype.org/content/repositories/snapshots/ - default - - false - - - true - - - - - - pre-merge - - - env.TRAVIS_EVENT_TYPE - pull_request - - - - false - preview - ${env.GITHUB_TOKEN} - ${env.TRAVIS_PULL_REQUEST} - ${env.TRAVIS_REPO_SLUG} - false - - - - post-merge - - - env.TRAVIS_BRANCH - develop - - - - false - false - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - ${java.version} - ${java.version} - - - - maven-assembly-plugin - 2.6 - - false - ${project.build.directory}/releases/ - - ${basedir}/src/main/assemblies/plugin.xml - - - - - create-archive - package - - single - - - - - - maven-surefire-plugin - 2.22.1 - - - **/TestAnalysisSudachi.java - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.0.0-M1 - - ${java.version} - - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.sonarsource.scanner.maven - sonar-maven-plugin - 3.3.0.603 - - - org.jacoco - jacoco-maven-plugin - 0.7.9 - - - - - - - com.worksap.nlp - sudachi - ${sudachi.version} - - - - org.elasticsearch.client - transport - ${elasticsearch.version} - - - - org.apache.lucene - lucene-analyzers-kuromoji - ${lucene.version} - - - - org.apache.logging.log4j - log4j-core - 2.11.1 - - - - org.glassfish - javax.json - 1.1 - - - - junit - junit - 4.12 - test - - - org.hamcrest - hamcrest-all - 1.3 - test - - - org.apache.lucene - lucene-test-framework - ${lucene.version} - test - - - org.elasticsearch.test - framework - ${elasticsearch.version} - test - - - https://github.com/WorksApplications/elasticsearch-sudachi - The Japanese analysis plugin for elasticsearch - 2017 - - - Apache License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0 - - - - Works Applications Co., Ltd. - http://www.worksap.com/ - - - - kazuma-t - Kazuma TAKAOKA - takaoka_k@worksap.co.jp - Asia/Tokyo - - - - GitHub Issues - https://github.com/WorksApplications/elasticsearch-sudachi/issues - - - scm:git:git@github.com:WorksApplications/elasticsearch-sudachi.git - scm:git:git@github.com:WorksApplications/elasticsearch-sudachi.git - https://github.com/WorksApplications/elasticsearch-sudachi - - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + ${java.version} + ${java.version} + + + + maven-assembly-plugin + 2.6 + + false + ${project.build.directory}/releases/ + + ${basedir}/src/main/assemblies/plugin.xml + + + + + create-archive + package + + single + + + + + + maven-surefire-plugin + 2.22.1 + + + **/TestAnalysisSudachi.java + + + + + + + + + com.worksap.nlp + sudachi + ${sudachi.version} + + + + org.elasticsearch.client + transport + ${elasticsearch.version} + + + + org.apache.lucene + lucene-analyzers-kuromoji + ${lucene.version} + + + + org.apache.logging.log4j + log4j-core + 2.11.1 + + + + org.glassfish + javax.json + 1.1 + + + + junit + junit + 4.12 + test + + + org.hamcrest + hamcrest-all + 1.3 + test + + + org.apache.lucene + lucene-test-framework + ${lucene.version} + test + + + org.elasticsearch.test + framework + ${elasticsearch.version} + test + + + https://github.com/WorksApplications/elasticsearch-sudachi + The Japanese analysis plugin for elasticsearch + 2017 + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0 + + + + Works Applications Co., Ltd. + http://www.worksap.com/ + + + + kazuma-t + Kazuma TAKAOKA + takaoka_k@worksap.co.jp + Asia/Tokyo + + + + GitHub Issues + https://github.com/WorksApplications/elasticsearch-sudachi/issues + + + scm:git:git@github.com:WorksApplications/elasticsearch-sudachi.git + scm:git:git@github.com:WorksApplications/elasticsearch-sudachi.git + https://github.com/WorksApplications/elasticsearch-sudachi + diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java index 1a24cb3f..290c27c7 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java @@ -237,6 +237,9 @@ String readSentences() throws IOException { n += offset; int eos = lastIndexOfEos(buffer, n); + if (eos == n && Character.isHighSurrogate(buffer[n - 1])) { + eos -= 1; + } String sentences = new String(buffer, 0, eos); remainSize = n - eos; System.arraycopy(buffer, eos, buffer, 0, remainSize); diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java index b3aa6c8a..040b12c4 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java @@ -345,4 +345,21 @@ public void testReadSentencesWithLongSentence() throws IOException { } } + @Test + public void testReadSentencesWithSurrogatePair() throws IOException { + int BUFFER_SIZE = 512; + String beforeSurrogatePair = ""; + for (int i = 0; i < BUFFER_SIZE - 1; i++) { + beforeSurrogatePair += "a"; + } + String afterSurrogatePair = "bbb"; + String inputString = beforeSurrogatePair + "😜" + afterSurrogatePair; + tokenizer.setReader(new StringReader(inputString)); + tokenizer.reset(); + String[] answerList = { beforeSurrogatePair, "😜" + afterSurrogatePair }; + for (int i = 0; i < answerList.length; i++) { + assertThat(tokenizer.readSentences(), is(answerList[i])); + } + } + }