diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3ad82fe --- /dev/null +++ b/README.md @@ -0,0 +1,466 @@ +# 玻森数据中文分析器Elasticsearch插件 (Beta版) + +##概述 +Elasticsearch 是一个基于 lucene 的强大搜索服务器,也是企业最受欢迎的搜索引擎之一。对于中文搜索,Elasticsearch 自带的标准分析器(Standard Analyzer)显然不能满足大家的要求。因此[玻森数据](http://bosonnlp.com)开发了一款基于玻森中文分词的 Elasticsearch 的插件(BosonNLP Analysis for Elasticsearch)方便大家准确的使用中文搜索。 + +##安装 + +###依赖 +Elasticsearch 官网安装说明 https://www.elastic.co/guide/en/elasticsearch/guide/1.x/_installing_elasticsearch.html + +###选择插件版本 +其对应的版本号和插件版本号如下: + +| BosonNLP version | ES Version | +| :--------------: | :---------:| +| 1.3.0-beta | 2.2.0 | +| 1.2.2-beta | 2.1.2 | +| 1.2.1-beta | 2.1.1 | +| 1.2.0-beta | 2.1.0 | +| 1.1.0-beta | 2.0.0 | +| 1.0.0-beta | 1.7.x | + +###安装插件 +现在提供以下两种方式安装插件。 + +####方法一 +```markdown +TO DO download from github + +``` +####方法二 + +1. 构建项目包 + + 下载玻森中文分析器项目到本地,并在项目根目录下通过 Maven 构建项目包: + + ``` + mvn clean package + ``` + + 构建后的项目包`elasticsearch-analysis-bosonnlp-{version}-plugin.zip`在`target/releases/`生成。 + +2. 安装插件 + + 通过 Elasticsearch 的 plugin 加载插件,在 Elasticsearch 根目录执行以下命令即可: + + ``` + $ sudo bin/plugin install file:/root/path/to/your/elasticsearch-analysis-bosonnlp-{version}-plugin.zip + ``` + +###设置 + +运行 Elasticsearch 之前需要在 config 文件夹中修改`elasticsearch.yml`来定义使用玻森中文分析器,并填写玻森 API_TOKEN 以及玻森分词 API 的地址,即在该文件结尾处添加: + +```makefile +index: + analysis: + analyzer: + bosonnlp: + type: bosonnlp + API_URL: http://api.bosonnlp.com/tag/analysis + # You MUST give the API_TOKEN value, otherwise it doesn't work + API_TOKEN: *PUT YOUR API TOKEN HERE* + # Please uncomment if you want to specify ANY ONE of the following + # areguments, otherwise the DEFAULT value will be used, i.e., + # space_mode is 0, + # oov_level is 3, + # t2s is 0, + # special_char_conv is 0. + # More detials can be found in bosonnlp docs: + # http://docs.bosonnlp.com/tag.html + # + # + # space_mode: put your value here(range from 0-3) + # oov_level: put your value here(range from 0-4) + # t2s: put your value here(range from 0-1) + # special_char_conv: put your value here(range from 0-1) +``` +**需要注意的是** + +1. `必须在 API_URL 填写给定的分词地址以及在API_TOKEN:*PUT YOUR API TOKEN HERE* 中填写给定的玻森数据API_TOKEN`,否则无法使用玻森中文分析器。该 API_TOKEN 是[注册玻森数据账号](http://bosonnlp.com/)所获得。 + +2. 如果配置文件中已经有配置过其他的 analyzer,请直接在 analyzer 下如上添加 bosonnlp analyzer。 + +3. 如果有多个 node 并且都需要 BosonNLP 的分词插件,则每个 node 下的 yml 文件都需要如上安装和设置。 + +4. 另外,玻森中文分词还提供了4个参数(*space_mode*,*oov_level*,*t2s*,*special_char_conv*)可满足不同的分词需求。如果取默认值,则无需任何修改;否则,可取消对应参数的注释并赋值。 + +> 例:需开启繁体转换成简体(*t2s*)功能,则取消*t2s*的注释并赋值。 +```makefile + t2s: 1 +``` + +更多关于玻森中文分词参数的信息,可以在此[了解](http://docs.bosonnlp.com/tag.html)。 + +设置完之后就可以运行 Elasticsearch 了,如果对该设置有新的改动,需要重启 Elasticsearch 才可生效。 + +###测试 + +####分词测试 +运行 Elasiticsearch + +显示插件加载成功 +``` +... +[time][INFO ][plugins] [Gaza] loaded [analysis-bosonnlp] +... +``` +建立 index +```markdown +curl -XPUT 'localhost:9200/test' +``` +测试分析器是否配置成功 +```markdown +curl -XGET 'localhost:9200/test/_analyze?analyzer=bosonnlp&pretty' -d '这是玻森数据分词的测试' +``` +结果 +```json +{ + "tokens" : [ { + "token" : "这", + "start_offset" : 0, + "end_offset" : 1, + "type" : "word", + "position" : 0 + }, { + "token" : "是", + "start_offset" : 1, + "end_offset" : 2, + "type" : "word", + "position" : 1 + }, { + "token" : "玻森", + "start_offset" : 2, + "end_offset" : 4, + "type" : "word", + "position" : 2 + }, { + "token" : "数据", + "start_offset" : 4, + "end_offset" : 6, + "type" : "word", + "position" : 3 + }, { + "token" : "分词", + "start_offset" : 6, + "end_offset" : 8, + "type" : "word", + "position" : 4 + }, { + "token" : "的", + "start_offset" : 8, + "end_offset" : 9, + "type" : "word", + "position" : 5 + }, { + "token" : "测试", + "start_offset" : 9, + "end_offset" : 11, + "type" : "word", + "position" : 6 + } ] +} + +``` + +####搜索测试 +建立 mapping +```markdown +curl -XPUT 'localhost:9200/test/text/_mapping' -d' +{ + "text": { + "properties": { + "content": { + "type": "string", + "analyzer": "bosonnlp", + "search_analyzer": "bosonnlp" + } + } + } +} +``` +输入数据 +```markdown +curl -XPUT 'localhost:9200/test/text/1' -d' +{"content": "美称中国武器商很神秘 花巨资海外参展却一言不发"} +' +``` +```markdown +curl -XPUT 'localhost:9200/test/text/2' -d' +{"content": "复旦发现江浙沪儿童体内普遍有兽用抗生素"} +' +``` +```markdown +curl -XPUT 'localhost:9200/test/text/3' -d' +{"content": "37年后重启顶层设计 中国未来城市发展料现四大变化"} +' +``` +查询搜索 +```markdown +curl -XPOST 'localhost:9200/test/text/_search?pretty' -d' +{ + "query" : { "term" : { "content" : "中国" }} +} +' +``` +结果 +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 2, + "max_score" : 0.076713204, + "hits" : [ { + "_index" : "test", + "_type" : "text", + "_id" : "1", + "_score" : 0.076713204, + "_source": +{ + "content": "美称中国武器商很神秘 花巨资海外参展却一言不发"} + }, { + "_index" : "test", + "_type" : "text", + "_id" : "3", + "_score" : 0.076713204, + "_source": +{ + "content": "37年后重启顶层设计 中国未来城市发展料现四大变化"} + } ] + } +} +``` +查询搜索 +```markdown +curl -XPOST 'localhost:9200/test/text/_search?pretty' -d' +{ + "query" : { "term" : { "content" : "国武" }} +}' +``` +结果 +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 0, + "max_score" : null, + "hits" : [ ] + } +} + +``` +查询搜索 +```markdown +curl -XPOST 'localhost:9200/test/text/_search?pretty' -d' +{ + "query" : { "term" : { "content" : "国" }} +}' +``` +结果 +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 0, + "max_score" : null, + "hits" : [ ] + } +} + +``` +如果用 ES 默认的分析器(Standard Analyzer)去查询,得到如下结果: + +查询搜索 +```markdown +curl -XPOST 'localhost:9200/test/text/_search?pretty' -d' +{ + "query" : { "term" : { "content" : "国" }} +}' +``` +结果 +```json +{ + "took" : 8, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 2, + "max_score" : 0.057534903, + "hits" : [ { + "_index" : "test3", + "_type" : "text", + "_id" : "1", + "_score" : 0.057534903, + "_source": +{"content": "美称中国武器商很神秘 花巨资海外参展却一言不发"} + }, { + "_index" : "test3", + "_type" : "text", + "_id" : "3", + "_score" : 0.057534903, + "_source": +{"content": "37年后重启顶层设计 中国未来城市发展料现四大变化"} + + } ] + } +} + +``` +查询搜索 +```markdown +curl -XPOST 'localhost:9200/test3/text/_search?pretty' -d ' +{ + "query": {"term":{"content":"中国"}} +}' +``` +结果 +```json +{ + "took" : 14, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 0, + "max_score" : null, + "hits" : [ ] + } +} + +``` + +###配置 Token Filter +现有的 BosonNLP 分析器没有内置 token filter,如果有过滤 Token 的需求,可以利用 BosonNLP Tokenizer 和 ES 提供的 token filter 搭建定制分析器。 + +####步骤 +配置定制的 analyzer 有以下三个步骤: + +- 添加 BosonNLP tokenizer + +在 `elasticsearch.yml` 文件中 analysis 下添加 tokenizer, 并在 tokenizer 中添加 BosonNLP tokenizer 的配置: +```makefile +index: + analysis: + analyzer: + ... + tokenizer: + bosonnlp: + type: bosonnlp + API_URL: http://api.bosonnlp.com/tag/analysis + # You MUST give the API_TOKEN value, otherwise it doesn't work + API_TOKEN: *PUT YOUR API TOKEN HERE* + # Please uncomment if you want to specify ANY ONE of the following + # areguments, otherwise the DEFAULT value will be used, i.e., + # space_mode is 0, + # oov_level is 3, + # t2s is 0, + # special_char_conv is 0. + # More detials can be found in bosonnlp docs: + # http://docs.bosonnlp.com/tag.html + # + # + # space_mode: put your value here(range from 0-3) + # oov_level: put your value here(range from 0-4) + # t2s: put your value here(range from 0-1) + # special_char_conv: put your value here(range from 0-1) +``` + +**同样需要注意的是** + +1. `必须在 API_URL 中填写给定的分词地址以及在 API_TOKEN:*PUT YOUR API TOKEN HERE* 中填写给定的玻森数据API_TOKEN`,否则无法使用玻森 tokenizer。 +2. 如果配置文件中已经有配置过其他的 tokenizer,请直接在 tokenizer 下如上添加 bosonnlp tokenizer。 +3. 如果需要改动参数的默认值,请可取消对应参数的注释并赋值。 + +- 添加 token filter + +在 `elasticsearch.yml` 文件中 analysis 下添加 filter, 并在 filter 中添加所需 filter 的配置(下面例子中,我们以 lowercase filter 为例): +```makefile +index: + analysis: + analyzer: + ... + tokenizer: + ... + filter: + lowercase: + type: lowercase + +``` + +- 添加定制的 anayzer + +在 `elasticsearch.yml` 文件中 analysis 下添加 analyzer, 并在 analyzer 中添加定制的 analyzer 的配置(下面例子中,我们把定制的 analyzer 命名为 filter_bosonnlp): +```markdown +index: + analysis: + analyzer: + ... + filter_bosonnlp: + type: custom + tokenizer: bosonnlp + filter: [lowercase] +``` +如有其他想要添加的 filter,可以在配置完 filter 之后在上述 filter:[] 列表中添加,以逗号隔开。 + +附上完整的定制 analyzer 配置: +```makefile +index: + analysis: + analyzer: + filter_bosonnlp: + type: custom + tokenizer: bosonnlp + filter: [lowercase] + tokenizer: + bosonnlp: + type: bosonnlp + API_URL: http://api.bosonnlp.com/tag/analysis + # You MUST give the API_TOKEN value, otherwise it doesn't work + API_TOKEN: *PUT YOUR API TOKEN HERE* + # Please uncomment if you want to specify ANY ONE of the following + # areguments, otherwise the DEFAULT value will be used, i.e., + # space_mode is 0, + # oov_level is 3, + # t2s is 0, + # special_char_conv is 0. + # More detials can be found in bosonnlp docs: + # http://docs.bosonnlp.com/tag.html + # + # + # space_mode: put your value here(range from 0-3) + # oov_level: put your value here(range from 0-4) + # t2s: put your value here(range from 0-1) + # special_char_conv: put your value here(range from 0-1) + filter: + lowercase: + type: lowercase + +``` +## 注意 +由于 ES 搜索内核 Lucene 索引文件的设计结构所限,每个文档的每个字段必须单独分析, 无法采用 BosonNLP 的批处理调用,从而在 Network IO 上会有较大的时间开销。 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..8b38abc --- /dev/null +++ b/pom.xml @@ -0,0 +1,167 @@ + + + 4.0.0 + org.elasticsearch + elasticsearch-analysis-bosonnlp + jar + 1.3.0-beta + elasticsearch-analysis-bosonnlp + http://maven.apache.org + BosonNLP Analyzer for ElasticSearch + + + 1.7 + 1.7 + 2.2.0 + ${project.basedir}/src/main/assemblies/plugin.xml + elasticsearch-analysis-bosonnlp + org.elasticsearch.plugin.analysis.bosonnlp.AnalysisBosonNLPPlugin + true + false + UTF-8 + + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + + junit + junit + 4.10 + test + + + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + + org.json + json + 20151123 + compile + + + com.mashape.unirest + unirest-java + 1.4.5 + compile + + + org.apache.httpcomponents + httpclient + 4.5.1 + + + org.apache.httpcomponents + httpcore + 4.4.4 + + + org.apache.httpcomponents + httpcore-nio + 4.4.4 + + + org.apache.httpcomponents + httpmime + 4.5.1 + + + org.apache.httpcomponents + httpasyncclient + 4.1.1 + + + commons-logging + commons-logging + 1.2 + + + log4j + log4j + 1.2.16 + runtime + + + commons-codec + commons-codec + 1.10 + + + + + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.9 + + true + true + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + ${jdk.version} + ${jdk.version} + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.1.2 + + + attach-sources + + jar-no-fork + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + ${project.build.directory}/releases/ + + ${basedir}/src/main/assemblies/plugin.xml + + + + fully.qualified.MainClass + + + + + + package + + single + + + + + + + + diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 0000000..0feb182 --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,25 @@ + + + + + zip + + false + + + / + true + false + + org.elasticsearch:elasticsearch + + + + + + + ${project.basedir}/src/main/resources/plugin-descriptor.properties + true + + + diff --git a/src/main/java/org/bosonnlp/analyzer/core/BosonNLPWordSegmenter.java b/src/main/java/org/bosonnlp/analyzer/core/BosonNLPWordSegmenter.java new file mode 100644 index 0000000..0910c79 --- /dev/null +++ b/src/main/java/org/bosonnlp/analyzer/core/BosonNLPWordSegmenter.java @@ -0,0 +1,150 @@ +/** + * BosonNLP word segmenter release 0.8.2 + * 玻森中文分词 版本 0.8.2 + */ +package org.bosonnlp.analyzer.core; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.ESLoggerFactory; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +import com.mashape.unirest.http.HttpResponse; +import com.mashape.unirest.http.JsonNode; +import com.mashape.unirest.http.Unirest; +import com.mashape.unirest.http.exceptions.UnirestException; + +public final class BosonNLPWordSegmenter { + + private String TAG_URL; + private String BOSONNLP_API_TOKEN; + private int spaceMode; + private int oovLevel; + private int t2s; + private int specialCharConv; + + private List words = new ArrayList(); + private Iterator wordsIter = Collections.emptyIterator(); + private Reader input; + + private ESLogger logger = ESLoggerFactory.getLogger("bosonnlp plugin"); + + public BosonNLPWordSegmenter(Reader input, String URL, String BAT, int spaceMode, int oovLevel, int t2s, int specialCharConv) + throws IOException, JSONException, UnirestException { + this.input = input; + this.TAG_URL = URL; + this.BOSONNLP_API_TOKEN = BAT; + this.spaceMode = spaceMode; + this.oovLevel = oovLevel; + this.t2s = t2s; + this.specialCharConv = specialCharConv; + } + + /** + * Get the input string + * + * @param input + * @return + * @throws IOException + */ + public String getStringText(Reader input) throws IOException { + StringBuffer target = new StringBuffer(); + try (BufferedReader br = new BufferedReader(input)) { + String temp; + while ((temp = br.readLine()) != null) { + target.append(temp + "\n"); + } + } catch (IOException e) { + e.printStackTrace(); + } + return target.toString(); + } + + /** + * Call BosonNLP word segmenter API via Java library Unirest. + * + * @param target, the text to be processed + * @throws JSONException + * @throws UnirestException + * @throws IOException + */ + public void segment(String target) throws JSONException, UnirestException, IOException { + // Clean the word token + this.words.clear(); + // Get the new word token of target + String body = new JSONArray(new String[] { target }).toString(); + HttpResponse jsonResponse = Unirest.post(this.TAG_URL) + .queryString("space_mode", this.spaceMode) + .queryString("oov_level", this.oovLevel) + .queryString("t2s", this.t2s) + .queryString("special_char_conv", this.specialCharConv) + .header("Accept", "application/json") + .header("X-Token", this.BOSONNLP_API_TOKEN).body(body).asJson(); + + makeToken(jsonResponse.getBody()); + } + + /** + * Get the token result from BosonNLP word segmenter. + * + * @param jn + */ + private void makeToken(JsonNode jn) { + try { + // Get Json-array as it encoded before + JSONArray jaTemp = jn.getArray(); + if (jaTemp.length() > 0) { + JSONObject jo = jaTemp.getJSONObject(0); + if (jo != null && jo.has("word")) { + JSONArray ja = jo.getJSONArray("word"); + + for (int i = 0; i < ja.length(); i++) { + this.words.add(ja.get(i).toString()); + } + } else { + logger.error("Check the validation of your API TOKEN or internet", + new UnirestException(jo.toString()), jo); + throw new RuntimeException("Check validation of API TOKEN or internet: " + jo.toString()); + } + } else { + logger.info("No string input", jaTemp); + } + + } catch (JSONException e) { + logger.error("JSONException", e, e); + throw new RuntimeException("JSONException"); + } finally { + // Assign to words iterator + this.wordsIter = this.words.iterator(); + } + } + + public void reset(Reader input) throws IOException, JSONException, UnirestException { + // Reset input + setInput(input); + String target = getStringText(input); + // Do segmentation + segment(target); + } + + public Reader getInput() { + return input; + } + + public void setInput(Reader input) { + this.input = input; + } + + public Iterator getWordsIter() { + return this.wordsIter; + } +} diff --git a/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPAnalyzer.java b/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPAnalyzer.java new file mode 100644 index 0000000..be23a1c --- /dev/null +++ b/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPAnalyzer.java @@ -0,0 +1,58 @@ +/** + * 玻森数据 中文分词 版本 0.8.2 + * + */ +package org.bosonnlp.analyzer.lucene; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.json.JSONException; + +import com.mashape.unirest.http.exceptions.UnirestException; + + +/** + * Implementation of Bosonnlp word segmenter + * on Lucene Analyzer interface + */ +public final class BosonNLPAnalyzer extends Analyzer{ + + private int spaceMode = 0; + private int oovLevel = 3; + private int t2s = 0; + private int specialCharConv = 0; + private String BOSONNLP_API_TOKEN; + private String TAG_URL; + + public BosonNLPAnalyzer(String URL, String BAT){ + super(); + this.TAG_URL = URL; + this.BOSONNLP_API_TOKEN = BAT; + } + + public BosonNLPAnalyzer(String URL, String BAT, int spaceMode, int oovLevel, int t2s, int specialCharConv){ + super(); + this.TAG_URL = URL; + this.BOSONNLP_API_TOKEN = BAT; + this.spaceMode = spaceMode; + this.oovLevel = oovLevel; + this.t2s = t2s; + this.specialCharConv = specialCharConv; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName){ + Tokenizer BTokenizer = null; + try { + + BTokenizer = new BosonNLPTokenizer(TAG_URL, BOSONNLP_API_TOKEN, spaceMode, + oovLevel, t2s, specialCharConv); + } catch (IOException | JSONException | UnirestException e) { + e.printStackTrace(); + } + return new TokenStreamComponents(BTokenizer); + } + +} diff --git a/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPTokenizer.java b/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPTokenizer.java new file mode 100644 index 0000000..0acce5a --- /dev/null +++ b/src/main/java/org/bosonnlp/analyzer/lucene/BosonNLPTokenizer.java @@ -0,0 +1,103 @@ +/** + * BosonNLP word segmenter version 0.8.2 + * 玻森中文分词 版本 0.8.2 + */ +package org.bosonnlp.analyzer.lucene; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.bosonnlp.analyzer.core.BosonNLPWordSegmenter; +import org.json.JSONException; + +import com.mashape.unirest.http.exceptions.UnirestException; + +/** + * Implementation of BosonNLP word segmenter on Lucene Tokenizer interface + * + */ +public final class BosonNLPTokenizer extends Tokenizer { + // bosonnlp word segmenter + private BosonNLPWordSegmenter BosonSeg; + private Iterator wordToken; + // Attributes to be added + private final CharTermAttribute charTermAttr; + private final OffsetAttribute offsetAttr; + private final TypeAttribute typeAttr; + private final PositionIncrementAttribute piAttr; + + // others + private int endPosition = -1; + private int extraIncrement = 0; + + /** + * Lucene constructor + * + * @throws UnirestException + * @throws JSONException + * @throws IOException + */ + public BosonNLPTokenizer(String URL, String BAT, int spaceMode, int oovLevel, int t2s, int specialCharConv) + throws IOException, JSONException, UnirestException { + super(); + // Add token offset attribute + offsetAttr = addAttribute(OffsetAttribute.class); + // Add token content attribute + charTermAttr = addAttribute(CharTermAttribute.class); + // Add token type attribute + typeAttr = addAttribute(TypeAttribute.class); + // Add token position attribute + piAttr = addAttribute(PositionIncrementAttribute.class); + // Create a new word segmenter to get tokens + BosonSeg = new BosonNLPWordSegmenter(input, URL, BAT, spaceMode, oovLevel, t2s, specialCharConv); + } + + @Override + public boolean incrementToken() throws IOException { + // clear all the attributes + clearAttributes(); + if (wordToken.hasNext()) { + String word = wordToken.next(); + piAttr.setPositionIncrement(extraIncrement + 1); + charTermAttr.append(word); + charTermAttr.setLength(word.length()); + offsetAttr.setOffset(endPosition + 1, endPosition + word.length() + 1); + // The type can be extended later + typeAttr.setType("word"); + endPosition += word.length(); + return true; + } + // No more token + return false; + } + + @Override + public void reset() throws IOException { + try { + super.reset(); + BosonSeg.reset(input); + wordToken = BosonSeg.getWordsIter(); + extraIncrement = 0; + endPosition = -1; + } catch (JSONException | UnirestException e) { + e.printStackTrace(); + } + } + + @Override + public final void end() throws IOException { + super.end(); + if (endPosition < 0) { + endPosition = 0; + } + int finalOffset = correctOffset(endPosition); + offsetAttr.setOffset(finalOffset, finalOffset); + piAttr.setPositionIncrement(piAttr.getPositionIncrement() + extraIncrement); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalysisBinderProcessor.java new file mode 100644 index 0000000..3136c6d --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalysisBinderProcessor.java @@ -0,0 +1,21 @@ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor; + + +public class BosonNLPAnalysisBinderProcessor extends AnalysisBinderProcessor { + + /* + * It simply adds our analyzer provider class to a list of bindings. + */ + @Override + public void processAnalyzers(AnalyzersBindings analyzersBindings) { + analyzersBindings.processAnalyzer(BosonNLPAnalyzerProvider.NAME, BosonNLPAnalyzerProvider.class); + } + + @Override + public void processTokenizers(TokenizersBindings tokenizersBindings) { + tokenizersBindings.processTokenizer(BosonNLPTokenizerFactory.NAME, BosonNLPTokenizerFactory.class); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalyzerProvider.java new file mode 100644 index 0000000..a81a321 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/BosonNLPAnalyzerProvider.java @@ -0,0 +1,44 @@ +package org.elasticsearch.index.analysis; + +import org.bosonnlp.analyzer.lucene.BosonNLPAnalyzer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettingsService; + +public class BosonNLPAnalyzerProvider extends AbstractIndexAnalyzerProvider { + private final BosonNLPAnalyzer analyzer; + private String BOSONNLP_API_TOKEN; + private String TAG_URL; + private int spaceMode; + private int oovLevel; + private int t2s; + private int specialCharConv; + + /* + * Name to associate with this class. It will be used in BinderProcesser + */ + public static final String NAME = "bosonnlp"; + + @Inject + public BosonNLPAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { + + super(index, indexSettingsService.getSettings(), name, settings); + this.TAG_URL = settings.get("API_URL", "").toString(); + this.BOSONNLP_API_TOKEN = settings.get("API_TOKEN", "").toString(); + this.spaceMode = Integer.parseInt(settings.get("space_mode", "0")); + this.oovLevel = Integer.parseInt(settings.get("oov_level", "3")); + this.t2s = Integer.parseInt(settings.get("t2s", "0")); + this.specialCharConv = Integer.parseInt(settings.get("spechial_char_conv", "0")); + + this.analyzer = new BosonNLPAnalyzer(TAG_URL, BOSONNLP_API_TOKEN, spaceMode, oovLevel, t2s, specialCharConv); + + } + + @Override + public BosonNLPAnalyzer get() { + return this.analyzer; + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/BosonNLPTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/BosonNLPTokenizerFactory.java new file mode 100644 index 0000000..525cd4c --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/BosonNLPTokenizerFactory.java @@ -0,0 +1,53 @@ +package org.elasticsearch.index.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.Tokenizer; +import org.bosonnlp.analyzer.lucene.BosonNLPTokenizer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettingsService; +import org.json.JSONException; + +import com.mashape.unirest.http.exceptions.UnirestException; + +public class BosonNLPTokenizerFactory extends AbstractTokenizerFactory { + private final Settings settings; + private String BOSONNLP_API_TOKEN; + private String TAG_URL; + private int spaceMode; + private int oovLevel; + private int t2s; + private int specialCharConv; + + // The name is associate with this class, which will be + // called in BinderProcesser + public static final String NAME = "bosonnlp"; + + @Inject + public BosonNLPTokenizerFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettingsService.getSettings(), name, settings); + this.settings = settings; + } + + @Override + public Tokenizer create() { + TAG_URL = settings.get("API_URL", "").toString(); + BOSONNLP_API_TOKEN = settings.get("API_TOKEN", "").toString(); + BosonNLPTokenizer BTokenizer = null; + spaceMode = Integer.parseInt(settings.get("space_mode", "0")); + oovLevel = Integer.parseInt(settings.get("oov_level", "3")); + t2s = Integer.parseInt(settings.get("t2s", "0")); + specialCharConv = Integer.parseInt(settings.get("spechial_char_conv", "0")); + + try { + BTokenizer = new BosonNLPTokenizer(TAG_URL, BOSONNLP_API_TOKEN, spaceMode, oovLevel, t2s, specialCharConv); + } catch (IOException | JSONException | UnirestException e) { + e.printStackTrace(); + } + return BTokenizer; + } + +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysis.java new file mode 100644 index 0000000..13038df --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysis.java @@ -0,0 +1,73 @@ +package org.elasticsearch.indices.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.Tokenizer; +import org.bosonnlp.analyzer.lucene.BosonNLPAnalyzer; +import org.bosonnlp.analyzer.lucene.BosonNLPTokenizer; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; +import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.json.JSONException; + +import com.mashape.unirest.http.exceptions.UnirestException; + + +/** + * Registers indices level analysis components. + */ +public class BosonNLPIndicesAnalysis extends AbstractComponent { + + private String BOSONNLP_API_TOKEN; + private String TAG_URL; + private int spaceMode; + private int oovLevel; + private int t2s; + private int specialCharConv; + + @Inject + public BosonNLPIndicesAnalysis(final Settings settings, IndicesAnalysisService indicesAnalysisService) { + super(settings); + // Get all the arguments from settings + this.TAG_URL = settings.get("API_URL", "").toString(); + this.BOSONNLP_API_TOKEN = settings.get("API_TOKEN", "").toString(); + this.spaceMode = Integer.parseInt(settings.get("space_mode", "0")); + this.oovLevel = Integer.parseInt(settings.get("oov_level", "3")); + this.t2s = Integer.parseInt(settings.get("t2s", "0")); + this.specialCharConv = Integer.parseInt(settings.get("spechial_char_conv", "0")); + + // Register the bosonnlp type analyzer + indicesAnalysisService.analyzerProviderFactories().put("bosonnlp", + new PreBuiltAnalyzerProviderFactory("bosonnlp", AnalyzerScope.GLOBAL, + new BosonNLPAnalyzer(TAG_URL, BOSONNLP_API_TOKEN, spaceMode, oovLevel, t2s, specialCharConv))); + + // Register the bosonnlp type tokenizer + indicesAnalysisService.tokenizerFactories().put("bosonnlp", + new PreBuiltTokenizerFactoryFactory(new TokenizerFactory(){ + + @Override + public String name() { + return "bosonnlp"; + } + + @Override + public Tokenizer create() { + BosonNLPTokenizer BToken = null; + try { + BToken = new BosonNLPTokenizer(TAG_URL, BOSONNLP_API_TOKEN, spaceMode, oovLevel, t2s, specialCharConv); + } catch (JSONException | IOException | UnirestException e) { + + e.printStackTrace(); + } + return BToken; + } + + })); + + } + +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysisModule.java new file mode 100644 index 0000000..bd336a5 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/BosonNLPIndicesAnalysisModule.java @@ -0,0 +1,13 @@ +package org.elasticsearch.indices.analysis; + +import org.elasticsearch.common.inject.AbstractModule; + +public class BosonNLPIndicesAnalysisModule extends AbstractModule{ + + @Override + protected void configure() { + bind(BosonNLPIndicesAnalysis.class).asEagerSingleton(); + + } + +} diff --git a/src/main/java/org/elasticsearch/plugin/analysis/bosonnlp/AnalysisBosonNLPPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/bosonnlp/AnalysisBosonNLPPlugin.java new file mode 100644 index 0000000..042dfb9 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/bosonnlp/AnalysisBosonNLPPlugin.java @@ -0,0 +1,41 @@ +package org.elasticsearch.plugin.analysis.bosonnlp; + +import java.util.Collection; +import java.util.Collections; + +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.BosonNLPAnalysisBinderProcessor; +import org.elasticsearch.indices.analysis.BosonNLPIndicesAnalysisModule; +import org.elasticsearch.plugins.Plugin; + +public class AnalysisBosonNLPPlugin extends Plugin { + + private final Settings settings; + + public AnalysisBosonNLPPlugin(Settings settings) { + this.settings = settings; + } + + @Override + public String name() { + return "analysis-bosonnlp"; + } + + @Override + public String description() { + return "BosonNLP analysis plugin for elasticsearch."; + } + + @Override + public Collection nodeModules() { + return Collections. singletonList(new BosonNLPIndicesAnalysisModule()); + } + + public void onModule(AnalysisModule module) { + module.addProcessor(new BosonNLPAnalysisBinderProcessor()); + } + +} diff --git a/src/main/resources/plugin-descriptor.properties b/src/main/resources/plugin-descriptor.properties new file mode 100644 index 0000000..24d1551 --- /dev/null +++ b/src/main/resources/plugin-descriptor.properties @@ -0,0 +1,82 @@ +# Elasticsearch plugin descriptor file +# This file must exist as 'plugin-descriptor.properties' at +# the root directory of all plugins. +# +# A plugin can be 'site', 'jvm', or both. +# +### example site plugin for "foo": +# +# foo.zip <-- zip file for the plugin, with this structure: +# _site/ <-- the contents that will be served +# plugin-descriptor.properties <-- example contents below: +# +# site=true +# description=My cool plugin +# version=1.0 +# +### example jvm plugin for "foo" +# +# foo.zip <-- zip file for the plugin, with this structure: +# .jar <-- classes, resources, dependencies +# .jar <-- any number of jars +# plugin-descriptor.properties <-- example contents below: +# +# jvm=true +# classname=foo.bar.BazPlugin +# description=My cool plugin +# version=2.0.0-rc1 +# elasticsearch.version=2.0 +# java.version=1.7 +# +### mandatory elements for all plugins: +# +# 'description': simple summary of the plugin +description=${project.description} +# +# 'version': plugin's version +version=${project.version} +# +# 'name': the plugin name +name=${elasticsearch.plugin.name} + +### mandatory elements for site plugins: +# +# 'site': set to true to indicate contents of the _site/ +# directory in the root of the plugin should be served. +# site=${elasticsearch.plugin.site} +# +### mandatory elements for jvm plugins : +# +# 'jvm': true if the 'classname' class should be loaded +# from jar files in the root directory of the plugin. +# Note that only jar files in the root directory are +# added to the classpath for the plugin! If you need +# other resources, package them into a resources jar. +jvm=${elasticsearch.plugin.jvm} +# +# 'classname': the name of the class to load, fully-qualified.# classname=${elasticsearch.plugin.classname} +classname=${elasticsearch.plugin.classname} +# +# 'java.version' version of java the code is built against +# use the system property java.specification.version +# version string must be a sequence of nonnegative decimal integers +# separated by "."'s and may have leading zeros +java.version=${maven.compiler.target} +# +# 'elasticsearch.version' version of elasticsearch compiled against +# You will have to release a new version of the plugin for each new +# elasticsearch release. This version is checked when the plugin +# is loaded so Elasticsearch will refuse to start in the presence of +# plugins with the incorrect elasticsearch.version. +elasticsearch.version=${elasticsearch.version} +# +### deprecated elements for jvm plugins : +# +# 'isolated': true if the plugin should have its own classloader. +# passing false is deprecated, and only intended to support plugins +# that have hard dependencies against each other. If this is +# not specified, then the plugin is isolated by default. +isolated=${elasticsearch.plugin.isolated} +# + +# plugin=org.elasticsearch.plugin.analysis.bosonnlp.AnalysisBosonNLPPlugin diff --git a/src/test/java/org/elasticsearch/BosonNLPTest.java b/src/test/java/org/elasticsearch/BosonNLPTest.java new file mode 100644 index 0000000..0e21273 --- /dev/null +++ b/src/test/java/org/elasticsearch/BosonNLPTest.java @@ -0,0 +1,38 @@ +package org.elasticsearch; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for Bosonnlp word segmenter. + */ +public class BosonNLPTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public BosonNLPTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( BosonNLPTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +}