From 3450c0aec12aae931f88493173b11ed39e4bfd53 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 30 Sep 2021 12:25:07 +0900 Subject: [PATCH] Python binding (#41) * Init with empty module * Add a naive wrapper * Add PyMorpheme * Add SplitMode class * Split files * Add annotations * Add Readme for python binding * Add license --- .gitignore | 5 + Cargo.lock | 223 +++++++++++++++++++++++++++++++++----- Cargo.toml | 1 + python/Cargo.toml | 21 ++++ python/MANIFEST.in | 2 + python/README.md | 37 +++++++ python/build-wheels.sh | 16 +++ python/pyproject.toml | 2 + python/setup.py | 26 +++++ python/src/dictionary.rs | 65 +++++++++++ python/src/lib.rs | 32 ++++++ python/src/morpheme.rs | 228 +++++++++++++++++++++++++++++++++++++++ python/src/tokenizer.rs | 131 ++++++++++++++++++++++ 13 files changed, 763 insertions(+), 26 deletions(-) create mode 100644 python/Cargo.toml create mode 100644 python/MANIFEST.in create mode 100644 python/README.md create mode 100644 python/build-wheels.sh create mode 100644 python/pyproject.toml create mode 100644 python/setup.py create mode 100644 python/src/dictionary.rs create mode 100644 python/src/lib.rs create mode 100644 python/src/morpheme.rs create mode 100644 python/src/tokenizer.rs diff --git a/.gitignore b/.gitignore index 158c85b1..5ee67833 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,8 @@ README*.html .idea/ .vscode/ +# python binding +python/dist +.env +*.egg-info +*.so \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 1bcee6ae..2b211417 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -134,11 +134,43 @@ dependencies = [ "libc", ] +[[package]] +name = "indoc" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47741a8bc60fb26eb8d6e0238bbb26d8575ff623fdc97b1a2c00c050b9684ed8" +dependencies = [ + "indoc-impl", + "proc-macro-hack", +] + +[[package]] +name = "indoc-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce046d161f000fffde5f432a0d034d0341dc152643b2598ed5bfce44c4f3a8f0" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", + "unindent", +] + +[[package]] +name = "instant" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "716d3d89f35ac6a34fd0eed635395f4c3b76fa889338a4632e5231a8684216bd" +dependencies = [ + "cfg-if", +] + [[package]] name = "itoa" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "join_katakana_oov" @@ -162,9 +194,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.99" +version = "0.2.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7f823d141fe0a24df1e23b4af4e3c7ba9e5966ec514ea068c93024aa7deb765" +checksum = "dd8f7255a17a627354f321ef0055d63b898c6fb27eff628af4d1b66b7331edf6" [[package]] name = "libloading" @@ -176,11 +208,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "lock_api" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +dependencies = [ + "scopeguard", +] + [[package]] name = "memchr" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" [[package]] name = "memmap2" @@ -214,6 +255,56 @@ dependencies = [ "version_check", ] +[[package]] +name = "once_cell" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + [[package]] name = "ppv-lite86" version = "0.2.10" @@ -244,15 +335,69 @@ dependencies = [ "version_check", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + [[package]] name = "proc-macro2" -version = "1.0.28" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" dependencies = [ "unicode-xid", ] +[[package]] +name = "pyo3" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35100f9347670a566a67aa623369293703322bb9db77d99d7df7313b575ae0c8" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "parking_lot", + "paste", + "pyo3-build-config", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d12961738cacbd7f91b7c43bc25cfeeaa2698ad07a04b3be0aa88b950865738f" +dependencies = [ + "once_cell", +] + +[[package]] +name = "pyo3-macros" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0bc5215d704824dfddddc03f93cb572e1155c68b6761c37005e1c288808ea8" +dependencies = [ + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71623fc593224afaab918aa3afcaf86ed2f43d34f6afde7f3922608f253240df" +dependencies = [ + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + [[package]] name = "quote" version = "1.0.9" @@ -343,20 +488,26 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "serde" -version = "1.0.127" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f03b9878abf6d14e6779d3f24f07b2cfa90352cfec4acc5aab8f1ac7f146fae8" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.127" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a024926d3432516606328597e0f224a51355a493b49fdd67e9209187cbe55ecc" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" dependencies = [ "proc-macro2", "quote", @@ -365,9 +516,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.66" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "336b10da19a12ad094b59d870ebde26a45402e5b470add4b5fd03c5048a32127" +checksum = "0f690853975602e1bfe1ccbf50504d67174e3bcf340f23b5ea9992e0587a52d8" dependencies = [ "itoa", "ryu", @@ -381,6 +532,12 @@ dependencies = [ "sudachi", ] +[[package]] +name = "smallvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" + [[package]] name = "strsim" version = "0.8.0" @@ -389,9 +546,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69b041cdcb67226aca307e6e7be44c8806423d83e018bd662360a93dabce4d71" +checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" dependencies = [ "clap", "lazy_static", @@ -400,9 +557,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.15" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7813934aecf5f51a54775e00068c237de98489463968231a51746bbbc03f9c10" +checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" dependencies = [ "heck", "proc-macro-error", @@ -444,11 +601,19 @@ dependencies = [ "sudachi", ] +[[package]] +name = "sudachi-python" +version = "0.1.0" +dependencies = [ + "pyo3", + "sudachi", +] + [[package]] name = "syn" -version = "1.0.74" +version = "1.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" +checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0" dependencies = [ "proc-macro2", "quote", @@ -480,18 +645,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.26" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2" +checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.26" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745" +checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" dependencies = [ "proc-macro2", "quote", @@ -530,9 +695,9 @@ checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" [[package]] name = "unicode-width" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" [[package]] name = "unicode-xid" @@ -540,6 +705,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +[[package]] +name = "unindent" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f14ee04d9415b52b3aeab06258a3f07093182b88ba0f9b8d203f211a7a7d41c7" + [[package]] name = "vec_map" version = "0.8.2" diff --git a/Cargo.toml b/Cargo.toml index 999b801b..aea353c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "plugin/input_text/*", "plugin/oov/*", "plugin/path_rewrite/*", + "python" ] default-members = [ diff --git a/python/Cargo.toml b/python/Cargo.toml new file mode 100644 index 00000000..5a25e046 --- /dev/null +++ b/python/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "sudachi-python" +version = "0.1.0" +edition = "2018" +description = "Python bindings of sudachi.rs, the Japanese Morphological Analyzer" +homepage = "https://github.com/WorksApplications/sudachi.rs" +repository = "https://github.com/WorksApplications/sudachi.rs" +license = "Apache-2.0" + + +[lib] +name = "sudachi_python" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.14.3", features = ["extension-module"] } + +[dependencies.sudachi] +version = "*" +path = "../sudachi" + diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 00000000..7c68298b --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,2 @@ +include Cargo.toml +recursive-include src * diff --git a/python/README.md b/python/README.md new file mode 100644 index 00000000..cd12d180 --- /dev/null +++ b/python/README.md @@ -0,0 +1,37 @@ + +# sudachi.rs python + +This is the python binding of sudachi.rs. + + +# Caution + +This project is under development and specifications may change drastically. + + +# Setup + +1. Install python module `setuptools` and `setuptools-rust`. +2. Run `python3 setup.py develop`. + - `develop` will create a debug build, while `install` will create a release build. +3. Now you can import the module by `import sudachi.sudachi`. + +ref: [setuptools-rust](https://github.com/PyO3/setuptools-rust) + + +# Example + +```python +import sudachi.sudachi as ss + +dictionary = ss.Dictionary() +tokenizer = dictionary.create() +morphemes = tokenizer.tokenize("国会議事堂前駅") +print(morphemes[0].surface()) # '国会議事堂前駅' +print(morphemes[0].reading_form()) # 'コッカイギジドウマエエキ' +print(morphemes[0].part_of_speech()) # ['名詞', '固有名詞', '一般', '*', '*', '*'] + +tokenizer = dictionary.create(ss.SplitMode.A) +morphemes = tokenizer.tokenize("国会議事堂前駅") +print(list(map(str, morphemes))) # ['国会', '議事', '堂', '前', '駅'] +``` diff --git a/python/build-wheels.sh b/python/build-wheels.sh new file mode 100644 index 00000000..8bbca803 --- /dev/null +++ b/python/build-wheels.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -ex + +curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y +export PATH="$HOME/.cargo/bin:$PATH" + +cd /io + +for PYBIN in /opt/python/cp{35,36,37,38,39}*/bin; do + "${PYBIN}/pip" install -U setuptools wheel setuptools-rust + "${PYBIN}/python" setup.py bdist_wheel +done + +for whl in dist/*.whl; do + auditwheel repair "$whl" -w dist/ +done diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 00000000..31ffe048 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "wheel", "setuptools-rust"] diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 00000000..8b7c4b17 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,26 @@ +# Copyright (c) 2019 Works Applications Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import setup +from setuptools_rust import Binding, RustExtension + +setup( + name="sudachi", + version="0.1", + rust_extensions=[RustExtension("sudachi.sudachi", binding=Binding.PyO3)], + packages=["sudachi"], + package_dir={"": "py_src"}, + # rust extensions are not zip safe, just like C-extensions. + zip_safe=False, +) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs new file mode 100644 index 00000000..873c34c6 --- /dev/null +++ b/python/src/dictionary.rs @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::path::PathBuf; +use std::sync::Arc; + +use pyo3::exceptions::PyException; +use pyo3::prelude::*; + +use sudachi::config::Config; +use sudachi::dic::dictionary::JapaneseDictionary; +use sudachi::stateless_tokeniser::StatelessTokenizer; + +use crate::tokenizer::{PySplitMode, PyTokenizer}; + +#[pyclass(module = "sudachi.dictionary", name = "Dictionary")] +#[pyo3(text_signature = "(config_path, resource_dir)")] +pub struct PyDictionary { + dictionary: Arc, +} + +#[pymethods] +impl PyDictionary { + /// Creates a sudachi dictionary + #[new] + #[args(config_path = "None", resource_dir = "None")] + fn new(config_path: Option, resource_dir: Option) -> PyResult { + let config = Config::new(config_path, resource_dir, None).map_err(|e| { + PyException::new_err(format!("Error loading config: {}", e.to_string())) + })?; + + let dictionary = Arc::new(JapaneseDictionary::from_cfg(&config).map_err(|e| { + PyException::new_err(format!( + "Error while constructing dictionary: {}", + e.to_string() + )) + })?); + + Ok(Self { dictionary }) + } + + /// Creates a sudachi tokenizer + #[pyo3(text_signature = "($self, mode)")] + #[args(mode = "None")] + fn create(&self, mode: Option) -> PyTokenizer { + let dictionary = self.dictionary.clone(); + let tokenizer = StatelessTokenizer::new(self.dictionary.clone()); + let mode = mode.unwrap_or(PySplitMode::C).into(); + + PyTokenizer::new(dictionary, tokenizer, mode) + } +} diff --git a/python/src/lib.rs b/python/src/lib.rs new file mode 100644 index 00000000..14b2b689 --- /dev/null +++ b/python/src/lib.rs @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use pyo3::prelude::*; + +pub mod dictionary; +pub mod morpheme; +pub mod tokenizer; + +/// module root +#[pymodule] +fn sudachi(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs new file mode 100644 index 00000000..d20beff8 --- /dev/null +++ b/python/src/morpheme.rs @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; + +use pyo3::exceptions::PyException; +use pyo3::prelude::*; + +use sudachi::dic::dictionary::JapaneseDictionary; +use sudachi::dic::lexicon::word_infos::WordInfo; +use sudachi::dic::lexicon_set::LexiconSet; +use sudachi::prelude::*; + +use crate::tokenizer::PySplitMode; + +#[pyclass(module = "sudachi.morpheme", name = "Morpheme")] +#[derive(Clone)] +pub struct PyMorpheme { + // begin: usize, // Need a reconstruction of Morpheme + // end: usize, // Need a reconstruction of Morpheme + surface: String, + word_info: WordInfo, + is_oov: bool, + // word_id: u32, // Need a reconstruction of Morpheme + dictionary_id: i32, + dict: Arc, +} + +impl PyMorpheme { + pub fn new(m: Morpheme, dict: Arc) -> Self { + Self { + surface: m.surface().clone(), + word_info: m.word_info, + is_oov: m.is_oov, + dictionary_id: m.dictionary_id, + dict, + } + } +} + +#[pyproto] +impl pyo3::basic::PyObjectProtocol for PyMorpheme { + fn __str__(&self) -> PyResult { + Ok(self.surface.clone()) + } +} + +#[pymethods] +impl PyMorpheme { + // /// Returns the begin index of this in the input text + // #[pyo3(text_signature = "($self)")] + // fn begin(&self) -> usize { + // self.begin + // } + + // /// Returns the end index of this in the input text + // #[pyo3(text_signature = "($self)")] + // fn end(&self) -> usize { + // self.end + // } + + /// Returns the surface + #[pyo3(text_signature = "($self)")] + fn surface(&self) -> &str { + &self.surface + } + + /// Returns the part of speech + #[pyo3(text_signature = "($self)")] + fn part_of_speech(&self) -> Vec { + self.dict + .grammar() + .pos_list + .get(self.part_of_speech_id() as usize) + .unwrap() + .clone() + } + + /// Returns the id of the part of speech in the dictionary + #[pyo3(text_signature = "($self)")] + fn part_of_speech_id(&self) -> u16 { + self.word_info.pos_id + } + + /// Returns the dictionary form + #[pyo3(text_signature = "($self)")] + fn dictionary_form(&self) -> &str { + &self.word_info.dictionary_form + } + + /// Returns the normalized form + #[pyo3(text_signature = "($self)")] + fn normalized_form(&self) -> &str { + &self.word_info.normalized_form + } + + /// Returns the reading form + #[pyo3(text_signature = "($self)")] + fn reading_form(&self) -> &str { + &self.word_info.reading_form + } + + /// Returns a list of morphemes splitting itself with given split mode + #[pyo3(text_signature = "($self, mode, /)")] + fn split(&self, mode: PySplitMode) -> PyResult> { + let word_ids = match mode { + PySplitMode::A => &self.word_info.a_unit_split, + PySplitMode::B => &self.word_info.b_unit_split, + PySplitMode::C => return Ok(vec![self.clone()]), + _ => return Err(PyException::new_err(format!("Error invalid SplitMode",))), + }; + + if word_ids.len() < 2 { + return Ok(vec![self.clone()]); + } + + let mut morphemes = Vec::with_capacity(word_ids.len()); + for &wid in word_ids { + let word_info = self.dict.lexicon().get_word_info(wid).map_err(|e| { + PyException::new_err(format!("Error while getting word_info: {}", e.to_string())) + })?; + + morphemes.push(PyMorpheme { + surface: word_info.surface.clone(), + word_info, + is_oov: false, + dictionary_id: LexiconSet::get_dictionary_id(wid) as i32, + dict: self.dict.clone(), + }); + } + + Ok(morphemes) + } + + /// Returns whether if this is out of vocabulary word + #[pyo3(text_signature = "($self)")] + fn is_oov(&self) -> bool { + self.is_oov + } + + // /// Returns word id of this word in the dictionary + // #[pyo3(text_signature = "($self)")] + // fn word_id(&self) -> u32 { + // self.word_id + // } + + /// Returns the dictionary id which this word belongs + #[pyo3(text_signature = "($self)")] + fn dictionary_id(&self) -> i32 { + self.dictionary_id + } + + /// Returns the list of synonym group ids + #[pyo3(text_signature = "($self)")] + fn synonym_group_ids(&self) -> Vec { + self.word_info.synonym_group_ids.clone() + } + + /// Returns the word info + #[pyo3(text_signature = "($self)")] + fn get_word_info(&self) -> PyWordInfo { + self.word_info.clone().into() + } +} + +#[pyclass(module = "sudachi.wordinfo", name = "WordInfo")] +pub struct PyWordInfo { + #[pyo3(get)] + surface: String, + #[pyo3(get)] + head_word_length: u16, + #[pyo3(get)] + pos_id: u16, + #[pyo3(get)] + normalized_form: String, + #[pyo3(get)] + dictionary_form_word_id: i32, + #[pyo3(get)] + dictionary_form: String, + #[pyo3(get)] + reading_form: String, + #[pyo3(get)] + a_unit_split: Vec, + #[pyo3(get)] + b_unit_split: Vec, + #[pyo3(get)] + word_structure: Vec, + #[pyo3(get)] + synonym_group_ids: Vec, +} + +impl From for PyWordInfo { + fn from(word_info: WordInfo) -> Self { + Self { + surface: word_info.surface, + head_word_length: word_info.head_word_length, + pos_id: word_info.pos_id, + normalized_form: word_info.normalized_form, + dictionary_form_word_id: word_info.dictionary_form_word_id, + dictionary_form: word_info.dictionary_form, + reading_form: word_info.reading_form, + a_unit_split: word_info.a_unit_split, + b_unit_split: word_info.b_unit_split, + word_structure: word_info.word_structure, + synonym_group_ids: word_info.synonym_group_ids, + } + } +} + +#[pymethods] +impl PyWordInfo { + fn length(&self) -> u16 { + self.head_word_length + } +} diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs new file mode 100644 index 00000000..1183dcb9 --- /dev/null +++ b/python/src/tokenizer.rs @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; + +use pyo3::exceptions::PyException; +use pyo3::prelude::*; + +use sudachi::dic::dictionary::JapaneseDictionary; +use sudachi::prelude::*; +use sudachi::stateless_tokeniser::StatelessTokenizer; + +use crate::morpheme::PyMorpheme; + +/// Unit to split text +/// +/// This implementation is a workaround. Waiting for the pyo3 enum feature. +/// ref: [PyO3 issue #834](https://github.com/PyO3/pyo3/issues/834). +#[pyclass(module = "sudachi.tokenizer", name = "SplitMode")] +#[derive(Clone, PartialEq, Eq)] +pub struct PySplitMode { + mode: u8, +} + +#[pymethods] +impl PySplitMode { + #[classattr] + pub const A: Self = Self { mode: 0 }; + + #[classattr] + pub const B: Self = Self { mode: 1 }; + + #[classattr] + pub const C: Self = Self { mode: 2 }; +} + +impl From for PySplitMode { + fn from(mode: Mode) -> Self { + match mode { + Mode::A => PySplitMode::A, + Mode::B => PySplitMode::B, + Mode::C => PySplitMode::C, + } + } +} + +impl From for Mode { + fn from(mode: PySplitMode) -> Self { + match mode { + PySplitMode::A => Mode::A, + PySplitMode::B => Mode::B, + _ => Mode::C, + } + } +} + +impl std::str::FromStr for PySplitMode { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "A" | "a" => Ok(PySplitMode::A), + "B" | "b" => Ok(PySplitMode::B), + "C" | "c" => Ok(PySplitMode::C), + _ => Err("Mode must be one of \"A\", \"B\", or \"C\" (in lower or upper case)."), + } + } +} + +#[pyclass(module = "sudachi.tokenizer", name = "Tokenizer")] +pub struct PyTokenizer { + dictionary: Arc, + tokenizer: StatelessTokenizer>, + mode: Mode, +} + +impl PyTokenizer { + pub fn new( + dictionary: Arc, + tokenizer: StatelessTokenizer>, + mode: Mode, + ) -> Self { + Self { + dictionary, + tokenizer, + mode, + } + } +} + +#[pymethods] +impl PyTokenizer { + /// Break text into morphemes + #[pyo3(text_signature = "($self, text, /, mode, enable_debug)")] + #[args(text, mode = "None", enable_debug = "None")] + fn tokenize( + &self, + text: &str, + mode: Option, + enable_debug: Option, // want to take logger instead of debug flag + ) -> PyResult> { + let mode: Mode = match mode { + Some(m) => m.into(), + None => self.mode, + }; + + let morphemes = self + .tokenizer + .tokenize(text, mode, enable_debug.unwrap_or(false)) + .map_err(|e| { + PyException::new_err(format!("Error while tokenization: {}", e.to_string())) + })? + .into_iter() + .map(|m| PyMorpheme::new(m, self.dictionary.clone())) + .collect(); + + Ok(morphemes) + } +}