From 113ad4cea2c77ea08ed097a299fa08ebb129da01 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Tue, 4 Jun 2024 11:01:37 +0900 Subject: [PATCH 01/24] update pyo3 to v0.21 --- python/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index e1143743..4c5513d9 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -15,7 +15,7 @@ name = "sudachipy" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.20", features = ["extension-module"] } +pyo3 = { version = "0.21", features = ["extension-module", "gil-refs"] } thread_local = "1.1" # Apache 2.0/MIT scopeguard = "1" # Apache 2.0/MIT From 4d0d1c0d253af545e0a6006b012e2d74ca0fff59 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Tue, 4 Jun 2024 16:07:09 +0900 Subject: [PATCH 02/24] use Bound instead of PyCell --- python/src/dictionary.rs | 24 ++++++++++++------------ python/src/morpheme.rs | 10 +++++----- python/src/pos_matcher.rs | 4 ++-- python/src/pretokenizer.rs | 12 ++++++------ python/src/tokenizer.rs | 8 ++++---- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index bc333c8e..9a6f062d 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -319,7 +319,7 @@ impl PyDictionary { let projector = resolve_projection(passed, &dict.projection); let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector); - let internal_cell = PyCell::new(py, internal)?; + let internal_cell = Bound::new(py, internal)?; let module = py.import("tokenizers.pre_tokenizers")?; module .getattr("PreTokenizer")? @@ -340,18 +340,18 @@ impl PyDictionary { /// :type surface: str /// :type out: sudachipy.MorphemeList #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")] - fn lookup<'p>( - &'p self, - py: Python<'p>, - surface: &'p str, - out: Option<&'p PyCell<PyMorphemeListWrapper>>, - ) -> PyResult<&'p PyCell<PyMorphemeListWrapper>> { + fn lookup<'py>( + &'py self, + py: Python<'py>, + surface: &'py str, + out: Option<Bound<'py, PyMorphemeListWrapper>>, + ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> { let l = match out { Some(l) => l, - None => PyCell::new( - py, - PyMorphemeListWrapper::new(self.dictionary.clone().unwrap()), - )?, + None => { + let list = PyMorphemeListWrapper::new(self.dictionary.clone().unwrap()); + Bound::new(py, list)? + } }; // this needs to be a variable diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index ad3929dd..1c8cf553 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -124,7 +124,7 @@ impl PyMorphemeListWrapper { self.size(py) } - fn __getitem__(slf: &PyCell<PyMorphemeListWrapper>, mut idx: isize) -> PyResult<PyMorpheme> { + fn __getitem__(slf: Bound<PyMorphemeListWrapper>, mut idx: isize) -> PyResult<PyMorpheme> { let list = slf.borrow(); let py = slf.py(); let len = list.size(py) as isize; @@ -362,9 +362,9 @@ impl PyMorpheme { &'py self, py: Python<'py>, mode: &PyAny, - out: Option<&'py PyCell<PyMorphemeListWrapper>>, + out: Option<Bound<'py, PyMorphemeListWrapper>>, add_single: Option<bool>, - ) -> PyResult<&'py PyCell<PyMorphemeListWrapper>> { + ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> { let list = self.list(py); let mode = extract_mode(py, mode)?; @@ -372,7 +372,7 @@ impl PyMorpheme { let out_cell = match out { None => { let list = list.empty_clone(py); - PyCell::new(py, list)? + Bound::new(py, list)? } Some(r) => r, }; diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 7c6a884d..062d0d0c 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ impl PyPosIter { #[pymethods] impl PyPosIter { - fn __iter__(slf: &PyCell<Self>) -> &PyCell<Self> { + fn __iter__(slf: Bound<Self>) -> Bound<Self> { slf } diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 755f040b..303f7645 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -157,11 +157,11 @@ impl PyPretokenizer { } /// Entry function for tokenization - pub fn pre_tokenize<'p>( - self_: &'p PyCell<Self>, - py: Python<'p>, - data: &'p PyAny, - ) -> PyResult<&'p PyAny> { + pub fn pre_tokenize<'py>( + self_: Bound<'py, Self>, + py: Python<'py>, + data: &'py PyAny, + ) -> PyResult<&'py PyAny> { data.call_method1("split", PyTuple::new(py, [self_])) } } diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 558d02cb..5f364380 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -138,8 +138,8 @@ impl PyTokenizer { text: &'py str, mode: Option<&PyAny>, logger: Option<PyObject>, - out: Option<&'py PyCell<PyMorphemeListWrapper>>, - ) -> PyResult<&'py PyCell<PyMorphemeListWrapper>> { + out: Option<Bound<'py, PyMorphemeListWrapper>>, + ) -> PyResult<Bound<PyMorphemeListWrapper>> { // restore default mode on scope exit let mode = match mode { None => None, @@ -164,7 +164,7 @@ impl PyTokenizer { let morphemes = MorphemeList::empty(dict); let wrapper = PyMorphemeListWrapper::from_components(morphemes, self.projection.clone()); - PyCell::new(py, wrapper)? + Bound::new(py, wrapper)? } Some(list) => list, }; From 2787346223d129f59adb1fc690789373acecd163 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Thu, 6 Jun 2024 10:34:58 +0900 Subject: [PATCH 03/24] deactivate gil-ref feature and fix related deprecation warnings --- python/Cargo.toml | 2 +- python/src/build.rs | 100 ++++++++++++++++++++++--------------- python/src/dictionary.rs | 83 +++++++++++++++--------------- python/src/lib.rs | 4 +- python/src/morpheme.rs | 36 ++++++------- python/src/pos_matcher.rs | 23 +++++---- python/src/pretokenizer.rs | 53 ++++++++++---------- python/src/projection.rs | 35 ++++++------- python/src/tokenizer.rs | 2 +- 9 files changed, 183 insertions(+), 155 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 4c5513d9..6e564c2e 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -15,7 +15,7 @@ name = "sudachipy" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.21", features = ["extension-module", "gil-refs"] } +pyo3 = { version = "0.21", features = ["extension-module"] } thread_local = "1.1" # Apache 2.0/MIT scopeguard = "1" # Apache 2.0/MIT diff --git a/python/src/build.rs b/python/src/build.rs index a6005b26..40e52c34 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,17 +26,17 @@ use sudachi::config::Config; use sudachi::dic::build::{DataSource, DictBuilder}; use sudachi::dic::dictionary::JapaneseDictionary; -pub fn register_functions(m: &PyModule) -> PyResult<()> { +pub fn register_functions(m: &Bound<PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(build_system_dic, m)?)?; m.add_function(wrap_pyfunction!(build_user_dic, m)?)?; Ok(()) } -fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<&PyList> { - let stats = PyList::empty(py); +fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<Bound<PyList>> { + let stats = PyList::empty_bound(py); for p in builder.report() { - let t = PyTuple::new( + let t = PyTuple::new_bound( py, [ p.part().into_py(py), @@ -60,23 +60,26 @@ fn create_file(p: &Path) -> std::io::Result<File> { #[pyfunction] #[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] -fn build_system_dic<'p>( - py: Python<'p>, - matrix: &'p PyAny, - lex: &'p PyList, - output: &'p PyAny, +fn build_system_dic<'py>( + py: Python<'py>, + matrix: &Bound<'py, PyAny>, + lex: &Bound<'py, PyList>, + output: &Bound<'py, PyAny>, description: Option<&str>, -) -> PyResult<&'p PyList> { +) -> PyResult<Bound<'py, PyList>> { let mut builder = DictBuilder::new_system(); description.map(|d| builder.set_description(d)); - let matrix_src = as_data_source(py, matrix)?; + let matrix_path = resolve_as_pypathstr(py, matrix)?; + let matrix_src = as_data_source(matrix_path.as_ref(), matrix)?; errors::wrap_ctx(builder.read_conn(matrix_src), matrix)?; for f in lex.iter() { - let lex_src = as_data_source(py, &f)?; + let lex_path = resolve_as_pypathstr(py, &f)?; + let lex_src = as_data_source(lex_path.as_ref(), &f)?; errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?; } - let out_file = match as_data_source(py, output)? { + let out_path = resolve_as_pypathstr(py, output)?; + let out_file = match as_data_source(out_path.as_ref(), output)? { DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?, DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")), }; @@ -89,14 +92,15 @@ fn build_system_dic<'p>( #[pyfunction] #[pyo3(text_signature = "(system, lex, output, description=None) -> list")] -fn build_user_dic<'p>( - py: Python<'p>, - system: &'p PyAny, - lex: &'p PyList, - output: &'p PyAny, +fn build_user_dic<'py>( + py: Python<'py>, + system: &Bound<'py, PyAny>, + lex: &Bound<'py, PyList>, + output: &Bound<'py, PyAny>, description: Option<&str>, -) -> PyResult<&'p PyList> { - let system_dic = match as_data_source(py, system)? { +) -> PyResult<Bound<'py, PyList>> { + let system_path = resolve_as_pypathstr(py, system)?; + let system_dic = match as_data_source(system_path.as_ref(), system)? { DataSource::File(f) => { let resource_path = get_default_resource_dir(py)?; let cfg = Config::minimal_at(resource_path).with_system_dic(f); @@ -113,10 +117,12 @@ fn build_user_dic<'p>( description.map(|d| builder.set_description(d)); for f in lex.iter() { - let lex_src = as_data_source(py, &f)?; + let lex_path = resolve_as_pypathstr(py, &f)?; + let lex_src = as_data_source(lex_path.as_ref(), &f)?; errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?; } - let out_file = match as_data_source(py, output)? { + let out_path = resolve_as_pypathstr(py, output)?; + let out_file = match as_data_source(out_path.as_ref(), output)? { DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?, DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")), }; @@ -127,25 +133,39 @@ fn build_user_dic<'p>( to_stats(py, builder) } -fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult<DataSource<'p>> { - let path = py - .import("pathlib")? - .getattr("Path")? - .downcast::<PyType>()?; +fn resolve_as_pypathstr<'py>( + py: Python<'py>, + data: &Bound<'py, PyAny>, +) -> PyResult<Option<Bound<'py, PyString>>> { + let binding = py.import_bound("pathlib")?.getattr("Path")?; + let path = binding.downcast::<PyType>()?; if data.is_instance(path)? { - let pypath = data.call_method0("resolve")?.str()?; - Ok(DataSource::File(Path::new(pypath.to_str()?))) + Ok(Some(data.call_method0("resolve")?.str()?)) } else if data.is_instance_of::<PyString>() { - let pypath = data.str()?; - Ok(DataSource::File(Path::new(pypath.to_str()?))) - } else if data.is_instance_of::<PyBytes>() { - let data = data.downcast::<PyBytes>()?; - Ok(DataSource::Data(data.as_bytes())) + Ok(Some(data.str()?)) } else { - Err(pyo3::exceptions::PyValueError::new_err(format!( - "data source should can be only Path, bytes or str, was {}: {}", - data, - data.get_type() - ))) + Ok(None) + } +} + +fn as_data_source<'py>( + resolved_path: Option<&'py Bound<'py, PyString>>, + original_obj: &'py Bound<'py, PyAny>, +) -> PyResult<DataSource<'py>> { + match resolved_path { + Some(pystr) => Ok(DataSource::File(Path::new(pystr.to_str()?))), + None => { + if original_obj.is_instance_of::<PyBytes>() { + Ok(DataSource::Data( + original_obj.downcast::<PyBytes>()?.as_bytes(), + )) + } else { + Err(pyo3::exceptions::PyValueError::new_err(format!( + "data source should can be only Path, bytes or str, was {}: {}", + original_obj, + original_obj.get_type() + ))) + } + } } } diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 9a6f062d..251267ab 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -103,11 +103,11 @@ impl PyDictionary { #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] fn new( py: Python, - config_path: Option<&PyAny>, + config_path: Option<&Bound<PyAny>>, resource_dir: Option<PathBuf>, dict: Option<&str>, dict_type: Option<&str>, - config: Option<&PyAny>, + config: Option<&Bound<PyAny>>, ) -> PyResult<Self> { if config.is_some() && config_path.is_some() { return Err(SudachiErr::new_err("Both config and config_path options were specified at the same time, use one of them")); @@ -131,10 +131,10 @@ impl PyDictionary { }; if dict_type.is_some() { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn( + let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?; + PyErr::warn_bound( py, - cat, + &cat, "Parameter dict_type of Dictionary() is deprecated, use dict instead", 1, )?; @@ -189,7 +189,7 @@ impl PyDictionary { .pos_list .iter() .map(|pos| { - let tuple: Py<PyTuple> = PyTuple::new(py, pos).into_py(py); + let tuple: Py<PyTuple> = PyTuple::new_bound(py, pos).into_py(py); tuple }) .collect(); @@ -226,9 +226,9 @@ impl PyDictionary { fn create<'py>( &'py self, py: Python<'py>, - mode: Option<&'py PyAny>, - fields: Option<&'py PySet>, - projection: Option<&'py PyString>, + mode: Option<&Bound<'py, PyAny>>, + fields: Option<&Bound<'py, PySet>>, + projection: Option<&Bound<'py, PyString>>, ) -> PyResult<PyTokenizer> { let mode = match mode { Some(m) => extract_mode(py, m)?, @@ -263,7 +263,11 @@ impl PyDictionary { /// /// :param target: can be either a callable or list of POS partial tuples #[pyo3(text_signature = "($self, target)")] - fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> { + fn pos_matcher<'py>( + &'py self, + py: Python<'py>, + target: &Bound<'py, PyAny>, + ) -> PyResult<PyPosMatcher> { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -286,21 +290,21 @@ impl PyDictionary { text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer", signature = (mode = None, fields = None, handler = None, *, projection = None) )] - fn pre_tokenizer<'p>( - &'p self, - py: Python<'p>, - mode: Option<&PyAny>, - fields: Option<&PySet>, + fn pre_tokenizer<'py>( + &'py self, + py: Python<'py>, + mode: Option<&Bound<'py, PyAny>>, + fields: Option<&Bound<'py, PySet>>, handler: Option<Py<PyAny>>, - projection: Option<&PyString>, - ) -> PyResult<&'p PyAny> { + projection: Option<&Bound<'py, PyString>>, + ) -> PyResult<Bound<'py, PyAny>> { let mode = match mode { Some(m) => extract_mode(py, m)?, None => Mode::C, }; let subset = parse_field_subset(fields)?; if let Some(h) = handler.as_ref() { - if !h.as_ref(py).is_callable() { + if !h.bind(py).is_callable() { return Err(SudachiErr::new_err("handler must be callable")); } } @@ -320,11 +324,11 @@ impl PyDictionary { let projector = resolve_projection(passed, &dict.projection); let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector); let internal_cell = Bound::new(py, internal)?; - let module = py.import("tokenizers.pre_tokenizers")?; + let module = py.import_bound("tokenizers.pre_tokenizers")?; module .getattr("PreTokenizer")? .getattr("custom")? - .call1(PyTuple::new(py, [internal_cell])) + .call1(PyTuple::new_bound(py, [internal_cell])) } /// Look up morphemes in the binary dictionary without performing the analysis. @@ -374,9 +378,9 @@ impl PyDictionary { /// Get POS Tuple by its id #[pyo3(text_signature = "($self, pos_id: int)")] - fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { + fn pos_of<'py>(&'py self, py: Python<'py>, pos_id: usize) -> Option<&Bound<'py, PyTuple>> { let dic = self.dictionary.as_ref().unwrap(); - dic.pos.get(pos_id).map(|x| x.as_ref(py)) + dic.pos.get(pos_id).map(|x| x.bind(py)) } fn __repr__(&self) -> PyResult<String> { @@ -411,10 +415,9 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> { Ok(result) } -pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult<Mode> { +pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &Bound<'py, PyAny>) -> PyResult<Mode> { if mode.is_instance_of::<PyString>() { - let mode = mode.str()?.to_str()?; - Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into()) + Mode::from_str(mode.str()?.to_str()?).map_err(|e| SudachiErr::new_err(e).into()) } else if mode.is_instance_of::<PySplitMode>() { let mode = mode.extract::<PySplitMode>()?; Ok(Mode::from(mode)) @@ -427,9 +430,10 @@ fn read_config_from_fs(path: Option<&Path>) -> PyResult<ConfigBuilder> { wrap(ConfigBuilder::from_opt_file(path)) } -fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { +fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> { if config_opt.is_instance_of::<PyString>() { - let config_str = config_opt.str()?.to_str()?.trim(); + let config_pystr = config_opt.str()?; + let config_str = config_pystr.to_str()?.trim(); // looks like json if config_str.starts_with("{") && config_str.ends_with("}") { let result = ConfigBuilder::from_bytes(config_str.as_bytes()); @@ -445,10 +449,10 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { ))); } let py = config_opt.py(); - let cfg_type = py.import("sudachipy.config")?.getattr("Config")?; - if config_opt.is_instance(cfg_type)? { + let cfg_type = py.import_bound("sudachipy.config")?.getattr("Config")?; + if config_opt.is_instance(&cfg_type)? { let cfg_as_str = config_opt.call_method0("as_jsons")?; - return read_config(cfg_as_str); + return read_config(&cfg_as_str); } Err(SudachiErr::new_err(( format!("passed config was not a string, json object or sudachipy.config.Config object"), @@ -457,24 +461,22 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { } pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> { - let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?; + let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?; let path = path.downcast::<PyString>()?.to_str()?; let path = PathBuf::from(path); wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path) } pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> { - let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?; + let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?; let path = path.downcast::<PyString>()?.to_str()?; Ok(PathBuf::from(path)) } fn find_dict_path(py: Python, dict_type: &str) -> PyResult<PathBuf> { - let pyfunc = PyModule::import(py, "sudachipy")?.getattr("_find_dict_path")?; - let path = pyfunc - .call1((dict_type,))? - .downcast::<PyString>()? - .to_str()?; + let pyfunc = PyModule::import_bound(py, "sudachipy")?.getattr("_find_dict_path")?; + let path = pyfunc.call1((dict_type,))?; + let path = path.downcast::<PyString>()?.to_str()?; Ok(PathBuf::from(path)) } @@ -491,15 +493,14 @@ fn locate_system_dict(py: Python, path: &Path) -> PyResult<PathBuf> { } } -fn parse_field_subset(data: Option<&PySet>) -> PyResult<InfoSubset> { +fn parse_field_subset(data: Option<&Bound<PySet>>) -> PyResult<InfoSubset> { if data.is_none() { return Ok(InfoSubset::all()); } let mut subset = InfoSubset::empty(); - for el in data.unwrap().iter() { - let s = el.str()?.to_str()?; - subset |= match s { + for elem in data.unwrap().iter() { + subset |= match elem.str()?.to_str()? { "surface" => InfoSubset::SURFACE, "pos" | "pos_id" => InfoSubset::POS_ID, "normalized_form" => InfoSubset::NORMALIZED_FORM, diff --git a/python/src/lib.rs b/python/src/lib.rs index 68a9c91d..f2c13703 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ mod word_info; /// module root #[pymodule] -fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { +fn sudachipy(_py: Python, m: &Bound<PyModule>) -> PyResult<()> { m.add_class::<dictionary::PyDictionary>()?; m.add_class::<tokenizer::PySplitMode>()?; m.add_class::<tokenizer::PyTokenizer>()?; diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 1c8cf553..69418d32 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -91,11 +91,11 @@ impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary #[classmethod] #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")] - fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn( + fn empty(_cls: &Bound<PyType>, py: Python, dict: &PyDictionary) -> PyResult<Self> { + let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?; + PyErr::warn_bound( py, - cat, + &cat, "Use Tokenizer.tokenize(\"\") if you need an empty MorphemeList.", 1, )?; @@ -150,7 +150,7 @@ impl PyMorphemeListWrapper { }) } - fn __str__<'py>(&'py self, py: Python<'py>) -> &PyString { + fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { // do a simple tokenization __str__ let list = self.internal(py); let mut result = String::with_capacity(list.surface().len() * 2); @@ -161,10 +161,10 @@ impl PyMorphemeListWrapper { result.push_str(" "); } } - PyString::new(py, result.as_str()) + PyString::new_bound(py, result.as_str()) } - fn __repr__(slf: Py<PyMorphemeListWrapper>, py: Python) -> PyResult<&PyString> { + fn __repr__(slf: Py<PyMorphemeListWrapper>, py: Python) -> PyResult<Bound<PyString>> { let self_ref = slf.borrow(py); let list = self_ref.internal(py); let mut result = String::with_capacity(list.surface().len() * 10); @@ -182,7 +182,7 @@ impl PyMorphemeListWrapper { result.push_str(",\n"); } result.push_str("]>"); - Ok(PyString::new(py, result.as_str())) + Ok(PyString::new_bound(py, result.as_str())) } fn __iter__(slf: Py<Self>) -> PyMorphemeIter { @@ -292,19 +292,19 @@ impl PyMorpheme { /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured #[pyo3(text_signature = "($self) -> str")] - fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { + fn surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { let list = self.list(py); let morph = self.morph(py); match list.projection() { - None => PyString::new(py, morph.surface().deref()), + None => PyString::new_bound(py, morph.surface().deref()), Some(proj) => proj.project(morph.deref(), py), } } /// Returns the substring of input text corresponding to the morpheme regardless the configured projection #[pyo3(text_signature = "($self) -> str")] - fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { - PyString::new(py, self.morph(py).surface().deref()) + fn raw_surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. @@ -361,7 +361,7 @@ impl PyMorpheme { fn split<'py>( &'py self, py: Python<'py>, - mode: &PyAny, + mode: &Bound<'py, PyAny>, out: Option<Bound<'py, PyMorphemeListWrapper>>, add_single: Option<bool>, ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> { @@ -424,17 +424,17 @@ impl PyMorpheme { /// Returns the list of synonym group ids #[pyo3(text_signature = "($self) -> List[int]")] - fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { + fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> Bound<PyList> { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); - PyList::new(py, ids) + PyList::new_bound(py, ids) } /// Returns the word info #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; + let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?; + PyErr::warn_bound(py, &cat, "Users should not touch the raw WordInfo.", 1)?; Ok(self.morph(py).get_word_info().clone().into()) } @@ -445,7 +445,7 @@ impl PyMorpheme { m.end_c() - m.begin_c() } - pub fn __str__<'py>(&'py self, py: Python<'py>) -> &'py PyString { + pub fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { self.surface(py) } diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 062d0d0c..f0753f4b 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -36,20 +36,20 @@ impl PyPosMatcher { pub(crate) fn create<'py>( py: Python<'py>, dic: &'py Arc<PyDicData>, - data: &'py PyAny, + data: &Bound<'py, PyAny>, ) -> PyResult<PyPosMatcher> { if data.is_callable() { Self::create_from_fn(dic, data, py) } else { let iter = data.iter()?; - Self::create_from_iter(dic, iter) + Self::create_from_iter(dic, &iter) } } - fn create_from_fn(dic: &Arc<PyDicData>, func: &PyAny, py: Python) -> PyResult<Self> { + fn create_from_fn(dic: &Arc<PyDicData>, func: &Bound<PyAny>, py: Python) -> PyResult<Self> { let mut data = Vec::new(); for (pos_id, pos) in dic.pos.iter().enumerate() { - let args = PyTuple::new(py, &[pos]); + let args = PyTuple::new_bound(py, &[pos]); if func.call1(args)?.downcast::<PyBool>()?.is_true() { data.push(pos_id as u16); } @@ -60,10 +60,11 @@ impl PyPosMatcher { }) } - fn create_from_iter(dic: &Arc<PyDicData>, data: &PyIterator) -> PyResult<Self> { + fn create_from_iter(dic: &Arc<PyDicData>, data: &Bound<PyIterator>) -> PyResult<Self> { let mut result = Vec::new(); for item in data { - let item = item?.downcast::<PyTuple>()?; + let item = item?; + let item = item.downcast::<PyTuple>()?; Self::match_pos_elements(&mut result, dic.as_ref(), item)?; } Ok(Self { @@ -72,7 +73,11 @@ impl PyPosMatcher { }) } - fn match_pos_elements(data: &mut Vec<u16>, dic: &PyDicData, elem: &PyTuple) -> PyResult<()> { + fn match_pos_elements( + data: &mut Vec<u16>, + dic: &PyDicData, + elem: &Bound<PyTuple>, + ) -> PyResult<()> { let start_len = data.len(); let elen = elem.len(); @@ -214,7 +219,7 @@ impl PyPosIter { slf } - fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&'py PyTuple> { + fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&Bound<'py, PyTuple>> { let idx = self.index; self.index += 1; if idx >= self.data.len() { @@ -222,6 +227,6 @@ impl PyPosIter { } let pos_id = self.data[idx]; let pos = &self.dic.pos[pos_id as usize]; - Some(pos.as_ref(py)) + Some(pos.bind(py)) } } diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 303f7645..cd15b1b3 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -126,13 +126,14 @@ impl PyPretokenizer { /// /// Implementation uses Sudachi to perform the analysis, then uses slice method /// of the passed parameter to create output data - pub fn __call__<'p>( - &'p self, - py: Python<'p>, - index: &'p PyAny, - string: &'p PyAny, - ) -> PyResult<&'p PyAny> { - let input_data = string.str()?.to_str()?; + pub fn __call__<'py>( + &'py self, + py: Python<'py>, + index: &Bound<'py, PyAny>, + string: &Bound<'py, PyAny>, + ) -> PyResult<Bound<'py, PyAny>> { + let pystr = string.str()?; + let input_data = pystr.to_str()?; // tokenization itself should work without GIL, we have thread-local tokenizers here py.allow_threads(|| self.tokenizer_cell().borrow_mut().tokenize(input_data))?; // then prepare results with GIL @@ -144,14 +145,14 @@ impl PyPretokenizer { let py_ref = morphs.borrow(py); let morphs = py_ref.internal(py); match self.projection.as_deref() { - None => make_result_for_surface(py, morphs, string), - Some(p) => make_result_for_projection(py, morphs, p), + None => make_result_for_surface(py, morphs, string).map(|bl| bl.into_any()), + Some(p) => make_result_for_projection(py, morphs, p).map(|bl| bl.into_any()), } } Some(h) => { - let mrp: &PyAny = morphs.as_ref(py); - let args = PyTuple::new(py, &[index, string, mrp]); - h.as_ref(py).call1(args) + let mrp: &Bound<PyAny> = morphs.bind(py); + let args = PyTuple::new_bound(py, &[index, string, mrp]); + h.bind(py).call1(args) } } } @@ -160,22 +161,22 @@ impl PyPretokenizer { pub fn pre_tokenize<'py>( self_: Bound<'py, Self>, py: Python<'py>, - data: &'py PyAny, - ) -> PyResult<&'py PyAny> { - data.call_method1("split", PyTuple::new(py, [self_])) + data: &Bound<'py, PyAny>, + ) -> PyResult<Bound<'py, PyAny>> { + data.call_method1("split", PyTuple::new_bound(py, [self_])) } } fn make_result_for_surface<'py>( py: Python<'py>, morphs: &PyMorphemeList, - string: &'py PyAny, -) -> PyResult<&'py PyAny> { - let result = PyList::empty(py); + string: &Bound<'py, PyAny>, +) -> PyResult<Bound<'py, PyList>> { + let result = PyList::empty_bound(py); for idx in 0..morphs.len() { let node = morphs.get(idx); - let slice = PySlice::new(py, node.begin_c() as isize, node.end_c() as isize, 1); - let args = PyTuple::new(py, [slice]); + let slice = PySlice::new_bound(py, node.begin_c() as isize, node.end_c() as isize, 1); + let args = PyTuple::new_bound(py, [slice]); let substring = string.call_method1(intern!(py, "slice"), args)?; result.append(substring)?; } @@ -186,20 +187,20 @@ fn make_result_for_projection<'py>( py: Python<'py>, morphs: &PyMorphemeList, proj: &dyn MorphemeProjection, -) -> PyResult<&'py PyAny> { - let result = PyList::empty(py); +) -> PyResult<Bound<'py, PyList>> { + let result = PyList::empty_bound(py); let nstring = { static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = pyo3::sync::GILOnceCell::new(); NORMALIZED_STRING.get_or_try_init(py, || -> PyResult<Py<PyType>> { - let ns = py.import("tokenizers")?.getattr("NormalizedString")?; - let tpe = ns.downcast::<PyType>(); - tpe.map(|x| x.into_py(py)).map_err(|e| e.into()) + let ns = py.import_bound("tokenizers")?.getattr("NormalizedString")?; + let tpe = ns.downcast::<PyType>()?; + Ok(tpe.clone().unbind()) })? }; for idx in 0..morphs.len() { let node = morphs.get(idx); let value = proj.project(&node, py); - let args = PyTuple::new(py, [value]); + let args = PyTuple::new_bound(py, [value]); let substring = nstring.call1(py, args)?; result.append(substring)?; } diff --git a/python/src/projection.rs b/python/src/projection.rs index 8bea35be..8c7dd142 100644 --- a/python/src/projection.rs +++ b/python/src/projection.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ use crate::dictionary::PyDicData; use crate::morpheme::PyProjector; +use pyo3::prelude::*; use pyo3::types::PyString; use pyo3::{PyResult, Python}; use std::convert::TryFrom; @@ -27,14 +28,14 @@ use sudachi::pos::PosMatcher; use sudachi::prelude::Morpheme; pub(crate) trait MorphemeProjection { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString; + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString>; } struct Surface {} impl MorphemeProjection for Surface { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString { - PyString::new(py, m.surface().deref()) + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, m.surface().deref()) } } @@ -43,8 +44,8 @@ struct Mapped<F: for<'a> Fn(&'a Morpheme<'a, Arc<PyDicData>>) -> &'a str> { } impl<F: for<'a> Fn(&'a Morpheme<'a, Arc<PyDicData>>) -> &'a str> MorphemeProjection for Mapped<F> { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString { - PyString::new(py, (self.func)(m)) + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, (self.func)(m)) } } @@ -60,11 +61,11 @@ impl DictionaryAndSurface { } impl MorphemeProjection for DictionaryAndSurface { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } else { - PyString::new(py, m.dictionary_form()) + PyString::new_bound(py, m.dictionary_form()) } } } @@ -81,11 +82,11 @@ impl NormalizedAndSurface { } impl MorphemeProjection for NormalizedAndSurface { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } else { - PyString::new(py, m.normalized_form()) + PyString::new_bound(py, m.normalized_form()) } } } @@ -102,11 +103,11 @@ impl NormalizedNouns { } impl MorphemeProjection for NormalizedNouns { - fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.normalized_form()) + PyString::new_bound(py, m.normalized_form()) } else { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } } } @@ -164,7 +165,7 @@ pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> P } pub(crate) fn parse_projection<D: DictionaryAccess>( - value: &PyString, + value: &Bound<PyString>, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { value.to_str().and_then(|s| parse_projection_raw(s, dict)) @@ -189,7 +190,7 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>( } pub(crate) fn parse_projection_opt<D: DictionaryAccess>( - value: Option<&PyString>, + value: Option<&Bound<PyString>>, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { match value { diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 5f364380..cc8142e7 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -136,7 +136,7 @@ impl PyTokenizer { &'py mut self, py: Python<'py>, text: &'py str, - mode: Option<&PyAny>, + mode: Option<&Bound<'py, PyAny>>, logger: Option<PyObject>, out: Option<Bound<'py, PyMorphemeListWrapper>>, ) -> PyResult<Bound<PyMorphemeListWrapper>> { From 73c8cd94e533a932a8ed94a08f523d915bdeabf5 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 3 Jun 2024 14:47:35 +0900 Subject: [PATCH 04/24] update dependencies --- Cargo.lock | 338 ++++++++++++++++------------------------- python/Cargo.toml | 2 +- sudachi-cli/Cargo.toml | 2 +- sudachi/Cargo.toml | 10 +- 4 files changed, 136 insertions(+), 216 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e9ad71bc..73ca27fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,47 +19,48 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" dependencies = [ "anstyle", "windows-sys", @@ -92,12 +93,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -106,9 +101,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "bumpalo" -version = "3.15.4" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "cast" @@ -200,9 +195,9 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] name = "criterion" @@ -261,9 +256,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.19" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "crunchy" @@ -301,9 +296,9 @@ dependencies = [ [[package]] name = "either" -version = "1.10.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] name = "equivalent" @@ -313,9 +308,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys", @@ -334,15 +329,15 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "half" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if", "crunchy", @@ -350,9 +345,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "heck" @@ -374,13 +369,13 @@ checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "honggfuzz" -version = "0.5.55" +version = "0.5.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848e9c511092e0daa0a35a63e8e6e475a3e8f870741448b9f6028d69b142f18e" +checksum = "7c76b6234c13c9ea73946d1379d33186151148e0da231506b964b44f3d023505" dependencies = [ "arbitrary", "lazy_static", - "memmap2 0.5.10", + "memmap2", "rustc_version", ] @@ -411,6 +406,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.10.5" @@ -422,9 +423,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -466,9 +467,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libloading" @@ -477,14 +478,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets", ] [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" @@ -503,18 +504,9 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "memmap2" -version = "0.5.10" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "memmap2" @@ -552,9 +544,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -573,9 +565,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -583,22 +575,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets", ] [[package]] name = "plotters" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" dependencies = [ "num-traits", "plotters-backend", @@ -609,15 +601,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" [[package]] name = "plotters-svg" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" dependencies = [ "plotters-backend", ] @@ -630,18 +622,18 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" +checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" dependencies = [ "cfg-if", "indoc", @@ -657,9 +649,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" +checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" dependencies = [ "once_cell", "target-lexicon", @@ -667,9 +659,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" +checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" dependencies = [ "libc", "pyo3-build-config", @@ -677,9 +669,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" +checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -689,9 +681,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" +checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" dependencies = [ "heck 0.4.1", "proc-macro2", @@ -702,9 +694,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -731,11 +723,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags 1.3.2", + "bitflags", ] [[package]] @@ -778,11 +770,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -791,9 +783,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "same-file" @@ -812,24 +804,24 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.197" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", @@ -838,9 +830,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" dependencies = [ "itoa", "ryu", @@ -871,18 +863,18 @@ name = "sudachi" version = "0.6.9-a1" dependencies = [ "aho-corasick", - "bitflags 2.5.0", + "bitflags", "claim", "csv", "default_input_text", "fancy-regex", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "join_katakana_oov", "join_numeric", "lazy_static", "libloading", - "memmap2 0.9.4", + "memmap2", "nom", "regex", "serde", @@ -900,7 +892,7 @@ version = "0.6.9-a1" dependencies = [ "cfg-if", "clap", - "memmap2 0.9.4", + "memmap2", "sudachi", ] @@ -926,9 +918,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.55" +version = "2.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" +checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" dependencies = [ "proc-macro2", "quote", @@ -955,18 +947,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", @@ -1109,159 +1101,87 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" dependencies = [ - "winapi", + "windows-sys", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" [[package]] -name = "windows_i686_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" +name = "windows_i686_gnullvm" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "yada" diff --git a/python/Cargo.toml b/python/Cargo.toml index 6e564c2e..53cd97e5 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -16,8 +16,8 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.21", features = ["extension-module"] } -thread_local = "1.1" # Apache 2.0/MIT scopeguard = "1" # Apache 2.0/MIT +thread_local = "1.1" # Apache 2.0/MIT [dependencies.sudachi] path = "../sudachi" diff --git a/sudachi-cli/Cargo.toml b/sudachi-cli/Cargo.toml index c5070424..14aeebb5 100644 --- a/sudachi-cli/Cargo.toml +++ b/sudachi-cli/Cargo.toml @@ -14,8 +14,8 @@ license.workspace = true sudachi = { path = "../sudachi" } cfg-if = "1.0.0" # MIT/Apache 2.0 -memmap2 = "0.9" # MIT/Apache 2.0 clap = { version = "4.5", features = ["derive"] } # MIT/Apache 2.0 +memmap2 = "0.9" # MIT/Apache 2.0 [[bin]] name = "sudachi" diff --git a/sudachi/Cargo.toml b/sudachi/Cargo.toml index 76b4cfe4..76e5f72c 100644 --- a/sudachi/Cargo.toml +++ b/sudachi/Cargo.toml @@ -12,15 +12,15 @@ license.workspace = true [dependencies] # this should be sorted aho-corasick = "1" # MIT/Apache 2.0 -bitflags = "2.0" # MIT/Apache 2.0 -csv = "1.1" # Unilicense/MIT +bitflags = "2.5" # MIT/Apache 2.0 +csv = "1.3" # Unilicense/MIT fancy-regex = "0.13" # MIT -indexmap = "2.0" # MIT/Apache 2.0 -itertools = "0.12" # MIT/Apachie 2.0 +indexmap = "2.2" # MIT/Apache 2.0 +itertools = "0.13" # MIT/Apachie 2.0 lazy_static = "1.4" # MIT/Apache 2.0 libloading = "0.8" # ISC (MIT-compatible) -nom = "7" # MIT memmap2 = "0.9" # MIT/Apache 2.0 +nom = "7" # MIT regex = "1" # MIT/Apache 2.0 serde = { version = "1.0", features = ["derive"] } # MIT/Apache 2.0 serde_json = "1.0" # MIT/Apache 2.0 From 4345772882bd4a87511ac136e8906b38c77581c1 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Fri, 7 Jun 2024 09:09:16 +0900 Subject: [PATCH 05/24] use pyo3::intern macro inside pretokenizer --- python/src/dictionary.rs | 10 +++++++--- python/src/pretokenizer.rs | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 251267ab..e9cbf1ed 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -461,20 +461,24 @@ fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> { } pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> { - let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?; + let path = py + .import_bound("sudachipy")? + .getattr("_DEFAULT_SETTINGFILE")?; let path = path.downcast::<PyString>()?.to_str()?; let path = PathBuf::from(path); wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path) } pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> { - let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?; + let path = py + .import_bound("sudachipy")? + .getattr("_DEFAULT_RESOURCEDIR")?; let path = path.downcast::<PyString>()?.to_str()?; Ok(PathBuf::from(path)) } fn find_dict_path(py: Python, dict_type: &str) -> PyResult<PathBuf> { - let pyfunc = PyModule::import_bound(py, "sudachipy")?.getattr("_find_dict_path")?; + let pyfunc = py.import_bound("sudachipy")?.getattr("_find_dict_path")?; let path = pyfunc.call1((dict_type,))?; let path = path.downcast::<PyString>()?.to_str()?; Ok(PathBuf::from(path)) diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index cd15b1b3..20e5cf65 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -163,7 +163,7 @@ impl PyPretokenizer { py: Python<'py>, data: &Bound<'py, PyAny>, ) -> PyResult<Bound<'py, PyAny>> { - data.call_method1("split", PyTuple::new_bound(py, [self_])) + data.call_method1(intern!(py, "split"), PyTuple::new_bound(py, [self_])) } } @@ -190,7 +190,7 @@ fn make_result_for_projection<'py>( ) -> PyResult<Bound<'py, PyList>> { let result = PyList::empty_bound(py); let nstring = { - static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = pyo3::sync::GILOnceCell::new(); + static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = GILOnceCell::new(); NORMALIZED_STRING.get_or_try_init(py, || -> PyResult<Py<PyType>> { let ns = py.import_bound("tokenizers")?.getattr("NormalizedString")?; let tpe = ns.downcast::<PyType>()?; From 8baaa7abc53c49d7b475256436935f2e30fe3a4c Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Fri, 5 Jul 2024 17:24:57 +0900 Subject: [PATCH 06/24] add missing docstrings --- python/py_src/sudachipy/errors.py | 6 ++++-- python/src/build.rs | 4 +++- python/src/lib.rs | 6 ++++-- python/src/morpheme.rs | 6 ++++-- python/src/pos_matcher.rs | 6 +++++- python/src/pretokenizer.rs | 7 ++++--- python/src/tokenizer.rs | 7 ++++--- 7 files changed, 28 insertions(+), 14 deletions(-) diff --git a/python/py_src/sudachipy/errors.py b/python/py_src/sudachipy/errors.py index e75e21cd..c11a8205 100644 --- a/python/py_src/sudachipy/errors.py +++ b/python/py_src/sudachipy/errors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Works Applications Co., Ltd. +# Copyright (c) 2023-2024 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,4 +13,6 @@ # limitations under the License. class SudachiError(Exception): - pass \ No newline at end of file + """Base class for all Sudachipy exceptions. + """ + pass diff --git a/python/src/build.rs b/python/src/build.rs index a6005b26..59eb50c9 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ fn create_file(p: &Path) -> std::io::Result<File> { OpenOptions::new().create_new(true).write(true).open(p) } +/// Build system dictionary from matrix and lexicons. #[pyfunction] #[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] fn build_system_dic<'p>( @@ -87,6 +88,7 @@ fn build_system_dic<'p>( to_stats(py, builder) } +/// Build user dictionary from lexicons based on the given system dictionary. #[pyfunction] #[pyo3(text_signature = "(system, lex, output, description=None) -> list")] fn build_user_dic<'p>( diff --git a/python/src/lib.rs b/python/src/lib.rs index 68a9c91d..4887a737 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,9 @@ mod projection; mod tokenizer; mod word_info; -/// module root +/// SudachiPy raw module root. +/// +/// Users should not use this directly. #[pymodule] fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::<dictionary::PyDictionary>()?; diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index ad3929dd..47e020ee 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,6 +86,7 @@ impl PyMorphemeListWrapper { } } } + #[pymethods] impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary @@ -197,7 +198,7 @@ impl PyMorphemeListWrapper { } } -/// A morpheme (basic semantic unit of language). +/// An iterator over the MorphemeList. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeIter")] pub struct PyMorphemeIter { list: Py<PyMorphemeListWrapper>, @@ -241,6 +242,7 @@ impl<'py> Deref for MorphemeRef<'py> { } } +/// A morpheme (basic semantic unit of language). #[pyclass(module = "sudachipy.morpheme", name = "Morpheme", frozen)] pub struct PyMorpheme { list: Py<PyMorphemeListWrapper>, diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 7c6a884d..a849edf5 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,9 @@ use sudachi::pos::PosMatcher; use crate::dictionary::PyDicData; use crate::morpheme::PyMorpheme; +/// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. +/// +/// Create using Dictionary.pos_matcher method. #[pyclass(name = "PosMatcher", module = "sudachipy")] pub struct PyPosMatcher { matcher: PosMatcher, @@ -189,6 +192,7 @@ impl PyPosMatcher { } } +/// An iterator over POS tuples in the PosPatcher #[pyclass(name = "PosMatcherIterator", module = "sudachipy")] pub struct PyPosIter { data: Vec<u16>, diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 755f040b..385c6dcb 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,9 +76,10 @@ impl PerThreadPreTokenizer { } } -/// Binding for the Tokenizer, which handles threading for tokenization +/// Binding for the Tokenizer, which handles threading for tokenization. /// -/// We use ThreadLocal for storing actual tokenizers +/// Create using Dictionary.pre_tokenizer method. +/// We use ThreadLocal for storing actual tokenizers. #[pyclass(module = "sudachipy.pretokenizer", name = "SudachiPreTokenizer")] pub struct PyPretokenizer { dict: Arc<PyDicData>, diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 558d02cb..a53ce166 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,7 +36,6 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// B == middle mode /// /// C == long mode -// #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -68,6 +67,7 @@ impl From<Mode> for PySplitMode { #[pymethods] impl PySplitMode { + /// Parse SplitMode from a character. #[new] fn new(mode: Option<&str>) -> PyResult<PySplitMode> { let mode = match mode { @@ -82,7 +82,7 @@ impl PySplitMode { } } -/// Sudachi Tokenizer, Python version +/// Sudachi Tokenizer #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { tokenizer: StatefulTokenizer<Arc<PyDicData>>, @@ -182,6 +182,7 @@ impl PyTokenizer { Ok(out_list) } + /// SplitMode of the tokenizer. #[getter] fn mode(&self) -> PySplitMode { self.tokenizer.mode().into() From 8b597e341c3b9c2b9340c08d6507bfa75e041ab8 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Fri, 5 Jul 2024 17:52:19 +0900 Subject: [PATCH 07/24] copy docstring from new to class --- python/src/dictionary.rs | 22 ++++++++++++++++------ python/src/tokenizer.rs | 11 ++++++++--- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index bc333c8e..1bada310 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,17 @@ impl PyDicData { } } -/// A sudachi dictionary +/// A sudachi dictionary. +/// +/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. +/// If both config.systemDict and dict_type are given, dict_type is used. +/// If dict is an absolute path to a file, it is used as a dictionary. +/// +/// :param config_path: path to the configuration JSON file. +/// :param resource_dir: path to the resource directory folder. +/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. +/// Also, can be an _absolute_ path to a compiled dictionary file. +/// :param dict_type: deprecated alias to dict. #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")] #[derive(Clone)] pub struct PyDictionary { @@ -92,13 +102,13 @@ impl PyDictionary { /// /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. /// If both config.systemDict and dict_type are given, dict_type is used. - /// If dict is an absolute path to a file, it is used as a dictionary + /// If dict is an absolute path to a file, it is used as a dictionary. /// - /// :param config_path: path to the configuration JSON file - /// :param resource_dir: path to the resource directory folder + /// :param config_path: path to the configuration JSON file. + /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. - /// :param dict_type: deprecated alias to dict + /// :param dict_type: deprecated alias to dict. #[new] #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] fn new( diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index a53ce166..fe3b66d3 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -29,13 +29,13 @@ use crate::dictionary::{extract_mode, PyDicData}; use crate::errors::SudachiError as SudachiPyErr; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; -/// Unit to split text +/// Unit to split text. /// /// A == short mode -/// /// B == middle mode -/// /// C == long mode +/// +/// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -68,7 +68,10 @@ impl From<Mode> for PySplitMode { #[pymethods] impl PySplitMode { /// Parse SplitMode from a character. + /// + /// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[new] + #[pyo3(signature=(mode=None, *))] fn new(mode: Option<&str>) -> PyResult<PySplitMode> { let mode = match mode { Some(m) => m, @@ -83,6 +86,8 @@ impl PySplitMode { } /// Sudachi Tokenizer +/// +/// Create using Dictionary.create method. #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { tokenizer: StatefulTokenizer<Arc<PyDicData>>, From c1d37c7f0aab64bd64144664537fa46512aac6c5 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 11:04:08 +0900 Subject: [PATCH 08/24] update text_signature --- python/src/build.rs | 4 ++-- python/src/dictionary.rs | 21 ++++++++++++--------- python/src/morpheme.rs | 36 ++++++++++++++++++------------------ python/src/tokenizer.rs | 9 ++++++--- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/python/src/build.rs b/python/src/build.rs index 59eb50c9..350f2fb3 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -60,7 +60,7 @@ fn create_file(p: &Path) -> std::io::Result<File> { /// Build system dictionary from matrix and lexicons. #[pyfunction] -#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] +#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_system_dic<'p>( py: Python<'p>, matrix: &'p PyAny, @@ -90,7 +90,7 @@ fn build_system_dic<'p>( /// Build user dictionary from lexicons based on the given system dictionary. #[pyfunction] -#[pyo3(text_signature = "(system, lex, output, description=None) -> list")] +#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_user_dic<'p>( py: Python<'p>, system: &'p PyAny, diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 1bada310..e208492f 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -110,7 +110,10 @@ impl PyDictionary { /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. #[new] - #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] + #[pyo3( + text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary", + signature=(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) + )] fn new( py: Python, config_path: Option<&PyAny>, @@ -230,8 +233,8 @@ impl PyDictionary { /// :param fields: load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html #[pyo3( - text_signature = "($self, mode = 'C') -> sudachipy.Tokenizer", - signature = (mode = None, fields = None, *, projection = None) + text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer", + signature=(mode=None, fields=None, *, projection=None) )] fn create<'py>( &'py self, @@ -272,7 +275,7 @@ impl PyDictionary { /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// /// :param target: can be either a callable or list of POS partial tuples - #[pyo3(text_signature = "($self, target)")] + #[pyo3(text_signature="(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -293,8 +296,8 @@ impl PyDictionary { /// :type mode: sudachipy.SplitMode /// :type fields: Set[str] #[pyo3( - text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer", - signature = (mode = None, fields = None, handler = None, *, projection = None) + text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", + signature=(mode=None, fields=None, handler=None, *, projection=None) )] fn pre_tokenizer<'p>( &'p self, @@ -349,7 +352,7 @@ impl PyDictionary { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// :type surface: str /// :type out: sudachipy.MorphemeList - #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")] + #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, py: Python<'p>, @@ -377,13 +380,13 @@ impl PyDictionary { } /// Close this dictionary - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature="(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } /// Get POS Tuple by its id - #[pyo3(text_signature = "($self, pos_id: int)")] + #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 47e020ee..f1aa204d 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -91,7 +91,7 @@ impl PyMorphemeListWrapper { impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary #[classmethod] - #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")] + #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn( @@ -110,13 +110,13 @@ impl PyMorphemeListWrapper { } /// Returns the total cost of the path - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() } /// Returns the number of morpheme in this list. - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn size(&self, py: Python) -> usize { self.internal(py).len() } @@ -279,21 +279,21 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { /// Returns the begin index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } /// Returns the end index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() } /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); let morph = self.morph(py); @@ -304,14 +304,14 @@ impl PyMorpheme { } /// Returns the substring of input text corresponding to the morpheme regardless the configured projection - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. /// Tuple elements are four POS levels, conjugation type and conjugation form. - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> { let pos_id = self.part_of_speech_id(py); self.list(py) @@ -322,25 +322,25 @@ impl PyMorpheme { } /// Returns the id of the part of speech in the dictionary - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } /// Returns the dictionary form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } /// Returns the normalized form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } /// Returns the reading form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) } @@ -358,7 +358,7 @@ impl PyMorpheme { /// :type out: Optional[sudachipy.MorphemeList] /// :type add_single: bool #[pyo3( - text_signature = "($self, mode, out = None, add_single = False) -> sudachipy.MorphemeList" + text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList" )] fn split<'py>( &'py self, @@ -402,19 +402,19 @@ impl PyMorpheme { } /// Returns whether if this is out of vocabulary word - #[pyo3(text_signature = "($self) -> bool")] + #[pyo3(text_signature="(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } /// Returns word id of this word in the dictionary - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } /// Returns the dictionary id which this word belongs - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); if word_id.is_oov() { @@ -425,7 +425,7 @@ impl PyMorpheme { } /// Returns the list of synonym group ids - #[pyo3(text_signature = "($self) -> List[int]")] + #[pyo3(text_signature="(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); @@ -433,7 +433,7 @@ impl PyMorpheme { } /// Returns the word info - #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")] + #[pyo3(text_signature="(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index fe3b66d3..16f2482a 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -71,7 +71,10 @@ impl PySplitMode { /// /// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[new] - #[pyo3(signature=(mode=None, *))] + #[pyo3( + text_signature="(mode=None) -> SplitMode", + signature=(mode=None) + )] fn new(mode: Option<&str>) -> PyResult<PySplitMode> { let mode = match mode { Some(m) => m, @@ -133,8 +136,8 @@ impl PyTokenizer { /// :type mode: sudachipy.SplitMode /// :type out: sudachipy.MorphemeList #[pyo3( - text_signature = "($self, text: str, mode = None, logger = None, out = None) -> sudachipy.MorphemeList", - signature = (text, mode = None, logger = None, out = None) + text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList", + signature=(text, mode=None, logger=None, out=None) )] #[allow(unused_variables)] fn tokenize<'py>( From dfc87edf656348474fef8b6aa46e8548e4895c5b Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 11:05:04 +0900 Subject: [PATCH 09/24] add import of PosMatcher --- python/py_src/sudachipy/__init__.py | 1 + python/src/lib.rs | 1 + python/src/pos_matcher.rs | 6 ++++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py index bdf67f40..fb551538 100644 --- a/python/py_src/sudachipy/__init__.py +++ b/python/py_src/sudachipy/__init__.py @@ -5,6 +5,7 @@ MorphemeList, Morpheme, WordInfo, + PosMatcher, ) from .config import Config from . import errors diff --git a/python/src/lib.rs b/python/src/lib.rs index 4887a737..56a950c2 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -37,6 +37,7 @@ fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::<morpheme::PyMorphemeListWrapper>()?; m.add_class::<morpheme::PyMorpheme>()?; m.add_class::<word_info::PyWordInfo>()?; + m.add_class::<pos_matcher::PyPosMatcher>()?; build::register_functions(m)?; Ok(()) } diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index a849edf5..586c7d90 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -29,7 +29,9 @@ use crate::morpheme::PyMorpheme; /// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. /// /// Create using Dictionary.pos_matcher method. -#[pyclass(name = "PosMatcher", module = "sudachipy")] +/// +/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher. +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")] pub struct PyPosMatcher { matcher: PosMatcher, dic: Arc<PyDicData>, @@ -193,7 +195,7 @@ impl PyPosMatcher { } /// An iterator over POS tuples in the PosPatcher -#[pyclass(name = "PosMatcherIterator", module = "sudachipy")] +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcherIterator")] pub struct PyPosIter { data: Vec<u16>, dic: Arc<PyDicData>, From 8c35516a1f20fee8608401b1aea694063458c061 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 11:56:02 +0900 Subject: [PATCH 10/24] sync pyi and rs --- python/py_src/sudachipy/sudachipy.pyi | 104 ++++++++++++++++---------- python/src/dictionary.rs | 54 ++++++------- python/src/morpheme.rs | 48 +++++++----- python/src/pos_matcher.rs | 10 ++- python/src/tokenizer.rs | 10 +-- 5 files changed, 136 insertions(+), 90 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 16c416f6..705b62af 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -1,6 +1,20 @@ +# Copyright (c) 2024 Works Applications Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set from .config import Config +# Part Of Speech POS = Tuple[str, str, str, str, str, str] # POS element PE = Optional[str] @@ -14,6 +28,8 @@ PartialPOS = Union[ Tuple[()], ] +# Fields that can be specified for partial dictionary loading. +# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form", "word_structure", "split_a", "split_b", "synonym_group_id"]]] @@ -23,9 +39,7 @@ class SplitMode: Unit to split text. A == short mode - B == middle mode - C == long mode """ @@ -36,8 +50,9 @@ class SplitMode: @classmethod def __init__(cls, mode: str = "C") -> None: """ - Creates a split mode from a string value - :param mode: string representation of the split mode + Creates a split mode from a string value. + + :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. """ ... @@ -54,14 +69,15 @@ class Dictionary: Creates a sudachi dictionary. If both config.systemDict and dict are not given, `sudachidict_core` is used. - If both config.systemDict and dict are given, dict_type is used. + If both config.systemDict and dict are given, dict is used. + If dict is an absolute path to a file, it is used as a dictionary. - :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object - :param config: alias to config_path, only one of them can be specified at the same time - :param resource_dir: path to the resource directory folder + :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + :param config: alias to config_path, only one of them can be specified at the same time. + :param resource_dir: path to the resource directory folder. :param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict. Also, can be an _absolute_ path to a compiled dictionary file. - :param dict_type: deprecated alias to dict + :param dict_type: deprecated alias to dict. """ ... @@ -77,11 +93,11 @@ class Dictionary: *, projection: str = None) -> Tokenizer: """ - Creates a Sudachi Tokenizer. + Creates a sudachi tokenizer. :param mode: sets the analysis mode for this Tokenizer :param fields: load only a subset of fields. - See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -91,21 +107,21 @@ class Dictionary: Creates a new POS matcher. If target is a function, then it must return whether a POS should match or not. - If target a list, it should contain partially specified POS. - By partially specified it means that it is possible to omit POS fields or - use None as a sentinel value that matches any POS. + If target is a list, it should contain partially specified POS. + By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. For example, ('名詞',) will match any noun and (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form. - :param target: can be either a function or a list of POS tuples. + :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. """ ... def pre_tokenizer(self, mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", fields: FieldSet = None, - handler: Optional[Callable[[int, object, MorphemeList], list]] = None, + handler: Optional[Callable[[ + int, object, MorphemeList], list]] = None, *, projection: str = None) -> object: """ @@ -113,10 +129,10 @@ class Dictionary: Requires package `tokenizers` to be installed. :param mode: Use this split mode (C by default) - :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - First two parameters are the index (int) and HuggingFace NormalizedString. - The handler must return a List[NormalizedString]. By default, just segment the tokens. + :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -126,7 +142,7 @@ class Dictionary: Returns POS with the given id. :param pos_id: POS id - :return: POS tuple with the given id. + :return: POS tuple with the given id or None for non existing id. """ ... @@ -197,7 +213,8 @@ class Morpheme: def part_of_speech(self) -> POS: """ - Returns the part of speech. + Returns the part of speech as a six-element tuple. + Tuple elements are four POS levels, conjugation type and conjugation form. """ ... @@ -217,8 +234,8 @@ class Morpheme: """ Returns sub-morphemes in the provided split mode. - :param mode: mode of new split - :param out: write results to this MorhpemeList instead of creating new one + :param mode: mode of new split. + :param out: write results to this MorhpemeList instead of creating new one. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for more information on output parameters. Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. @@ -230,6 +247,7 @@ class Morpheme: def surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + See `Config.projection`. """ ... @@ -237,6 +255,7 @@ class Morpheme: def raw_surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme regardless the configured projection. + See `Config.projection`. """ ... @@ -255,7 +274,7 @@ class Morpheme: def __len__(self) -> int: """ - Returns morpheme length in codepoints + Returns morpheme length in codepoints. """ @@ -293,6 +312,11 @@ class MorphemeList: class Tokenizer: + """ + A sudachi tokenizer + + Create using Dictionary.create method. + """ SplitMode: ClassVar[SplitMode] = ... @classmethod def __init__(cls) -> None: ... @@ -303,13 +327,12 @@ class Tokenizer: """ Break text into morphemes. - SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - - :param text: text to analyze + :param text: text to analyze. :param mode: analysis mode. This parameter is deprecated. Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + :param logger: Arg for v0.5.* compatibility. Ignored. :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. """ @@ -342,41 +365,44 @@ class WordInfo: class PosMatcher: + """ + A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. + + Create using Dictionary.pos_matcher method. + """ + def __iter__(self) -> Iterator[POS]: ... def __len__(self) -> int: ... def __call__(self, m: Morpheme) -> bool: """ - Checks whether a morpheme has matching POS - :param m: morpheme - :return: if morpheme has matching POS + Checks whether a morpheme has matching POS. + + :param m: morpheme. + :return: if morpheme has matching POS. """ ... def __or__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if any of two matchers would match it - :return: PosMatcher + Returns a POS matcher which matches a POS if any of two matchers would match it. """ ... def __and__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if both matchers would match it at the same time - :return: PosMatcher + Returns a POS matcher which matches a POS if both matchers would match it at the same time. """ ... def __sub__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS - :return: PosMatcher + Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. """ ... def __invert__(self) -> PosMatcher: """ - Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher - :return: PosMatcher + Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. """ ... diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index e208492f..5f1e8f65 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -80,11 +80,12 @@ impl PyDicData { /// A sudachi dictionary. /// -/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. -/// If both config.systemDict and dict_type are given, dict_type is used. +/// If both config.systemDict and dict are not given, `sudachidict_core` is used. +/// If both config.systemDict and dict are given, dict is used. /// If dict is an absolute path to a file, it is used as a dictionary. /// -/// :param config_path: path to the configuration JSON file. +/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. +/// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. @@ -100,11 +101,12 @@ pub struct PyDictionary { impl PyDictionary { /// Creates a sudachi dictionary. /// - /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. - /// If both config.systemDict and dict_type are given, dict_type is used. + /// If both config.systemDict and dict are not given, `sudachidict_core` is used. + /// If both config.systemDict and dict are given, dict is used. /// If dict is an absolute path to a file, it is used as a dictionary. /// - /// :param config_path: path to the configuration JSON file. + /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. @@ -229,11 +231,12 @@ impl PyDictionary { /// Creates a sudachi tokenizer. /// - /// :param mode: tokenizer's default split mode (C by default). + /// :param mode: sets the analysis mode for this Tokenizer /// :param fields: load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. #[pyo3( - text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer", + text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer", signature=(mode=None, fields=None, *, projection=None) )] fn create<'py>( @@ -267,14 +270,13 @@ impl PyDictionary { /// Creates a POS matcher object /// /// If target is a function, then it must return whether a POS should match or not. - /// If target a list, it should contain partially specified POS. - /// By partially specified it means that it is possible to omit POS fields or - /// use None as a sentinel value that matches any POS. + /// If target is a list, it should contain partially specified POS. + /// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. /// /// For example, ('名詞',) will match any noun and /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// - /// :param target: can be either a callable or list of POS partial tuples + /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. #[pyo3(text_signature="(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) @@ -285,15 +287,13 @@ impl PyDictionary { /// /// :param mode: Use this split mode (C by default) /// :param fields: ask Sudachi to load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - /// :param handler: a custom callable to transform MorphemeList into list of tokens. - /// It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. - /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - /// If nothing was passed, simply use surface as token representations. - /// :param projection: projection mode for a created PreTokenizer. - /// See :class:`sudachipy.config.Config` object documentation for supported projections. + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + /// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. /// - /// :type mode: sudachipy.SplitMode + /// :type mode: SplitMode /// :type fields: Set[str] #[pyo3( text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", @@ -350,8 +350,9 @@ impl PyDictionary { /// :param surface: find all morphemes with the given surface /// :param out: if passed, reuse the given morpheme list instead of creating a new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type surface: str - /// :type out: sudachipy.MorphemeList + /// :type out: MorphemeList #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, @@ -379,14 +380,17 @@ impl PyDictionary { Ok(l) } - /// Close this dictionary + /// Close this dictionary. #[pyo3(text_signature="(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } - /// Get POS Tuple by its id - #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")] + /// Returns POS with the given id. + /// + /// :param pos_id: POS id + /// :return: POS tuple with the given id or None for non existing id. + #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index f1aa204d..0a18f6c4 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -31,7 +31,10 @@ use crate::word_info::PyWordInfo; pub(crate) type PyMorphemeList = MorphemeList<Arc<PyDicData>>; pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>; -/// A list of morphemes +/// A list of morphemes. +/// +/// An object can not be instantiated manually. +/// Use Tokenizer.tokenize("") to create an empty morpheme list. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeList")] pub struct PyMorphemeListWrapper { /// use `internal()` function instead @@ -89,7 +92,7 @@ impl PyMorphemeListWrapper { #[pymethods] impl PyMorphemeListWrapper { - /// Returns an empty morpheme list with dictionary + /// Returns an empty morpheme list with dictionary. #[classmethod] #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { @@ -109,7 +112,7 @@ impl PyMorphemeListWrapper { }) } - /// Returns the total cost of the path + /// Returns the total cost of the path. #[pyo3(text_signature="(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() @@ -278,21 +281,23 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { - /// Returns the begin index of this in the input text + /// Returns the begin index of this in the input text. #[pyo3(text_signature="(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } - /// Returns the end index of this in the input text + /// Returns the end index of this in the input text. #[pyo3(text_signature="(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() } - /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured + /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + /// + /// See `Config.projection`. #[pyo3(text_signature="(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); @@ -303,14 +308,16 @@ impl PyMorpheme { } } - /// Returns the substring of input text corresponding to the morpheme regardless the configured projection + /// Returns the substring of input text corresponding to the morpheme regardless the configured projection. + /// + /// See `Config.projection`. #[pyo3(text_signature="(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. - /// Tuple elements are four POS levels, conjugation type and conjugation form. + /// Tuple elements are four POS levels, conjugation type and conjugation form. #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> { let pos_id = self.part_of_speech_id(py); @@ -321,25 +328,25 @@ impl PyMorpheme { .clone_ref(py) } - /// Returns the id of the part of speech in the dictionary + /// Returns the id of the part of speech in the dictionary. #[pyo3(text_signature="(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } - /// Returns the dictionary form + /// Returns the dictionary form. #[pyo3(text_signature="(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } - /// Returns the normalized form + /// Returns the normalized form. #[pyo3(text_signature="(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } - /// Returns the reading form + /// Returns the reading form. #[pyo3(text_signature="(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) @@ -347,13 +354,14 @@ impl PyMorpheme { /// Returns sub-morphemes in the provided split mode. /// - /// :param mode: mode of new split - /// :param out: write results to this MorhpemeList instead of creating new one + /// :param mode: mode of new split. + /// :param out: write results to this MorhpemeList instead of creating new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for /// more information on output parameters. /// Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements. /// When False is passed, empty lists are returned instead. + /// /// :type mode: sudachipy.SplitMode /// :type out: Optional[sudachipy.MorphemeList] /// :type add_single: bool @@ -401,19 +409,19 @@ impl PyMorpheme { Ok(out_cell) } - /// Returns whether if this is out of vocabulary word + /// Returns whether if this is out of vocabulary word. #[pyo3(text_signature="(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } - /// Returns word id of this word in the dictionary + /// Returns word id of this word in the dictionary. #[pyo3(text_signature="(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } - /// Returns the dictionary id which this word belongs + /// Returns the dictionary id which this word belongs. #[pyo3(text_signature="(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); @@ -424,7 +432,7 @@ impl PyMorpheme { } } - /// Returns the list of synonym group ids + /// Returns the list of synonym group ids. #[pyo3(text_signature="(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); @@ -432,7 +440,7 @@ impl PyMorpheme { PyList::new(py, ids) } - /// Returns the word info + /// Returns the word info. #[pyo3(text_signature="(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; @@ -441,7 +449,7 @@ impl PyMorpheme { Ok(self.morph(py).get_word_info().clone().into()) } - /// Returns morpheme length in codepoints + /// Returns morpheme length in codepoints. pub fn __len__(&self, py: Python) -> usize { let m = self.morph(py); m.end_c() - m.begin_c() diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 586c7d90..16d1fa56 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -30,7 +30,7 @@ use crate::morpheme::PyMorpheme; /// /// Create using Dictionary.pos_matcher method. /// -/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher. +/// Use `__call__(m: Morpheme) -> bool` to check whether a morpheme has matching POS. #[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")] pub struct PyPosMatcher { matcher: PosMatcher, @@ -123,6 +123,10 @@ impl PyPosMatcher { #[pymethods] impl PyPosMatcher { + /// Checks whether a morpheme has matching POS. + /// + /// :param m: morpheme. + /// :return: if morpheme has matching POS. pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool { let pos_id = m.part_of_speech_id(py); self.matcher.matches_id(pos_id) @@ -140,6 +144,7 @@ impl PyPosMatcher { self.matcher.num_entries() } + /// Returns a POS matcher which matches a POS if any of two matchers would match it. pub fn __or__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -153,6 +158,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if both matchers would match it at the same time. pub fn __and__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -166,6 +172,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. pub fn __sub__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -179,6 +186,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. pub fn __invert__(&self) -> Self { let max_id = self.dic.pos.len(); // map -> filter chain is needed to handle exactly u16::MAX POS entries diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 16f2482a..8c7c1c84 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -35,7 +35,7 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// B == middle mode /// C == long mode /// -/// :param mode: str to parse. One of [A,B,C] in captital or lower case. +/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -88,7 +88,7 @@ impl PySplitMode { } } -/// Sudachi Tokenizer +/// A sudachi tokenizer /// /// Create using Dictionary.create method. #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] @@ -123,15 +123,15 @@ impl PyTokenizer { /// Break text into morphemes. /// - /// SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - /// - /// :param text: text to analyze + /// :param text: text to analyze. /// :param mode: analysis mode. /// This parameter is deprecated. /// Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. /// If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + /// :param logger: Arg for v0.5.* compatibility. Ignored. /// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type text: str /// :type mode: sudachipy.SplitMode /// :type out: sudachipy.MorphemeList From 706a573311551542cc726486daabfb10bf2c5966 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 13:53:56 +0900 Subject: [PATCH 11/24] add type fields for rs --- python/src/build.rs | 26 ++++++++++++++++++++++-- python/src/dictionary.rs | 36 ++++++++++++++++++++++++++------- python/src/morpheme.rs | 42 +++++++++++++++++++-------------------- python/src/pos_matcher.rs | 4 +++- python/src/tokenizer.rs | 14 +++++++++---- 5 files changed, 86 insertions(+), 36 deletions(-) diff --git a/python/src/build.rs b/python/src/build.rs index 350f2fb3..2b2ce94f 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -59,8 +59,19 @@ fn create_file(p: &Path) -> std::io::Result<File> { } /// Build system dictionary from matrix and lexicons. +/// +/// :param matrix: Path to the matrix file. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type matrix: pathlib.Path | str | bytes +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] +#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_system_dic<'p>( py: Python<'p>, matrix: &'p PyAny, @@ -89,8 +100,19 @@ fn build_system_dic<'p>( } /// Build user dictionary from lexicons based on the given system dictionary. +/// +/// :param system: Path to the system dictionary. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type system: pathlib.Path | str +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")] +#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_user_dic<'p>( py: Python<'p>, system: &'p PyAny, diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 5f1e8f65..2b5c849b 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -90,6 +90,12 @@ impl PyDicData { /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. +/// +/// :type config_path: Config | pathlib.Path | str | None +/// :type config: Config | pathlib.Path | str | None +/// :type resource_dir: pathlib.Path | str | None +/// :type dict: pathlib.Path | str | None +/// :type dict_type: pathlib.Path | str | None #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")] #[derive(Clone)] pub struct PyDictionary { @@ -111,6 +117,12 @@ impl PyDictionary { /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. + /// + /// :type config_path: Config | pathlib.Path | str | None + /// :type config: Config | pathlib.Path | str | None + /// :type resource_dir: pathlib.Path | str | None + /// :type dict: pathlib.Path | str | None + /// :type dict_type: pathlib.Path | str | None #[new] #[pyo3( text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary", @@ -235,6 +247,10 @@ impl PyDictionary { /// :param fields: load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. /// :param projection: Projection override for created Tokenizer. See Config.projection for values. + /// + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type projection: str | None #[pyo3( text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer", signature=(mode=None, fields=None, *, projection=None) @@ -277,7 +293,9 @@ impl PyDictionary { /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. - #[pyo3(text_signature="(self, /, target) -> PosMatcher")] + /// + /// :type target: Iterable[PartialPOS] | Callable[[POS], bool] + #[pyo3(text_signature = "(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -293,8 +311,10 @@ impl PyDictionary { /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. /// :param projection: Projection override for created Tokenizer. See Config.projection for values. /// - /// :type mode: SplitMode - /// :type fields: Set[str] + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type handler: Callable[[int, NormalizedString, MorphemeList], list[NormalizedString]] | None + /// :type projection: str | None #[pyo3( text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", signature=(mode=None, fields=None, handler=None, *, projection=None) @@ -352,8 +372,8 @@ impl PyDictionary { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// /// :type surface: str - /// :type out: MorphemeList - #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] + /// :type out: MorphemeList | None + #[pyo3(text_signature = "(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, py: Python<'p>, @@ -381,7 +401,7 @@ impl PyDictionary { } /// Close this dictionary. - #[pyo3(text_signature="(self, /) -> ()")] + #[pyo3(text_signature = "(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } @@ -390,7 +410,9 @@ impl PyDictionary { /// /// :param pos_id: POS id /// :return: POS tuple with the given id or None for non existing id. - #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] + /// + /// :type pos_id: int + #[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 0a18f6c4..522d8ecd 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -94,7 +94,7 @@ impl PyMorphemeListWrapper { impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary. #[classmethod] - #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] + #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn( @@ -113,13 +113,13 @@ impl PyMorphemeListWrapper { } /// Returns the total cost of the path. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() } /// Returns the number of morpheme in this list. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn size(&self, py: Python) -> usize { self.internal(py).len() } @@ -282,14 +282,14 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { /// Returns the begin index of this in the input text. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } /// Returns the end index of this in the input text. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() @@ -298,7 +298,7 @@ impl PyMorpheme { /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. /// /// See `Config.projection`. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); let morph = self.morph(py); @@ -311,14 +311,14 @@ impl PyMorpheme { /// Returns the substring of input text corresponding to the morpheme regardless the configured projection. /// /// See `Config.projection`. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. /// Tuple elements are four POS levels, conjugation type and conjugation form. - #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] + #[pyo3(text_signature = "(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> { let pos_id = self.part_of_speech_id(py); self.list(py) @@ -329,25 +329,25 @@ impl PyMorpheme { } /// Returns the id of the part of speech in the dictionary. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } /// Returns the dictionary form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } /// Returns the normalized form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } /// Returns the reading form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) } @@ -362,12 +362,10 @@ impl PyMorpheme { /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements. /// When False is passed, empty lists are returned instead. /// - /// :type mode: sudachipy.SplitMode - /// :type out: Optional[sudachipy.MorphemeList] + /// :type mode: SplitMode | None + /// :type out: MorphemeList | None /// :type add_single: bool - #[pyo3( - text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList" - )] + #[pyo3(text_signature = "(self, /, mode, out=None, add_single=False) -> MorphemeList")] fn split<'py>( &'py self, py: Python<'py>, @@ -410,19 +408,19 @@ impl PyMorpheme { } /// Returns whether if this is out of vocabulary word. - #[pyo3(text_signature="(self, /) -> bool")] + #[pyo3(text_signature = "(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } /// Returns word id of this word in the dictionary. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } /// Returns the dictionary id which this word belongs. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); if word_id.is_oov() { @@ -433,7 +431,7 @@ impl PyMorpheme { } /// Returns the list of synonym group ids. - #[pyo3(text_signature="(self, /) -> List[int]")] + #[pyo3(text_signature = "(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); @@ -441,7 +439,7 @@ impl PyMorpheme { } /// Returns the word info. - #[pyo3(text_signature="(self, /) -> WordInfo")] + #[pyo3(text_signature = "(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 16d1fa56..bb9749f2 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -125,8 +125,10 @@ impl PyPosMatcher { impl PyPosMatcher { /// Checks whether a morpheme has matching POS. /// - /// :param m: morpheme. + /// :param m: a morpheme to check. /// :return: if morpheme has matching POS. + /// + /// :type m: Morpheme pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool { let pos_id = m.part_of_speech_id(py); self.matcher.matches_id(pos_id) diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 8c7c1c84..c14f7076 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -36,6 +36,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// C == long mode /// /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. +/// If None, returns SplitMode.C. +/// +/// :type mode: str | None #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -67,9 +70,12 @@ impl From<Mode> for PySplitMode { #[pymethods] impl PySplitMode { - /// Parse SplitMode from a character. + /// Creates a split mode from a string value. + /// + /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + /// If None, returns SplitMode.C. /// - /// :param mode: str to parse. One of [A,B,C] in captital or lower case. + /// :type mode: str | None #[new] #[pyo3( text_signature="(mode=None) -> SplitMode", @@ -133,8 +139,8 @@ impl PyTokenizer { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// /// :type text: str - /// :type mode: sudachipy.SplitMode - /// :type out: sudachipy.MorphemeList + /// :type mode: SplitMode | str | None + /// :type out: MorphemeList #[pyo3( text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList", signature=(text, mode=None, logger=None, out=None) From 5d8620ee643096027a687275b26838cb70874a68 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 14:16:45 +0900 Subject: [PATCH 12/24] improve pyi --- python/py_src/sudachipy/sudachipy.pyi | 47 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 705b62af..0b1c4fc2 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -28,12 +28,20 @@ PartialPOS = Union[ Tuple[()], ] -# Fields that can be specified for partial dictionary loading. -# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. +""" +Fields that can be specified for partial dictionary loading. +See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. +""" FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form", "word_structure", "split_a", "split_b", "synonym_group_id"]]] +""" +Strings that can be parsed as SplitMode +""" +SplitModeStr = Literal["A", "a", "B", "b", "C", "c"] + + class SplitMode: """ Unit to split text. @@ -48,11 +56,12 @@ class SplitMode: C: ClassVar[SplitMode] = ... @classmethod - def __init__(cls, mode: str = "C") -> None: + def __init__(cls, mode: Optional[SplitModeStr] = "C") -> None: """ Creates a split mode from a string value. :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + If None, returns SplitMode.C. """ ... @@ -88,10 +97,10 @@ class Dictionary: ... def create(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = SplitMode.C, - fields: FieldSet = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, *, - projection: str = None) -> Tokenizer: + projection: Optional[str] = None) -> Tokenizer: """ Creates a sudachi tokenizer. @@ -118,12 +127,12 @@ class Dictionary: ... def pre_tokenizer(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", - fields: FieldSet = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, handler: Optional[Callable[[ int, object, MorphemeList], list]] = None, *, - projection: str = None) -> object: + projection: Optional[str] = None) -> object: """ Creates HuggingFace Tokenizers-compatible PreTokenizer. Requires package `tokenizers` to be installed. @@ -230,7 +239,10 @@ class Morpheme: """ ... - def split(self, mode: Union[SplitMode, Literal["A", "B", "C"]], out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList: + def split(self, + mode: Union[SplitMode, SplitModeStr], + out: Optional[MorphemeList] = None, + add_single: bool = True) -> MorphemeList: """ Returns sub-morphemes in the provided split mode. @@ -288,7 +300,7 @@ class MorphemeList: def __init__(self) -> None: ... @classmethod - def empty(cls, dict) -> MorphemeList: + def empty(cls, dict: Dictionary) -> MorphemeList: """ Returns an empty morpheme list with dictionary. """ @@ -306,7 +318,7 @@ class MorphemeList: """ ... - def __getitem__(self, index) -> Morpheme: ... + def __getitem__(self, index: int) -> Morpheme: ... def __iter__(self) -> Iterator[Morpheme]: ... def __len__(self) -> int: ... @@ -318,11 +330,13 @@ class Tokenizer: Create using Dictionary.create method. """ SplitMode: ClassVar[SplitMode] = ... + @classmethod def __init__(cls) -> None: ... - def tokenize(self, text: str, - mode: Union[SplitMode, Literal["A", "B", "C"]] = ..., + def tokenize(self, + text: str, + mode: Union[SplitMode, SplitModeStr, None] = None, out: Optional[MorphemeList] = None) -> MorphemeList: """ Break text into morphemes. @@ -359,6 +373,7 @@ class WordInfo: surface: ClassVar[str] = ... synonym_group_ids: ClassVar[List[int]] = ... word_structure: ClassVar[List[int]] = ... + @classmethod def __init__(self) -> None: ... def length(self) -> int: ... @@ -374,11 +389,11 @@ class PosMatcher: def __iter__(self) -> Iterator[POS]: ... def __len__(self) -> int: ... - def __call__(self, m: Morpheme) -> bool: + def __call__(self, /, m: Morpheme) -> bool: """ Checks whether a morpheme has matching POS. - :param m: morpheme. + :param m: a morpheme to check. :return: if morpheme has matching POS. """ ... From 3fd3e52e268b6906fd9eda2e6ad8c84abc44979e Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 11:04:18 +0900 Subject: [PATCH 13/24] use get_all for wordinfo --- python/src/word_info.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/python/src/word_info.rs b/python/src/word_info.rs index 4f74d0f1..eb51a28d 100644 --- a/python/src/word_info.rs +++ b/python/src/word_info.rs @@ -18,29 +18,18 @@ use pyo3::prelude::*; use sudachi::dic::lexicon::word_infos::{WordInfo, WordInfoData}; -#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo")] +#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo", get_all)] pub struct PyWordInfo { - #[pyo3(get)] surface: String, - #[pyo3(get)] head_word_length: u16, - #[pyo3(get)] pos_id: u16, - #[pyo3(get)] normalized_form: String, - #[pyo3(get)] dictionary_form_word_id: i32, - #[pyo3(get)] dictionary_form: String, - #[pyo3(get)] reading_form: String, - #[pyo3(get)] a_unit_split: Vec<u32>, - #[pyo3(get)] b_unit_split: Vec<u32>, - #[pyo3(get)] word_structure: Vec<u32>, - #[pyo3(get)] synonym_group_ids: Vec<u32>, } From d1c31655292adc80e1b9a1051bb4b90752500e6f Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 8 Jul 2024 14:51:00 +0900 Subject: [PATCH 14/24] add deprecated directive and fix --- python/py_src/sudachipy/sudachipy.pyi | 9 +++++++++ python/src/build.rs | 4 ++-- python/src/dictionary.rs | 4 ++-- python/src/morpheme.rs | 6 ++++++ python/src/tokenizer.rs | 2 ++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 0b1c4fc2..ca39a95c 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -47,7 +47,9 @@ class SplitMode: Unit to split text. A == short mode + B == middle mode + C == long mode """ @@ -205,6 +207,9 @@ class Morpheme: def get_word_info(self) -> WordInfo: """ Returns the word info. + + ..deprecated:: v0.6.0 + Users should not touch the raw WordInfo. """ ... @@ -293,6 +298,7 @@ class Morpheme: class MorphemeList: """ A list of morphemes. + An object can not be instantiated manually. Use Tokenizer.tokenize("") to create an empty morpheme list. """ @@ -303,6 +309,9 @@ class MorphemeList: def empty(cls, dict: Dictionary) -> MorphemeList: """ Returns an empty morpheme list with dictionary. + + .. deprecated:: + Use Tokenizer.tokenize("") if you need. """ ... diff --git a/python/src/build.rs b/python/src/build.rs index 2b2ce94f..b37ed807 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -65,7 +65,7 @@ fn create_file(p: &Path) -> std::io::Result<File> { /// :param output: Path to output built dictionray. /// :param description: A description text to embed in the dictionary. /// :return: A build report, list of (part, size, time). -/// +/// /// :type matrix: pathlib.Path | str | bytes /// :type lex: list[pathlib.Path | str | bytes] /// :type output: pathlib.Path | str @@ -106,7 +106,7 @@ fn build_system_dic<'p>( /// :param output: Path to output built dictionray. /// :param description: A description text to embed in the dictionary. /// :return: A build report, list of (part, size, time). -/// +/// /// :type system: pathlib.Path | str /// :type lex: list[pathlib.Path | str | bytes] /// :type output: pathlib.Path | str diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 2b5c849b..22241f95 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -87,7 +87,7 @@ impl PyDicData { /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. -/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. +/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. /// @@ -114,7 +114,7 @@ impl PyDictionary { /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. - /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict. + /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. /// diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 522d8ecd..b9367e10 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -93,6 +93,9 @@ impl PyMorphemeListWrapper { #[pymethods] impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary. + /// + /// .. deprecated:: 0.6.0 + /// Use Tokenizer.tokenize("") if you need. #[classmethod] #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { @@ -439,6 +442,9 @@ impl PyMorpheme { } /// Returns the word info. + /// + /// ..deprecated:: v0.6.0 + /// Users should not touch the raw WordInfo. #[pyo3(text_signature = "(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index c14f7076..d96763de 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -32,7 +32,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// Unit to split text. /// /// A == short mode +/// /// B == middle mode +/// /// C == long mode /// /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. From 4a3da5bacd868112165ac5f3c5c49d5f82eba48f Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Wed, 25 Sep 2024 09:36:20 +0900 Subject: [PATCH 15/24] update Dictionary arg name --- python/README.md | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/python/README.md b/python/README.md index 4d95d7fb..b1ad3e5e 100644 --- a/python/README.md +++ b/python/README.md @@ -66,7 +66,7 @@ $ pip install sudachipy ### Step 2. Get a Dictionary -You can get dictionary as a Python package. It make take a while to download the dictionary file (around 70MB for the `core` edition). +You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition). ```bash $ pip install sudachidict_core @@ -209,7 +209,7 @@ There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `fu SudachiPy uses `sudachidict_core` by default. -Dictionaries are installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. +Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. * [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/) * [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/) @@ -234,19 +234,19 @@ $ echo "外国人参政権" | sudachipy -s full ### Dictionary option: Python package -You can specify the dictionary with the `Dicionary()` argument; `config_path` or `dict_type`. +You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`. ```python -class Dictionary(config_path=None, resource_dir=None, dict_type=None) +class Dictionary(config=None, resource_dir=None, dict=None) ``` -1. `config_path` - * You can specify the file path to the setting file with `config_path` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). +1. `config` + * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary. -2. `dict_type` - * You can also specify the dictionary type with `dict_type`. - * The available arguments are `small`, `core`, or `full`. - * If different dictionaries are specified with `config_path` and `dict_type`, **a dictionary defined `dict_type` overrides** those defined in the config path. +2. `dict` + * You can also specify the dictionary type with `dict`. + * The available arguments are `small`, `core`, `full`, or a path to the dictionary file. + * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config. ```python from sudachipy import Dictionary @@ -255,16 +255,16 @@ from sudachipy import Dictionary tokenizer_obj = Dictionary().create() # The dictionary given by the `systemDict` key in the config file (/path/to/sudachi.json) will be used -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json").create() -# The dictionary specified by `dict_type` will be set. -tokenizer_obj = Dictionary(dict_type="core").create() # sudachidict_core (same as default) -tokenizer_obj = Dictionary(dict_type="small").create() # sudachidict_small -tokenizer_obj = Dictionary(dict_type="full").create() # sudachidict_full +# The dictionary specified by `dict` will be used. +tokenizer_obj = Dictionary(dict="core").create() # sudachidict_core (same as default) +tokenizer_obj = Dictionary(dict="small").create() # sudachidict_small +tokenizer_obj = Dictionary(dict="full").create() # sudachidict_full -# The dictionary specified by `dict_type` overrides those defined in the config path. +# The dictionary specified by `dict` overrides those defined in the config. # In the following code, `sudachidict_full` will be used regardless of a dictionary defined in the config file. -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json", dict_type="full").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create() ``` @@ -303,10 +303,8 @@ Then specify your `sudachi.json` with the `-r` option. $ sudachipy -r path/to/sudachi.json ``` - You can build a user dictionary with the subcommand `ubuild`. - ```bash $ sudachipy ubuild -h usage: sudachipy ubuild [-h] [-o file] [-d string] -s file file [file ...] From c943da8d452b28ff291f4671d1cab50b0078d3e9 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 10 Jun 2024 09:41:05 +0900 Subject: [PATCH 16/24] use crate::errors to send err --- python/src/build.rs | 4 +-- python/src/dictionary.rs | 67 +++++++++++++++++--------------------- python/src/errors.rs | 8 ++++- python/src/morpheme.rs | 31 +++++++----------- python/src/pos_matcher.rs | 4 +-- python/src/pretokenizer.rs | 4 +-- python/src/projection.rs | 18 ++++------ python/src/tokenizer.rs | 30 ++++++++--------- 8 files changed, 74 insertions(+), 92 deletions(-) diff --git a/python/src/build.rs b/python/src/build.rs index a6005b26..6b3bd0ca 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -142,8 +142,8 @@ fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult<DataSource<'p let data = data.downcast::<PyBytes>()?; Ok(DataSource::Data(data.as_bytes())) } else { - Err(pyo3::exceptions::PyValueError::new_err(format!( - "data source should can be only Path, bytes or str, was {}: {}", + errors::wrap(Err(format!( + "data source should be Path, bytes or str, was {}: {}", data, data.get_type() ))) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index bc333c8e..802e23c2 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -24,7 +24,6 @@ use std::str::FromStr; use std::sync::Arc; use sudachi::analysis::Mode; -use crate::errors::{wrap, wrap_ctx, SudachiError as SudachiErr}; use sudachi::analysis::stateless_tokenizer::DictionaryAccess; use sudachi::config::{Config, ConfigBuilder, SurfaceProjection}; use sudachi::dic::dictionary::JapaneseDictionary; @@ -35,6 +34,7 @@ use sudachi::plugin::input_text::InputTextPlugin; use sudachi::plugin::oov::OovProviderPlugin; use sudachi::plugin::path_rewrite::PathRewritePlugin; +use crate::errors; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; use crate::pos_matcher::PyPosMatcher; use crate::pretokenizer::PyPretokenizer; @@ -110,7 +110,7 @@ impl PyDictionary { config: Option<&PyAny>, ) -> PyResult<Self> { if config.is_some() && config_path.is_some() { - return Err(SudachiErr::new_err("Both config and config_path options were specified at the same time, use one of them")); + return errors::wrap(Err("Both config and config_path options were specified at the same time, use one of them")); } let default_config = read_default_config(py)?; @@ -131,13 +131,10 @@ impl PyDictionary { }; if dict_type.is_some() { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn( + errors::warn_deprecation( py, - cat, "Parameter dict_type of Dictionary() is deprecated, use dict instead", - 1, - )?; + )? } let config_builder = match resource_dir { @@ -177,12 +174,10 @@ impl PyDictionary { } } - let jdic = JapaneseDictionary::from_cfg(&config).map_err(|e| { - SudachiErr::new_err(format!( - "Error while constructing dictionary: {}", - e.to_string() - )) - })?; + let jdic = errors::wrap_ctx( + JapaneseDictionary::from_cfg(&config), + "Error while constructing dictionary", + )?; let pos_data = jdic .grammar() @@ -238,7 +233,7 @@ impl PyDictionary { let mut required_fields = self.config.projection.required_subset(); let dict = self.dictionary.as_ref().unwrap().clone(); let projobj = if let Some(s) = projection { - let proj = wrap(SurfaceProjection::try_from(s.to_str()?))?; + let proj = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?; required_fields = proj.required_subset(); Some(morpheme_projection(proj, &dict)) } else { @@ -301,7 +296,7 @@ impl PyDictionary { let subset = parse_field_subset(fields)?; if let Some(h) = handler.as_ref() { if !h.as_ref(py).is_callable() { - return Err(SudachiErr::new_err("handler must be callable")); + return errors::wrap(Err("handler must be callable")); } } @@ -357,12 +352,12 @@ impl PyDictionary { // this needs to be a variable let mut borrow = l.try_borrow_mut(); let out_list = match borrow { - Err(_) => return Err(SudachiErr::new_err("out was used twice at the same time")), Ok(ref mut ms) => ms.internal_mut(py), + Err(_) => return errors::wrap(Err("out was used twice at the same time")), }; out_list.clear(); - wrap_ctx(out_list.lookup(surface, InfoSubset::all()), surface)?; + errors::wrap_ctx(out_list.lookup(surface, InfoSubset::all()), surface)?; Ok(l) } @@ -380,7 +375,7 @@ impl PyDictionary { } fn __repr__(&self) -> PyResult<String> { - wrap(config_repr(&self.config)) + errors::wrap(config_repr(&self.config)) } } @@ -413,18 +408,21 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> { pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult<Mode> { if mode.is_instance_of::<PyString>() { - let mode = mode.str()?.to_str()?; - Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into()) + errors::wrap(Mode::from_str(mode.str()?.to_str()?)) } else if mode.is_instance_of::<PySplitMode>() { let mode = mode.extract::<PySplitMode>()?; Ok(Mode::from(mode)) } else { - Err(SudachiErr::new_err(("unknown mode", mode.into_py(py)))) + errors::wrap(Err(format!( + "mode should be sudachipy.SplitMode or str, was {}: {}", + mode, + mode.get_type() + ))) } } fn read_config_from_fs(path: Option<&Path>) -> PyResult<ConfigBuilder> { - wrap(ConfigBuilder::from_opt_file(path)) + errors::wrap(ConfigBuilder::from_opt_file(path)) } fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { @@ -433,13 +431,13 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { // looks like json if config_str.starts_with("{") && config_str.ends_with("}") { let result = ConfigBuilder::from_bytes(config_str.as_bytes()); - return wrap(result); + return errors::wrap(result); } let p = Path::new(config_str); if p.exists() && p.is_file() { return read_config_from_fs(Some(p)); } - return Err(SudachiErr::new_err(format!( + return errors::wrap(Err(format!( "config file [{}] do not exist or is not a file", p.display() ))); @@ -450,9 +448,10 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> { let cfg_as_str = config_opt.call_method0("as_jsons")?; return read_config(cfg_as_str); } - Err(SudachiErr::new_err(( - format!("passed config was not a string, json object or sudachipy.config.Config object"), - config_opt.into_py(py), + errors::wrap(Err(format!( + "config should be sudachipy.Config or str which represents a file path or json obj, was {}: {}", + config_opt, + config_opt.get_type() ))) } @@ -460,7 +459,7 @@ pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> { let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?; let path = path.downcast::<PyString>()?.to_str()?; let path = PathBuf::from(path); - wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path) + errors::wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path) } pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> { @@ -484,10 +483,7 @@ fn locate_system_dict(py: Python, path: &Path) -> PyResult<PathBuf> { } match path.to_str() { Some(name @ ("small" | "core" | "full")) => find_dict_path(py, name), - _ => Err(SudachiErr::new_err(format!( - "invalid dictionary path {:?}", - path - ))), + _ => errors::wrap(Err(format!("invalid dictionary path {:?}", path))), } } @@ -509,12 +505,7 @@ fn parse_field_subset(data: Option<&PySet>) -> PyResult<InfoSubset> { "split_a" => InfoSubset::SPLIT_A, "split_b" => InfoSubset::SPLIT_B, "synonym_group_id" => InfoSubset::SYNONYM_GROUP_ID, - x => { - return Err(SudachiErr::new_err(format!( - "Invalid WordInfo field name {}", - x - ))) - } + x => return errors::wrap(Err(format!("Invalid WordInfo field name {}", x))), }; } Ok(subset) diff --git a/python/src/errors.rs b/python/src/errors.rs index 04827fd4..da72601a 100644 --- a/python/src/errors.rs +++ b/python/src/errors.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +use pyo3::exceptions::PyDeprecationWarning; +use pyo3::prelude::*; use pyo3::{import_exception, PyResult}; use std::fmt::{Debug, Display}; @@ -33,3 +35,7 @@ pub fn wrap_ctx<T, E: Display, C: Debug + ?Sized>(v: Result<T, E>, ctx: &C) -> P Err(e) => Err(SudachiError::new_err(format!("{:?}: {}", ctx, e))), } } + +pub fn warn_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { + PyErr::warn(py, &py.get_type::<PyDeprecationWarning>(), msg, 1) +} diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index ad3929dd..fd097336 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -18,13 +18,14 @@ use std::fmt::Write; use std::ops::Deref; use std::sync::Arc; -use pyo3::exceptions::{PyException, PyIndexError}; +use pyo3::exceptions::PyIndexError; use pyo3::prelude::*; use pyo3::types::{PyList, PyString, PyTuple, PyType}; use sudachi::prelude::{Morpheme, MorphemeList}; use crate::dictionary::{extract_mode, PyDicData, PyDictionary}; +use crate::errors; use crate::projection::MorphemeProjection; use crate::word_info::PyWordInfo; @@ -92,12 +93,9 @@ impl PyMorphemeListWrapper { #[classmethod] #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn( + errors::warn_deprecation( py, - cat, "Use Tokenizer.tokenize(\"\") if you need an empty MorphemeList.", - 1, )?; let cloned = dict.dictionary.as_ref().unwrap().clone(); @@ -176,9 +174,7 @@ impl PyMorphemeListWrapper { list: slf.clone_ref(py), index: i, }; - pymorph - .write_repr(py, &mut result) - .map_err(|_| PyException::new_err("format failed"))?; + errors::wrap_ctx(pymorph.write_repr(py, &mut result), "format failed")?; result.push_str(",\n"); } result.push_str("]>"); @@ -380,16 +376,14 @@ impl PyMorpheme { let mut borrow = out_cell.try_borrow_mut(); let out_ref = match borrow { Ok(ref mut v) => v.internal_mut(py), - Err(_) => return Err(PyException::new_err("out was used twice")), + Err(_) => return errors::wrap(Err("out was used twice at the same time")), }; out_ref.clear(); - let splitted = list - .internal(py) - .split_into(mode, self.index, out_ref) - .map_err(|e| { - PyException::new_err(format!("Error while splitting morpheme: {}", e.to_string())) - })?; + let splitted = errors::wrap_ctx( + list.internal(py).split_into(mode, self.index, out_ref), + "Error while splitting morpheme", + )?; if add_single.unwrap_or(true) && !splitted { list.internal(py) @@ -433,9 +427,7 @@ impl PyMorpheme { /// Returns the word info #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")] fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> { - let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; - PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; - + errors::warn_deprecation(py, "Users should not touch the raw WordInfo.")?; Ok(self.morph(py).get_word_info().clone().into()) } @@ -451,8 +443,7 @@ impl PyMorpheme { pub fn __repr__<'py>(&'py self, py: Python<'py>) -> PyResult<String> { let mut result = String::new(); - self.write_repr(py, &mut result) - .map_err(|_| PyException::new_err("failed to format repr"))?; + errors::wrap_ctx(self.write_repr(py, &mut result), "failed to format repr")?; Ok(result) } } diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 7c6a884d..f0a53b64 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -16,7 +16,6 @@ use std::sync::Arc; -use pyo3::exceptions::PyException; use pyo3::prelude::*; use pyo3::types::{PyBool, PyIterator, PyTuple}; @@ -24,6 +23,7 @@ use sudachi::analysis::stateless_tokenizer::DictionaryAccess; use sudachi::pos::PosMatcher; use crate::dictionary::PyDicData; +use crate::errors; use crate::morpheme::PyMorpheme; #[pyclass(name = "PosMatcher", module = "sudachipy")] @@ -106,7 +106,7 @@ impl PyPosMatcher { } if start_len == data.len() { - Err(PyException::new_err(format!( + errors::wrap(Err(format!( "POS {:?} did not match any elements", elem.repr()? ))) diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 755f040b..49cf1a29 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -15,7 +15,7 @@ */ use crate::dictionary::PyDicData; -use crate::errors::wrap; +use crate::errors; use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector}; use pyo3::intern; use pyo3::prelude::*; @@ -49,7 +49,7 @@ impl PerThreadPreTokenizer { pub fn tokenize(&mut self, data: &str) -> PyResult<()> { self.tokenizer.reset().push_str(data); - wrap(self.tokenizer.do_tokenize())?; + errors::wrap(self.tokenizer.do_tokenize())?; Ok(()) } diff --git a/python/src/projection.rs b/python/src/projection.rs index 8bea35be..7739c7bc 100644 --- a/python/src/projection.rs +++ b/python/src/projection.rs @@ -15,6 +15,7 @@ */ use crate::dictionary::PyDicData; +use crate::errors; use crate::morpheme::PyProjector; use pyo3::types::PyString; use pyo3::{PyResult, Python}; @@ -174,18 +175,13 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>( value: &str, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { - match SurfaceProjection::try_from(value) { - Ok(v) => { - if v == SurfaceProjection::Surface { - Ok((None, SurfaceProjection::Surface)) - } else { - Ok((Some(morpheme_projection(v, dict)), v)) - } + errors::wrap_ctx(SurfaceProjection::try_from(value).map(|v| { + if v == SurfaceProjection::Surface { + (None, SurfaceProjection::Surface) + } else { + (Some(morpheme_projection(v, dict)), v) } - Err(e) => Err(crate::errors::SudachiError::new_err(format!( - "invalid surface projection: {e:?}" - ))), - } + }), "invalid surface projection") } pub(crate) fn parse_projection_opt<D: DictionaryAccess>( diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 558d02cb..18ec0a63 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -26,7 +26,7 @@ use sudachi::dic::subset::InfoSubset; use sudachi::prelude::*; use crate::dictionary::{extract_mode, PyDicData}; -use crate::errors::SudachiError as SudachiPyErr; +use crate::errors; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// Unit to split text @@ -74,11 +74,7 @@ impl PySplitMode { Some(m) => m, None => return Ok(PySplitMode::C), }; - - match Mode::from_str(mode) { - Ok(m) => Ok(m.into()), - Err(e) => Err(SudachiPyErr::new_err(e.to_string())), - } + errors::wrap(Mode::from_str(mode).map(|m| m.into())) } } @@ -151,12 +147,13 @@ impl PyTokenizer { }); // analysis can be done without GIL - let err = py.allow_threads(|| { - tokenizer.reset().push_str(text); - tokenizer.do_tokenize() - }); - - err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?; + errors::wrap_ctx( + py.allow_threads(|| { + tokenizer.reset().push_str(text); + tokenizer.do_tokenize() + }), + "Error during tokenization", + )?; let out_list = match out { None => { @@ -172,12 +169,13 @@ impl PyTokenizer { let mut borrow = out_list.try_borrow_mut(); let morphemes = match borrow { Ok(ref mut ms) => ms.internal_mut(py), - Err(e) => return Err(SudachiPyErr::new_err("out was used twice at the same time")), + Err(_) => return errors::wrap(Err("out was used twice at the same time")), }; - morphemes - .collect_results(tokenizer.deref_mut()) - .map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?; + errors::wrap_ctx( + morphemes.collect_results(tokenizer.deref_mut()), + "Error during tokenization", + )?; Ok(out_list) } From a4a47e21c6b27ffd39cfa2dbebc4d51f85b1c0e3 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Fri, 25 Oct 2024 16:20:38 +0900 Subject: [PATCH 17/24] cargo fmt --- python/src/projection.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/python/src/projection.rs b/python/src/projection.rs index 7739c7bc..9140e747 100644 --- a/python/src/projection.rs +++ b/python/src/projection.rs @@ -175,13 +175,16 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>( value: &str, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { - errors::wrap_ctx(SurfaceProjection::try_from(value).map(|v| { - if v == SurfaceProjection::Surface { - (None, SurfaceProjection::Surface) - } else { - (Some(morpheme_projection(v, dict)), v) - } - }), "invalid surface projection") + errors::wrap_ctx( + SurfaceProjection::try_from(value).map(|v| { + if v == SurfaceProjection::Surface { + (None, SurfaceProjection::Surface) + } else { + (Some(morpheme_projection(v, dict)), v) + } + }), + "invalid surface projection", + ) } pub(crate) fn parse_projection_opt<D: DictionaryAccess>( From a33bc6b30061ca28c874a3c9d7f47e670054a92b Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Fri, 25 Oct 2024 17:43:23 +0900 Subject: [PATCH 18/24] add new line for the matrix size --- sudachi-cli/src/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eea11ecf..eb2e716c 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -210,7 +210,7 @@ fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) { fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) { let conn = grammar.conn_matrix(); - write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); + write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { From 848e637fc271735e3ede206a84d03b776722b708 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 28 Oct 2024 16:03:29 +0900 Subject: [PATCH 19/24] dump pos_id --- sudachi-cli/src/build.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eb2e716c..62248809 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -196,7 +196,8 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) { } fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) { - for p in grammar.pos_list.iter() { + for (id, p) in grammar.pos_list.iter().enumerate() { + write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { w.write_all(e.as_bytes()).unwrap(); if (i + 1) == p.len() { From ec9a0f4dff64d7a6792504ef7e37ebcf57bdc978 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Mon, 28 Oct 2024 17:16:55 +0900 Subject: [PATCH 20/24] dump winfo in lexicon format --- sudachi-cli/src/build.rs | 97 +++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 17 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index 62248809..ec7f4620 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; use sudachi::dic::DictionaryLoader; @@ -79,6 +80,7 @@ pub(crate) enum BuildCli { dict: PathBuf, part: String, output: PathBuf, + // todo: dump user dict }, } @@ -189,7 +191,7 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) { match part.as_str() { "pos" => dump_pos(dict.grammar(), &mut writer), "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(), + "winfo" => dump_word_info(&dict, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); @@ -221,23 +223,28 @@ fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) { } } -fn dump_word_info<W: Write>(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { +fn dump_word_info<W: Write>(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> { + let grammar = dict.grammar(); + let lex = dict.lexicon(); let size = lex.size(); for i in 0..size { let wid = WordId::checked(0, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; + write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; - write!(w, "{},", winfo.surface())?; - write!(w, "{},", winfo.head_word_length())?; - write!(w, "{},", winfo.normalized_form())?; - write!(w, "{},", winfo.dictionary_form_word_id())?; - write!(w, "{},", winfo.reading_form())?; - dump_wids(w, winfo.a_unit_split())?; + write!(w, "{},", unicode_escape(winfo.surface()))?; // writing + write!(w, "{},", pos_string(grammar, winfo.pos_id()))?; + write!(w, "{},", unicode_escape(winfo.reading_form()))?; + write!(w, "{},", unicode_escape(winfo.normalized_form()))?; + let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id()); + write!(w, "{},", dict_form)?; + write!(w, "{},", split_mode(&winfo))?; + dump_wids(w, grammar, lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.b_unit_split())?; + dump_wids(w, grammar, lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.word_structure())?; + dump_wids(w, grammar, lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -245,23 +252,79 @@ fn dump_word_info<W: Write>(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { Ok(()) } -fn dump_wids<W: Write>(w: &mut W, data: &[WordId]) -> SudachiResult<()> { +fn unicode_escape(raw: &str) -> String { + // replace '"' and ',' + let escaped = raw + .to_string() + .replace("\"", "\\u0022") + .replace(",", "\\u002c"); + escaped +} + +fn split_mode(winfo: &WordInfo) -> &str { + // todo: check + let asplits = winfo.a_unit_split(); + if asplits.len() == 0 { + return "A"; + } + let bsplits = winfo.b_unit_split(); + if bsplits.len() == 0 { + return "B"; + } + return "C"; +} + +fn pos_string(grammar: &Grammar, posid: u16) -> String { + let pos_parts = grammar.pos_components(posid); + pos_parts.join(",") +} + +fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String { + if wid < 0 { + return "*".to_string(); + } + let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid"); + format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic)) +} + +fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String { + let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo"); + format!( + "{},{},{}", + unicode_escape(winfo.surface()), + pos_string(grammar, winfo.pos_id()), + unicode_escape(winfo.reading_form()), + ) +} + +fn dump_wids<W: Write>( + w: &mut W, + grammar: &Grammar, + lex: &LexiconSet, + data: &[WordId], +) -> SudachiResult<()> { + if data.len() == 0 { + write!(w, "*")?; + return Ok(()); + } + w.write_all(b"\"")?; for (i, e) in data.iter().enumerate() { - let prefix = match e.dic() { - 0 => "", - _ => "U", - }; - write!(w, "{}{}", prefix, e.word())?; + write!(w, "{}", wordref_string(grammar, lex, e))?; if i + 1 != data.len() { w.write_all(b"/")?; } } + w.write_all(b"\"")?; Ok(()) } fn dump_gids<W: Write>(w: &mut W, data: &[u32]) -> SudachiResult<()> { + if data.len() == 0 { + write!(w, "*")?; + return Ok(()); + } for (i, e) in data.iter().enumerate() { - write!(w, "{}", e)?; + write!(w, "{:06}", e)?; if i + 1 != data.len() { w.write_all(b"/")?; } From d11cafc56f675345156c6cb5d98854bed7bc4e7b Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Tue, 29 Oct 2024 16:46:54 +0900 Subject: [PATCH 21/24] dump user dict --- sudachi-cli/src/build.rs | 110 ++++++++++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 24 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index ec7f4620..f1d1dc27 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::header::HeaderVersion; use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; @@ -77,10 +78,17 @@ pub(crate) enum BuildCli { #[command(name = "dump")] Dump { - dict: PathBuf, + /// target dictionary to dump + dictionary: PathBuf, + /// dump target (matrix, pos, winfo) part: String, + /// output file output: PathBuf, - // todo: dump user dict + + /// reference system dictionary. + /// required to dump winfo of an user dictionary + #[arg(short = 's', long = "system")] + system: Option<PathBuf>, }, } @@ -103,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) { match subcommand { BuildCli::System { common, matrix } => build_system(common, matrix), BuildCli::User { common, dictionary } => build_user(common, dictionary), - BuildCli::Dump { dict, part, output } => dump_part(dict, part, output), + BuildCli::Dump { + dictionary, + part, + output, + system, + } => dump_part(dictionary, system, part, output), } } @@ -178,26 +191,30 @@ fn output_file(p: &Path) -> File { .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } -fn dump_part(dict: PathBuf, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open failed"); - let data = unsafe { Mmap::map(&file) }.expect("mmap failed"); +fn dump_part(dict: PathBuf, system: Option<PathBuf>, part: String, output: PathBuf) { + let file = File::open(&dict).expect("open dict failed"); + let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); - let dict = loader.to_loaded().expect("should contain grammar"); let outf = output_file(&output); let mut writer = BufWriter::new(outf); match part.as_str() { - "pos" => dump_pos(dict.grammar(), &mut writer), - "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(&dict, &mut writer).unwrap(), + "pos" => dump_pos(loader, &mut writer), + "matrix" => dump_matrix(loader, &mut writer), + "winfo" => dump_word_info(loader, system, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); } -fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) { +fn dump_pos<W: Write>(dict: DictionaryLoader, w: &mut W) { + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); + for (id, p) in grammar.pos_list.iter().enumerate() { write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { @@ -211,10 +228,18 @@ fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) { } } -fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) { +fn dump_matrix<W: Write>(dict: DictionaryLoader, w: &mut W) { + if let HeaderVersion::UserDict(_) = dict.header.version { + panic!("user dictionary does not have connection matrix.") + } + + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); + write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); @@ -223,28 +248,66 @@ fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) { } } -fn dump_word_info<W: Write>(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> { - let grammar = dict.grammar(); - let lex = dict.lexicon(); - let size = lex.size(); +fn dump_word_info<W: Write>( + dict: DictionaryLoader, + system: Option<PathBuf>, + w: &mut W, +) -> SudachiResult<()> { + let is_user = match dict.header.version { + HeaderVersion::UserDict(_) => true, + HeaderVersion::SystemDict(_) => false, + }; + let did = if is_user { 1 } else { 0 }; + let size = dict.lexicon.size(); + + let data = system.map(|system_path| { + let file = File::open(&system_path).expect("open system failed"); + unsafe { Mmap::map(&file) }.expect("mmap system failed") + }); + let system = data.as_ref().map(|data| { + let loader = DictionaryLoader::read_system_dictionary(data) + .expect("failed to load system dictionary"); + loader + .to_loaded() + .expect("failed to load system dictionary") + }); + + let (base, user) = if is_user { + ( + system.expect("system dictionary is required to dump user dictionary lexicon"), + Some(dict), + ) + } else { + (dict.to_loaded().expect("failed to load dictionary"), None) + }; + + let mut lex = base.lexicon_set; + let mut grammar = base.grammar; + if let Some(udic) = user { + lex.append(udic.lexicon, grammar.pos_list.len())?; + if let Some(g) = udic.grammar { + grammar.merge(g) + } + } + for i in 0..size { - let wid = WordId::checked(0, i)?; + let wid = WordId::checked(did, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; write!(w, "{},", unicode_escape(winfo.surface()))?; // writing - write!(w, "{},", pos_string(grammar, winfo.pos_id()))?; + write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?; write!(w, "{},", unicode_escape(winfo.reading_form()))?; write!(w, "{},", unicode_escape(winfo.normalized_form()))?; - let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id()); + let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id()); write!(w, "{},", dict_form)?; write!(w, "{},", split_mode(&winfo))?; - dump_wids(w, grammar, lex, winfo.a_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, grammar, lex, winfo.b_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, grammar, lex, winfo.word_structure())?; + dump_wids(w, &grammar, &lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -262,7 +325,6 @@ fn unicode_escape(raw: &str) -> String { } fn split_mode(winfo: &WordInfo) -> &str { - // todo: check let asplits = winfo.a_unit_split(); if asplits.len() == 0 { return "A"; From ecddc0beb899fb4cfcb9e2d36c45d6372f4ad90d Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Tue, 29 Oct 2024 16:51:53 +0900 Subject: [PATCH 22/24] fix clippy warnings --- sudachi-cli/src/build.rs | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index f1d1dc27..dbb03444 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -187,12 +187,12 @@ fn output_file(p: &Path) -> File { OpenOptions::new() .write(true) .create_new(true) - .open(&p) + .open(p) .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } fn dump_part(dict: PathBuf, system: Option<PathBuf>, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open dict failed"); + let file = File::open(dict).expect("open dict failed"); let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); @@ -239,11 +239,11 @@ fn dump_matrix<W: Write>(dict: DictionaryLoader, w: &mut W) { let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); + writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); - write!(w, "{} {} {}\n", left, right, cost).unwrap(); + writeln!(w, "{} {} {}", left, right, cost).unwrap(); } } } @@ -261,7 +261,7 @@ fn dump_word_info<W: Write>( let size = dict.lexicon.size(); let data = system.map(|system_path| { - let file = File::open(&system_path).expect("open system failed"); + let file = File::open(system_path).expect("open system failed"); unsafe { Mmap::map(&file) }.expect("mmap system failed") }); let system = data.as_ref().map(|data| { @@ -317,23 +317,21 @@ fn dump_word_info<W: Write>( fn unicode_escape(raw: &str) -> String { // replace '"' and ',' - let escaped = raw - .to_string() - .replace("\"", "\\u0022") - .replace(",", "\\u002c"); - escaped + raw.to_string() + .replace('"', "\\u0022") + .replace(',', "\\u002c") } fn split_mode(winfo: &WordInfo) -> &str { let asplits = winfo.a_unit_split(); - if asplits.len() == 0 { + if asplits.is_empty() { return "A"; } let bsplits = winfo.b_unit_split(); - if bsplits.len() == 0 { + if bsplits.is_empty() { return "B"; } - return "C"; + "C" } fn pos_string(grammar: &Grammar, posid: u16) -> String { @@ -365,7 +363,7 @@ fn dump_wids<W: Write>( lex: &LexiconSet, data: &[WordId], ) -> SudachiResult<()> { - if data.len() == 0 { + if data.is_empty() { write!(w, "*")?; return Ok(()); } @@ -381,7 +379,7 @@ fn dump_wids<W: Write>( } fn dump_gids<W: Write>(w: &mut W, data: &[u32]) -> SudachiResult<()> { - if data.len() == 0 { + if data.is_empty() { write!(w, "*")?; return Ok(()); } From 75cda40da26e0917c55a2f3a9e78421b9b6f9399 Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Thu, 7 Nov 2024 17:24:02 +0900 Subject: [PATCH 23/24] add note to the help of pycli -d option and warn on its use --- python/py_src/sudachipy/command_line.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py index 07f59c19..e5cd87d1 100644 --- a/python/py_src/sudachipy/command_line.py +++ b/python/py_src/sudachipy/command_line.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 Works Applications Co., Ltd. +# Copyright (c) 2019-2024 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,6 +24,13 @@ from . import sudachipy +logging.basicConfig( + style="{", + format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', + datefmt="%m-%d-%Y %H:%M:%S", +) + + def _set_default_subparser(self, name, args=None): """ copy and modify code from https://bitbucket.org/ruamel/std.argparse @@ -97,14 +104,13 @@ def _command_tokenize(args, print_usage): if args.fpath_out: output = open(args.fpath_out, "w", encoding="utf-8") - stdout_logger = logging.getLogger(__name__) - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.DEBUG) - stdout_logger.addHandler(handler) - stdout_logger.setLevel(logging.DEBUG) - stdout_logger.propagate = False + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) print_all = args.a + debug = args.d + if debug: + logger.warning("-d option is not implemented in python.") try: dict_ = Dictionary(config_path=args.fpath_setting, @@ -217,7 +223,7 @@ def main(): parser_tk.add_argument("-a", action="store_true", help="print all of the fields") parser_tk.add_argument("-d", action="store_true", - help="print the debug information") + help="print the debug information (not implemented yet)") parser_tk.add_argument("-v", "--version", action="store_true", dest="version", help="print sudachipy version") parser_tk.add_argument("in_files", metavar="file", From 1cad6c95f463e83c6a774e145ff8c55b93ad580c Mon Sep 17 00:00:00 2001 From: mh-northlander <mh.northlander+github@gmail.com> Date: Thu, 7 Nov 2024 17:24:21 +0900 Subject: [PATCH 24/24] rename pos_list and fmt --- python/py_src/sudachipy/command_line.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py index e5cd87d1..e7574bf1 100644 --- a/python/py_src/sudachipy/command_line.py +++ b/python/py_src/sudachipy/command_line.py @@ -58,7 +58,7 @@ def _set_default_subparser(self, name, args=None): argparse.ArgumentParser.set_default_subparser = _set_default_subparser -def run(tokenizer, input_, output, print_all, morphs, is_stdout): +def run(tokenizer, input_, output, print_all, pos_list, is_stdout): # get an empty MorphemeList for memory reuse mlist = tokenizer.tokenize("") for line in input_: @@ -67,7 +67,7 @@ def run(tokenizer, input_, output, print_all, morphs, is_stdout): for m in tokenizer.tokenize(line, out=mlist): list_info = [ m.surface(), - morphs[m.part_of_speech_id()], + pos_list[m.part_of_speech_id()], m.normalized_form()] if print_all: list_info += [ @@ -116,14 +116,15 @@ def _command_tokenize(args, print_usage): dict_ = Dictionary(config_path=args.fpath_setting, dict_type=args.system_dict_type) # empty matcher - get all POS tags - all_morphs = dict_.pos_matcher([()]) + all_pos_matcher = dict_.pos_matcher([()]) # precompute output POS strings - morphs = [",".join(ms) for ms in all_morphs] + pos_list = [",".join(ms) for ms in all_pos_matcher] tokenizer_obj = dict_.create(mode=args.mode) input_ = fileinput.input( args.in_files, openhook=fileinput.hook_encoded("utf-8")) - run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None) + run(tokenizer_obj, input_, output, print_all, + pos_list, is_stdout=args.fpath_out is None) finally: if args.fpath_out: output.close() @@ -145,7 +146,8 @@ def _command_build(args, print_usage): out_file = Path(args.out_file) if out_file.exists(): - print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) + print("File", out_file, + "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or "" @@ -167,7 +169,8 @@ def _command_build(args, print_usage): def _command_user_build(args, print_usage): system = Path(args.system_dic) if not system.exists(): - print("System dictionary file", system, "does not exist", file=sys.stderr) + print("System dictionary file", system, + "does not exist", file=sys.stderr) return print_usage() in_files = [] @@ -180,7 +183,8 @@ def _command_user_build(args, print_usage): out_file = Path(args.out_file) if out_file.exists(): - print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) + print("File", out_file, + "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or ""