From 113ad4cea2c77ea08ed097a299fa08ebb129da01 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 4 Jun 2024 11:01:37 +0900
Subject: [PATCH 01/24] update pyo3 to v0.21

---
 python/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/Cargo.toml b/python/Cargo.toml
index e1143743..4c5513d9 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -15,7 +15,7 @@ name = "sudachipy"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.20", features = ["extension-module"] }
+pyo3 = { version = "0.21", features = ["extension-module", "gil-refs"] }
 thread_local = "1.1" # Apache 2.0/MIT
 scopeguard = "1" # Apache 2.0/MIT
 

From 4d0d1c0d253af545e0a6006b012e2d74ca0fff59 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 4 Jun 2024 16:07:09 +0900
Subject: [PATCH 02/24] use Bound instead of PyCell

---
 python/src/dictionary.rs   | 24 ++++++++++++------------
 python/src/morpheme.rs     | 10 +++++-----
 python/src/pos_matcher.rs  |  4 ++--
 python/src/pretokenizer.rs | 12 ++++++------
 python/src/tokenizer.rs    |  8 ++++----
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index bc333c8e..9a6f062d 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021-2023 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -319,7 +319,7 @@ impl PyDictionary {
 
         let projector = resolve_projection(passed, &dict.projection);
         let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector);
-        let internal_cell = PyCell::new(py, internal)?;
+        let internal_cell = Bound::new(py, internal)?;
         let module = py.import("tokenizers.pre_tokenizers")?;
         module
             .getattr("PreTokenizer")?
@@ -340,18 +340,18 @@ impl PyDictionary {
     /// :type surface: str
     /// :type out: sudachipy.MorphemeList
     #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")]
-    fn lookup<'p>(
-        &'p self,
-        py: Python<'p>,
-        surface: &'p str,
-        out: Option<&'p PyCell<PyMorphemeListWrapper>>,
-    ) -> PyResult<&'p PyCell<PyMorphemeListWrapper>> {
+    fn lookup<'py>(
+        &'py self,
+        py: Python<'py>,
+        surface: &'py str,
+        out: Option<Bound<'py, PyMorphemeListWrapper>>,
+    ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> {
         let l = match out {
             Some(l) => l,
-            None => PyCell::new(
-                py,
-                PyMorphemeListWrapper::new(self.dictionary.clone().unwrap()),
-            )?,
+            None => {
+                let list = PyMorphemeListWrapper::new(self.dictionary.clone().unwrap());
+                Bound::new(py, list)?
+            }
         };
 
         // this needs to be a variable
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index ad3929dd..1c8cf553 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ impl PyMorphemeListWrapper {
         self.size(py)
     }
 
-    fn __getitem__(slf: &PyCell<PyMorphemeListWrapper>, mut idx: isize) -> PyResult<PyMorpheme> {
+    fn __getitem__(slf: Bound<PyMorphemeListWrapper>, mut idx: isize) -> PyResult<PyMorpheme> {
         let list = slf.borrow();
         let py = slf.py();
         let len = list.size(py) as isize;
@@ -362,9 +362,9 @@ impl PyMorpheme {
         &'py self,
         py: Python<'py>,
         mode: &PyAny,
-        out: Option<&'py PyCell<PyMorphemeListWrapper>>,
+        out: Option<Bound<'py, PyMorphemeListWrapper>>,
         add_single: Option<bool>,
-    ) -> PyResult<&'py PyCell<PyMorphemeListWrapper>> {
+    ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> {
         let list = self.list(py);
 
         let mode = extract_mode(py, mode)?;
@@ -372,7 +372,7 @@ impl PyMorpheme {
         let out_cell = match out {
             None => {
                 let list = list.empty_clone(py);
-                PyCell::new(py, list)?
+                Bound::new(py, list)?
             }
             Some(r) => r,
         };
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 7c6a884d..062d0d0c 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -210,7 +210,7 @@ impl PyPosIter {
 
 #[pymethods]
 impl PyPosIter {
-    fn __iter__(slf: &PyCell<Self>) -> &PyCell<Self> {
+    fn __iter__(slf: Bound<Self>) -> Bound<Self> {
         slf
     }
 
diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs
index 755f040b..303f7645 100644
--- a/python/src/pretokenizer.rs
+++ b/python/src/pretokenizer.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -157,11 +157,11 @@ impl PyPretokenizer {
     }
 
     /// Entry function for tokenization
-    pub fn pre_tokenize<'p>(
-        self_: &'p PyCell<Self>,
-        py: Python<'p>,
-        data: &'p PyAny,
-    ) -> PyResult<&'p PyAny> {
+    pub fn pre_tokenize<'py>(
+        self_: Bound<'py, Self>,
+        py: Python<'py>,
+        data: &'py PyAny,
+    ) -> PyResult<&'py PyAny> {
         data.call_method1("split", PyTuple::new(py, [self_]))
     }
 }
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 558d02cb..5f364380 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -138,8 +138,8 @@ impl PyTokenizer {
         text: &'py str,
         mode: Option<&PyAny>,
         logger: Option<PyObject>,
-        out: Option<&'py PyCell<PyMorphemeListWrapper>>,
-    ) -> PyResult<&'py PyCell<PyMorphemeListWrapper>> {
+        out: Option<Bound<'py, PyMorphemeListWrapper>>,
+    ) -> PyResult<Bound<PyMorphemeListWrapper>> {
         // restore default mode on scope exit
         let mode = match mode {
             None => None,
@@ -164,7 +164,7 @@ impl PyTokenizer {
                 let morphemes = MorphemeList::empty(dict);
                 let wrapper =
                     PyMorphemeListWrapper::from_components(morphemes, self.projection.clone());
-                PyCell::new(py, wrapper)?
+                Bound::new(py, wrapper)?
             }
             Some(list) => list,
         };

From 2787346223d129f59adb1fc690789373acecd163 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Thu, 6 Jun 2024 10:34:58 +0900
Subject: [PATCH 03/24] deactivate gil-ref feature and fix related deprecation
 warnings

---
 python/Cargo.toml          |   2 +-
 python/src/build.rs        | 100 ++++++++++++++++++++++---------------
 python/src/dictionary.rs   |  83 +++++++++++++++---------------
 python/src/lib.rs          |   4 +-
 python/src/morpheme.rs     |  36 ++++++-------
 python/src/pos_matcher.rs  |  23 +++++----
 python/src/pretokenizer.rs |  53 ++++++++++----------
 python/src/projection.rs   |  35 ++++++-------
 python/src/tokenizer.rs    |   2 +-
 9 files changed, 183 insertions(+), 155 deletions(-)

diff --git a/python/Cargo.toml b/python/Cargo.toml
index 4c5513d9..6e564c2e 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -15,7 +15,7 @@ name = "sudachipy"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.21", features = ["extension-module", "gil-refs"] }
+pyo3 = { version = "0.21", features = ["extension-module"] }
 thread_local = "1.1" # Apache 2.0/MIT
 scopeguard = "1" # Apache 2.0/MIT
 
diff --git a/python/src/build.rs b/python/src/build.rs
index a6005b26..40e52c34 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,17 +26,17 @@ use sudachi::config::Config;
 use sudachi::dic::build::{DataSource, DictBuilder};
 use sudachi::dic::dictionary::JapaneseDictionary;
 
-pub fn register_functions(m: &PyModule) -> PyResult<()> {
+pub fn register_functions(m: &Bound<PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(build_system_dic, m)?)?;
     m.add_function(wrap_pyfunction!(build_user_dic, m)?)?;
     Ok(())
 }
 
-fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<&PyList> {
-    let stats = PyList::empty(py);
+fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<Bound<PyList>> {
+    let stats = PyList::empty_bound(py);
 
     for p in builder.report() {
-        let t = PyTuple::new(
+        let t = PyTuple::new_bound(
             py,
             [
                 p.part().into_py(py),
@@ -60,23 +60,26 @@ fn create_file(p: &Path) -> std::io::Result<File> {
 
 #[pyfunction]
 #[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")]
-fn build_system_dic<'p>(
-    py: Python<'p>,
-    matrix: &'p PyAny,
-    lex: &'p PyList,
-    output: &'p PyAny,
+fn build_system_dic<'py>(
+    py: Python<'py>,
+    matrix: &Bound<'py, PyAny>,
+    lex: &Bound<'py, PyList>,
+    output: &Bound<'py, PyAny>,
     description: Option<&str>,
-) -> PyResult<&'p PyList> {
+) -> PyResult<Bound<'py, PyList>> {
     let mut builder = DictBuilder::new_system();
     description.map(|d| builder.set_description(d));
 
-    let matrix_src = as_data_source(py, matrix)?;
+    let matrix_path = resolve_as_pypathstr(py, matrix)?;
+    let matrix_src = as_data_source(matrix_path.as_ref(), matrix)?;
     errors::wrap_ctx(builder.read_conn(matrix_src), matrix)?;
     for f in lex.iter() {
-        let lex_src = as_data_source(py, &f)?;
+        let lex_path = resolve_as_pypathstr(py, &f)?;
+        let lex_src = as_data_source(lex_path.as_ref(), &f)?;
         errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
     }
-    let out_file = match as_data_source(py, output)? {
+    let out_path = resolve_as_pypathstr(py, output)?;
+    let out_file = match as_data_source(out_path.as_ref(), output)? {
         DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?,
         DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")),
     };
@@ -89,14 +92,15 @@ fn build_system_dic<'p>(
 
 #[pyfunction]
 #[pyo3(text_signature = "(system, lex, output, description=None) -> list")]
-fn build_user_dic<'p>(
-    py: Python<'p>,
-    system: &'p PyAny,
-    lex: &'p PyList,
-    output: &'p PyAny,
+fn build_user_dic<'py>(
+    py: Python<'py>,
+    system: &Bound<'py, PyAny>,
+    lex: &Bound<'py, PyList>,
+    output: &Bound<'py, PyAny>,
     description: Option<&str>,
-) -> PyResult<&'p PyList> {
-    let system_dic = match as_data_source(py, system)? {
+) -> PyResult<Bound<'py, PyList>> {
+    let system_path = resolve_as_pypathstr(py, system)?;
+    let system_dic = match as_data_source(system_path.as_ref(), system)? {
         DataSource::File(f) => {
             let resource_path = get_default_resource_dir(py)?;
             let cfg = Config::minimal_at(resource_path).with_system_dic(f);
@@ -113,10 +117,12 @@ fn build_user_dic<'p>(
     description.map(|d| builder.set_description(d));
 
     for f in lex.iter() {
-        let lex_src = as_data_source(py, &f)?;
+        let lex_path = resolve_as_pypathstr(py, &f)?;
+        let lex_src = as_data_source(lex_path.as_ref(), &f)?;
         errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
     }
-    let out_file = match as_data_source(py, output)? {
+    let out_path = resolve_as_pypathstr(py, output)?;
+    let out_file = match as_data_source(out_path.as_ref(), output)? {
         DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?,
         DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")),
     };
@@ -127,25 +133,39 @@ fn build_user_dic<'p>(
     to_stats(py, builder)
 }
 
-fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult<DataSource<'p>> {
-    let path = py
-        .import("pathlib")?
-        .getattr("Path")?
-        .downcast::<PyType>()?;
+fn resolve_as_pypathstr<'py>(
+    py: Python<'py>,
+    data: &Bound<'py, PyAny>,
+) -> PyResult<Option<Bound<'py, PyString>>> {
+    let binding = py.import_bound("pathlib")?.getattr("Path")?;
+    let path = binding.downcast::<PyType>()?;
     if data.is_instance(path)? {
-        let pypath = data.call_method0("resolve")?.str()?;
-        Ok(DataSource::File(Path::new(pypath.to_str()?)))
+        Ok(Some(data.call_method0("resolve")?.str()?))
     } else if data.is_instance_of::<PyString>() {
-        let pypath = data.str()?;
-        Ok(DataSource::File(Path::new(pypath.to_str()?)))
-    } else if data.is_instance_of::<PyBytes>() {
-        let data = data.downcast::<PyBytes>()?;
-        Ok(DataSource::Data(data.as_bytes()))
+        Ok(Some(data.str()?))
     } else {
-        Err(pyo3::exceptions::PyValueError::new_err(format!(
-            "data source should can be only Path, bytes or str, was {}: {}",
-            data,
-            data.get_type()
-        )))
+        Ok(None)
+    }
+}
+
+fn as_data_source<'py>(
+    resolved_path: Option<&'py Bound<'py, PyString>>,
+    original_obj: &'py Bound<'py, PyAny>,
+) -> PyResult<DataSource<'py>> {
+    match resolved_path {
+        Some(pystr) => Ok(DataSource::File(Path::new(pystr.to_str()?))),
+        None => {
+            if original_obj.is_instance_of::<PyBytes>() {
+                Ok(DataSource::Data(
+                    original_obj.downcast::<PyBytes>()?.as_bytes(),
+                ))
+            } else {
+                Err(pyo3::exceptions::PyValueError::new_err(format!(
+                    "data source should can be only Path, bytes or str, was {}: {}",
+                    original_obj,
+                    original_obj.get_type()
+                )))
+            }
+        }
     }
 }
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index 9a6f062d..251267ab 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -103,11 +103,11 @@ impl PyDictionary {
     #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))]
     fn new(
         py: Python,
-        config_path: Option<&PyAny>,
+        config_path: Option<&Bound<PyAny>>,
         resource_dir: Option<PathBuf>,
         dict: Option<&str>,
         dict_type: Option<&str>,
-        config: Option<&PyAny>,
+        config: Option<&Bound<PyAny>>,
     ) -> PyResult<Self> {
         if config.is_some() && config_path.is_some() {
             return Err(SudachiErr::new_err("Both config and config_path options were specified at the same time, use one of them"));
@@ -131,10 +131,10 @@ impl PyDictionary {
         };
 
         if dict_type.is_some() {
-            let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-            PyErr::warn(
+            let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?;
+            PyErr::warn_bound(
                 py,
-                cat,
+                &cat,
                 "Parameter dict_type of Dictionary() is deprecated, use dict instead",
                 1,
             )?;
@@ -189,7 +189,7 @@ impl PyDictionary {
             .pos_list
             .iter()
             .map(|pos| {
-                let tuple: Py<PyTuple> = PyTuple::new(py, pos).into_py(py);
+                let tuple: Py<PyTuple> = PyTuple::new_bound(py, pos).into_py(py);
                 tuple
             })
             .collect();
@@ -226,9 +226,9 @@ impl PyDictionary {
     fn create<'py>(
         &'py self,
         py: Python<'py>,
-        mode: Option<&'py PyAny>,
-        fields: Option<&'py PySet>,
-        projection: Option<&'py PyString>,
+        mode: Option<&Bound<'py, PyAny>>,
+        fields: Option<&Bound<'py, PySet>>,
+        projection: Option<&Bound<'py, PyString>>,
     ) -> PyResult<PyTokenizer> {
         let mode = match mode {
             Some(m) => extract_mode(py, m)?,
@@ -263,7 +263,11 @@ impl PyDictionary {
     ///
     /// :param target: can be either a callable or list of POS partial tuples
     #[pyo3(text_signature = "($self, target)")]
-    fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
+    fn pos_matcher<'py>(
+        &'py self,
+        py: Python<'py>,
+        target: &Bound<'py, PyAny>,
+    ) -> PyResult<PyPosMatcher> {
         PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
     }
 
@@ -286,21 +290,21 @@ impl PyDictionary {
         text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer",
         signature = (mode = None, fields = None, handler = None, *, projection = None)
     )]
-    fn pre_tokenizer<'p>(
-        &'p self,
-        py: Python<'p>,
-        mode: Option<&PyAny>,
-        fields: Option<&PySet>,
+    fn pre_tokenizer<'py>(
+        &'py self,
+        py: Python<'py>,
+        mode: Option<&Bound<'py, PyAny>>,
+        fields: Option<&Bound<'py, PySet>>,
         handler: Option<Py<PyAny>>,
-        projection: Option<&PyString>,
-    ) -> PyResult<&'p PyAny> {
+        projection: Option<&Bound<'py, PyString>>,
+    ) -> PyResult<Bound<'py, PyAny>> {
         let mode = match mode {
             Some(m) => extract_mode(py, m)?,
             None => Mode::C,
         };
         let subset = parse_field_subset(fields)?;
         if let Some(h) = handler.as_ref() {
-            if !h.as_ref(py).is_callable() {
+            if !h.bind(py).is_callable() {
                 return Err(SudachiErr::new_err("handler must be callable"));
             }
         }
@@ -320,11 +324,11 @@ impl PyDictionary {
         let projector = resolve_projection(passed, &dict.projection);
         let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector);
         let internal_cell = Bound::new(py, internal)?;
-        let module = py.import("tokenizers.pre_tokenizers")?;
+        let module = py.import_bound("tokenizers.pre_tokenizers")?;
         module
             .getattr("PreTokenizer")?
             .getattr("custom")?
-            .call1(PyTuple::new(py, [internal_cell]))
+            .call1(PyTuple::new_bound(py, [internal_cell]))
     }
 
     /// Look up morphemes in the binary dictionary without performing the analysis.
@@ -374,9 +378,9 @@ impl PyDictionary {
 
     /// Get POS Tuple by its id
     #[pyo3(text_signature = "($self, pos_id: int)")]
-    fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
+    fn pos_of<'py>(&'py self, py: Python<'py>, pos_id: usize) -> Option<&Bound<'py, PyTuple>> {
         let dic = self.dictionary.as_ref().unwrap();
-        dic.pos.get(pos_id).map(|x| x.as_ref(py))
+        dic.pos.get(pos_id).map(|x| x.bind(py))
     }
 
     fn __repr__(&self) -> PyResult<String> {
@@ -411,10 +415,9 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> {
     Ok(result)
 }
 
-pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult<Mode> {
+pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &Bound<'py, PyAny>) -> PyResult<Mode> {
     if mode.is_instance_of::<PyString>() {
-        let mode = mode.str()?.to_str()?;
-        Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into())
+        Mode::from_str(mode.str()?.to_str()?).map_err(|e| SudachiErr::new_err(e).into())
     } else if mode.is_instance_of::<PySplitMode>() {
         let mode = mode.extract::<PySplitMode>()?;
         Ok(Mode::from(mode))
@@ -427,9 +430,10 @@ fn read_config_from_fs(path: Option<&Path>) -> PyResult<ConfigBuilder> {
     wrap(ConfigBuilder::from_opt_file(path))
 }
 
-fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
+fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> {
     if config_opt.is_instance_of::<PyString>() {
-        let config_str = config_opt.str()?.to_str()?.trim();
+        let config_pystr = config_opt.str()?;
+        let config_str = config_pystr.to_str()?.trim();
         // looks like json
         if config_str.starts_with("{") && config_str.ends_with("}") {
             let result = ConfigBuilder::from_bytes(config_str.as_bytes());
@@ -445,10 +449,10 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
         )));
     }
     let py = config_opt.py();
-    let cfg_type = py.import("sudachipy.config")?.getattr("Config")?;
-    if config_opt.is_instance(cfg_type)? {
+    let cfg_type = py.import_bound("sudachipy.config")?.getattr("Config")?;
+    if config_opt.is_instance(&cfg_type)? {
         let cfg_as_str = config_opt.call_method0("as_jsons")?;
-        return read_config(cfg_as_str);
+        return read_config(&cfg_as_str);
     }
     Err(SudachiErr::new_err((
         format!("passed config was not a string, json object or sudachipy.config.Config object"),
@@ -457,24 +461,22 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
 }
 
 pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> {
-    let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?;
+    let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?;
     let path = path.downcast::<PyString>()?.to_str()?;
     let path = PathBuf::from(path);
     wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path)
 }
 
 pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> {
-    let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?;
+    let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?;
     let path = path.downcast::<PyString>()?.to_str()?;
     Ok(PathBuf::from(path))
 }
 
 fn find_dict_path(py: Python, dict_type: &str) -> PyResult<PathBuf> {
-    let pyfunc = PyModule::import(py, "sudachipy")?.getattr("_find_dict_path")?;
-    let path = pyfunc
-        .call1((dict_type,))?
-        .downcast::<PyString>()?
-        .to_str()?;
+    let pyfunc = PyModule::import_bound(py, "sudachipy")?.getattr("_find_dict_path")?;
+    let path = pyfunc.call1((dict_type,))?;
+    let path = path.downcast::<PyString>()?.to_str()?;
     Ok(PathBuf::from(path))
 }
 
@@ -491,15 +493,14 @@ fn locate_system_dict(py: Python, path: &Path) -> PyResult<PathBuf> {
     }
 }
 
-fn parse_field_subset(data: Option<&PySet>) -> PyResult<InfoSubset> {
+fn parse_field_subset(data: Option<&Bound<PySet>>) -> PyResult<InfoSubset> {
     if data.is_none() {
         return Ok(InfoSubset::all());
     }
 
     let mut subset = InfoSubset::empty();
-    for el in data.unwrap().iter() {
-        let s = el.str()?.to_str()?;
-        subset |= match s {
+    for elem in data.unwrap().iter() {
+        subset |= match elem.str()?.to_str()? {
             "surface" => InfoSubset::SURFACE,
             "pos" | "pos_id" => InfoSubset::POS_ID,
             "normalized_form" => InfoSubset::NORMALIZED_FORM,
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 68a9c91d..f2c13703 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ mod word_info;
 
 /// module root
 #[pymodule]
-fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> {
+fn sudachipy(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
     m.add_class::<dictionary::PyDictionary>()?;
     m.add_class::<tokenizer::PySplitMode>()?;
     m.add_class::<tokenizer::PyTokenizer>()?;
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index 1c8cf553..69418d32 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -91,11 +91,11 @@ impl PyMorphemeListWrapper {
     /// Returns an empty morpheme list with dictionary
     #[classmethod]
     #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")]
-    fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
-        let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-        PyErr::warn(
+    fn empty(_cls: &Bound<PyType>, py: Python, dict: &PyDictionary) -> PyResult<Self> {
+        let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?;
+        PyErr::warn_bound(
             py,
-            cat,
+            &cat,
             "Use Tokenizer.tokenize(\"\") if you need an empty MorphemeList.",
             1,
         )?;
@@ -150,7 +150,7 @@ impl PyMorphemeListWrapper {
         })
     }
 
-    fn __str__<'py>(&'py self, py: Python<'py>) -> &PyString {
+    fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> {
         // do a simple tokenization __str__
         let list = self.internal(py);
         let mut result = String::with_capacity(list.surface().len() * 2);
@@ -161,10 +161,10 @@ impl PyMorphemeListWrapper {
                 result.push_str(" ");
             }
         }
-        PyString::new(py, result.as_str())
+        PyString::new_bound(py, result.as_str())
     }
 
-    fn __repr__(slf: Py<PyMorphemeListWrapper>, py: Python) -> PyResult<&PyString> {
+    fn __repr__(slf: Py<PyMorphemeListWrapper>, py: Python) -> PyResult<Bound<PyString>> {
         let self_ref = slf.borrow(py);
         let list = self_ref.internal(py);
         let mut result = String::with_capacity(list.surface().len() * 10);
@@ -182,7 +182,7 @@ impl PyMorphemeListWrapper {
             result.push_str(",\n");
         }
         result.push_str("]>");
-        Ok(PyString::new(py, result.as_str()))
+        Ok(PyString::new_bound(py, result.as_str()))
     }
 
     fn __iter__(slf: Py<Self>) -> PyMorphemeIter {
@@ -292,19 +292,19 @@ impl PyMorpheme {
 
     /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured
     #[pyo3(text_signature = "($self) -> str")]
-    fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
+    fn surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> {
         let list = self.list(py);
         let morph = self.morph(py);
         match list.projection() {
-            None => PyString::new(py, morph.surface().deref()),
+            None => PyString::new_bound(py, morph.surface().deref()),
             Some(proj) => proj.project(morph.deref(), py),
         }
     }
 
     /// Returns the substring of input text corresponding to the morpheme regardless the configured projection
     #[pyo3(text_signature = "($self) -> str")]
-    fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
-        PyString::new(py, self.morph(py).surface().deref())
+    fn raw_surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> {
+        PyString::new_bound(py, self.morph(py).surface().deref())
     }
 
     /// Returns the part of speech as a six-element tuple.
@@ -361,7 +361,7 @@ impl PyMorpheme {
     fn split<'py>(
         &'py self,
         py: Python<'py>,
-        mode: &PyAny,
+        mode: &Bound<'py, PyAny>,
         out: Option<Bound<'py, PyMorphemeListWrapper>>,
         add_single: Option<bool>,
     ) -> PyResult<Bound<'py, PyMorphemeListWrapper>> {
@@ -424,17 +424,17 @@ impl PyMorpheme {
 
     /// Returns the list of synonym group ids
     #[pyo3(text_signature = "($self) -> List[int]")]
-    fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList {
+    fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> Bound<PyList> {
         let mref = self.morph(py);
         let ids = mref.get_word_info().synonym_group_ids();
-        PyList::new(py, ids)
+        PyList::new_bound(py, ids)
     }
 
     /// Returns the word info
     #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
-        let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-        PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?;
+        let cat = PyModule::import_bound(py, "builtins")?.getattr("DeprecationWarning")?;
+        PyErr::warn_bound(py, &cat, "Users should not touch the raw WordInfo.", 1)?;
 
         Ok(self.morph(py).get_word_info().clone().into())
     }
@@ -445,7 +445,7 @@ impl PyMorpheme {
         m.end_c() - m.begin_c()
     }
 
-    pub fn __str__<'py>(&'py self, py: Python<'py>) -> &'py PyString {
+    pub fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> {
         self.surface(py)
     }
 
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 062d0d0c..f0753f4b 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -36,20 +36,20 @@ impl PyPosMatcher {
     pub(crate) fn create<'py>(
         py: Python<'py>,
         dic: &'py Arc<PyDicData>,
-        data: &'py PyAny,
+        data: &Bound<'py, PyAny>,
     ) -> PyResult<PyPosMatcher> {
         if data.is_callable() {
             Self::create_from_fn(dic, data, py)
         } else {
             let iter = data.iter()?;
-            Self::create_from_iter(dic, iter)
+            Self::create_from_iter(dic, &iter)
         }
     }
 
-    fn create_from_fn(dic: &Arc<PyDicData>, func: &PyAny, py: Python) -> PyResult<Self> {
+    fn create_from_fn(dic: &Arc<PyDicData>, func: &Bound<PyAny>, py: Python) -> PyResult<Self> {
         let mut data = Vec::new();
         for (pos_id, pos) in dic.pos.iter().enumerate() {
-            let args = PyTuple::new(py, &[pos]);
+            let args = PyTuple::new_bound(py, &[pos]);
             if func.call1(args)?.downcast::<PyBool>()?.is_true() {
                 data.push(pos_id as u16);
             }
@@ -60,10 +60,11 @@ impl PyPosMatcher {
         })
     }
 
-    fn create_from_iter(dic: &Arc<PyDicData>, data: &PyIterator) -> PyResult<Self> {
+    fn create_from_iter(dic: &Arc<PyDicData>, data: &Bound<PyIterator>) -> PyResult<Self> {
         let mut result = Vec::new();
         for item in data {
-            let item = item?.downcast::<PyTuple>()?;
+            let item = item?;
+            let item = item.downcast::<PyTuple>()?;
             Self::match_pos_elements(&mut result, dic.as_ref(), item)?;
         }
         Ok(Self {
@@ -72,7 +73,11 @@ impl PyPosMatcher {
         })
     }
 
-    fn match_pos_elements(data: &mut Vec<u16>, dic: &PyDicData, elem: &PyTuple) -> PyResult<()> {
+    fn match_pos_elements(
+        data: &mut Vec<u16>,
+        dic: &PyDicData,
+        elem: &Bound<PyTuple>,
+    ) -> PyResult<()> {
         let start_len = data.len();
 
         let elen = elem.len();
@@ -214,7 +219,7 @@ impl PyPosIter {
         slf
     }
 
-    fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&'py PyTuple> {
+    fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&Bound<'py, PyTuple>> {
         let idx = self.index;
         self.index += 1;
         if idx >= self.data.len() {
@@ -222,6 +227,6 @@ impl PyPosIter {
         }
         let pos_id = self.data[idx];
         let pos = &self.dic.pos[pos_id as usize];
-        Some(pos.as_ref(py))
+        Some(pos.bind(py))
     }
 }
diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs
index 303f7645..cd15b1b3 100644
--- a/python/src/pretokenizer.rs
+++ b/python/src/pretokenizer.rs
@@ -126,13 +126,14 @@ impl PyPretokenizer {
     ///
     /// Implementation uses Sudachi to perform the analysis, then uses slice method
     /// of the passed parameter to create output data
-    pub fn __call__<'p>(
-        &'p self,
-        py: Python<'p>,
-        index: &'p PyAny,
-        string: &'p PyAny,
-    ) -> PyResult<&'p PyAny> {
-        let input_data = string.str()?.to_str()?;
+    pub fn __call__<'py>(
+        &'py self,
+        py: Python<'py>,
+        index: &Bound<'py, PyAny>,
+        string: &Bound<'py, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let pystr = string.str()?;
+        let input_data = pystr.to_str()?;
         // tokenization itself should work without GIL, we have thread-local tokenizers here
         py.allow_threads(|| self.tokenizer_cell().borrow_mut().tokenize(input_data))?;
         // then prepare results with GIL
@@ -144,14 +145,14 @@ impl PyPretokenizer {
                 let py_ref = morphs.borrow(py);
                 let morphs = py_ref.internal(py);
                 match self.projection.as_deref() {
-                    None => make_result_for_surface(py, morphs, string),
-                    Some(p) => make_result_for_projection(py, morphs, p),
+                    None => make_result_for_surface(py, morphs, string).map(|bl| bl.into_any()),
+                    Some(p) => make_result_for_projection(py, morphs, p).map(|bl| bl.into_any()),
                 }
             }
             Some(h) => {
-                let mrp: &PyAny = morphs.as_ref(py);
-                let args = PyTuple::new(py, &[index, string, mrp]);
-                h.as_ref(py).call1(args)
+                let mrp: &Bound<PyAny> = morphs.bind(py);
+                let args = PyTuple::new_bound(py, &[index, string, mrp]);
+                h.bind(py).call1(args)
             }
         }
     }
@@ -160,22 +161,22 @@ impl PyPretokenizer {
     pub fn pre_tokenize<'py>(
         self_: Bound<'py, Self>,
         py: Python<'py>,
-        data: &'py PyAny,
-    ) -> PyResult<&'py PyAny> {
-        data.call_method1("split", PyTuple::new(py, [self_]))
+        data: &Bound<'py, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        data.call_method1("split", PyTuple::new_bound(py, [self_]))
     }
 }
 
 fn make_result_for_surface<'py>(
     py: Python<'py>,
     morphs: &PyMorphemeList,
-    string: &'py PyAny,
-) -> PyResult<&'py PyAny> {
-    let result = PyList::empty(py);
+    string: &Bound<'py, PyAny>,
+) -> PyResult<Bound<'py, PyList>> {
+    let result = PyList::empty_bound(py);
     for idx in 0..morphs.len() {
         let node = morphs.get(idx);
-        let slice = PySlice::new(py, node.begin_c() as isize, node.end_c() as isize, 1);
-        let args = PyTuple::new(py, [slice]);
+        let slice = PySlice::new_bound(py, node.begin_c() as isize, node.end_c() as isize, 1);
+        let args = PyTuple::new_bound(py, [slice]);
         let substring = string.call_method1(intern!(py, "slice"), args)?;
         result.append(substring)?;
     }
@@ -186,20 +187,20 @@ fn make_result_for_projection<'py>(
     py: Python<'py>,
     morphs: &PyMorphemeList,
     proj: &dyn MorphemeProjection,
-) -> PyResult<&'py PyAny> {
-    let result = PyList::empty(py);
+) -> PyResult<Bound<'py, PyList>> {
+    let result = PyList::empty_bound(py);
     let nstring = {
         static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = pyo3::sync::GILOnceCell::new();
         NORMALIZED_STRING.get_or_try_init(py, || -> PyResult<Py<PyType>> {
-            let ns = py.import("tokenizers")?.getattr("NormalizedString")?;
-            let tpe = ns.downcast::<PyType>();
-            tpe.map(|x| x.into_py(py)).map_err(|e| e.into())
+            let ns = py.import_bound("tokenizers")?.getattr("NormalizedString")?;
+            let tpe = ns.downcast::<PyType>()?;
+            Ok(tpe.clone().unbind())
         })?
     };
     for idx in 0..morphs.len() {
         let node = morphs.get(idx);
         let value = proj.project(&node, py);
-        let args = PyTuple::new(py, [value]);
+        let args = PyTuple::new_bound(py, [value]);
         let substring = nstring.call1(py, args)?;
         result.append(substring)?;
     }
diff --git a/python/src/projection.rs b/python/src/projection.rs
index 8bea35be..8c7dd142 100644
--- a/python/src/projection.rs
+++ b/python/src/projection.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2023 Works Applications Co., Ltd.
+ *  Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 use crate::dictionary::PyDicData;
 use crate::morpheme::PyProjector;
+use pyo3::prelude::*;
 use pyo3::types::PyString;
 use pyo3::{PyResult, Python};
 use std::convert::TryFrom;
@@ -27,14 +28,14 @@ use sudachi::pos::PosMatcher;
 use sudachi::prelude::Morpheme;
 
 pub(crate) trait MorphemeProjection {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString;
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString>;
 }
 
 struct Surface {}
 
 impl MorphemeProjection for Surface {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString {
-        PyString::new(py, m.surface().deref())
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> {
+        PyString::new_bound(py, m.surface().deref())
     }
 }
 
@@ -43,8 +44,8 @@ struct Mapped<F: for<'a> Fn(&'a Morpheme<'a, Arc<PyDicData>>) -> &'a str> {
 }
 
 impl<F: for<'a> Fn(&'a Morpheme<'a, Arc<PyDicData>>) -> &'a str> MorphemeProjection for Mapped<F> {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString {
-        PyString::new(py, (self.func)(m))
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> {
+        PyString::new_bound(py, (self.func)(m))
     }
 }
 
@@ -60,11 +61,11 @@ impl DictionaryAndSurface {
 }
 
 impl MorphemeProjection for DictionaryAndSurface {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString {
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> {
         if self.matcher.matches_id(m.part_of_speech_id()) {
-            PyString::new(py, m.surface().deref())
+            PyString::new_bound(py, m.surface().deref())
         } else {
-            PyString::new(py, m.dictionary_form())
+            PyString::new_bound(py, m.dictionary_form())
         }
     }
 }
@@ -81,11 +82,11 @@ impl NormalizedAndSurface {
 }
 
 impl MorphemeProjection for NormalizedAndSurface {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString {
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> {
         if self.matcher.matches_id(m.part_of_speech_id()) {
-            PyString::new(py, m.surface().deref())
+            PyString::new_bound(py, m.surface().deref())
         } else {
-            PyString::new(py, m.normalized_form())
+            PyString::new_bound(py, m.normalized_form())
         }
     }
 }
@@ -102,11 +103,11 @@ impl NormalizedNouns {
 }
 
 impl MorphemeProjection for NormalizedNouns {
-    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString {
+    fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString> {
         if self.matcher.matches_id(m.part_of_speech_id()) {
-            PyString::new(py, m.normalized_form())
+            PyString::new_bound(py, m.normalized_form())
         } else {
-            PyString::new(py, m.surface().deref())
+            PyString::new_bound(py, m.surface().deref())
         }
     }
 }
@@ -164,7 +165,7 @@ pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> P
 }
 
 pub(crate) fn parse_projection<D: DictionaryAccess>(
-    value: &PyString,
+    value: &Bound<PyString>,
     dict: &D,
 ) -> PyResult<(PyProjector, SurfaceProjection)> {
     value.to_str().and_then(|s| parse_projection_raw(s, dict))
@@ -189,7 +190,7 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>(
 }
 
 pub(crate) fn parse_projection_opt<D: DictionaryAccess>(
-    value: Option<&PyString>,
+    value: Option<&Bound<PyString>>,
     dict: &D,
 ) -> PyResult<(PyProjector, SurfaceProjection)> {
     match value {
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 5f364380..cc8142e7 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -136,7 +136,7 @@ impl PyTokenizer {
         &'py mut self,
         py: Python<'py>,
         text: &'py str,
-        mode: Option<&PyAny>,
+        mode: Option<&Bound<'py, PyAny>>,
         logger: Option<PyObject>,
         out: Option<Bound<'py, PyMorphemeListWrapper>>,
     ) -> PyResult<Bound<PyMorphemeListWrapper>> {

From 73c8cd94e533a932a8ed94a08f523d915bdeabf5 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 3 Jun 2024 14:47:35 +0900
Subject: [PATCH 04/24] update dependencies

---
 Cargo.lock             | 338 ++++++++++++++++-------------------------
 python/Cargo.toml      |   2 +-
 sudachi-cli/Cargo.toml |   2 +-
 sudachi/Cargo.toml     |  10 +-
 4 files changed, 136 insertions(+), 216 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9ad71bc..73ca27fa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -19,47 +19,48 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.6.13"
+version = "0.6.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
+checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.0.2"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
 dependencies = [
  "windows-sys",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
 dependencies = [
  "anstyle",
  "windows-sys",
@@ -92,12 +93,6 @@ version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
 
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
 [[package]]
 name = "bitflags"
 version = "2.5.0"
@@ -106,9 +101,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "bumpalo"
-version = "3.15.4"
+version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "cast"
@@ -200,9 +195,9 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
 
 [[package]]
 name = "colorchoice"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
 
 [[package]]
 name = "criterion"
@@ -261,9 +256,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 
 [[package]]
 name = "crunchy"
@@ -301,9 +296,9 @@ dependencies = [
 
 [[package]]
 name = "either"
-version = "1.10.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
+checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "equivalent"
@@ -313,9 +308,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
  "libc",
  "windows-sys",
@@ -334,15 +329,15 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.2"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "half"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
 dependencies = [
  "cfg-if",
  "crunchy",
@@ -350,9 +345,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 
 [[package]]
 name = "heck"
@@ -374,13 +369,13 @@ checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
 
 [[package]]
 name = "honggfuzz"
-version = "0.5.55"
+version = "0.5.56"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "848e9c511092e0daa0a35a63e8e6e475a3e8f870741448b9f6028d69b142f18e"
+checksum = "7c76b6234c13c9ea73946d1379d33186151148e0da231506b964b44f3d023505"
 dependencies = [
  "arbitrary",
  "lazy_static",
- "memmap2 0.5.10",
+ "memmap2",
  "rustc_version",
 ]
 
@@ -411,6 +406,12 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -422,9 +423,9 @@ dependencies = [
 
 [[package]]
 name = "itertools"
-version = "0.12.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
 dependencies = [
  "either",
 ]
@@ -466,9 +467,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
 [[package]]
 name = "libloading"
@@ -477,14 +478,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.4",
+ "windows-targets",
 ]
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "lock_api"
@@ -503,18 +504,9 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
 name = "memchr"
-version = "2.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
-
-[[package]]
-name = "memmap2"
-version = "0.5.10"
+version = "2.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
-dependencies = [
- "libc",
-]
+checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
 
 [[package]]
 name = "memmap2"
@@ -552,9 +544,9 @@ dependencies = [
 
 [[package]]
 name = "num-traits"
-version = "0.2.18"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
 ]
@@ -573,9 +565,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -583,22 +575,22 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
  "redox_syscall",
  "smallvec",
- "windows-targets 0.48.5",
+ "windows-targets",
 ]
 
 [[package]]
 name = "plotters"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45"
+checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3"
 dependencies = [
  "num-traits",
  "plotters-backend",
@@ -609,15 +601,15 @@ dependencies = [
 
 [[package]]
 name = "plotters-backend"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609"
+checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7"
 
 [[package]]
 name = "plotters-svg"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab"
+checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705"
 dependencies = [
  "plotters-backend",
 ]
@@ -630,18 +622,18 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.79"
+version = "1.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
+checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "pyo3"
-version = "0.20.3"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233"
+checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8"
 dependencies = [
  "cfg-if",
  "indoc",
@@ -657,9 +649,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-build-config"
-version = "0.20.3"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7"
+checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50"
 dependencies = [
  "once_cell",
  "target-lexicon",
@@ -667,9 +659,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-ffi"
-version = "0.20.3"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa"
+checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403"
 dependencies = [
  "libc",
  "pyo3-build-config",
@@ -677,9 +669,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-macros"
-version = "0.20.3"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158"
+checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c"
 dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
@@ -689,9 +681,9 @@ dependencies = [
 
 [[package]]
 name = "pyo3-macros-backend"
-version = "0.20.3"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185"
+checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c"
 dependencies = [
  "heck 0.4.1",
  "proc-macro2",
@@ -702,9 +694,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -731,11 +723,11 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.4.1"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags",
 ]
 
 [[package]]
@@ -778,11 +770,11 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.32"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
- "bitflags 2.5.0",
+ "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -791,9 +783,9 @@ dependencies = [
 
 [[package]]
 name = "ryu"
-version = "1.0.17"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
 [[package]]
 name = "same-file"
@@ -812,24 +804,24 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "semver"
-version = "1.0.22"
+version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca"
+checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
 
 [[package]]
 name = "serde"
-version = "1.0.197"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.197"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -838,9 +830,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.115"
+version = "1.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
+checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3"
 dependencies = [
  "itoa",
  "ryu",
@@ -871,18 +863,18 @@ name = "sudachi"
 version = "0.6.9-a1"
 dependencies = [
  "aho-corasick",
- "bitflags 2.5.0",
+ "bitflags",
  "claim",
  "csv",
  "default_input_text",
  "fancy-regex",
  "indexmap",
- "itertools 0.12.1",
+ "itertools 0.13.0",
  "join_katakana_oov",
  "join_numeric",
  "lazy_static",
  "libloading",
- "memmap2 0.9.4",
+ "memmap2",
  "nom",
  "regex",
  "serde",
@@ -900,7 +892,7 @@ version = "0.6.9-a1"
 dependencies = [
  "cfg-if",
  "clap",
- "memmap2 0.9.4",
+ "memmap2",
  "sudachi",
 ]
 
@@ -926,9 +918,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.55"
+version = "2.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0"
+checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -955,18 +947,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1109,159 +1101,87 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
 [[package]]
 name = "winapi-util"
-version = "0.1.6"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
 dependencies = [
- "winapi",
+ "windows-sys",
 ]
 
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
-dependencies = [
- "windows_aarch64_gnullvm 0.48.5",
- "windows_aarch64_msvc 0.48.5",
- "windows_i686_gnu 0.48.5",
- "windows_i686_msvc 0.48.5",
- "windows_x86_64_gnu 0.48.5",
- "windows_x86_64_gnullvm 0.48.5",
- "windows_x86_64_msvc 0.48.5",
+ "windows-targets",
 ]
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
 ]
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.5"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.48.5"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
 
 [[package]]
-name = "windows_i686_gnu"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.48.5"
+name = "windows_i686_gnullvm"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.48.5"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.5"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 
 [[package]]
 name = "yada"
diff --git a/python/Cargo.toml b/python/Cargo.toml
index 6e564c2e..53cd97e5 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -16,8 +16,8 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.21", features = ["extension-module"] }
-thread_local = "1.1" # Apache 2.0/MIT
 scopeguard = "1" # Apache 2.0/MIT
+thread_local = "1.1" # Apache 2.0/MIT
 
 [dependencies.sudachi]
 path = "../sudachi"
diff --git a/sudachi-cli/Cargo.toml b/sudachi-cli/Cargo.toml
index c5070424..14aeebb5 100644
--- a/sudachi-cli/Cargo.toml
+++ b/sudachi-cli/Cargo.toml
@@ -14,8 +14,8 @@ license.workspace = true
 sudachi = { path = "../sudachi" }
 
 cfg-if = "1.0.0" # MIT/Apache 2.0
-memmap2 = "0.9" # MIT/Apache 2.0
 clap = { version = "4.5", features = ["derive"] } # MIT/Apache 2.0
+memmap2 = "0.9" # MIT/Apache 2.0
 
 [[bin]]
 name = "sudachi"
diff --git a/sudachi/Cargo.toml b/sudachi/Cargo.toml
index 76b4cfe4..76e5f72c 100644
--- a/sudachi/Cargo.toml
+++ b/sudachi/Cargo.toml
@@ -12,15 +12,15 @@ license.workspace = true
 
 [dependencies] # this should be sorted
 aho-corasick = "1" # MIT/Apache 2.0
-bitflags = "2.0" # MIT/Apache 2.0
-csv = "1.1" # Unilicense/MIT
+bitflags = "2.5" # MIT/Apache 2.0
+csv = "1.3" # Unilicense/MIT
 fancy-regex = "0.13" # MIT
-indexmap = "2.0" # MIT/Apache 2.0
-itertools = "0.12" # MIT/Apachie 2.0
+indexmap = "2.2" # MIT/Apache 2.0
+itertools = "0.13" # MIT/Apachie 2.0
 lazy_static = "1.4" # MIT/Apache 2.0
 libloading = "0.8" # ISC (MIT-compatible)
-nom = "7" # MIT
 memmap2 = "0.9" # MIT/Apache 2.0
+nom = "7" # MIT
 regex = "1" # MIT/Apache 2.0
 serde = { version = "1.0", features = ["derive"] } # MIT/Apache 2.0
 serde_json = "1.0" # MIT/Apache 2.0

From 4345772882bd4a87511ac136e8906b38c77581c1 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Fri, 7 Jun 2024 09:09:16 +0900
Subject: [PATCH 05/24] use pyo3::intern macro inside pretokenizer

---
 python/src/dictionary.rs   | 10 +++++++---
 python/src/pretokenizer.rs |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index 251267ab..e9cbf1ed 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -461,20 +461,24 @@ fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> {
 }
 
 pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> {
-    let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?;
+    let path = py
+        .import_bound("sudachipy")?
+        .getattr("_DEFAULT_SETTINGFILE")?;
     let path = path.downcast::<PyString>()?.to_str()?;
     let path = PathBuf::from(path);
     wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path)
 }
 
 pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> {
-    let path = PyModule::import_bound(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?;
+    let path = py
+        .import_bound("sudachipy")?
+        .getattr("_DEFAULT_RESOURCEDIR")?;
     let path = path.downcast::<PyString>()?.to_str()?;
     Ok(PathBuf::from(path))
 }
 
 fn find_dict_path(py: Python, dict_type: &str) -> PyResult<PathBuf> {
-    let pyfunc = PyModule::import_bound(py, "sudachipy")?.getattr("_find_dict_path")?;
+    let pyfunc = py.import_bound("sudachipy")?.getattr("_find_dict_path")?;
     let path = pyfunc.call1((dict_type,))?;
     let path = path.downcast::<PyString>()?.to_str()?;
     Ok(PathBuf::from(path))
diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs
index cd15b1b3..20e5cf65 100644
--- a/python/src/pretokenizer.rs
+++ b/python/src/pretokenizer.rs
@@ -163,7 +163,7 @@ impl PyPretokenizer {
         py: Python<'py>,
         data: &Bound<'py, PyAny>,
     ) -> PyResult<Bound<'py, PyAny>> {
-        data.call_method1("split", PyTuple::new_bound(py, [self_]))
+        data.call_method1(intern!(py, "split"), PyTuple::new_bound(py, [self_]))
     }
 }
 
@@ -190,7 +190,7 @@ fn make_result_for_projection<'py>(
 ) -> PyResult<Bound<'py, PyList>> {
     let result = PyList::empty_bound(py);
     let nstring = {
-        static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = pyo3::sync::GILOnceCell::new();
+        static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = GILOnceCell::new();
         NORMALIZED_STRING.get_or_try_init(py, || -> PyResult<Py<PyType>> {
             let ns = py.import_bound("tokenizers")?.getattr("NormalizedString")?;
             let tpe = ns.downcast::<PyType>()?;

From 8baaa7abc53c49d7b475256436935f2e30fe3a4c Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Fri, 5 Jul 2024 17:24:57 +0900
Subject: [PATCH 06/24] add missing docstrings

---
 python/py_src/sudachipy/errors.py | 6 ++++--
 python/src/build.rs               | 4 +++-
 python/src/lib.rs                 | 6 ++++--
 python/src/morpheme.rs            | 6 ++++--
 python/src/pos_matcher.rs         | 6 +++++-
 python/src/pretokenizer.rs        | 7 ++++---
 python/src/tokenizer.rs           | 7 ++++---
 7 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/python/py_src/sudachipy/errors.py b/python/py_src/sudachipy/errors.py
index e75e21cd..c11a8205 100644
--- a/python/py_src/sudachipy/errors.py
+++ b/python/py_src/sudachipy/errors.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2023 Works Applications Co., Ltd.
+#   Copyright (c) 2023-2024 Works Applications Co., Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
@@ -13,4 +13,6 @@
 #   limitations under the License.
 
 class SudachiError(Exception):
-    pass
\ No newline at end of file
+    """Base class for all Sudachipy exceptions.
+    """
+    pass
diff --git a/python/src/build.rs b/python/src/build.rs
index a6005b26..59eb50c9 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ fn create_file(p: &Path) -> std::io::Result<File> {
     OpenOptions::new().create_new(true).write(true).open(p)
 }
 
+/// Build system dictionary from matrix and lexicons.
 #[pyfunction]
 #[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")]
 fn build_system_dic<'p>(
@@ -87,6 +88,7 @@ fn build_system_dic<'p>(
     to_stats(py, builder)
 }
 
+/// Build user dictionary from lexicons based on the given system dictionary.
 #[pyfunction]
 #[pyo3(text_signature = "(system, lex, output, description=None) -> list")]
 fn build_user_dic<'p>(
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 68a9c91d..4887a737 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,7 +26,9 @@ mod projection;
 mod tokenizer;
 mod word_info;
 
-/// module root
+/// SudachiPy raw module root.
+///
+/// Users should not use this directly.
 #[pymodule]
 fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<dictionary::PyDictionary>()?;
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index ad3929dd..47e020ee 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -86,6 +86,7 @@ impl PyMorphemeListWrapper {
         }
     }
 }
+
 #[pymethods]
 impl PyMorphemeListWrapper {
     /// Returns an empty morpheme list with dictionary
@@ -197,7 +198,7 @@ impl PyMorphemeListWrapper {
     }
 }
 
-/// A morpheme (basic semantic unit of language).
+/// An iterator over the MorphemeList.
 #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeIter")]
 pub struct PyMorphemeIter {
     list: Py<PyMorphemeListWrapper>,
@@ -241,6 +242,7 @@ impl<'py> Deref for MorphemeRef<'py> {
     }
 }
 
+/// A morpheme (basic semantic unit of language).
 #[pyclass(module = "sudachipy.morpheme", name = "Morpheme", frozen)]
 pub struct PyMorpheme {
     list: Py<PyMorphemeListWrapper>,
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 7c6a884d..a849edf5 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,6 +26,9 @@ use sudachi::pos::PosMatcher;
 use crate::dictionary::PyDicData;
 use crate::morpheme::PyMorpheme;
 
+/// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech.
+///
+/// Create using Dictionary.pos_matcher method.
 #[pyclass(name = "PosMatcher", module = "sudachipy")]
 pub struct PyPosMatcher {
     matcher: PosMatcher,
@@ -189,6 +192,7 @@ impl PyPosMatcher {
     }
 }
 
+/// An iterator over POS tuples in the PosPatcher
 #[pyclass(name = "PosMatcherIterator", module = "sudachipy")]
 pub struct PyPosIter {
     data: Vec<u16>,
diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs
index 755f040b..385c6dcb 100644
--- a/python/src/pretokenizer.rs
+++ b/python/src/pretokenizer.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -76,9 +76,10 @@ impl PerThreadPreTokenizer {
     }
 }
 
-/// Binding for the Tokenizer, which handles threading for tokenization
+/// Binding for the Tokenizer, which handles threading for tokenization.
 ///
-/// We use ThreadLocal for storing actual tokenizers
+/// Create using Dictionary.pre_tokenizer method.
+/// We use ThreadLocal for storing actual tokenizers.
 #[pyclass(module = "sudachipy.pretokenizer", name = "SudachiPreTokenizer")]
 pub struct PyPretokenizer {
     dict: Arc<PyDicData>,
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 558d02cb..a53ce166 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -36,7 +36,6 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 /// B == middle mode
 ///
 /// C == long mode
-//
 #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)]
 #[derive(Clone, PartialEq, Eq, Copy, Debug)]
 #[repr(u8)]
@@ -68,6 +67,7 @@ impl From<Mode> for PySplitMode {
 
 #[pymethods]
 impl PySplitMode {
+    /// Parse SplitMode from a character.
     #[new]
     fn new(mode: Option<&str>) -> PyResult<PySplitMode> {
         let mode = match mode {
@@ -82,7 +82,7 @@ impl PySplitMode {
     }
 }
 
-/// Sudachi Tokenizer, Python version
+/// Sudachi Tokenizer
 #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")]
 pub(crate) struct PyTokenizer {
     tokenizer: StatefulTokenizer<Arc<PyDicData>>,
@@ -182,6 +182,7 @@ impl PyTokenizer {
         Ok(out_list)
     }
 
+    /// SplitMode of the tokenizer.
     #[getter]
     fn mode(&self) -> PySplitMode {
         self.tokenizer.mode().into()

From 8b597e341c3b9c2b9340c08d6507bfa75e041ab8 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Fri, 5 Jul 2024 17:52:19 +0900
Subject: [PATCH 07/24] copy docstring from new to class

---
 python/src/dictionary.rs | 22 ++++++++++++++++------
 python/src/tokenizer.rs  | 11 ++++++++---
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index bc333c8e..1bada310 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021-2023 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -78,7 +78,17 @@ impl PyDicData {
     }
 }
 
-/// A sudachi dictionary
+/// A sudachi dictionary.
+///
+/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
+/// If both config.systemDict and dict_type are given, dict_type is used.
+/// If dict is an absolute path to a file, it is used as a dictionary.
+///
+/// :param config_path: path to the configuration JSON file.
+/// :param resource_dir: path to the resource directory folder.
+/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
+///     Also, can be an _absolute_ path to a compiled dictionary file.
+/// :param dict_type: deprecated alias to dict.
 #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")]
 #[derive(Clone)]
 pub struct PyDictionary {
@@ -92,13 +102,13 @@ impl PyDictionary {
     ///
     /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
     /// If both config.systemDict and dict_type are given, dict_type is used.
-    /// If dict is an absolute path to a file, it is used as a dictionary
+    /// If dict is an absolute path to a file, it is used as a dictionary.
     ///
-    /// :param config_path: path to the configuration JSON file
-    /// :param resource_dir: path to the resource directory folder
+    /// :param config_path: path to the configuration JSON file.
+    /// :param resource_dir: path to the resource directory folder.
     /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
-    /// :param dict_type: deprecated alias to dict
+    /// :param dict_type: deprecated alias to dict.
     #[new]
     #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))]
     fn new(
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index a53ce166..fe3b66d3 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -29,13 +29,13 @@ use crate::dictionary::{extract_mode, PyDicData};
 use crate::errors::SudachiError as SudachiPyErr;
 use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 
-/// Unit to split text
+/// Unit to split text.
 ///
 /// A == short mode
-///
 /// B == middle mode
-///
 /// C == long mode
+///
+/// :param mode: str to parse. One of [A,B,C] in captital or lower case.
 #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)]
 #[derive(Clone, PartialEq, Eq, Copy, Debug)]
 #[repr(u8)]
@@ -68,7 +68,10 @@ impl From<Mode> for PySplitMode {
 #[pymethods]
 impl PySplitMode {
     /// Parse SplitMode from a character.
+    ///
+    /// :param mode: str to parse. One of [A,B,C] in captital or lower case.
     #[new]
+    #[pyo3(signature=(mode=None, *))]
     fn new(mode: Option<&str>) -> PyResult<PySplitMode> {
         let mode = match mode {
             Some(m) => m,
@@ -83,6 +86,8 @@ impl PySplitMode {
 }
 
 /// Sudachi Tokenizer
+///
+/// Create using Dictionary.create method.
 #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")]
 pub(crate) struct PyTokenizer {
     tokenizer: StatefulTokenizer<Arc<PyDicData>>,

From c1d37c7f0aab64bd64144664537fa46512aac6c5 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 11:04:08 +0900
Subject: [PATCH 08/24] update text_signature

---
 python/src/build.rs      |  4 ++--
 python/src/dictionary.rs | 21 ++++++++++++---------
 python/src/morpheme.rs   | 36 ++++++++++++++++++------------------
 python/src/tokenizer.rs  |  9 ++++++---
 4 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/python/src/build.rs b/python/src/build.rs
index 59eb50c9..350f2fb3 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -60,7 +60,7 @@ fn create_file(p: &Path) -> std::io::Result<File> {
 
 /// Build system dictionary from matrix and lexicons.
 #[pyfunction]
-#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")]
+#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
 fn build_system_dic<'p>(
     py: Python<'p>,
     matrix: &'p PyAny,
@@ -90,7 +90,7 @@ fn build_system_dic<'p>(
 
 /// Build user dictionary from lexicons based on the given system dictionary.
 #[pyfunction]
-#[pyo3(text_signature = "(system, lex, output, description=None) -> list")]
+#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
 fn build_user_dic<'p>(
     py: Python<'p>,
     system: &'p PyAny,
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index 1bada310..e208492f 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -110,7 +110,10 @@ impl PyDictionary {
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
     /// :param dict_type: deprecated alias to dict.
     #[new]
-    #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))]
+    #[pyo3(
+        text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary",
+        signature=(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None)
+    )]
     fn new(
         py: Python,
         config_path: Option<&PyAny>,
@@ -230,8 +233,8 @@ impl PyDictionary {
     /// :param fields: load only a subset of fields.
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
     #[pyo3(
-        text_signature = "($self, mode = 'C') -> sudachipy.Tokenizer",
-        signature = (mode = None, fields = None, *, projection = None)
+        text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer",
+        signature=(mode=None, fields=None, *, projection=None)
     )]
     fn create<'py>(
         &'py self,
@@ -272,7 +275,7 @@ impl PyDictionary {
     /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
     ///
     /// :param target: can be either a callable or list of POS partial tuples
-    #[pyo3(text_signature = "($self, target)")]
+    #[pyo3(text_signature="(self, /, target) -> PosMatcher")]
     fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
         PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
     }
@@ -293,8 +296,8 @@ impl PyDictionary {
     /// :type mode: sudachipy.SplitMode
     /// :type fields: Set[str]
     #[pyo3(
-        text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer",
-        signature = (mode = None, fields = None, handler = None, *, projection = None)
+        text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
+        signature=(mode=None, fields=None, handler=None, *, projection=None)
     )]
     fn pre_tokenizer<'p>(
         &'p self,
@@ -349,7 +352,7 @@ impl PyDictionary {
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
     /// :type surface: str
     /// :type out: sudachipy.MorphemeList
-    #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")]
+    #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
     fn lookup<'p>(
         &'p self,
         py: Python<'p>,
@@ -377,13 +380,13 @@ impl PyDictionary {
     }
 
     /// Close this dictionary
-    #[pyo3(text_signature = "($self)")]
+    #[pyo3(text_signature="(self, /) -> ()")]
     fn close(&mut self) {
         self.dictionary = None;
     }
 
     /// Get POS Tuple by its id
-    #[pyo3(text_signature = "($self, pos_id: int)")]
+    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")]
     fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
         let dic = self.dictionary.as_ref().unwrap();
         dic.pos.get(pos_id).map(|x| x.as_ref(py))
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index 47e020ee..f1aa204d 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -91,7 +91,7 @@ impl PyMorphemeListWrapper {
 impl PyMorphemeListWrapper {
     /// Returns an empty morpheme list with dictionary
     #[classmethod]
-    #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")]
+    #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")]
     fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
         PyErr::warn(
@@ -110,13 +110,13 @@ impl PyMorphemeListWrapper {
     }
 
     /// Returns the total cost of the path
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn get_internal_cost(&self, py: Python) -> i32 {
         self.internal(py).get_internal_cost()
     }
 
     /// Returns the number of morpheme in this list.
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn size(&self, py: Python) -> usize {
         self.internal(py).len()
     }
@@ -279,21 +279,21 @@ impl PyMorpheme {
 #[pymethods]
 impl PyMorpheme {
     /// Returns the begin index of this in the input text
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn begin(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).begin_c()
     }
 
     /// Returns the end index of this in the input text
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn end(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).end_c()
     }
 
     /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured
-    #[pyo3(text_signature = "($self) -> str")]
+    #[pyo3(text_signature="(self, /) -> str")]
     fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         let list = self.list(py);
         let morph = self.morph(py);
@@ -304,14 +304,14 @@ impl PyMorpheme {
     }
 
     /// Returns the substring of input text corresponding to the morpheme regardless the configured projection
-    #[pyo3(text_signature = "($self) -> str")]
+    #[pyo3(text_signature="(self, /) -> str")]
     fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         PyString::new(py, self.morph(py).surface().deref())
     }
 
     /// Returns the part of speech as a six-element tuple.
     /// Tuple elements are four POS levels, conjugation type and conjugation form.    
-    #[pyo3(text_signature = "($self)")]
+    #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")]
     fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> {
         let pos_id = self.part_of_speech_id(py);
         self.list(py)
@@ -322,25 +322,25 @@ impl PyMorpheme {
     }
 
     /// Returns the id of the part of speech in the dictionary
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     pub fn part_of_speech_id(&self, py: Python) -> u16 {
         self.morph(py).part_of_speech_id()
     }
 
     /// Returns the dictionary form
-    #[pyo3(text_signature = "($self) -> str")]
+    #[pyo3(text_signature="(self, /) -> str")]
     fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().dictionary_form().into_py(py)
     }
 
     /// Returns the normalized form
-    #[pyo3(text_signature = "($self) -> str")]
+    #[pyo3(text_signature="(self, /) -> str")]
     fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().normalized_form().into_py(py)
     }
 
     /// Returns the reading form
-    #[pyo3(text_signature = "($self) -> str")]
+    #[pyo3(text_signature="(self, /) -> str")]
     fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().reading_form().into_py(py)
     }
@@ -358,7 +358,7 @@ impl PyMorpheme {
     /// :type out: Optional[sudachipy.MorphemeList]
     /// :type add_single: bool
     #[pyo3(
-        text_signature = "($self, mode, out = None, add_single = False) -> sudachipy.MorphemeList"
+        text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList"
     )]
     fn split<'py>(
         &'py self,
@@ -402,19 +402,19 @@ impl PyMorpheme {
     }
 
     /// Returns whether if this is out of vocabulary word
-    #[pyo3(text_signature = "($self) -> bool")]
+    #[pyo3(text_signature="(self, /) -> bool")]
     fn is_oov(&self, py: Python) -> bool {
         self.morph(py).is_oov()
     }
 
     /// Returns word id of this word in the dictionary
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn word_id(&self, py: Python) -> u32 {
         self.morph(py).word_id().as_raw()
     }
 
     /// Returns the dictionary id which this word belongs
-    #[pyo3(text_signature = "($self) -> int")]
+    #[pyo3(text_signature="(self, /) -> int")]
     fn dictionary_id(&self, py: Python) -> i32 {
         let word_id = self.morph(py).word_id();
         if word_id.is_oov() {
@@ -425,7 +425,7 @@ impl PyMorpheme {
     }
 
     /// Returns the list of synonym group ids
-    #[pyo3(text_signature = "($self) -> List[int]")]
+    #[pyo3(text_signature="(self, /) -> List[int]")]
     fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList {
         let mref = self.morph(py);
         let ids = mref.get_word_info().synonym_group_ids();
@@ -433,7 +433,7 @@ impl PyMorpheme {
     }
 
     /// Returns the word info
-    #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")]
+    #[pyo3(text_signature="(self, /) -> WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
         PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?;
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index fe3b66d3..16f2482a 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -71,7 +71,10 @@ impl PySplitMode {
     ///
     /// :param mode: str to parse. One of [A,B,C] in captital or lower case.
     #[new]
-    #[pyo3(signature=(mode=None, *))]
+    #[pyo3(
+        text_signature="(mode=None) -> SplitMode",
+        signature=(mode=None)
+    )]
     fn new(mode: Option<&str>) -> PyResult<PySplitMode> {
         let mode = match mode {
             Some(m) => m,
@@ -133,8 +136,8 @@ impl PyTokenizer {
     /// :type mode: sudachipy.SplitMode
     /// :type out: sudachipy.MorphemeList
     #[pyo3(
-        text_signature = "($self, text: str, mode = None, logger = None, out = None) -> sudachipy.MorphemeList",
-        signature = (text, mode = None, logger = None, out = None)
+        text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList",
+        signature=(text, mode=None, logger=None, out=None)
     )]
     #[allow(unused_variables)]
     fn tokenize<'py>(

From dfc87edf656348474fef8b6aa46e8548e4895c5b Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 11:05:04 +0900
Subject: [PATCH 09/24] add import of PosMatcher

---
 python/py_src/sudachipy/__init__.py | 1 +
 python/src/lib.rs                   | 1 +
 python/src/pos_matcher.rs           | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py
index bdf67f40..fb551538 100644
--- a/python/py_src/sudachipy/__init__.py
+++ b/python/py_src/sudachipy/__init__.py
@@ -5,6 +5,7 @@
     MorphemeList,
     Morpheme,
     WordInfo,
+    PosMatcher,
 )
 from .config import Config
 from . import errors
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 4887a737..56a950c2 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -37,6 +37,7 @@ fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<morpheme::PyMorphemeListWrapper>()?;
     m.add_class::<morpheme::PyMorpheme>()?;
     m.add_class::<word_info::PyWordInfo>()?;
+    m.add_class::<pos_matcher::PyPosMatcher>()?;
     build::register_functions(m)?;
     Ok(())
 }
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index a849edf5..586c7d90 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -29,7 +29,9 @@ use crate::morpheme::PyMorpheme;
 /// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech.
 ///
 /// Create using Dictionary.pos_matcher method.
-#[pyclass(name = "PosMatcher", module = "sudachipy")]
+///
+/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher.
+#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")]
 pub struct PyPosMatcher {
     matcher: PosMatcher,
     dic: Arc<PyDicData>,
@@ -193,7 +195,7 @@ impl PyPosMatcher {
 }
 
 /// An iterator over POS tuples in the PosPatcher
-#[pyclass(name = "PosMatcherIterator", module = "sudachipy")]
+#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcherIterator")]
 pub struct PyPosIter {
     data: Vec<u16>,
     dic: Arc<PyDicData>,

From 8c35516a1f20fee8608401b1aea694063458c061 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 11:56:02 +0900
Subject: [PATCH 10/24] sync pyi and rs

---
 python/py_src/sudachipy/sudachipy.pyi | 104 ++++++++++++++++----------
 python/src/dictionary.rs              |  54 ++++++-------
 python/src/morpheme.rs                |  48 +++++++-----
 python/src/pos_matcher.rs             |  10 ++-
 python/src/tokenizer.rs               |  10 +--
 5 files changed, 136 insertions(+), 90 deletions(-)

diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi
index 16c416f6..705b62af 100644
--- a/python/py_src/sudachipy/sudachipy.pyi
+++ b/python/py_src/sudachipy/sudachipy.pyi
@@ -1,6 +1,20 @@
+#   Copyright (c) 2024 Works Applications Co., Ltd.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
 from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
 from .config import Config
 
+# Part Of Speech
 POS = Tuple[str, str, str, str, str, str]
 # POS element
 PE = Optional[str]
@@ -14,6 +28,8 @@ PartialPOS = Union[
     Tuple[()],
 ]
 
+# Fields that can be specified for partial dictionary loading.
+# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
 FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
                                 "word_structure", "split_a", "split_b", "synonym_group_id"]]]
 
@@ -23,9 +39,7 @@ class SplitMode:
     Unit to split text.
 
     A == short mode
-
     B == middle mode
-
     C == long mode
     """
 
@@ -36,8 +50,9 @@ class SplitMode:
     @classmethod
     def __init__(cls, mode: str = "C") -> None:
         """
-        Creates a split mode from a string value
-        :param mode: string representation of the split mode
+        Creates a split mode from a string value.
+
+        :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
         """
         ...
 
@@ -54,14 +69,15 @@ class Dictionary:
         Creates a sudachi dictionary.
 
         If both config.systemDict and dict are not given, `sudachidict_core` is used.
-        If both config.systemDict and dict are given, dict_type is used.
+        If both config.systemDict and dict are given, dict is used.
+        If dict is an absolute path to a file, it is used as a dictionary.
 
-        :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
-        :param config: alias to config_path, only one of them can be specified at the same time
-        :param resource_dir: path to the resource directory folder
+        :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+        :param config: alias to config_path, only one of them can be specified at the same time.
+        :param resource_dir: path to the resource directory folder.
         :param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
             Also, can be an _absolute_ path to a compiled dictionary file.
-        :param dict_type: deprecated alias to dict
+        :param dict_type: deprecated alias to dict.
         """
         ...
 
@@ -77,11 +93,11 @@ class Dictionary:
                *,
                projection: str = None) -> Tokenizer:
         """
-        Creates a Sudachi Tokenizer.
+        Creates a sudachi tokenizer.
 
         :param mode: sets the analysis mode for this Tokenizer
         :param fields: load only a subset of fields.
-            See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
+            See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
         :param projection: Projection override for created Tokenizer. See Config.projection for values.
         """
         ...
@@ -91,21 +107,21 @@ class Dictionary:
         Creates a new POS matcher.
 
         If target is a function, then it must return whether a POS should match or not.
-        If target a list, it should contain partially specified POS.
-        By partially specified it means that it is possible to omit POS fields or
-        use None as a sentinel value that matches any POS.
+        If target is a list, it should contain partially specified POS.
+        By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
 
         For example, ('名詞',) will match any noun and
         (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.
 
-        :param target: can be either a function or a list of POS tuples.
+        :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
         """
         ...
 
     def pre_tokenizer(self,
                       mode: Union[SplitMode, Literal["A", "B", "C"]] = "C",
                       fields: FieldSet = None,
-                      handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
+                      handler: Optional[Callable[[
+                          int, object, MorphemeList], list]] = None,
                       *,
                       projection: str = None) -> object:
         """
@@ -113,10 +129,10 @@ class Dictionary:
         Requires package `tokenizers` to be installed.
 
         :param mode: Use this split mode (C by default)
-        :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
-        :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
-            First two parameters are the index (int) and HuggingFace NormalizedString.
-            The handler must return a List[NormalizedString]. By default, just segment the tokens.
+        :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+        :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
+            It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
+            See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
         :param projection: Projection override for created Tokenizer. See Config.projection for values.
         """
         ...
@@ -126,7 +142,7 @@ class Dictionary:
         Returns POS with the given id.
 
         :param pos_id: POS id
-        :return: POS tuple with the given id.
+        :return: POS tuple with the given id or None for non existing id.
         """
         ...
 
@@ -197,7 +213,8 @@ class Morpheme:
 
     def part_of_speech(self) -> POS:
         """
-        Returns the part of speech.
+        Returns the part of speech as a six-element tuple.
+        Tuple elements are four POS levels, conjugation type and conjugation form.
         """
         ...
 
@@ -217,8 +234,8 @@ class Morpheme:
         """
         Returns sub-morphemes in the provided split mode.
 
-        :param mode: mode of new split
-        :param out: write results to this MorhpemeList instead of creating new one
+        :param mode: mode of new split.
+        :param out: write results to this MorhpemeList instead of creating new one.
             See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
             more information on output parameters.
             Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
@@ -230,6 +247,7 @@ class Morpheme:
     def surface(self) -> str:
         """
         Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
+
         See `Config.projection`.
         """
         ...
@@ -237,6 +255,7 @@ class Morpheme:
     def raw_surface(self) -> str:
         """
         Returns the substring of input text corresponding to the morpheme regardless the configured projection.
+
         See `Config.projection`.
         """
         ...
@@ -255,7 +274,7 @@ class Morpheme:
 
     def __len__(self) -> int:
         """
-        Returns morpheme length in codepoints
+        Returns morpheme length in codepoints.
         """
 
 
@@ -293,6 +312,11 @@ class MorphemeList:
 
 
 class Tokenizer:
+    """
+    A sudachi tokenizer
+
+    Create using Dictionary.create method.
+    """
     SplitMode: ClassVar[SplitMode] = ...
     @classmethod
     def __init__(cls) -> None: ...
@@ -303,13 +327,12 @@ class Tokenizer:
         """
         Break text into morphemes.
 
-        SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
-
-        :param text: text to analyze
+        :param text: text to analyze.
         :param mode: analysis mode.
             This parameter is deprecated.
             Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
             If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
+        :param logger: Arg for v0.5.* compatibility. Ignored.
         :param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
             See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
         """
@@ -342,41 +365,44 @@ class WordInfo:
 
 
 class PosMatcher:
+    """
+    A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech.
+
+    Create using Dictionary.pos_matcher method.
+    """
+
     def __iter__(self) -> Iterator[POS]: ...
     def __len__(self) -> int: ...
 
     def __call__(self, m: Morpheme) -> bool:
         """
-        Checks whether a morpheme has matching POS
-        :param m: morpheme
-        :return: if morpheme has matching POS
+        Checks whether a morpheme has matching POS.
+
+        :param m: morpheme.
+        :return: if morpheme has matching POS.
         """
         ...
 
     def __or__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if any of two matchers would match it
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if any of two matchers would match it.
         """
         ...
 
     def __and__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if both matchers would match it at the same time
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if both matchers would match it at the same time.
         """
         ...
 
     def __sub__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS.
         """
         ...
 
     def __invert__(self) -> PosMatcher:
         """
-        Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
-        :return: PosMatcher
+        Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher.
         """
         ...
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index e208492f..5f1e8f65 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -80,11 +80,12 @@ impl PyDicData {
 
 /// A sudachi dictionary.
 ///
-/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
-/// If both config.systemDict and dict_type are given, dict_type is used.
+/// If both config.systemDict and dict are not given, `sudachidict_core` is used.
+/// If both config.systemDict and dict are given, dict is used.
 /// If dict is an absolute path to a file, it is used as a dictionary.
 ///
-/// :param config_path: path to the configuration JSON file.
+/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+/// :param config: alias to config_path, only one of them can be specified at the same time.
 /// :param resource_dir: path to the resource directory folder.
 /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
 ///     Also, can be an _absolute_ path to a compiled dictionary file.
@@ -100,11 +101,12 @@ pub struct PyDictionary {
 impl PyDictionary {
     /// Creates a sudachi dictionary.
     ///
-    /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
-    /// If both config.systemDict and dict_type are given, dict_type is used.
+    /// If both config.systemDict and dict are not given, `sudachidict_core` is used.
+    /// If both config.systemDict and dict are given, dict is used.
     /// If dict is an absolute path to a file, it is used as a dictionary.
     ///
-    /// :param config_path: path to the configuration JSON file.
+    /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+    /// :param config: alias to config_path, only one of them can be specified at the same time.
     /// :param resource_dir: path to the resource directory folder.
     /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
@@ -229,11 +231,12 @@ impl PyDictionary {
 
     /// Creates a sudachi tokenizer.
     ///
-    /// :param mode: tokenizer's default split mode (C by default).
+    /// :param mode: sets the analysis mode for this Tokenizer
     /// :param fields: load only a subset of fields.
-    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
+    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+    /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
     #[pyo3(
-        text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer",
+        text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer",
         signature=(mode=None, fields=None, *, projection=None)
     )]
     fn create<'py>(
@@ -267,14 +270,13 @@ impl PyDictionary {
     /// Creates a POS matcher object
     ///
     /// If target is a function, then it must return whether a POS should match or not.
-    /// If target a list, it should contain partially specified POS.
-    /// By partially specified it means that it is possible to omit POS fields or
-    /// use None as a sentinel value that matches any POS.
+    /// If target is a list, it should contain partially specified POS.
+    /// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
     ///
     /// For example, ('名詞',) will match any noun and
     /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
     ///
-    /// :param target: can be either a callable or list of POS partial tuples
+    /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
     #[pyo3(text_signature="(self, /, target) -> PosMatcher")]
     fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
         PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
@@ -285,15 +287,13 @@ impl PyDictionary {
     ///
     /// :param mode: Use this split mode (C by default)
     /// :param fields: ask Sudachi to load only a subset of fields.
-    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
-    /// :param handler: a custom callable to transform MorphemeList into list of tokens.
-    ///     It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
-    ///     See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
-    ///     If nothing was passed, simply use surface as token representations.
-    /// :param projection: projection mode for a created PreTokenizer.
-    ///     See :class:`sudachipy.config.Config` object documentation for supported projections.
+    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+    /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
+    ///     It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
+    ///     See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
+    /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
     ///
-    /// :type mode: sudachipy.SplitMode
+    /// :type mode: SplitMode
     /// :type fields: Set[str]
     #[pyo3(
         text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
@@ -350,8 +350,9 @@ impl PyDictionary {
     /// :param surface: find all morphemes with the given surface
     /// :param out: if passed, reuse the given morpheme list instead of creating a new one.
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
+    ///
     /// :type surface: str
-    /// :type out: sudachipy.MorphemeList
+    /// :type out: MorphemeList
     #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
     fn lookup<'p>(
         &'p self,
@@ -379,14 +380,17 @@ impl PyDictionary {
         Ok(l)
     }
 
-    /// Close this dictionary
+    /// Close this dictionary.
     #[pyo3(text_signature="(self, /) -> ()")]
     fn close(&mut self) {
         self.dictionary = None;
     }
 
-    /// Get POS Tuple by its id
-    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")]
+    /// Returns POS with the given id.
+    ///
+    /// :param pos_id: POS id
+    /// :return: POS tuple with the given id or None for non existing id.
+    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
     fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
         let dic = self.dictionary.as_ref().unwrap();
         dic.pos.get(pos_id).map(|x| x.as_ref(py))
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index f1aa204d..0a18f6c4 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -31,7 +31,10 @@ use crate::word_info::PyWordInfo;
 pub(crate) type PyMorphemeList = MorphemeList<Arc<PyDicData>>;
 pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>;
 
-/// A list of morphemes
+/// A list of morphemes.
+///
+/// An object can not be instantiated manually.
+/// Use Tokenizer.tokenize("") to create an empty morpheme list.
 #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeList")]
 pub struct PyMorphemeListWrapper {
     /// use `internal()` function instead
@@ -89,7 +92,7 @@ impl PyMorphemeListWrapper {
 
 #[pymethods]
 impl PyMorphemeListWrapper {
-    /// Returns an empty morpheme list with dictionary
+    /// Returns an empty morpheme list with dictionary.
     #[classmethod]
     #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")]
     fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
@@ -109,7 +112,7 @@ impl PyMorphemeListWrapper {
         })
     }
 
-    /// Returns the total cost of the path
+    /// Returns the total cost of the path.
     #[pyo3(text_signature="(self, /) -> int")]
     fn get_internal_cost(&self, py: Python) -> i32 {
         self.internal(py).get_internal_cost()
@@ -278,21 +281,23 @@ impl PyMorpheme {
 
 #[pymethods]
 impl PyMorpheme {
-    /// Returns the begin index of this in the input text
+    /// Returns the begin index of this in the input text.
     #[pyo3(text_signature="(self, /) -> int")]
     fn begin(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).begin_c()
     }
 
-    /// Returns the end index of this in the input text
+    /// Returns the end index of this in the input text.
     #[pyo3(text_signature="(self, /) -> int")]
     fn end(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).end_c()
     }
 
-    /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured
+    /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
+    ///
+    /// See `Config.projection`.
     #[pyo3(text_signature="(self, /) -> str")]
     fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         let list = self.list(py);
@@ -303,14 +308,16 @@ impl PyMorpheme {
         }
     }
 
-    /// Returns the substring of input text corresponding to the morpheme regardless the configured projection
+    /// Returns the substring of input text corresponding to the morpheme regardless the configured projection.
+    ///
+    /// See `Config.projection`.
     #[pyo3(text_signature="(self, /) -> str")]
     fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         PyString::new(py, self.morph(py).surface().deref())
     }
 
     /// Returns the part of speech as a six-element tuple.
-    /// Tuple elements are four POS levels, conjugation type and conjugation form.    
+    /// Tuple elements are four POS levels, conjugation type and conjugation form.
     #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")]
     fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> {
         let pos_id = self.part_of_speech_id(py);
@@ -321,25 +328,25 @@ impl PyMorpheme {
             .clone_ref(py)
     }
 
-    /// Returns the id of the part of speech in the dictionary
+    /// Returns the id of the part of speech in the dictionary.
     #[pyo3(text_signature="(self, /) -> int")]
     pub fn part_of_speech_id(&self, py: Python) -> u16 {
         self.morph(py).part_of_speech_id()
     }
 
-    /// Returns the dictionary form
+    /// Returns the dictionary form.
     #[pyo3(text_signature="(self, /) -> str")]
     fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().dictionary_form().into_py(py)
     }
 
-    /// Returns the normalized form
+    /// Returns the normalized form.
     #[pyo3(text_signature="(self, /) -> str")]
     fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().normalized_form().into_py(py)
     }
 
-    /// Returns the reading form
+    /// Returns the reading form.
     #[pyo3(text_signature="(self, /) -> str")]
     fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().reading_form().into_py(py)
@@ -347,13 +354,14 @@ impl PyMorpheme {
 
     /// Returns sub-morphemes in the provided split mode.
     ///
-    /// :param mode: mode of new split
-    /// :param out: write results to this MorhpemeList instead of creating new one
+    /// :param mode: mode of new split.
+    /// :param out: write results to this MorhpemeList instead of creating new one.
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
     ///     more information on output parameters.
     ///     Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
     /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements.
     ///     When False is passed, empty lists are returned instead.
+    ///
     /// :type mode: sudachipy.SplitMode    
     /// :type out: Optional[sudachipy.MorphemeList]
     /// :type add_single: bool
@@ -401,19 +409,19 @@ impl PyMorpheme {
         Ok(out_cell)
     }
 
-    /// Returns whether if this is out of vocabulary word
+    /// Returns whether if this is out of vocabulary word.
     #[pyo3(text_signature="(self, /) -> bool")]
     fn is_oov(&self, py: Python) -> bool {
         self.morph(py).is_oov()
     }
 
-    /// Returns word id of this word in the dictionary
+    /// Returns word id of this word in the dictionary.
     #[pyo3(text_signature="(self, /) -> int")]
     fn word_id(&self, py: Python) -> u32 {
         self.morph(py).word_id().as_raw()
     }
 
-    /// Returns the dictionary id which this word belongs
+    /// Returns the dictionary id which this word belongs.
     #[pyo3(text_signature="(self, /) -> int")]
     fn dictionary_id(&self, py: Python) -> i32 {
         let word_id = self.morph(py).word_id();
@@ -424,7 +432,7 @@ impl PyMorpheme {
         }
     }
 
-    /// Returns the list of synonym group ids
+    /// Returns the list of synonym group ids.
     #[pyo3(text_signature="(self, /) -> List[int]")]
     fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList {
         let mref = self.morph(py);
@@ -432,7 +440,7 @@ impl PyMorpheme {
         PyList::new(py, ids)
     }
 
-    /// Returns the word info
+    /// Returns the word info.
     #[pyo3(text_signature="(self, /) -> WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
@@ -441,7 +449,7 @@ impl PyMorpheme {
         Ok(self.morph(py).get_word_info().clone().into())
     }
 
-    /// Returns morpheme length in codepoints    
+    /// Returns morpheme length in codepoints.
     pub fn __len__(&self, py: Python) -> usize {
         let m = self.morph(py);
         m.end_c() - m.begin_c()
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 586c7d90..16d1fa56 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -30,7 +30,7 @@ use crate::morpheme::PyMorpheme;
 ///
 /// Create using Dictionary.pos_matcher method.
 ///
-/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher.
+/// Use `__call__(m: Morpheme) -> bool` to check whether a morpheme has matching POS.
 #[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")]
 pub struct PyPosMatcher {
     matcher: PosMatcher,
@@ -123,6 +123,10 @@ impl PyPosMatcher {
 
 #[pymethods]
 impl PyPosMatcher {
+    /// Checks whether a morpheme has matching POS.
+    ///
+    /// :param m: morpheme.
+    /// :return: if morpheme has matching POS.
     pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool {
         let pos_id = m.part_of_speech_id(py);
         self.matcher.matches_id(pos_id)
@@ -140,6 +144,7 @@ impl PyPosMatcher {
         self.matcher.num_entries()
     }
 
+    /// Returns a POS matcher which matches a POS if any of two matchers would match it.
     pub fn __or__(&self, other: &Self) -> Self {
         assert_eq!(
             Arc::as_ptr(&self.dic),
@@ -153,6 +158,7 @@ impl PyPosMatcher {
         }
     }
 
+    /// Returns a POS matcher which matches a POS if both matchers would match it at the same time.
     pub fn __and__(&self, other: &Self) -> Self {
         assert_eq!(
             Arc::as_ptr(&self.dic),
@@ -166,6 +172,7 @@ impl PyPosMatcher {
         }
     }
 
+    /// Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS.
     pub fn __sub__(&self, other: &Self) -> Self {
         assert_eq!(
             Arc::as_ptr(&self.dic),
@@ -179,6 +186,7 @@ impl PyPosMatcher {
         }
     }
 
+    /// Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher.
     pub fn __invert__(&self) -> Self {
         let max_id = self.dic.pos.len();
         // map -> filter chain is needed to handle exactly u16::MAX POS entries
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 16f2482a..8c7c1c84 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -35,7 +35,7 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 /// B == middle mode
 /// C == long mode
 ///
-/// :param mode: str to parse. One of [A,B,C] in captital or lower case.
+/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
 #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)]
 #[derive(Clone, PartialEq, Eq, Copy, Debug)]
 #[repr(u8)]
@@ -88,7 +88,7 @@ impl PySplitMode {
     }
 }
 
-/// Sudachi Tokenizer
+/// A sudachi tokenizer
 ///
 /// Create using Dictionary.create method.
 #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")]
@@ -123,15 +123,15 @@ impl PyTokenizer {
 
     /// Break text into morphemes.
     ///
-    /// SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
-    ///
-    /// :param text: text to analyze
+    /// :param text: text to analyze.
     /// :param mode: analysis mode.
     ///    This parameter is deprecated.
     ///    Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
     ///    If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
+    /// :param logger: Arg for v0.5.* compatibility. Ignored.
     /// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
     ///    See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
+    ///
     /// :type text: str
     /// :type mode: sudachipy.SplitMode
     /// :type out: sudachipy.MorphemeList

From 706a573311551542cc726486daabfb10bf2c5966 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 13:53:56 +0900
Subject: [PATCH 11/24] add type fields for rs

---
 python/src/build.rs       | 26 ++++++++++++++++++++++--
 python/src/dictionary.rs  | 36 ++++++++++++++++++++++++++-------
 python/src/morpheme.rs    | 42 +++++++++++++++++++--------------------
 python/src/pos_matcher.rs |  4 +++-
 python/src/tokenizer.rs   | 14 +++++++++----
 5 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/python/src/build.rs b/python/src/build.rs
index 350f2fb3..2b2ce94f 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -59,8 +59,19 @@ fn create_file(p: &Path) -> std::io::Result<File> {
 }
 
 /// Build system dictionary from matrix and lexicons.
+///
+/// :param matrix: Path to the matrix file.
+/// :param lex: List of paths to lexicon files.
+/// :param output: Path to output built dictionray.
+/// :param description: A description text to embed in the dictionary.
+/// :return: A build report, list of (part, size, time).
+/// 
+/// :type matrix: pathlib.Path | str | bytes
+/// :type lex: list[pathlib.Path | str | bytes]
+/// :type output: pathlib.Path | str
+/// :type description: str
 #[pyfunction]
-#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
+#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
 fn build_system_dic<'p>(
     py: Python<'p>,
     matrix: &'p PyAny,
@@ -89,8 +100,19 @@ fn build_system_dic<'p>(
 }
 
 /// Build user dictionary from lexicons based on the given system dictionary.
+///
+/// :param system: Path to the system dictionary.
+/// :param lex: List of paths to lexicon files.
+/// :param output: Path to output built dictionray.
+/// :param description: A description text to embed in the dictionary.
+/// :return: A build report, list of (part, size, time).
+/// 
+/// :type system: pathlib.Path | str
+/// :type lex: list[pathlib.Path | str | bytes]
+/// :type output: pathlib.Path | str
+/// :type description: str
 #[pyfunction]
-#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
+#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
 fn build_user_dic<'p>(
     py: Python<'p>,
     system: &'p PyAny,
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index 5f1e8f65..2b5c849b 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -90,6 +90,12 @@ impl PyDicData {
 /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
 ///     Also, can be an _absolute_ path to a compiled dictionary file.
 /// :param dict_type: deprecated alias to dict.
+///
+/// :type config_path: Config | pathlib.Path | str | None
+/// :type config: Config | pathlib.Path | str | None
+/// :type resource_dir: pathlib.Path | str | None
+/// :type dict: pathlib.Path | str | None
+/// :type dict_type: pathlib.Path | str | None
 #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")]
 #[derive(Clone)]
 pub struct PyDictionary {
@@ -111,6 +117,12 @@ impl PyDictionary {
     /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
     /// :param dict_type: deprecated alias to dict.
+    ///
+    /// :type config_path: Config | pathlib.Path | str | None
+    /// :type config: Config | pathlib.Path | str | None
+    /// :type resource_dir: pathlib.Path | str | None
+    /// :type dict: pathlib.Path | str | None
+    /// :type dict_type: pathlib.Path | str | None
     #[new]
     #[pyo3(
         text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary",
@@ -235,6 +247,10 @@ impl PyDictionary {
     /// :param fields: load only a subset of fields.
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
     /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
+    ///
+    /// :type mode: SplitMode | str | None
+    /// :type fields: set[str] | None
+    /// :type projection: str | None
     #[pyo3(
         text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer",
         signature=(mode=None, fields=None, *, projection=None)
@@ -277,7 +293,9 @@ impl PyDictionary {
     /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
     ///
     /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
-    #[pyo3(text_signature="(self, /, target) -> PosMatcher")]
+    ///
+    /// :type target: Iterable[PartialPOS] | Callable[[POS], bool]
+    #[pyo3(text_signature = "(self, /, target) -> PosMatcher")]
     fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
         PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
     }
@@ -293,8 +311,10 @@ impl PyDictionary {
     ///     See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
     /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
     ///
-    /// :type mode: SplitMode
-    /// :type fields: Set[str]
+    /// :type mode: SplitMode | str | None
+    /// :type fields: set[str] | None
+    /// :type handler: Callable[[int, NormalizedString, MorphemeList], list[NormalizedString]] | None
+    /// :type projection: str | None
     #[pyo3(
         text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
         signature=(mode=None, fields=None, handler=None, *, projection=None)
@@ -352,8 +372,8 @@ impl PyDictionary {
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
     ///
     /// :type surface: str
-    /// :type out: MorphemeList
-    #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
+    /// :type out: MorphemeList | None
+    #[pyo3(text_signature = "(self, /, surface, out=None) -> MorphemeList")]
     fn lookup<'p>(
         &'p self,
         py: Python<'p>,
@@ -381,7 +401,7 @@ impl PyDictionary {
     }
 
     /// Close this dictionary.
-    #[pyo3(text_signature="(self, /) -> ()")]
+    #[pyo3(text_signature = "(self, /) -> ()")]
     fn close(&mut self) {
         self.dictionary = None;
     }
@@ -390,7 +410,9 @@ impl PyDictionary {
     ///
     /// :param pos_id: POS id
     /// :return: POS tuple with the given id or None for non existing id.
-    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
+    ///
+    /// :type pos_id: int
+    #[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
     fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
         let dic = self.dictionary.as_ref().unwrap();
         dic.pos.get(pos_id).map(|x| x.as_ref(py))
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index 0a18f6c4..522d8ecd 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -94,7 +94,7 @@ impl PyMorphemeListWrapper {
 impl PyMorphemeListWrapper {
     /// Returns an empty morpheme list with dictionary.
     #[classmethod]
-    #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")]
+    #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")]
     fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
         PyErr::warn(
@@ -113,13 +113,13 @@ impl PyMorphemeListWrapper {
     }
 
     /// Returns the total cost of the path.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn get_internal_cost(&self, py: Python) -> i32 {
         self.internal(py).get_internal_cost()
     }
 
     /// Returns the number of morpheme in this list.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn size(&self, py: Python) -> usize {
         self.internal(py).len()
     }
@@ -282,14 +282,14 @@ impl PyMorpheme {
 #[pymethods]
 impl PyMorpheme {
     /// Returns the begin index of this in the input text.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn begin(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).begin_c()
     }
 
     /// Returns the end index of this in the input text.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn end(&self, py: Python) -> usize {
         // call codepoint version
         self.morph(py).end_c()
@@ -298,7 +298,7 @@ impl PyMorpheme {
     /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
     ///
     /// See `Config.projection`.
-    #[pyo3(text_signature="(self, /) -> str")]
+    #[pyo3(text_signature = "(self, /) -> str")]
     fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         let list = self.list(py);
         let morph = self.morph(py);
@@ -311,14 +311,14 @@ impl PyMorpheme {
     /// Returns the substring of input text corresponding to the morpheme regardless the configured projection.
     ///
     /// See `Config.projection`.
-    #[pyo3(text_signature="(self, /) -> str")]
+    #[pyo3(text_signature = "(self, /) -> str")]
     fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
         PyString::new(py, self.morph(py).surface().deref())
     }
 
     /// Returns the part of speech as a six-element tuple.
     /// Tuple elements are four POS levels, conjugation type and conjugation form.
-    #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")]
+    #[pyo3(text_signature = "(self, /) -> tuple[str, str, str, str, str, str]")]
     fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> {
         let pos_id = self.part_of_speech_id(py);
         self.list(py)
@@ -329,25 +329,25 @@ impl PyMorpheme {
     }
 
     /// Returns the id of the part of speech in the dictionary.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     pub fn part_of_speech_id(&self, py: Python) -> u16 {
         self.morph(py).part_of_speech_id()
     }
 
     /// Returns the dictionary form.
-    #[pyo3(text_signature="(self, /) -> str")]
+    #[pyo3(text_signature = "(self, /) -> str")]
     fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().dictionary_form().into_py(py)
     }
 
     /// Returns the normalized form.
-    #[pyo3(text_signature="(self, /) -> str")]
+    #[pyo3(text_signature = "(self, /) -> str")]
     fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().normalized_form().into_py(py)
     }
 
     /// Returns the reading form.
-    #[pyo3(text_signature="(self, /) -> str")]
+    #[pyo3(text_signature = "(self, /) -> str")]
     fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject {
         self.morph(py).get_word_info().reading_form().into_py(py)
     }
@@ -362,12 +362,10 @@ impl PyMorpheme {
     /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements.
     ///     When False is passed, empty lists are returned instead.
     ///
-    /// :type mode: sudachipy.SplitMode    
-    /// :type out: Optional[sudachipy.MorphemeList]
+    /// :type mode: SplitMode | None
+    /// :type out: MorphemeList | None
     /// :type add_single: bool
-    #[pyo3(
-        text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList"
-    )]
+    #[pyo3(text_signature = "(self, /, mode, out=None, add_single=False) -> MorphemeList")]
     fn split<'py>(
         &'py self,
         py: Python<'py>,
@@ -410,19 +408,19 @@ impl PyMorpheme {
     }
 
     /// Returns whether if this is out of vocabulary word.
-    #[pyo3(text_signature="(self, /) -> bool")]
+    #[pyo3(text_signature = "(self, /) -> bool")]
     fn is_oov(&self, py: Python) -> bool {
         self.morph(py).is_oov()
     }
 
     /// Returns word id of this word in the dictionary.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn word_id(&self, py: Python) -> u32 {
         self.morph(py).word_id().as_raw()
     }
 
     /// Returns the dictionary id which this word belongs.
-    #[pyo3(text_signature="(self, /) -> int")]
+    #[pyo3(text_signature = "(self, /) -> int")]
     fn dictionary_id(&self, py: Python) -> i32 {
         let word_id = self.morph(py).word_id();
         if word_id.is_oov() {
@@ -433,7 +431,7 @@ impl PyMorpheme {
     }
 
     /// Returns the list of synonym group ids.
-    #[pyo3(text_signature="(self, /) -> List[int]")]
+    #[pyo3(text_signature = "(self, /) -> List[int]")]
     fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList {
         let mref = self.morph(py);
         let ids = mref.get_word_info().synonym_group_ids();
@@ -441,7 +439,7 @@ impl PyMorpheme {
     }
 
     /// Returns the word info.
-    #[pyo3(text_signature="(self, /) -> WordInfo")]
+    #[pyo3(text_signature = "(self, /) -> WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
         PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?;
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 16d1fa56..bb9749f2 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -125,8 +125,10 @@ impl PyPosMatcher {
 impl PyPosMatcher {
     /// Checks whether a morpheme has matching POS.
     ///
-    /// :param m: morpheme.
+    /// :param m: a morpheme to check.
     /// :return: if morpheme has matching POS.
+    ///
+    /// :type m: Morpheme
     pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool {
         let pos_id = m.part_of_speech_id(py);
         self.matcher.matches_id(pos_id)
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 8c7c1c84..c14f7076 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -36,6 +36,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 /// C == long mode
 ///
 /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
+///     If None, returns SplitMode.C.
+///
+/// :type mode: str | None
 #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)]
 #[derive(Clone, PartialEq, Eq, Copy, Debug)]
 #[repr(u8)]
@@ -67,9 +70,12 @@ impl From<Mode> for PySplitMode {
 
 #[pymethods]
 impl PySplitMode {
-    /// Parse SplitMode from a character.
+    /// Creates a split mode from a string value.
+    ///
+    /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
+    ///     If None, returns SplitMode.C.
     ///
-    /// :param mode: str to parse. One of [A,B,C] in captital or lower case.
+    /// :type mode: str | None
     #[new]
     #[pyo3(
         text_signature="(mode=None) -> SplitMode",
@@ -133,8 +139,8 @@ impl PyTokenizer {
     ///    See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
     ///
     /// :type text: str
-    /// :type mode: sudachipy.SplitMode
-    /// :type out: sudachipy.MorphemeList
+    /// :type mode: SplitMode | str | None
+    /// :type out: MorphemeList
     #[pyo3(
         text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList",
         signature=(text, mode=None, logger=None, out=None)

From 5d8620ee643096027a687275b26838cb70874a68 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 14:16:45 +0900
Subject: [PATCH 12/24] improve pyi

---
 python/py_src/sudachipy/sudachipy.pyi | 47 ++++++++++++++++++---------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi
index 705b62af..0b1c4fc2 100644
--- a/python/py_src/sudachipy/sudachipy.pyi
+++ b/python/py_src/sudachipy/sudachipy.pyi
@@ -28,12 +28,20 @@ PartialPOS = Union[
     Tuple[()],
 ]
 
-# Fields that can be specified for partial dictionary loading.
-# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+"""
+Fields that can be specified for partial dictionary loading.
+See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+"""
 FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
                                 "word_structure", "split_a", "split_b", "synonym_group_id"]]]
 
 
+"""
+Strings that can be parsed as SplitMode
+"""
+SplitModeStr = Literal["A", "a", "B", "b", "C", "c"]
+
+
 class SplitMode:
     """
     Unit to split text.
@@ -48,11 +56,12 @@ class SplitMode:
     C: ClassVar[SplitMode] = ...
 
     @classmethod
-    def __init__(cls, mode: str = "C") -> None:
+    def __init__(cls, mode: Optional[SplitModeStr] = "C") -> None:
         """
         Creates a split mode from a string value.
 
         :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
+            If None, returns SplitMode.C.
         """
         ...
 
@@ -88,10 +97,10 @@ class Dictionary:
         ...
 
     def create(self,
-               mode: Union[SplitMode, Literal["A", "B", "C"]] = SplitMode.C,
-               fields: FieldSet = None,
+               mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C,
+               fields: Optional[FieldSet] = None,
                *,
-               projection: str = None) -> Tokenizer:
+               projection: Optional[str] = None) -> Tokenizer:
         """
         Creates a sudachi tokenizer.
 
@@ -118,12 +127,12 @@ class Dictionary:
         ...
 
     def pre_tokenizer(self,
-                      mode: Union[SplitMode, Literal["A", "B", "C"]] = "C",
-                      fields: FieldSet = None,
+                      mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C,
+                      fields: Optional[FieldSet] = None,
                       handler: Optional[Callable[[
                           int, object, MorphemeList], list]] = None,
                       *,
-                      projection: str = None) -> object:
+                      projection: Optional[str] = None) -> object:
         """
         Creates HuggingFace Tokenizers-compatible PreTokenizer.
         Requires package `tokenizers` to be installed.
@@ -230,7 +239,10 @@ class Morpheme:
         """
         ...
 
-    def split(self, mode: Union[SplitMode, Literal["A", "B", "C"]], out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList:
+    def split(self,
+              mode: Union[SplitMode, SplitModeStr],
+              out: Optional[MorphemeList] = None,
+              add_single: bool = True) -> MorphemeList:
         """
         Returns sub-morphemes in the provided split mode.
 
@@ -288,7 +300,7 @@ class MorphemeList:
     def __init__(self) -> None: ...
 
     @classmethod
-    def empty(cls, dict) -> MorphemeList:
+    def empty(cls, dict: Dictionary) -> MorphemeList:
         """
         Returns an empty morpheme list with dictionary.
         """
@@ -306,7 +318,7 @@ class MorphemeList:
         """
         ...
 
-    def __getitem__(self, index) -> Morpheme: ...
+    def __getitem__(self, index: int) -> Morpheme: ...
     def __iter__(self) -> Iterator[Morpheme]: ...
     def __len__(self) -> int: ...
 
@@ -318,11 +330,13 @@ class Tokenizer:
     Create using Dictionary.create method.
     """
     SplitMode: ClassVar[SplitMode] = ...
+
     @classmethod
     def __init__(cls) -> None: ...
 
-    def tokenize(self, text: str,
-                 mode: Union[SplitMode, Literal["A", "B", "C"]] = ...,
+    def tokenize(self,
+                 text: str,
+                 mode: Union[SplitMode, SplitModeStr, None] = None,
                  out: Optional[MorphemeList] = None) -> MorphemeList:
         """
         Break text into morphemes.
@@ -359,6 +373,7 @@ class WordInfo:
     surface: ClassVar[str] = ...
     synonym_group_ids: ClassVar[List[int]] = ...
     word_structure: ClassVar[List[int]] = ...
+
     @classmethod
     def __init__(self) -> None: ...
     def length(self) -> int: ...
@@ -374,11 +389,11 @@ class PosMatcher:
     def __iter__(self) -> Iterator[POS]: ...
     def __len__(self) -> int: ...
 
-    def __call__(self, m: Morpheme) -> bool:
+    def __call__(self, /, m: Morpheme) -> bool:
         """
         Checks whether a morpheme has matching POS.
 
-        :param m: morpheme.
+        :param m: a morpheme to check.
         :return: if morpheme has matching POS.
         """
         ...

From 3fd3e52e268b6906fd9eda2e6ad8c84abc44979e Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 11:04:18 +0900
Subject: [PATCH 13/24] use get_all for wordinfo

---
 python/src/word_info.rs | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/python/src/word_info.rs b/python/src/word_info.rs
index 4f74d0f1..eb51a28d 100644
--- a/python/src/word_info.rs
+++ b/python/src/word_info.rs
@@ -18,29 +18,18 @@ use pyo3::prelude::*;
 
 use sudachi::dic::lexicon::word_infos::{WordInfo, WordInfoData};
 
-#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo")]
+#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo", get_all)]
 pub struct PyWordInfo {
-    #[pyo3(get)]
     surface: String,
-    #[pyo3(get)]
     head_word_length: u16,
-    #[pyo3(get)]
     pos_id: u16,
-    #[pyo3(get)]
     normalized_form: String,
-    #[pyo3(get)]
     dictionary_form_word_id: i32,
-    #[pyo3(get)]
     dictionary_form: String,
-    #[pyo3(get)]
     reading_form: String,
-    #[pyo3(get)]
     a_unit_split: Vec<u32>,
-    #[pyo3(get)]
     b_unit_split: Vec<u32>,
-    #[pyo3(get)]
     word_structure: Vec<u32>,
-    #[pyo3(get)]
     synonym_group_ids: Vec<u32>,
 }
 

From d1c31655292adc80e1b9a1051bb4b90752500e6f Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 8 Jul 2024 14:51:00 +0900
Subject: [PATCH 14/24] add deprecated directive and fix

---
 python/py_src/sudachipy/sudachipy.pyi | 9 +++++++++
 python/src/build.rs                   | 4 ++--
 python/src/dictionary.rs              | 4 ++--
 python/src/morpheme.rs                | 6 ++++++
 python/src/tokenizer.rs               | 2 ++
 5 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi
index 0b1c4fc2..ca39a95c 100644
--- a/python/py_src/sudachipy/sudachipy.pyi
+++ b/python/py_src/sudachipy/sudachipy.pyi
@@ -47,7 +47,9 @@ class SplitMode:
     Unit to split text.
 
     A == short mode
+
     B == middle mode
+
     C == long mode
     """
 
@@ -205,6 +207,9 @@ class Morpheme:
     def get_word_info(self) -> WordInfo:
         """
         Returns the word info.
+
+        ..deprecated:: v0.6.0
+           Users should not touch the raw WordInfo.
         """
         ...
 
@@ -293,6 +298,7 @@ class Morpheme:
 class MorphemeList:
     """
     A list of morphemes.
+
     An object can not be instantiated manually.
     Use Tokenizer.tokenize("") to create an empty morpheme list.
     """
@@ -303,6 +309,9 @@ class MorphemeList:
     def empty(cls, dict: Dictionary) -> MorphemeList:
         """
         Returns an empty morpheme list with dictionary.
+
+        .. deprecated::
+            Use Tokenizer.tokenize("") if you need.
         """
         ...
 
diff --git a/python/src/build.rs b/python/src/build.rs
index 2b2ce94f..b37ed807 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -65,7 +65,7 @@ fn create_file(p: &Path) -> std::io::Result<File> {
 /// :param output: Path to output built dictionray.
 /// :param description: A description text to embed in the dictionary.
 /// :return: A build report, list of (part, size, time).
-/// 
+///
 /// :type matrix: pathlib.Path | str | bytes
 /// :type lex: list[pathlib.Path | str | bytes]
 /// :type output: pathlib.Path | str
@@ -106,7 +106,7 @@ fn build_system_dic<'p>(
 /// :param output: Path to output built dictionray.
 /// :param description: A description text to embed in the dictionary.
 /// :return: A build report, list of (part, size, time).
-/// 
+///
 /// :type system: pathlib.Path | str
 /// :type lex: list[pathlib.Path | str | bytes]
 /// :type output: pathlib.Path | str
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index 2b5c849b..22241f95 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -87,7 +87,7 @@ impl PyDicData {
 /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
 /// :param config: alias to config_path, only one of them can be specified at the same time.
 /// :param resource_dir: path to the resource directory folder.
-/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
+/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
 ///     Also, can be an _absolute_ path to a compiled dictionary file.
 /// :param dict_type: deprecated alias to dict.
 ///
@@ -114,7 +114,7 @@ impl PyDictionary {
     /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
     /// :param config: alias to config_path, only one of them can be specified at the same time.
     /// :param resource_dir: path to the resource directory folder.
-    /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
+    /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
     /// :param dict_type: deprecated alias to dict.
     ///
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index 522d8ecd..b9367e10 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -93,6 +93,9 @@ impl PyMorphemeListWrapper {
 #[pymethods]
 impl PyMorphemeListWrapper {
     /// Returns an empty morpheme list with dictionary.
+    ///
+    /// .. deprecated:: 0.6.0
+    ///     Use Tokenizer.tokenize("") if you need.
     #[classmethod]
     #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")]
     fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
@@ -439,6 +442,9 @@ impl PyMorpheme {
     }
 
     /// Returns the word info.
+    ///
+    /// ..deprecated:: v0.6.0
+    ///    Users should not touch the raw WordInfo.
     #[pyo3(text_signature = "(self, /) -> WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
         let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index c14f7076..d96763de 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -32,7 +32,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 /// Unit to split text.
 ///
 /// A == short mode
+///
 /// B == middle mode
+///
 /// C == long mode
 ///
 /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.

From 4a3da5bacd868112165ac5f3c5c49d5f82eba48f Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Wed, 25 Sep 2024 09:36:20 +0900
Subject: [PATCH 15/24] update Dictionary arg name

---
 python/README.md | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/python/README.md b/python/README.md
index 4d95d7fb..b1ad3e5e 100644
--- a/python/README.md
+++ b/python/README.md
@@ -66,7 +66,7 @@ $ pip install sudachipy
 
 ### Step 2. Get a Dictionary
 
-You can get dictionary as a Python package. It make take a while to download the dictionary file (around 70MB for the `core` edition).
+You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition).
 
 ```bash
 $ pip install sudachidict_core
@@ -209,7 +209,7 @@ There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `fu
 
 SudachiPy uses `sudachidict_core` by default.
 
-Dictionaries are installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`.
+Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`.
 
 * [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/)
 * [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/)
@@ -234,19 +234,19 @@ $ echo "外国人参政権" | sudachipy -s full
 
 ### Dictionary option: Python package
 
-You can specify the dictionary with the `Dicionary()` argument; `config_path` or `dict_type`.
+You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`.
 
 ```python
-class Dictionary(config_path=None, resource_dir=None, dict_type=None)
+class Dictionary(config=None, resource_dir=None, dict=None)
 ```
 
-1. `config_path`
-    * You can specify the file path to the setting file with `config_path` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail).
+1. `config`
+    * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail).
     * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary.
-2. `dict_type`
-    * You can also specify the dictionary type with `dict_type`.
-    * The available arguments are `small`, `core`, or `full`.
-    * If different dictionaries are specified with `config_path` and `dict_type`, **a dictionary defined `dict_type` overrides** those defined in the config path.
+2. `dict`
+    * You can also specify the dictionary type with `dict`.
+    * The available arguments are `small`, `core`, `full`, or a path to the dictionary file.
+    * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config.
 
 ```python
 from sudachipy import Dictionary
@@ -255,16 +255,16 @@ from sudachipy import Dictionary
 tokenizer_obj = Dictionary().create()
 
 # The dictionary given by the `systemDict` key in the config file (/path/to/sudachi.json) will be used
-tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json").create()
+tokenizer_obj = Dictionary(config="/path/to/sudachi.json").create()
 
-# The dictionary specified by `dict_type` will be set.
-tokenizer_obj = Dictionary(dict_type="core").create()  # sudachidict_core (same as default)
-tokenizer_obj = Dictionary(dict_type="small").create()  # sudachidict_small
-tokenizer_obj = Dictionary(dict_type="full").create()  # sudachidict_full
+# The dictionary specified by `dict` will be used.
+tokenizer_obj = Dictionary(dict="core").create()  # sudachidict_core (same as default)
+tokenizer_obj = Dictionary(dict="small").create()  # sudachidict_small
+tokenizer_obj = Dictionary(dict="full").create()  # sudachidict_full
 
-# The dictionary specified by `dict_type` overrides those defined in the config path.
+# The dictionary specified by `dict` overrides those defined in the config.
 # In the following code, `sudachidict_full` will be used regardless of a dictionary defined in the config file.
-tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json", dict_type="full").create()
+tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create()
 ```
 
 
@@ -303,10 +303,8 @@ Then specify your `sudachi.json` with the `-r` option.
 $ sudachipy -r path/to/sudachi.json
 ```
 
-
 You can build a user dictionary with the subcommand `ubuild`.
 
-
 ```bash
 $ sudachipy ubuild -h
 usage: sudachipy ubuild [-h] [-o file] [-d string] -s file file [file ...]

From c943da8d452b28ff291f4671d1cab50b0078d3e9 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 10 Jun 2024 09:41:05 +0900
Subject: [PATCH 16/24] use crate::errors to send err

---
 python/src/build.rs        |  4 +--
 python/src/dictionary.rs   | 67 +++++++++++++++++---------------------
 python/src/errors.rs       |  8 ++++-
 python/src/morpheme.rs     | 31 +++++++-----------
 python/src/pos_matcher.rs  |  4 +--
 python/src/pretokenizer.rs |  4 +--
 python/src/projection.rs   | 18 ++++------
 python/src/tokenizer.rs    | 30 ++++++++---------
 8 files changed, 74 insertions(+), 92 deletions(-)

diff --git a/python/src/build.rs b/python/src/build.rs
index a6005b26..6b3bd0ca 100644
--- a/python/src/build.rs
+++ b/python/src/build.rs
@@ -142,8 +142,8 @@ fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult<DataSource<'p
         let data = data.downcast::<PyBytes>()?;
         Ok(DataSource::Data(data.as_bytes()))
     } else {
-        Err(pyo3::exceptions::PyValueError::new_err(format!(
-            "data source should can be only Path, bytes or str, was {}: {}",
+        errors::wrap(Err(format!(
+            "data source should be Path, bytes or str, was {}: {}",
             data,
             data.get_type()
         )))
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
index bc333c8e..802e23c2 100644
--- a/python/src/dictionary.rs
+++ b/python/src/dictionary.rs
@@ -24,7 +24,6 @@ use std::str::FromStr;
 use std::sync::Arc;
 use sudachi::analysis::Mode;
 
-use crate::errors::{wrap, wrap_ctx, SudachiError as SudachiErr};
 use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
 use sudachi::config::{Config, ConfigBuilder, SurfaceProjection};
 use sudachi::dic::dictionary::JapaneseDictionary;
@@ -35,6 +34,7 @@ use sudachi::plugin::input_text::InputTextPlugin;
 use sudachi::plugin::oov::OovProviderPlugin;
 use sudachi::plugin::path_rewrite::PathRewritePlugin;
 
+use crate::errors;
 use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 use crate::pos_matcher::PyPosMatcher;
 use crate::pretokenizer::PyPretokenizer;
@@ -110,7 +110,7 @@ impl PyDictionary {
         config: Option<&PyAny>,
     ) -> PyResult<Self> {
         if config.is_some() && config_path.is_some() {
-            return Err(SudachiErr::new_err("Both config and config_path options were specified at the same time, use one of them"));
+            return errors::wrap(Err("Both config and config_path options were specified at the same time, use one of them"));
         }
 
         let default_config = read_default_config(py)?;
@@ -131,13 +131,10 @@ impl PyDictionary {
         };
 
         if dict_type.is_some() {
-            let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-            PyErr::warn(
+            errors::warn_deprecation(
                 py,
-                cat,
                 "Parameter dict_type of Dictionary() is deprecated, use dict instead",
-                1,
-            )?;
+            )?
         }
 
         let config_builder = match resource_dir {
@@ -177,12 +174,10 @@ impl PyDictionary {
             }
         }
 
-        let jdic = JapaneseDictionary::from_cfg(&config).map_err(|e| {
-            SudachiErr::new_err(format!(
-                "Error while constructing dictionary: {}",
-                e.to_string()
-            ))
-        })?;
+        let jdic = errors::wrap_ctx(
+            JapaneseDictionary::from_cfg(&config),
+            "Error while constructing dictionary",
+        )?;
 
         let pos_data = jdic
             .grammar()
@@ -238,7 +233,7 @@ impl PyDictionary {
         let mut required_fields = self.config.projection.required_subset();
         let dict = self.dictionary.as_ref().unwrap().clone();
         let projobj = if let Some(s) = projection {
-            let proj = wrap(SurfaceProjection::try_from(s.to_str()?))?;
+            let proj = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
             required_fields = proj.required_subset();
             Some(morpheme_projection(proj, &dict))
         } else {
@@ -301,7 +296,7 @@ impl PyDictionary {
         let subset = parse_field_subset(fields)?;
         if let Some(h) = handler.as_ref() {
             if !h.as_ref(py).is_callable() {
-                return Err(SudachiErr::new_err("handler must be callable"));
+                return errors::wrap(Err("handler must be callable"));
             }
         }
 
@@ -357,12 +352,12 @@ impl PyDictionary {
         // this needs to be a variable
         let mut borrow = l.try_borrow_mut();
         let out_list = match borrow {
-            Err(_) => return Err(SudachiErr::new_err("out was used twice at the same time")),
             Ok(ref mut ms) => ms.internal_mut(py),
+            Err(_) => return errors::wrap(Err("out was used twice at the same time")),
         };
 
         out_list.clear();
-        wrap_ctx(out_list.lookup(surface, InfoSubset::all()), surface)?;
+        errors::wrap_ctx(out_list.lookup(surface, InfoSubset::all()), surface)?;
         Ok(l)
     }
 
@@ -380,7 +375,7 @@ impl PyDictionary {
     }
 
     fn __repr__(&self) -> PyResult<String> {
-        wrap(config_repr(&self.config))
+        errors::wrap(config_repr(&self.config))
     }
 }
 
@@ -413,18 +408,21 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> {
 
 pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult<Mode> {
     if mode.is_instance_of::<PyString>() {
-        let mode = mode.str()?.to_str()?;
-        Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into())
+        errors::wrap(Mode::from_str(mode.str()?.to_str()?))
     } else if mode.is_instance_of::<PySplitMode>() {
         let mode = mode.extract::<PySplitMode>()?;
         Ok(Mode::from(mode))
     } else {
-        Err(SudachiErr::new_err(("unknown mode", mode.into_py(py))))
+        errors::wrap(Err(format!(
+            "mode should be sudachipy.SplitMode or str, was {}: {}",
+            mode,
+            mode.get_type()
+        )))
     }
 }
 
 fn read_config_from_fs(path: Option<&Path>) -> PyResult<ConfigBuilder> {
-    wrap(ConfigBuilder::from_opt_file(path))
+    errors::wrap(ConfigBuilder::from_opt_file(path))
 }
 
 fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
@@ -433,13 +431,13 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
         // looks like json
         if config_str.starts_with("{") && config_str.ends_with("}") {
             let result = ConfigBuilder::from_bytes(config_str.as_bytes());
-            return wrap(result);
+            return errors::wrap(result);
         }
         let p = Path::new(config_str);
         if p.exists() && p.is_file() {
             return read_config_from_fs(Some(p));
         }
-        return Err(SudachiErr::new_err(format!(
+        return errors::wrap(Err(format!(
             "config file [{}] do not exist or is not a file",
             p.display()
         )));
@@ -450,9 +448,10 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
         let cfg_as_str = config_opt.call_method0("as_jsons")?;
         return read_config(cfg_as_str);
     }
-    Err(SudachiErr::new_err((
-        format!("passed config was not a string, json object or sudachipy.config.Config object"),
-        config_opt.into_py(py),
+    errors::wrap(Err(format!(
+        "config should be sudachipy.Config or str which represents a file path or json obj, was {}: {}",
+        config_opt,
+        config_opt.get_type()
     )))
 }
 
@@ -460,7 +459,7 @@ pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> {
     let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?;
     let path = path.downcast::<PyString>()?.to_str()?;
     let path = PathBuf::from(path);
-    wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path)
+    errors::wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path)
 }
 
 pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> {
@@ -484,10 +483,7 @@ fn locate_system_dict(py: Python, path: &Path) -> PyResult<PathBuf> {
     }
     match path.to_str() {
         Some(name @ ("small" | "core" | "full")) => find_dict_path(py, name),
-        _ => Err(SudachiErr::new_err(format!(
-            "invalid dictionary path {:?}",
-            path
-        ))),
+        _ => errors::wrap(Err(format!("invalid dictionary path {:?}", path))),
     }
 }
 
@@ -509,12 +505,7 @@ fn parse_field_subset(data: Option<&PySet>) -> PyResult<InfoSubset> {
             "split_a" => InfoSubset::SPLIT_A,
             "split_b" => InfoSubset::SPLIT_B,
             "synonym_group_id" => InfoSubset::SYNONYM_GROUP_ID,
-            x => {
-                return Err(SudachiErr::new_err(format!(
-                    "Invalid WordInfo field name {}",
-                    x
-                )))
-            }
+            x => return errors::wrap(Err(format!("Invalid WordInfo field name {}", x))),
         };
     }
     Ok(subset)
diff --git a/python/src/errors.rs b/python/src/errors.rs
index 04827fd4..da72601a 100644
--- a/python/src/errors.rs
+++ b/python/src/errors.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+use pyo3::exceptions::PyDeprecationWarning;
+use pyo3::prelude::*;
 use pyo3::{import_exception, PyResult};
 use std::fmt::{Debug, Display};
 
@@ -33,3 +35,7 @@ pub fn wrap_ctx<T, E: Display, C: Debug + ?Sized>(v: Result<T, E>, ctx: &C) -> P
         Err(e) => Err(SudachiError::new_err(format!("{:?}: {}", ctx, e))),
     }
 }
+
+pub fn warn_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> {
+    PyErr::warn(py, &py.get_type::<PyDeprecationWarning>(), msg, 1)
+}
diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs
index ad3929dd..fd097336 100644
--- a/python/src/morpheme.rs
+++ b/python/src/morpheme.rs
@@ -18,13 +18,14 @@ use std::fmt::Write;
 use std::ops::Deref;
 use std::sync::Arc;
 
-use pyo3::exceptions::{PyException, PyIndexError};
+use pyo3::exceptions::PyIndexError;
 use pyo3::prelude::*;
 use pyo3::types::{PyList, PyString, PyTuple, PyType};
 
 use sudachi::prelude::{Morpheme, MorphemeList};
 
 use crate::dictionary::{extract_mode, PyDicData, PyDictionary};
+use crate::errors;
 use crate::projection::MorphemeProjection;
 use crate::word_info::PyWordInfo;
 
@@ -92,12 +93,9 @@ impl PyMorphemeListWrapper {
     #[classmethod]
     #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")]
     fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
-        let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-        PyErr::warn(
+        errors::warn_deprecation(
             py,
-            cat,
             "Use Tokenizer.tokenize(\"\") if you need an empty MorphemeList.",
-            1,
         )?;
 
         let cloned = dict.dictionary.as_ref().unwrap().clone();
@@ -176,9 +174,7 @@ impl PyMorphemeListWrapper {
                 list: slf.clone_ref(py),
                 index: i,
             };
-            pymorph
-                .write_repr(py, &mut result)
-                .map_err(|_| PyException::new_err("format failed"))?;
+            errors::wrap_ctx(pymorph.write_repr(py, &mut result), "format failed")?;
             result.push_str(",\n");
         }
         result.push_str("]>");
@@ -380,16 +376,14 @@ impl PyMorpheme {
         let mut borrow = out_cell.try_borrow_mut();
         let out_ref = match borrow {
             Ok(ref mut v) => v.internal_mut(py),
-            Err(_) => return Err(PyException::new_err("out was used twice")),
+            Err(_) => return errors::wrap(Err("out was used twice at the same time")),
         };
 
         out_ref.clear();
-        let splitted = list
-            .internal(py)
-            .split_into(mode, self.index, out_ref)
-            .map_err(|e| {
-                PyException::new_err(format!("Error while splitting morpheme: {}", e.to_string()))
-            })?;
+        let splitted = errors::wrap_ctx(
+            list.internal(py).split_into(mode, self.index, out_ref),
+            "Error while splitting morpheme",
+        )?;
 
         if add_single.unwrap_or(true) && !splitted {
             list.internal(py)
@@ -433,9 +427,7 @@ impl PyMorpheme {
     /// Returns the word info
     #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")]
     fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
-        let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
-        PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?;
-
+        errors::warn_deprecation(py, "Users should not touch the raw WordInfo.")?;
         Ok(self.morph(py).get_word_info().clone().into())
     }
 
@@ -451,8 +443,7 @@ impl PyMorpheme {
 
     pub fn __repr__<'py>(&'py self, py: Python<'py>) -> PyResult<String> {
         let mut result = String::new();
-        self.write_repr(py, &mut result)
-            .map_err(|_| PyException::new_err("failed to format repr"))?;
+        errors::wrap_ctx(self.write_repr(py, &mut result), "failed to format repr")?;
         Ok(result)
     }
 }
diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs
index 7c6a884d..f0a53b64 100644
--- a/python/src/pos_matcher.rs
+++ b/python/src/pos_matcher.rs
@@ -16,7 +16,6 @@
 
 use std::sync::Arc;
 
-use pyo3::exceptions::PyException;
 use pyo3::prelude::*;
 use pyo3::types::{PyBool, PyIterator, PyTuple};
 
@@ -24,6 +23,7 @@ use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
 use sudachi::pos::PosMatcher;
 
 use crate::dictionary::PyDicData;
+use crate::errors;
 use crate::morpheme::PyMorpheme;
 
 #[pyclass(name = "PosMatcher", module = "sudachipy")]
@@ -106,7 +106,7 @@ impl PyPosMatcher {
         }
 
         if start_len == data.len() {
-            Err(PyException::new_err(format!(
+            errors::wrap(Err(format!(
                 "POS {:?} did not match any elements",
                 elem.repr()?
             )))
diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs
index 755f040b..49cf1a29 100644
--- a/python/src/pretokenizer.rs
+++ b/python/src/pretokenizer.rs
@@ -15,7 +15,7 @@
  */
 
 use crate::dictionary::PyDicData;
-use crate::errors::wrap;
+use crate::errors;
 use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
 use pyo3::intern;
 use pyo3::prelude::*;
@@ -49,7 +49,7 @@ impl PerThreadPreTokenizer {
 
     pub fn tokenize(&mut self, data: &str) -> PyResult<()> {
         self.tokenizer.reset().push_str(data);
-        wrap(self.tokenizer.do_tokenize())?;
+        errors::wrap(self.tokenizer.do_tokenize())?;
         Ok(())
     }
 
diff --git a/python/src/projection.rs b/python/src/projection.rs
index 8bea35be..7739c7bc 100644
--- a/python/src/projection.rs
+++ b/python/src/projection.rs
@@ -15,6 +15,7 @@
  */
 
 use crate::dictionary::PyDicData;
+use crate::errors;
 use crate::morpheme::PyProjector;
 use pyo3::types::PyString;
 use pyo3::{PyResult, Python};
@@ -174,18 +175,13 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>(
     value: &str,
     dict: &D,
 ) -> PyResult<(PyProjector, SurfaceProjection)> {
-    match SurfaceProjection::try_from(value) {
-        Ok(v) => {
-            if v == SurfaceProjection::Surface {
-                Ok((None, SurfaceProjection::Surface))
-            } else {
-                Ok((Some(morpheme_projection(v, dict)), v))
-            }
+    errors::wrap_ctx(SurfaceProjection::try_from(value).map(|v| {
+        if v == SurfaceProjection::Surface {
+            (None, SurfaceProjection::Surface)
+        } else {
+            (Some(morpheme_projection(v, dict)), v)
         }
-        Err(e) => Err(crate::errors::SudachiError::new_err(format!(
-            "invalid surface projection: {e:?}"
-        ))),
-    }
+    }), "invalid surface projection")
 }
 
 pub(crate) fn parse_projection_opt<D: DictionaryAccess>(
diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs
index 558d02cb..18ec0a63 100644
--- a/python/src/tokenizer.rs
+++ b/python/src/tokenizer.rs
@@ -26,7 +26,7 @@ use sudachi::dic::subset::InfoSubset;
 use sudachi::prelude::*;
 
 use crate::dictionary::{extract_mode, PyDicData};
-use crate::errors::SudachiError as SudachiPyErr;
+use crate::errors;
 use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
 
 /// Unit to split text
@@ -74,11 +74,7 @@ impl PySplitMode {
             Some(m) => m,
             None => return Ok(PySplitMode::C),
         };
-
-        match Mode::from_str(mode) {
-            Ok(m) => Ok(m.into()),
-            Err(e) => Err(SudachiPyErr::new_err(e.to_string())),
-        }
+        errors::wrap(Mode::from_str(mode).map(|m| m.into()))
     }
 }
 
@@ -151,12 +147,13 @@ impl PyTokenizer {
         });
 
         // analysis can be done without GIL
-        let err = py.allow_threads(|| {
-            tokenizer.reset().push_str(text);
-            tokenizer.do_tokenize()
-        });
-
-        err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?;
+        errors::wrap_ctx(
+            py.allow_threads(|| {
+                tokenizer.reset().push_str(text);
+                tokenizer.do_tokenize()
+            }),
+            "Error during tokenization",
+        )?;
 
         let out_list = match out {
             None => {
@@ -172,12 +169,13 @@ impl PyTokenizer {
         let mut borrow = out_list.try_borrow_mut();
         let morphemes = match borrow {
             Ok(ref mut ms) => ms.internal_mut(py),
-            Err(e) => return Err(SudachiPyErr::new_err("out was used twice at the same time")),
+            Err(_) => return errors::wrap(Err("out was used twice at the same time")),
         };
 
-        morphemes
-            .collect_results(tokenizer.deref_mut())
-            .map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?;
+        errors::wrap_ctx(
+            morphemes.collect_results(tokenizer.deref_mut()),
+            "Error during tokenization",
+        )?;
 
         Ok(out_list)
     }

From a4a47e21c6b27ffd39cfa2dbebc4d51f85b1c0e3 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Fri, 25 Oct 2024 16:20:38 +0900
Subject: [PATCH 17/24] cargo fmt

---
 python/src/projection.rs | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/src/projection.rs b/python/src/projection.rs
index 7739c7bc..9140e747 100644
--- a/python/src/projection.rs
+++ b/python/src/projection.rs
@@ -175,13 +175,16 @@ pub(crate) fn parse_projection_raw<D: DictionaryAccess>(
     value: &str,
     dict: &D,
 ) -> PyResult<(PyProjector, SurfaceProjection)> {
-    errors::wrap_ctx(SurfaceProjection::try_from(value).map(|v| {
-        if v == SurfaceProjection::Surface {
-            (None, SurfaceProjection::Surface)
-        } else {
-            (Some(morpheme_projection(v, dict)), v)
-        }
-    }), "invalid surface projection")
+    errors::wrap_ctx(
+        SurfaceProjection::try_from(value).map(|v| {
+            if v == SurfaceProjection::Surface {
+                (None, SurfaceProjection::Surface)
+            } else {
+                (Some(morpheme_projection(v, dict)), v)
+            }
+        }),
+        "invalid surface projection",
+    )
 }
 
 pub(crate) fn parse_projection_opt<D: DictionaryAccess>(

From a33bc6b30061ca28c874a3c9d7f47e670054a92b Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Fri, 25 Oct 2024 17:43:23 +0900
Subject: [PATCH 18/24] add new line for the matrix size

---
 sudachi-cli/src/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs
index eea11ecf..eb2e716c 100644
--- a/sudachi-cli/src/build.rs
+++ b/sudachi-cli/src/build.rs
@@ -210,7 +210,7 @@ fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
 
 fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
     let conn = grammar.conn_matrix();
-    write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap();
+    write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap();
 
     for left in 0..conn.num_left() {
         for right in 0..conn.num_right() {

From 848e637fc271735e3ede206a84d03b776722b708 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 28 Oct 2024 16:03:29 +0900
Subject: [PATCH 19/24] dump pos_id

---
 sudachi-cli/src/build.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs
index eb2e716c..62248809 100644
--- a/sudachi-cli/src/build.rs
+++ b/sudachi-cli/src/build.rs
@@ -196,7 +196,8 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) {
 }
 
 fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
-    for p in grammar.pos_list.iter() {
+    for (id, p) in grammar.pos_list.iter().enumerate() {
+        write!(w, "{},", id).unwrap();
         for (i, e) in p.iter().enumerate() {
             w.write_all(e.as_bytes()).unwrap();
             if (i + 1) == p.len() {

From ec9a0f4dff64d7a6792504ef7e37ebcf57bdc978 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 28 Oct 2024 17:16:55 +0900
Subject: [PATCH 20/24] dump winfo in lexicon format

---
 sudachi-cli/src/build.rs | 97 +++++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 17 deletions(-)

diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs
index 62248809..ec7f4620 100644
--- a/sudachi-cli/src/build.rs
+++ b/sudachi-cli/src/build.rs
@@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport;
 use sudachi::dic::build::DictBuilder;
 use sudachi::dic::dictionary::JapaneseDictionary;
 use sudachi::dic::grammar::Grammar;
+use sudachi::dic::lexicon::word_infos::WordInfo;
 use sudachi::dic::lexicon_set::LexiconSet;
 use sudachi::dic::word_id::WordId;
 use sudachi::dic::DictionaryLoader;
@@ -79,6 +80,7 @@ pub(crate) enum BuildCli {
         dict: PathBuf,
         part: String,
         output: PathBuf,
+        // todo: dump user dict
     },
 }
 
@@ -189,7 +191,7 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) {
     match part.as_str() {
         "pos" => dump_pos(dict.grammar(), &mut writer),
         "matrix" => dump_matrix(dict.grammar(), &mut writer),
-        "winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(),
+        "winfo" => dump_word_info(&dict, &mut writer).unwrap(),
         _ => unimplemented!(),
     }
     writer.flush().unwrap();
@@ -221,23 +223,28 @@ fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
     }
 }
 
-fn dump_word_info<W: Write>(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> {
+fn dump_word_info<W: Write>(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> {
+    let grammar = dict.grammar();
+    let lex = dict.lexicon();
     let size = lex.size();
     for i in 0..size {
         let wid = WordId::checked(0, i)?;
         let (left, right, cost) = lex.get_word_param(wid);
         let winfo = lex.get_word_info(wid)?;
+        write!(w, "{},", unicode_escape(winfo.surface()))?;
         write!(w, "{},{},{},", left, right, cost)?;
-        write!(w, "{},", winfo.surface())?;
-        write!(w, "{},", winfo.head_word_length())?;
-        write!(w, "{},", winfo.normalized_form())?;
-        write!(w, "{},", winfo.dictionary_form_word_id())?;
-        write!(w, "{},", winfo.reading_form())?;
-        dump_wids(w, winfo.a_unit_split())?;
+        write!(w, "{},", unicode_escape(winfo.surface()))?; // writing
+        write!(w, "{},", pos_string(grammar, winfo.pos_id()))?;
+        write!(w, "{},", unicode_escape(winfo.reading_form()))?;
+        write!(w, "{},", unicode_escape(winfo.normalized_form()))?;
+        let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id());
+        write!(w, "{},", dict_form)?;
+        write!(w, "{},", split_mode(&winfo))?;
+        dump_wids(w, grammar, lex, winfo.a_unit_split())?;
         w.write_all(b",")?;
-        dump_wids(w, winfo.b_unit_split())?;
+        dump_wids(w, grammar, lex, winfo.b_unit_split())?;
         w.write_all(b",")?;
-        dump_wids(w, winfo.word_structure())?;
+        dump_wids(w, grammar, lex, winfo.word_structure())?;
         w.write_all(b",")?;
         dump_gids(w, winfo.synonym_group_ids())?;
         w.write_all(b"\n")?;
@@ -245,23 +252,79 @@ fn dump_word_info<W: Write>(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> {
     Ok(())
 }
 
-fn dump_wids<W: Write>(w: &mut W, data: &[WordId]) -> SudachiResult<()> {
+fn unicode_escape(raw: &str) -> String {
+    // replace '"' and ','
+    let escaped = raw
+        .to_string()
+        .replace("\"", "\\u0022")
+        .replace(",", "\\u002c");
+    escaped
+}
+
+fn split_mode(winfo: &WordInfo) -> &str {
+    // todo: check
+    let asplits = winfo.a_unit_split();
+    if asplits.len() == 0 {
+        return "A";
+    }
+    let bsplits = winfo.b_unit_split();
+    if bsplits.len() == 0 {
+        return "B";
+    }
+    return "C";
+}
+
+fn pos_string(grammar: &Grammar, posid: u16) -> String {
+    let pos_parts = grammar.pos_components(posid);
+    pos_parts.join(",")
+}
+
+fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String {
+    if wid < 0 {
+        return "*".to_string();
+    }
+    let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid");
+    format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic))
+}
+
+fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String {
+    let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo");
+    format!(
+        "{},{},{}",
+        unicode_escape(winfo.surface()),
+        pos_string(grammar, winfo.pos_id()),
+        unicode_escape(winfo.reading_form()),
+    )
+}
+
+fn dump_wids<W: Write>(
+    w: &mut W,
+    grammar: &Grammar,
+    lex: &LexiconSet,
+    data: &[WordId],
+) -> SudachiResult<()> {
+    if data.len() == 0 {
+        write!(w, "*")?;
+        return Ok(());
+    }
+    w.write_all(b"\"")?;
     for (i, e) in data.iter().enumerate() {
-        let prefix = match e.dic() {
-            0 => "",
-            _ => "U",
-        };
-        write!(w, "{}{}", prefix, e.word())?;
+        write!(w, "{}", wordref_string(grammar, lex, e))?;
         if i + 1 != data.len() {
             w.write_all(b"/")?;
         }
     }
+    w.write_all(b"\"")?;
     Ok(())
 }
 
 fn dump_gids<W: Write>(w: &mut W, data: &[u32]) -> SudachiResult<()> {
+    if data.len() == 0 {
+        write!(w, "*")?;
+        return Ok(());
+    }
     for (i, e) in data.iter().enumerate() {
-        write!(w, "{}", e)?;
+        write!(w, "{:06}", e)?;
         if i + 1 != data.len() {
             w.write_all(b"/")?;
         }

From d11cafc56f675345156c6cb5d98854bed7bc4e7b Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 29 Oct 2024 16:46:54 +0900
Subject: [PATCH 21/24] dump user dict

---
 sudachi-cli/src/build.rs | 110 ++++++++++++++++++++++++++++++---------
 1 file changed, 86 insertions(+), 24 deletions(-)

diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs
index ec7f4620..f1d1dc27 100644
--- a/sudachi-cli/src/build.rs
+++ b/sudachi-cli/src/build.rs
@@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport;
 use sudachi::dic::build::DictBuilder;
 use sudachi::dic::dictionary::JapaneseDictionary;
 use sudachi::dic::grammar::Grammar;
+use sudachi::dic::header::HeaderVersion;
 use sudachi::dic::lexicon::word_infos::WordInfo;
 use sudachi::dic::lexicon_set::LexiconSet;
 use sudachi::dic::word_id::WordId;
@@ -77,10 +78,17 @@ pub(crate) enum BuildCli {
 
     #[command(name = "dump")]
     Dump {
-        dict: PathBuf,
+        /// target dictionary to dump
+        dictionary: PathBuf,
+        /// dump target (matrix, pos, winfo)
         part: String,
+        /// output file
         output: PathBuf,
-        // todo: dump user dict
+
+        /// reference system dictionary.
+        /// required to dump winfo of an user dictionary
+        #[arg(short = 's', long = "system")]
+        system: Option<PathBuf>,
     },
 }
 
@@ -103,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) {
     match subcommand {
         BuildCli::System { common, matrix } => build_system(common, matrix),
         BuildCli::User { common, dictionary } => build_user(common, dictionary),
-        BuildCli::Dump { dict, part, output } => dump_part(dict, part, output),
+        BuildCli::Dump {
+            dictionary,
+            part,
+            output,
+            system,
+        } => dump_part(dictionary, system, part, output),
     }
 }
 
@@ -178,26 +191,30 @@ fn output_file(p: &Path) -> File {
         .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e))
 }
 
-fn dump_part(dict: PathBuf, part: String, output: PathBuf) {
-    let file = File::open(&dict).expect("open failed");
-    let data = unsafe { Mmap::map(&file) }.expect("mmap failed");
+fn dump_part(dict: PathBuf, system: Option<PathBuf>, part: String, output: PathBuf) {
+    let file = File::open(&dict).expect("open dict failed");
+    let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed");
     let loader =
         unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary");
-    let dict = loader.to_loaded().expect("should contain grammar");
 
     let outf = output_file(&output);
     let mut writer = BufWriter::new(outf);
 
     match part.as_str() {
-        "pos" => dump_pos(dict.grammar(), &mut writer),
-        "matrix" => dump_matrix(dict.grammar(), &mut writer),
-        "winfo" => dump_word_info(&dict, &mut writer).unwrap(),
+        "pos" => dump_pos(loader, &mut writer),
+        "matrix" => dump_matrix(loader, &mut writer),
+        "winfo" => dump_word_info(loader, system, &mut writer).unwrap(),
         _ => unimplemented!(),
     }
     writer.flush().unwrap();
 }
 
-fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
+fn dump_pos<W: Write>(dict: DictionaryLoader, w: &mut W) {
+    let dict = dict
+        .to_loaded()
+        .expect("target dict should contain grammar");
+    let grammar = dict.grammar();
+
     for (id, p) in grammar.pos_list.iter().enumerate() {
         write!(w, "{},", id).unwrap();
         for (i, e) in p.iter().enumerate() {
@@ -211,10 +228,18 @@ fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
     }
 }
 
-fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
+fn dump_matrix<W: Write>(dict: DictionaryLoader, w: &mut W) {
+    if let HeaderVersion::UserDict(_) = dict.header.version {
+        panic!("user dictionary does not have connection matrix.")
+    }
+
+    let dict = dict
+        .to_loaded()
+        .expect("target dict should contain grammar");
+    let grammar = dict.grammar();
     let conn = grammar.conn_matrix();
-    write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap();
 
+    write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap();
     for left in 0..conn.num_left() {
         for right in 0..conn.num_right() {
             let cost = conn.cost(left as _, right as _);
@@ -223,28 +248,66 @@ fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
     }
 }
 
-fn dump_word_info<W: Write>(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> {
-    let grammar = dict.grammar();
-    let lex = dict.lexicon();
-    let size = lex.size();
+fn dump_word_info<W: Write>(
+    dict: DictionaryLoader,
+    system: Option<PathBuf>,
+    w: &mut W,
+) -> SudachiResult<()> {
+    let is_user = match dict.header.version {
+        HeaderVersion::UserDict(_) => true,
+        HeaderVersion::SystemDict(_) => false,
+    };
+    let did = if is_user { 1 } else { 0 };
+    let size = dict.lexicon.size();
+
+    let data = system.map(|system_path| {
+        let file = File::open(&system_path).expect("open system failed");
+        unsafe { Mmap::map(&file) }.expect("mmap system failed")
+    });
+    let system = data.as_ref().map(|data| {
+        let loader = DictionaryLoader::read_system_dictionary(data)
+            .expect("failed to load system dictionary");
+        loader
+            .to_loaded()
+            .expect("failed to load system dictionary")
+    });
+
+    let (base, user) = if is_user {
+        (
+            system.expect("system dictionary is required to dump user dictionary lexicon"),
+            Some(dict),
+        )
+    } else {
+        (dict.to_loaded().expect("failed to load dictionary"), None)
+    };
+
+    let mut lex = base.lexicon_set;
+    let mut grammar = base.grammar;
+    if let Some(udic) = user {
+        lex.append(udic.lexicon, grammar.pos_list.len())?;
+        if let Some(g) = udic.grammar {
+            grammar.merge(g)
+        }
+    }
+
     for i in 0..size {
-        let wid = WordId::checked(0, i)?;
+        let wid = WordId::checked(did, i)?;
         let (left, right, cost) = lex.get_word_param(wid);
         let winfo = lex.get_word_info(wid)?;
         write!(w, "{},", unicode_escape(winfo.surface()))?;
         write!(w, "{},{},{},", left, right, cost)?;
         write!(w, "{},", unicode_escape(winfo.surface()))?; // writing
-        write!(w, "{},", pos_string(grammar, winfo.pos_id()))?;
+        write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?;
         write!(w, "{},", unicode_escape(winfo.reading_form()))?;
         write!(w, "{},", unicode_escape(winfo.normalized_form()))?;
-        let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id());
+        let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id());
         write!(w, "{},", dict_form)?;
         write!(w, "{},", split_mode(&winfo))?;
-        dump_wids(w, grammar, lex, winfo.a_unit_split())?;
+        dump_wids(w, &grammar, &lex, winfo.a_unit_split())?;
         w.write_all(b",")?;
-        dump_wids(w, grammar, lex, winfo.b_unit_split())?;
+        dump_wids(w, &grammar, &lex, winfo.b_unit_split())?;
         w.write_all(b",")?;
-        dump_wids(w, grammar, lex, winfo.word_structure())?;
+        dump_wids(w, &grammar, &lex, winfo.word_structure())?;
         w.write_all(b",")?;
         dump_gids(w, winfo.synonym_group_ids())?;
         w.write_all(b"\n")?;
@@ -262,7 +325,6 @@ fn unicode_escape(raw: &str) -> String {
 }
 
 fn split_mode(winfo: &WordInfo) -> &str {
-    // todo: check
     let asplits = winfo.a_unit_split();
     if asplits.len() == 0 {
         return "A";

From ecddc0beb899fb4cfcb9e2d36c45d6372f4ad90d Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 29 Oct 2024 16:51:53 +0900
Subject: [PATCH 22/24] fix clippy warnings

---
 sudachi-cli/src/build.rs | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs
index f1d1dc27..dbb03444 100644
--- a/sudachi-cli/src/build.rs
+++ b/sudachi-cli/src/build.rs
@@ -187,12 +187,12 @@ fn output_file(p: &Path) -> File {
     OpenOptions::new()
         .write(true)
         .create_new(true)
-        .open(&p)
+        .open(p)
         .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e))
 }
 
 fn dump_part(dict: PathBuf, system: Option<PathBuf>, part: String, output: PathBuf) {
-    let file = File::open(&dict).expect("open dict failed");
+    let file = File::open(dict).expect("open dict failed");
     let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed");
     let loader =
         unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary");
@@ -239,11 +239,11 @@ fn dump_matrix<W: Write>(dict: DictionaryLoader, w: &mut W) {
     let grammar = dict.grammar();
     let conn = grammar.conn_matrix();
 
-    write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap();
+    writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap();
     for left in 0..conn.num_left() {
         for right in 0..conn.num_right() {
             let cost = conn.cost(left as _, right as _);
-            write!(w, "{} {} {}\n", left, right, cost).unwrap();
+            writeln!(w, "{} {} {}", left, right, cost).unwrap();
         }
     }
 }
@@ -261,7 +261,7 @@ fn dump_word_info<W: Write>(
     let size = dict.lexicon.size();
 
     let data = system.map(|system_path| {
-        let file = File::open(&system_path).expect("open system failed");
+        let file = File::open(system_path).expect("open system failed");
         unsafe { Mmap::map(&file) }.expect("mmap system failed")
     });
     let system = data.as_ref().map(|data| {
@@ -317,23 +317,21 @@ fn dump_word_info<W: Write>(
 
 fn unicode_escape(raw: &str) -> String {
     // replace '"' and ','
-    let escaped = raw
-        .to_string()
-        .replace("\"", "\\u0022")
-        .replace(",", "\\u002c");
-    escaped
+    raw.to_string()
+        .replace('"', "\\u0022")
+        .replace(',', "\\u002c")
 }
 
 fn split_mode(winfo: &WordInfo) -> &str {
     let asplits = winfo.a_unit_split();
-    if asplits.len() == 0 {
+    if asplits.is_empty() {
         return "A";
     }
     let bsplits = winfo.b_unit_split();
-    if bsplits.len() == 0 {
+    if bsplits.is_empty() {
         return "B";
     }
-    return "C";
+    "C"
 }
 
 fn pos_string(grammar: &Grammar, posid: u16) -> String {
@@ -365,7 +363,7 @@ fn dump_wids<W: Write>(
     lex: &LexiconSet,
     data: &[WordId],
 ) -> SudachiResult<()> {
-    if data.len() == 0 {
+    if data.is_empty() {
         write!(w, "*")?;
         return Ok(());
     }
@@ -381,7 +379,7 @@ fn dump_wids<W: Write>(
 }
 
 fn dump_gids<W: Write>(w: &mut W, data: &[u32]) -> SudachiResult<()> {
-    if data.len() == 0 {
+    if data.is_empty() {
         write!(w, "*")?;
         return Ok(());
     }

From 75cda40da26e0917c55a2f3a9e78421b9b6f9399 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Thu, 7 Nov 2024 17:24:02 +0900
Subject: [PATCH 23/24] add note to the help of pycli -d option and warn on its
 use

---
 python/py_src/sudachipy/command_line.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py
index 07f59c19..e5cd87d1 100644
--- a/python/py_src/sudachipy/command_line.py
+++ b/python/py_src/sudachipy/command_line.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Works Applications Co., Ltd.
+# Copyright (c) 2019-2024 Works Applications Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +24,13 @@
 from . import sudachipy
 
 
+logging.basicConfig(
+    style="{",
+    format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
+    datefmt="%m-%d-%Y %H:%M:%S",
+)
+
+
 def _set_default_subparser(self, name, args=None):
     """
     copy and modify code from https://bitbucket.org/ruamel/std.argparse
@@ -97,14 +104,13 @@ def _command_tokenize(args, print_usage):
     if args.fpath_out:
         output = open(args.fpath_out, "w", encoding="utf-8")
 
-    stdout_logger = logging.getLogger(__name__)
-    handler = logging.StreamHandler(sys.stdout)
-    handler.setLevel(logging.DEBUG)
-    stdout_logger.addHandler(handler)
-    stdout_logger.setLevel(logging.DEBUG)
-    stdout_logger.propagate = False
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
 
     print_all = args.a
+    debug = args.d
+    if debug:
+        logger.warning("-d option is not implemented in python.")
 
     try:
         dict_ = Dictionary(config_path=args.fpath_setting,
@@ -217,7 +223,7 @@ def main():
     parser_tk.add_argument("-a", action="store_true",
                            help="print all of the fields")
     parser_tk.add_argument("-d", action="store_true",
-                           help="print the debug information")
+                           help="print the debug information (not implemented yet)")
     parser_tk.add_argument("-v", "--version", action="store_true",
                            dest="version", help="print sudachipy version")
     parser_tk.add_argument("in_files", metavar="file",

From 1cad6c95f463e83c6a774e145ff8c55b93ad580c Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Thu, 7 Nov 2024 17:24:21 +0900
Subject: [PATCH 24/24] rename pos_list and fmt

---
 python/py_src/sudachipy/command_line.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py
index e5cd87d1..e7574bf1 100644
--- a/python/py_src/sudachipy/command_line.py
+++ b/python/py_src/sudachipy/command_line.py
@@ -58,7 +58,7 @@ def _set_default_subparser(self, name, args=None):
 argparse.ArgumentParser.set_default_subparser = _set_default_subparser
 
 
-def run(tokenizer, input_, output, print_all, morphs, is_stdout):
+def run(tokenizer, input_, output, print_all, pos_list, is_stdout):
     # get an empty MorphemeList for memory reuse
     mlist = tokenizer.tokenize("")
     for line in input_:
@@ -67,7 +67,7 @@ def run(tokenizer, input_, output, print_all, morphs, is_stdout):
         for m in tokenizer.tokenize(line, out=mlist):
             list_info = [
                 m.surface(),
-                morphs[m.part_of_speech_id()],
+                pos_list[m.part_of_speech_id()],
                 m.normalized_form()]
             if print_all:
                 list_info += [
@@ -116,14 +116,15 @@ def _command_tokenize(args, print_usage):
         dict_ = Dictionary(config_path=args.fpath_setting,
                            dict_type=args.system_dict_type)
         # empty matcher - get all POS tags
-        all_morphs = dict_.pos_matcher([()])
+        all_pos_matcher = dict_.pos_matcher([()])
         # precompute output POS strings
-        morphs = [",".join(ms) for ms in all_morphs]
+        pos_list = [",".join(ms) for ms in all_pos_matcher]
 
         tokenizer_obj = dict_.create(mode=args.mode)
         input_ = fileinput.input(
             args.in_files, openhook=fileinput.hook_encoded("utf-8"))
-        run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None)
+        run(tokenizer_obj, input_, output, print_all,
+            pos_list, is_stdout=args.fpath_out is None)
     finally:
         if args.fpath_out:
             output.close()
@@ -145,7 +146,8 @@ def _command_build(args, print_usage):
 
     out_file = Path(args.out_file)
     if out_file.exists():
-        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
+        print("File", out_file,
+              "already exists, refusing to overwrite it", file=sys.stderr)
         return
 
     description = args.description or ""
@@ -167,7 +169,8 @@ def _command_build(args, print_usage):
 def _command_user_build(args, print_usage):
     system = Path(args.system_dic)
     if not system.exists():
-        print("System dictionary file", system, "does not exist", file=sys.stderr)
+        print("System dictionary file", system,
+              "does not exist", file=sys.stderr)
         return print_usage()
 
     in_files = []
@@ -180,7 +183,8 @@ def _command_user_build(args, print_usage):
 
     out_file = Path(args.out_file)
     if out_file.exists():
-        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
+        print("File", out_file,
+              "already exists, refusing to overwrite it", file=sys.stderr)
         return
 
     description = args.description or ""