sync pyi and rs

WorksApplications · Jul 8, 2024 · 8c35516 · 8c35516
1 parent dfc87ed
commit 8c35516
Show file tree

Hide file tree

Showing 5 changed files with 136 additions and 90 deletions.
diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi
@@ -1,6 +1,20 @@
+#   Copyright (c) 2024 Works Applications Co., Ltd.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
 from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
 from .config import Config
 
+# Part Of Speech
 POS = Tuple[str, str, str, str, str, str]
 # POS element
 PE = Optional[str]
@@ -14,6 +28,8 @@ PartialPOS = Union[
     Tuple[()],
 ]
 
+# Fields that can be specified for partial dictionary loading.
+# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
 FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
                                 "word_structure", "split_a", "split_b", "synonym_group_id"]]]
 
@@ -23,9 +39,7 @@ class SplitMode:
     Unit to split text.
 
     A == short mode
-
     B == middle mode
-
     C == long mode
     """
 
@@ -36,8 +50,9 @@ class SplitMode:
     @classmethod
     def __init__(cls, mode: str = "C") -> None:
         """
-        Creates a split mode from a string value
-        :param mode: string representation of the split mode
+        Creates a split mode from a string value.
+
+        :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
         """
         ...
 
@@ -54,14 +69,15 @@ class Dictionary:
         Creates a sudachi dictionary.
 
         If both config.systemDict and dict are not given, `sudachidict_core` is used.
-        If both config.systemDict and dict are given, dict_type is used.
+        If both config.systemDict and dict are given, dict is used.
+        If dict is an absolute path to a file, it is used as a dictionary.
 
-        :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
-        :param config: alias to config_path, only one of them can be specified at the same time
-        :param resource_dir: path to the resource directory folder
+        :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+        :param config: alias to config_path, only one of them can be specified at the same time.
+        :param resource_dir: path to the resource directory folder.
         :param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
             Also, can be an _absolute_ path to a compiled dictionary file.
-        :param dict_type: deprecated alias to dict
+        :param dict_type: deprecated alias to dict.
         """
         ...
 
@@ -77,11 +93,11 @@ class Dictionary:
                *,
                projection: str = None) -> Tokenizer:
         """
-        Creates a Sudachi Tokenizer.
+        Creates a sudachi tokenizer.
 
         :param mode: sets the analysis mode for this Tokenizer
         :param fields: load only a subset of fields.
-            See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
+            See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
         :param projection: Projection override for created Tokenizer. See Config.projection for values.
         """
         ...
@@ -91,32 +107,32 @@ class Dictionary:
         Creates a new POS matcher.
 
         If target is a function, then it must return whether a POS should match or not.
-        If target a list, it should contain partially specified POS.
-        By partially specified it means that it is possible to omit POS fields or
-        use None as a sentinel value that matches any POS.
+        If target is a list, it should contain partially specified POS.
+        By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
 
         For example, ('名詞',) will match any noun and
         (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.
 
-        :param target: can be either a function or a list of POS tuples.
+        :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
         """
         ...
 
     def pre_tokenizer(self,
                       mode: Union[SplitMode, Literal["A", "B", "C"]] = "C",
                       fields: FieldSet = None,
-                      handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
+                      handler: Optional[Callable[[
+                          int, object, MorphemeList], list]] = None,
                       *,
                       projection: str = None) -> object:
         """
         Creates HuggingFace Tokenizers-compatible PreTokenizer.
         Requires package `tokenizers` to be installed.
 
         :param mode: Use this split mode (C by default)
-        :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
-        :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
-            First two parameters are the index (int) and HuggingFace NormalizedString.
-            The handler must return a List[NormalizedString]. By default, just segment the tokens.
+        :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+        :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
+            It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
+            See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
         :param projection: Projection override for created Tokenizer. See Config.projection for values.
         """
         ...
@@ -126,7 +142,7 @@ class Dictionary:
         Returns POS with the given id.
 
         :param pos_id: POS id
-        :return: POS tuple with the given id.
+        :return: POS tuple with the given id or None for non existing id.
         """
         ...
 
@@ -197,7 +213,8 @@ class Morpheme:
 
     def part_of_speech(self) -> POS:
         """
-        Returns the part of speech.
+        Returns the part of speech as a six-element tuple.
+        Tuple elements are four POS levels, conjugation type and conjugation form.
         """
         ...
 
@@ -217,8 +234,8 @@ class Morpheme:
         """
         Returns sub-morphemes in the provided split mode.
 
-        :param mode: mode of new split
-        :param out: write results to this MorhpemeList instead of creating new one
+        :param mode: mode of new split.
+        :param out: write results to this MorhpemeList instead of creating new one.
             See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
             more information on output parameters.
             Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
@@ -230,13 +247,15 @@ class Morpheme:
     def surface(self) -> str:
         """
         Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
+
         See `Config.projection`.
         """
         ...
 
     def raw_surface(self) -> str:
         """
         Returns the substring of input text corresponding to the morpheme regardless the configured projection.
+
         See `Config.projection`.
         """
         ...
@@ -255,7 +274,7 @@ class Morpheme:
 
     def __len__(self) -> int:
         """
-        Returns morpheme length in codepoints
+        Returns morpheme length in codepoints.
         """
 
 
@@ -293,6 +312,11 @@ class MorphemeList:
 
 
 class Tokenizer:
+    """
+    A sudachi tokenizer
+
+    Create using Dictionary.create method.
+    """
     SplitMode: ClassVar[SplitMode] = ...
     @classmethod
     def __init__(cls) -> None: ...
@@ -303,13 +327,12 @@ class Tokenizer:
         """
         Break text into morphemes.
 
-        SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
-
-        :param text: text to analyze
+        :param text: text to analyze.
         :param mode: analysis mode.
             This parameter is deprecated.
             Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
             If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
+        :param logger: Arg for v0.5.* compatibility. Ignored.
         :param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
             See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
         """
@@ -342,41 +365,44 @@ class WordInfo:
 
 
 class PosMatcher:
+    """
+    A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech.
+
+    Create using Dictionary.pos_matcher method.
+    """
+
     def __iter__(self) -> Iterator[POS]: ...
     def __len__(self) -> int: ...
 
     def __call__(self, m: Morpheme) -> bool:
         """
-        Checks whether a morpheme has matching POS
-        :param m: morpheme
-        :return: if morpheme has matching POS
+        Checks whether a morpheme has matching POS.
+
+        :param m: morpheme.
+        :return: if morpheme has matching POS.
         """
         ...
 
     def __or__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if any of two matchers would match it
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if any of two matchers would match it.
         """
         ...
 
     def __and__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if both matchers would match it at the same time
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if both matchers would match it at the same time.
         """
         ...
 
     def __sub__(self, other: PosMatcher) -> PosMatcher:
         """
-        Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
-        :return: PosMatcher
+        Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS.
         """
         ...
 
     def __invert__(self) -> PosMatcher:
         """
-        Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
-        :return: PosMatcher
+        Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher.
         """
         ...
diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs
@@ -80,11 +80,12 @@ impl PyDicData {
 
 /// A sudachi dictionary.
 ///
-/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
-/// If both config.systemDict and dict_type are given, dict_type is used.
+/// If both config.systemDict and dict are not given, `sudachidict_core` is used.
+/// If both config.systemDict and dict are given, dict is used.
 /// If dict is an absolute path to a file, it is used as a dictionary.
 ///
-/// :param config_path: path to the configuration JSON file.
+/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+/// :param config: alias to config_path, only one of them can be specified at the same time.
 /// :param resource_dir: path to the resource directory folder.
 /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
 ///     Also, can be an _absolute_ path to a compiled dictionary file.
@@ -100,11 +101,12 @@ pub struct PyDictionary {
 impl PyDictionary {
     /// Creates a sudachi dictionary.
     ///
-    /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
-    /// If both config.systemDict and dict_type are given, dict_type is used.
+    /// If both config.systemDict and dict are not given, `sudachidict_core` is used.
+    /// If both config.systemDict and dict are given, dict is used.
     /// If dict is an absolute path to a file, it is used as a dictionary.
     ///
-    /// :param config_path: path to the configuration JSON file.
+    /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
+    /// :param config: alias to config_path, only one of them can be specified at the same time.
     /// :param resource_dir: path to the resource directory folder.
     /// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
     ///     Also, can be an _absolute_ path to a compiled dictionary file.
@@ -229,11 +231,12 @@ impl PyDictionary {
 
     /// Creates a sudachi tokenizer.
     ///
-    /// :param mode: tokenizer's default split mode (C by default).
+    /// :param mode: sets the analysis mode for this Tokenizer
     /// :param fields: load only a subset of fields.
-    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
+    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+    /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
     #[pyo3(
-        text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer",
+        text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer",
         signature=(mode=None, fields=None, *, projection=None)
     )]
     fn create<'py>(
@@ -267,14 +270,13 @@ impl PyDictionary {
     /// Creates a POS matcher object
     ///
     /// If target is a function, then it must return whether a POS should match or not.
-    /// If target a list, it should contain partially specified POS.
-    /// By partially specified it means that it is possible to omit POS fields or
-    /// use None as a sentinel value that matches any POS.
+    /// If target is a list, it should contain partially specified POS.
+    /// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
     ///
     /// For example, ('名詞',) will match any noun and
     /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
     ///
-    /// :param target: can be either a callable or list of POS partial tuples
+    /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
     #[pyo3(text_signature="(self, /, target) -> PosMatcher")]
     fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
         PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
@@ -285,15 +287,13 @@ impl PyDictionary {
     ///
     /// :param mode: Use this split mode (C by default)
     /// :param fields: ask Sudachi to load only a subset of fields.
-    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
-    /// :param handler: a custom callable to transform MorphemeList into list of tokens.
-    ///     It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
-    ///     See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
-    ///     If nothing was passed, simply use surface as token representations.
-    /// :param projection: projection mode for a created PreTokenizer.
-    ///     See :class:`sudachipy.config.Config` object documentation for supported projections.
+    ///     See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
+    /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
+    ///     It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
+    ///     See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
+    /// :param projection: Projection override for created Tokenizer. See Config.projection for values.
     ///
-    /// :type mode: sudachipy.SplitMode
+    /// :type mode: SplitMode
     /// :type fields: Set[str]
     #[pyo3(
         text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
@@ -350,8 +350,9 @@ impl PyDictionary {
     /// :param surface: find all morphemes with the given surface
     /// :param out: if passed, reuse the given morpheme list instead of creating a new one.
     ///     See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
+    ///
     /// :type surface: str
-    /// :type out: sudachipy.MorphemeList
+    /// :type out: MorphemeList
     #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
     fn lookup<'p>(
         &'p self,
@@ -379,14 +380,17 @@ impl PyDictionary {
         Ok(l)
     }
 
-    /// Close this dictionary
+    /// Close this dictionary.
     #[pyo3(text_signature="(self, /) -> ()")]
     fn close(&mut self) {
         self.dictionary = None;
     }
 
-    /// Get POS Tuple by its id
-    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")]
+    /// Returns POS with the given id.
+    ///
+    /// :param pos_id: POS id
+    /// :return: POS tuple with the given id or None for non existing id.
+    #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
     fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
         let dic = self.dictionary.as_ref().unwrap();
         dic.pos.get(pos_id).map(|x| x.as_ref(py))