Skip to content

Commit

Permalink
sync pyi and rs
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Jul 8, 2024
1 parent dfc87ed commit 8c35516
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 90 deletions.
104 changes: 65 additions & 39 deletions python/py_src/sudachipy/sudachipy.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
# Copyright (c) 2024 Works Applications Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set
from .config import Config

# Part Of Speech
POS = Tuple[str, str, str, str, str, str]
# POS element
PE = Optional[str]
Expand All @@ -14,6 +28,8 @@ PartialPOS = Union[
Tuple[()],
]

# Fields that can be specified for partial dictionary loading.
# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form",
"word_structure", "split_a", "split_b", "synonym_group_id"]]]

Expand All @@ -23,9 +39,7 @@ class SplitMode:
Unit to split text.
A == short mode
B == middle mode
C == long mode
"""

Expand All @@ -36,8 +50,9 @@ class SplitMode:
@classmethod
def __init__(cls, mode: str = "C") -> None:
"""
Creates a split mode from a string value
:param mode: string representation of the split mode
Creates a split mode from a string value.
:param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
"""
...

Expand All @@ -54,14 +69,15 @@ class Dictionary:
Creates a sudachi dictionary.
If both config.systemDict and dict are not given, `sudachidict_core` is used.
If both config.systemDict and dict are given, dict_type is used.
If both config.systemDict and dict are given, dict is used.
If dict is an absolute path to a file, it is used as a dictionary.
:param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object
:param config: alias to config_path, only one of them can be specified at the same time
:param resource_dir: path to the resource directory folder
:param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
:param config: alias to config_path, only one of them can be specified at the same time.
:param resource_dir: path to the resource directory folder.
:param dict: type of pre-packaged system dictionary, referring to sudachidict_<dict> packages on PyPI: https://pypi.org/search/?q=sudachidict.
Also, can be an _absolute_ path to a compiled dictionary file.
:param dict_type: deprecated alias to dict
:param dict_type: deprecated alias to dict.
"""
...

Expand All @@ -77,11 +93,11 @@ class Dictionary:
*,
projection: str = None) -> Tokenizer:
"""
Creates a Sudachi Tokenizer.
Creates a sudachi tokenizer.
:param mode: sets the analysis mode for this Tokenizer
:param fields: load only a subset of fields.
See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
:param projection: Projection override for created Tokenizer. See Config.projection for values.
"""
...
Expand All @@ -91,32 +107,32 @@ class Dictionary:
Creates a new POS matcher.
If target is a function, then it must return whether a POS should match or not.
If target a list, it should contain partially specified POS.
By partially specified it means that it is possible to omit POS fields or
use None as a sentinel value that matches any POS.
If target is a list, it should contain partially specified POS.
By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
For example, ('名詞',) will match any noun and
(None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form.
:param target: can be either a function or a list of POS tuples.
:param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
"""
...

def pre_tokenizer(self,
mode: Union[SplitMode, Literal["A", "B", "C"]] = "C",
fields: FieldSet = None,
handler: Optional[Callable[[int, object, MorphemeList], list]] = None,
handler: Optional[Callable[[
int, object, MorphemeList], list]] = None,
*,
projection: str = None) -> object:
"""
Creates HuggingFace Tokenizers-compatible PreTokenizer.
Requires package `tokenizers` to be installed.
:param mode: Use this split mode (C by default)
:param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
:param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
First two parameters are the index (int) and HuggingFace NormalizedString.
The handler must return a List[NormalizedString]. By default, just segment the tokens.
:param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
:param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
:param projection: Projection override for created Tokenizer. See Config.projection for values.
"""
...
Expand All @@ -126,7 +142,7 @@ class Dictionary:
Returns POS with the given id.
:param pos_id: POS id
:return: POS tuple with the given id.
:return: POS tuple with the given id or None for non existing id.
"""
...

Expand Down Expand Up @@ -197,7 +213,8 @@ class Morpheme:

def part_of_speech(self) -> POS:
"""
Returns the part of speech.
Returns the part of speech as a six-element tuple.
Tuple elements are four POS levels, conjugation type and conjugation form.
"""
...

Expand All @@ -217,8 +234,8 @@ class Morpheme:
"""
Returns sub-morphemes in the provided split mode.
:param mode: mode of new split
:param out: write results to this MorhpemeList instead of creating new one
:param mode: mode of new split.
:param out: write results to this MorhpemeList instead of creating new one.
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
more information on output parameters.
Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
Expand All @@ -230,13 +247,15 @@ class Morpheme:
def surface(self) -> str:
"""
Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
See `Config.projection`.
"""
...

def raw_surface(self) -> str:
"""
Returns the substring of input text corresponding to the morpheme regardless the configured projection.
See `Config.projection`.
"""
...
Expand All @@ -255,7 +274,7 @@ class Morpheme:

def __len__(self) -> int:
"""
Returns morpheme length in codepoints
Returns morpheme length in codepoints.
"""


Expand Down Expand Up @@ -293,6 +312,11 @@ class MorphemeList:


class Tokenizer:
"""
A sudachi tokenizer
Create using Dictionary.create method.
"""
SplitMode: ClassVar[SplitMode] = ...
@classmethod
def __init__(cls) -> None: ...
Expand All @@ -303,13 +327,12 @@ class Tokenizer:
"""
Break text into morphemes.
SudachiPy 0.5.* had logger parameter, it is accepted, but ignored.
:param text: text to analyze
:param text: text to analyze.
:param mode: analysis mode.
This parameter is deprecated.
Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes.
If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead.
:param logger: Arg for v0.5.* compatibility. Ignored.
:param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
"""
Expand Down Expand Up @@ -342,41 +365,44 @@ class WordInfo:


class PosMatcher:
"""
A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech.
Create using Dictionary.pos_matcher method.
"""

def __iter__(self) -> Iterator[POS]: ...
def __len__(self) -> int: ...

def __call__(self, m: Morpheme) -> bool:
"""
Checks whether a morpheme has matching POS
:param m: morpheme
:return: if morpheme has matching POS
Checks whether a morpheme has matching POS.
:param m: morpheme.
:return: if morpheme has matching POS.
"""
...

def __or__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if any of two matchers would match it
:return: PosMatcher
Returns a POS matcher which matches a POS if any of two matchers would match it.
"""
...

def __and__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if both matchers would match it at the same time
:return: PosMatcher
Returns a POS matcher which matches a POS if both matchers would match it at the same time.
"""
...

def __sub__(self, other: PosMatcher) -> PosMatcher:
"""
Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS
:return: PosMatcher
Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS.
"""
...

def __invert__(self) -> PosMatcher:
"""
Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher
:return: PosMatcher
Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher.
"""
...
54 changes: 29 additions & 25 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,12 @@ impl PyDicData {

/// A sudachi dictionary.
///
/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
/// If both config.systemDict and dict_type are given, dict_type is used.
/// If both config.systemDict and dict are not given, `sudachidict_core` is used.
/// If both config.systemDict and dict are given, dict is used.
/// If dict is an absolute path to a file, it is used as a dictionary.
///
/// :param config_path: path to the configuration JSON file.
/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
/// :param config: alias to config_path, only one of them can be specified at the same time.
/// :param resource_dir: path to the resource directory folder.
/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
/// Also, can be an _absolute_ path to a compiled dictionary file.
Expand All @@ -100,11 +101,12 @@ pub struct PyDictionary {
impl PyDictionary {
/// Creates a sudachi dictionary.
///
/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used.
/// If both config.systemDict and dict_type are given, dict_type is used.
/// If both config.systemDict and dict are not given, `sudachidict_core` is used.
/// If both config.systemDict and dict are given, dict is used.
/// If dict is an absolute path to a file, it is used as a dictionary.
///
/// :param config_path: path to the configuration JSON file.
/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object.
/// :param config: alias to config_path, only one of them can be specified at the same time.
/// :param resource_dir: path to the resource directory folder.
/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
/// Also, can be an _absolute_ path to a compiled dictionary file.
Expand Down Expand Up @@ -229,11 +231,12 @@ impl PyDictionary {

/// Creates a sudachi tokenizer.
///
/// :param mode: tokenizer's default split mode (C by default).
/// :param mode: sets the analysis mode for this Tokenizer
/// :param fields: load only a subset of fields.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
#[pyo3(
text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer",
text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer",
signature=(mode=None, fields=None, *, projection=None)
)]
fn create<'py>(
Expand Down Expand Up @@ -267,14 +270,13 @@ impl PyDictionary {
/// Creates a POS matcher object
///
/// If target is a function, then it must return whether a POS should match or not.
/// If target a list, it should contain partially specified POS.
/// By partially specified it means that it is possible to omit POS fields or
/// use None as a sentinel value that matches any POS.
/// If target is a list, it should contain partially specified POS.
/// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS.
///
/// For example, ('名詞',) will match any noun and
/// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
///
/// :param target: can be either a callable or list of POS partial tuples
/// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
#[pyo3(text_signature="(self, /, target) -> PosMatcher")]
fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
Expand All @@ -285,15 +287,13 @@ impl PyDictionary {
///
/// :param mode: Use this split mode (C by default)
/// :param fields: ask Sudachi to load only a subset of fields.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html
/// :param handler: a custom callable to transform MorphemeList into list of tokens.
/// It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
/// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py
/// If nothing was passed, simply use surface as token representations.
/// :param projection: projection mode for a created PreTokenizer.
/// See :class:`sudachipy.config.Config` object documentation for supported projections.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
/// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
/// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
/// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
///
/// :type mode: sudachipy.SplitMode
/// :type mode: SplitMode
/// :type fields: Set[str]
#[pyo3(
text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
Expand Down Expand Up @@ -350,8 +350,9 @@ impl PyDictionary {
/// :param surface: find all morphemes with the given surface
/// :param out: if passed, reuse the given morpheme list instead of creating a new one.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
///
/// :type surface: str
/// :type out: sudachipy.MorphemeList
/// :type out: MorphemeList
#[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
fn lookup<'p>(
&'p self,
Expand Down Expand Up @@ -379,14 +380,17 @@ impl PyDictionary {
Ok(l)
}

/// Close this dictionary
/// Close this dictionary.
#[pyo3(text_signature="(self, /) -> ()")]
fn close(&mut self) {
self.dictionary = None;
}

/// Get POS Tuple by its id
#[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")]
/// Returns POS with the given id.
///
/// :param pos_id: POS id
/// :return: POS tuple with the given id or None for non existing id.
#[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
let dic = self.dictionary.as_ref().unwrap();
dic.pos.get(pos_id).map(|x| x.as_ref(py))
Expand Down
Loading

0 comments on commit 8c35516

Please sign in to comment.