Skip to content

Commit

Permalink
add type fields for rs
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Jul 8, 2024
1 parent 8c35516 commit 706a573
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 36 deletions.
26 changes: 24 additions & 2 deletions python/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,19 @@ fn create_file(p: &Path) -> std::io::Result<File> {
}

/// Build system dictionary from matrix and lexicons.
///
/// :param matrix: Path to the matrix file.
/// :param lex: List of paths to lexicon files.
/// :param output: Path to output built dictionray.
/// :param description: A description text to embed in the dictionary.
/// :return: A build report, list of (part, size, time).
///
/// :type matrix: pathlib.Path | str | bytes
/// :type lex: list[pathlib.Path | str | bytes]
/// :type output: pathlib.Path | str
/// :type description: str
#[pyfunction]
#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
fn build_system_dic<'p>(
py: Python<'p>,
matrix: &'p PyAny,
Expand Down Expand Up @@ -89,8 +100,19 @@ fn build_system_dic<'p>(
}

/// Build user dictionary from lexicons based on the given system dictionary.
///
/// :param system: Path to the system dictionary.
/// :param lex: List of paths to lexicon files.
/// :param output: Path to output built dictionray.
/// :param description: A description text to embed in the dictionary.
/// :return: A build report, list of (part, size, time).
///
/// :type system: pathlib.Path | str
/// :type lex: list[pathlib.Path | str | bytes]
/// :type output: pathlib.Path | str
/// :type description: str
#[pyfunction]
#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
fn build_user_dic<'p>(
py: Python<'p>,
system: &'p PyAny,
Expand Down
36 changes: 29 additions & 7 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ impl PyDicData {
/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
/// Also, can be an _absolute_ path to a compiled dictionary file.
/// :param dict_type: deprecated alias to dict.
///
/// :type config_path: Config | pathlib.Path | str | None
/// :type config: Config | pathlib.Path | str | None
/// :type resource_dir: pathlib.Path | str | None
/// :type dict: pathlib.Path | str | None
/// :type dict_type: pathlib.Path | str | None
#[pyclass(module = "sudachipy.dictionary", name = "Dictionary")]
#[derive(Clone)]
pub struct PyDictionary {
Expand All @@ -111,6 +117,12 @@ impl PyDictionary {
/// :param dict: type of pre-packaged dictionary, referring to sudachidict_<dict_type> packages on PyPI: https://pypi.org/search/?q=sudachidict.
/// Also, can be an _absolute_ path to a compiled dictionary file.
/// :param dict_type: deprecated alias to dict.
///
/// :type config_path: Config | pathlib.Path | str | None
/// :type config: Config | pathlib.Path | str | None
/// :type resource_dir: pathlib.Path | str | None
/// :type dict: pathlib.Path | str | None
/// :type dict_type: pathlib.Path | str | None
#[new]
#[pyo3(
text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary",
Expand Down Expand Up @@ -235,6 +247,10 @@ impl PyDictionary {
/// :param fields: load only a subset of fields.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
///
/// :type mode: SplitMode | str | None
/// :type fields: set[str] | None
/// :type projection: str | None
#[pyo3(
text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer",
signature=(mode=None, fields=None, *, projection=None)
Expand Down Expand Up @@ -277,7 +293,9 @@ impl PyDictionary {
/// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form.
///
/// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool.
#[pyo3(text_signature="(self, /, target) -> PosMatcher")]
///
/// :type target: Iterable[PartialPOS] | Callable[[POS], bool]
#[pyo3(text_signature = "(self, /, target) -> PosMatcher")]
fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult<PyPosMatcher> {
PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target)
}
Expand All @@ -293,8 +311,10 @@ impl PyDictionary {
/// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
///
/// :type mode: SplitMode
/// :type fields: Set[str]
/// :type mode: SplitMode | str | None
/// :type fields: set[str] | None
/// :type handler: Callable[[int, NormalizedString, MorphemeList], list[NormalizedString]] | None
/// :type projection: str | None
#[pyo3(
text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer",
signature=(mode=None, fields=None, handler=None, *, projection=None)
Expand Down Expand Up @@ -352,8 +372,8 @@ impl PyDictionary {
/// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
///
/// :type surface: str
/// :type out: MorphemeList
#[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")]
/// :type out: MorphemeList | None
#[pyo3(text_signature = "(self, /, surface, out=None) -> MorphemeList")]
fn lookup<'p>(
&'p self,
py: Python<'p>,
Expand Down Expand Up @@ -381,7 +401,7 @@ impl PyDictionary {
}

/// Close this dictionary.
#[pyo3(text_signature="(self, /) -> ()")]
#[pyo3(text_signature = "(self, /) -> ()")]
fn close(&mut self) {
self.dictionary = None;
}
Expand All @@ -390,7 +410,9 @@ impl PyDictionary {
///
/// :param pos_id: POS id
/// :return: POS tuple with the given id or None for non existing id.
#[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
///
/// :type pos_id: int
#[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> {
let dic = self.dictionary.as_ref().unwrap();
dic.pos.get(pos_id).map(|x| x.as_ref(py))
Expand Down
42 changes: 20 additions & 22 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ impl PyMorphemeListWrapper {
impl PyMorphemeListWrapper {
/// Returns an empty morpheme list with dictionary.
#[classmethod]
#[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")]
#[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")]
fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult<Self> {
let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
PyErr::warn(
Expand All @@ -113,13 +113,13 @@ impl PyMorphemeListWrapper {
}

/// Returns the total cost of the path.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn get_internal_cost(&self, py: Python) -> i32 {
self.internal(py).get_internal_cost()
}

/// Returns the number of morpheme in this list.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn size(&self, py: Python) -> usize {
self.internal(py).len()
}
Expand Down Expand Up @@ -282,14 +282,14 @@ impl PyMorpheme {
#[pymethods]
impl PyMorpheme {
/// Returns the begin index of this in the input text.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn begin(&self, py: Python) -> usize {
// call codepoint version
self.morph(py).begin_c()
}

/// Returns the end index of this in the input text.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn end(&self, py: Python) -> usize {
// call codepoint version
self.morph(py).end_c()
Expand All @@ -298,7 +298,7 @@ impl PyMorpheme {
/// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured.
///
/// See `Config.projection`.
#[pyo3(text_signature="(self, /) -> str")]
#[pyo3(text_signature = "(self, /) -> str")]
fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
let list = self.list(py);
let morph = self.morph(py);
Expand All @@ -311,14 +311,14 @@ impl PyMorpheme {
/// Returns the substring of input text corresponding to the morpheme regardless the configured projection.
///
/// See `Config.projection`.
#[pyo3(text_signature="(self, /) -> str")]
#[pyo3(text_signature = "(self, /) -> str")]
fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString {
PyString::new(py, self.morph(py).surface().deref())
}

/// Returns the part of speech as a six-element tuple.
/// Tuple elements are four POS levels, conjugation type and conjugation form.
#[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")]
#[pyo3(text_signature = "(self, /) -> tuple[str, str, str, str, str, str]")]
fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py<PyTuple> {
let pos_id = self.part_of_speech_id(py);
self.list(py)
Expand All @@ -329,25 +329,25 @@ impl PyMorpheme {
}

/// Returns the id of the part of speech in the dictionary.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
pub fn part_of_speech_id(&self, py: Python) -> u16 {
self.morph(py).part_of_speech_id()
}

/// Returns the dictionary form.
#[pyo3(text_signature="(self, /) -> str")]
#[pyo3(text_signature = "(self, /) -> str")]
fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject {
self.morph(py).get_word_info().dictionary_form().into_py(py)
}

/// Returns the normalized form.
#[pyo3(text_signature="(self, /) -> str")]
#[pyo3(text_signature = "(self, /) -> str")]
fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject {
self.morph(py).get_word_info().normalized_form().into_py(py)
}

/// Returns the reading form.
#[pyo3(text_signature="(self, /) -> str")]
#[pyo3(text_signature = "(self, /) -> str")]
fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject {
self.morph(py).get_word_info().reading_form().into_py(py)
}
Expand All @@ -362,12 +362,10 @@ impl PyMorpheme {
/// :param add_single: return lists with the current morpheme if the split hasn't produced any elements.
/// When False is passed, empty lists are returned instead.
///
/// :type mode: sudachipy.SplitMode
/// :type out: Optional[sudachipy.MorphemeList]
/// :type mode: SplitMode | None
/// :type out: MorphemeList | None
/// :type add_single: bool
#[pyo3(
text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList"
)]
#[pyo3(text_signature = "(self, /, mode, out=None, add_single=False) -> MorphemeList")]
fn split<'py>(
&'py self,
py: Python<'py>,
Expand Down Expand Up @@ -410,19 +408,19 @@ impl PyMorpheme {
}

/// Returns whether if this is out of vocabulary word.
#[pyo3(text_signature="(self, /) -> bool")]
#[pyo3(text_signature = "(self, /) -> bool")]
fn is_oov(&self, py: Python) -> bool {
self.morph(py).is_oov()
}

/// Returns word id of this word in the dictionary.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn word_id(&self, py: Python) -> u32 {
self.morph(py).word_id().as_raw()
}

/// Returns the dictionary id which this word belongs.
#[pyo3(text_signature="(self, /) -> int")]
#[pyo3(text_signature = "(self, /) -> int")]
fn dictionary_id(&self, py: Python) -> i32 {
let word_id = self.morph(py).word_id();
if word_id.is_oov() {
Expand All @@ -433,15 +431,15 @@ impl PyMorpheme {
}

/// Returns the list of synonym group ids.
#[pyo3(text_signature="(self, /) -> List[int]")]
#[pyo3(text_signature = "(self, /) -> List[int]")]
fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList {
let mref = self.morph(py);
let ids = mref.get_word_info().synonym_group_ids();
PyList::new(py, ids)
}

/// Returns the word info.
#[pyo3(text_signature="(self, /) -> WordInfo")]
#[pyo3(text_signature = "(self, /) -> WordInfo")]
fn get_word_info(&self, py: Python) -> PyResult<PyWordInfo> {
let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?;
PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?;
Expand Down
4 changes: 3 additions & 1 deletion python/src/pos_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,10 @@ impl PyPosMatcher {
impl PyPosMatcher {
/// Checks whether a morpheme has matching POS.
///
/// :param m: morpheme.
/// :param m: a morpheme to check.
/// :return: if morpheme has matching POS.
///
/// :type m: Morpheme
pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool {
let pos_id = m.part_of_speech_id(py);
self.matcher.matches_id(pos_id)
Expand Down
14 changes: 10 additions & 4 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
/// C == long mode
///
/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
/// If None, returns SplitMode.C.
///
/// :type mode: str | None
#[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)]
#[derive(Clone, PartialEq, Eq, Copy, Debug)]
#[repr(u8)]
Expand Down Expand Up @@ -67,9 +70,12 @@ impl From<Mode> for PySplitMode {

#[pymethods]
impl PySplitMode {
/// Parse SplitMode from a character.
/// Creates a split mode from a string value.
///
/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case.
/// If None, returns SplitMode.C.
///
/// :param mode: str to parse. One of [A,B,C] in captital or lower case.
/// :type mode: str | None
#[new]
#[pyo3(
text_signature="(mode=None) -> SplitMode",
Expand Down Expand Up @@ -133,8 +139,8 @@ impl PyTokenizer {
/// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
///
/// :type text: str
/// :type mode: sudachipy.SplitMode
/// :type out: sudachipy.MorphemeList
/// :type mode: SplitMode | str | None
/// :type out: MorphemeList
#[pyo3(
text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList",
signature=(text, mode=None, logger=None, out=None)
Expand Down

0 comments on commit 706a573

Please sign in to comment.