Skip to content
This repository has been archived by the owner on Mar 9, 2023. It is now read-only.

Commit

Permalink
Cython based optimization (#123)
Browse files Browse the repository at this point in the history
* Remove unecessary deep copy

* Add lru_cache on get_word_info

* Add lru_cache to get_word_info

This seems to be a small speedup.

* Basic Cythonization

Unlike the other branch the tests pass on this one. Benchmark time went
down by a third compared to the previous commit.

I'm not sure the _c functions are necessary here - I think that's what
cpdef functions are for, but I had difficulty getting them working. Will
need to give that another look.

* Use cpdef functions

Didn't have any issues this time, and it's cleaner with no clear
performance difference.

* Move build_lattice to Cython, intern some slow parts

This should cut execution time by roughly 25% compared to the last
commit.

* Don't use deepcopy

This is not an appropriate use of deepcopy and it's slow.

* Add cython to setup_requires

* Fix setup.py

* Make INHIBITED_CONNECTION literal

Minor speed boost.

* Bring the matrix into the lattice building

This provides a notable speedup.

* Various cythonizations

Improvements are relatively minor compared to previous commit, but there
is a few seconds of speedup.

* Inline function for small speed boost

* Change import order, make lru cache size explicit

Maybe this will make Travis happy?

* Add a build command

* Use INT_MAX

* Remove comment

Missed this before, this is fine.
  • Loading branch information
polm authored Jun 10, 2020
1 parent 4d50586 commit 620f7c3
Show file tree
Hide file tree
Showing 14 changed files with 148 additions and 84 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ python:
- '3.7'
install:
- pip install flake8 flake8-import-order flake8-builtins && pip install -r requirements.txt
- python setup.py build_ext --inplace
before_script:
- cp .travis/system.dic.test tests/resources/system.dic && cp .travis/user.dic.test tests/resources/user.dic
script:
Expand Down
11 changes: 10 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,17 @@

from setuptools import setup, find_packages

from distutils.extension import Extension

extensions = [
Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']),
Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']),
Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']),
]

setup(name="SudachiPy",
use_scm_version=True,
setup_requires=['setuptools_scm'],
setup_requires=['setuptools_scm', 'cython'],
description="Python version of Sudachi, the Japanese Morphological Analyzer",
long_description=open('README.md', encoding='utf-8').read(),
long_description_content_type="text/markdown",
Expand All @@ -33,4 +41,5 @@
"sortedcontainers~=2.1.0",
'dartsclone~=0.9.0',
],
ext_modules=extensions,
)
2 changes: 2 additions & 0 deletions sudachipy/dictionarylib/lexiconset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import lru_cache
from typing import List

from .lexicon import Lexicon
Expand Down Expand Up @@ -57,6 +58,7 @@ def get_cost(self, word_id: int) -> int:
return self.lexicons[self.get_dictionary_id(word_id)]\
.get_cost(self.get_word_id1(word_id))

@lru_cache(1024)
def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821
dic_id = self.get_dictionary_id(word_id)
winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id))
Expand Down
2 changes: 2 additions & 0 deletions sudachipy/dictionarylib/wordinfolist.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import struct
from functools import lru_cache

from .wordinfo import WordInfo

Expand All @@ -23,6 +24,7 @@ def __init__(self, bytes_, offset, word_size):
self.offset = offset
self._word_size = word_size

@lru_cache(2048)
def get_word_info(self, word_id):
orig_pos = self.bytes.tell()
index = self.word_id_to_offset(word_id)
Expand Down
20 changes: 20 additions & 0 deletions sudachipy/lattice.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .latticenode cimport LatticeNode

cdef extern from "limits.h":
cdef int INT_MAX

cdef class Lattice:

cdef int size
cdef int capacity
cdef LatticeNode eos_node

cdef list end_lists
cdef object grammar
cdef object eos_params
cdef const short[:,:] connect_costs

cpdef void resize(self, int size)
cpdef void insert(self, int begin, int end, LatticeNode node)
cdef void connect_node(self, LatticeNode r_node)
cdef void connect_eos_node(self)
33 changes: 19 additions & 14 deletions sudachipy/lattice.py → sudachipy/lattice.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,26 @@
from typing import List, Optional

from .dictionarylib.grammar import Grammar
from .latticenode import LatticeNode
from .latticenode cimport LatticeNode

cdef class Lattice:

class Lattice:
def __init__(self, grammar: Grammar):
self.size = 0
self.capacity = 0

size = 0
capacity = 0
eos_node = None

def __init__(self, grammar: Grammar):
self.end_lists = []
self.grammar = grammar
self.eos_params = grammar.get_eos_parameter()
bos_node = LatticeNode()
cdef LatticeNode bos_node = LatticeNode()
bos_params = grammar.get_bos_parameter()
bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2])
bos_node.is_connected_to_bos = True
self.end_lists.append([bos_node])
self.connect_costs = self.grammar._matrix_view

def resize(self, size: int) -> None:
cpdef void resize(self, int size):
if size > self.capacity:
self.expand(size)
self.size = size
Expand Down Expand Up @@ -69,7 +69,7 @@ def get_minumum_node(self, begin: int, end: int) -> Optional[LatticeNode]:
min_arg = node
return min_arg

def insert(self, begin: int, end: int, node: LatticeNode) -> None:
cpdef void insert(self, int begin, int end, LatticeNode node):
self.end_lists[end].append(node)
node.begin = begin
node.end = end
Expand All @@ -85,15 +85,20 @@ def create_node() -> LatticeNode:
def has_previous_node(self, index: int) -> bool:
return bool(self.end_lists[index])

def connect_node(self, r_node: LatticeNode) -> None:
cdef void connect_node(self, LatticeNode r_node):
begin = r_node.begin
r_node.total_cost = float('inf')
r_node.total_cost = INT_MAX

cdef LatticeNode l_node
cdef int connect_cost
for l_node in self.end_lists[begin]:
if not l_node.is_connected_to_bos:
continue
# right_id and left_id look reversed, but it works ...
connect_cost = self.grammar.get_connect_cost(l_node.right_id, r_node.left_id)
if connect_cost == Grammar.INHIBITED_CONNECTION:
connect_cost = self.connect_costs[l_node.right_id, r_node.left_id]

# 0x7fff == Grammar.INHIBITED_CONNECTION:
if connect_cost == 0x7fff:
continue
cost = l_node.total_cost + connect_cost
if cost < r_node.total_cost:
Expand All @@ -103,7 +108,7 @@ def connect_node(self, r_node: LatticeNode) -> None:
r_node.is_connected_to_bos = r_node.best_previous_node is not None
r_node.total_cost += r_node.cost

def connect_eos_node(self) -> None:
cdef void connect_eos_node(self):
self.connect_node(self.eos_node)

def get_best_path(self) -> List[LatticeNode]:
Expand Down
17 changes: 17 additions & 0 deletions sudachipy/latticenode.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
cdef class LatticeNode:

cdef int begin
cdef int end
cdef int total_cost
cdef int word_id
cdef bint _is_oov
cdef LatticeNode best_previous_node
cdef bint is_connected_to_bos
cdef object extra_word_info
cdef object undefined_word_info
cdef bint is_defined
cdef object lexicon
cdef int left_id
cdef int right_id
cdef int cost

37 changes: 20 additions & 17 deletions sudachipy/latticenode.py → sudachipy/latticenode.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# cython: profile=True

# Copyright (c) 2019 Works Applications Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -15,27 +17,22 @@
from .dictionarylib.wordinfo import WordInfo

__NULL_SURFACE = '(null)'
UNK = WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
__NULL_SURFACE, __NULL_SURFACE, [], [], [])

UNK =\
WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
__NULL_SURFACE, __NULL_SURFACE, [], [], [])

class LatticeNode:

begin = 0
end = 0
total_cost = 0
word_id = 0
_is_oov = False
best_previous_node = None
is_connected_to_bos = None
extra_word_info = None
lexicon = None
left_id = None
right_id = None
cost = None
cdef class LatticeNode:

def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None):

self.begin = 0
self.end = 0
self.word_id = 0
self._is_oov = False
self.best_previous_node = None
self.is_connected_to_bos = False
self.extra_word_info = None

self.is_defined = True
if lexicon is left_id is right_id is cost is word_id is None:
self.is_defined = False
Expand All @@ -54,9 +51,15 @@ def set_parameter(self, left_id: int, right_id: int, cost: int) -> None:
def get_begin(self) -> int:
return self.begin

def set_begin(self, begin) -> None:
self.begin = begin

def get_end(self) -> int:
return self.end

def set_end(self, end) -> None:
self.end = end

def set_range(self, begin: int, end: int) -> None:
self.begin = begin
self.end = end
Expand Down
6 changes: 3 additions & 3 deletions sudachipy/plugin/oov/mecab_oov_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def __init__(self):

class OOV:
def __init__(self):
self.left_id = None
self.right_id = None
self.cost = None
self.left_id = -1
self.right_id = -1
self.cost = -1
self.pos_id = None

def __init__(self, json_obj=None):
Expand Down
4 changes: 2 additions & 2 deletions sudachipy/plugin/oov/oov_provider_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: b
def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
nodes = self.provide_oov(input_text, offset, has_other_words)
for node in nodes:
node.begin = offset
node.end = offset + node.get_word_info().length()
node.set_begin(offset)
node.set_end(offset + node.get_word_info().length())
return nodes

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,4 @@ def can_oov_bow_node(self, text, node):

@staticmethod
def is_shorter(length: int, text: UTF8InputText, node: LatticeNode):
return text.code_point_count(node.begin, node.end) < length
return text.code_point_count(node.get_begin(), node.get_end()) < length
84 changes: 48 additions & 36 deletions sudachipy/tokenizer.py → sudachipy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,58 @@
from .dictionarylib.categorytype import CategoryType
from .dictionarylib.grammar import Grammar
from .dictionarylib.lexicon import Lexicon
from .lattice import Lattice
from .latticenode import LatticeNode
from .lattice cimport Lattice
from .latticenode cimport LatticeNode
from .morphemelist import MorphemeList
from .plugin.input_text import InputTextPlugin
from .plugin.path_rewrite import PathRewritePlugin
from .utf8inputtext import UTF8InputText
from .utf8inputtextbuilder import UTF8InputTextBuilder


cdef void build_lattice_c(object tokenizer, object input_):
bytes_ = input_.get_byte_text()

cdef Lattice lattice = tokenizer._lattice
lattice.resize(len(bytes_))

cdef unsigned int i, word_id, end, idx
cdef int left_id, right_id, cost
cdef object lexicon = tokenizer._lexicon
cdef list oov_provider_plugins = tokenizer._oov_provider_plugins

for i in range(len(bytes_)):
if not input_.can_bow(i) or not lattice.has_previous_node(i):
continue
iterator = lexicon.lookup(bytes_, i)
has_words = False
for word_id, end in iterator:
if (end < len(bytes_)) and (not input_.can_bow(end)):
continue
has_words = True

lex = lexicon.lexicons[word_id >> 28]
idx = (0x0FFFFFFF & word_id) * 3 # 3 is ELEMENT_SIZE_AS_SHORT
left_id, right_id, cost = lex.word_params._array_view[idx:idx+3]
n = LatticeNode(lexicon, left_id, right_id, cost, word_id)

lattice.insert(i, end, n)

# OOV
if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
for oov_plugin in tokenizer._oov_provider_plugins:
for node in oov_plugin.get_oov(input_, i, has_words):
has_words = True
lattice.insert(node.get_begin(), node.get_end(), node)
if not has_words and tokenizer.default_oov_provider:
for node in tokenizer.default_oov_provider.get_oov(input_, i, has_words):
has_words = True
lattice.insert(node.get_begin(), node.get_end(), node)

if not has_words:
raise RuntimeError("there is no morpheme at " + str(i))
lattice.connect_node(lattice.eos_node)

class Tokenizer:
""" tokenizer of morphological analysis
Expand Down Expand Up @@ -124,38 +167,7 @@ def tokenize(self, text: str, mode=None, logger=None) -> MorphemeList:
return ml

def _build_lattice(self, input_: UTF8InputText):
bytes_ = input_.get_byte_text()
self._lattice.resize(len(bytes_))
for i in range(len(bytes_)):
if not input_.can_bow(i) or not self._lattice.has_previous_node(i):
continue
iterator = self._lexicon.lookup(bytes_, i)
has_words = False
for word_id, end in iterator:
if (end < len(bytes_)) and (not input_.can_bow(end)):
continue
has_words = True
n = LatticeNode(self._lexicon,
self._lexicon.get_left_id(word_id),
self._lexicon.get_right_id(word_id),
self._lexicon.get_cost(word_id),
word_id)
self._lattice.insert(i, end, n)

# OOV
if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
for oov_plugin in self._oov_provider_plugins:
for node in oov_plugin.get_oov(input_, i, has_words):
has_words = True
self._lattice.insert(node.get_begin(), node.get_end(), node)
if not has_words and self.default_oov_provider:
for node in self.default_oov_provider.get_oov(input_, i, has_words):
has_words = True
self._lattice.insert(node.get_begin(), node.get_end(), node)

if not has_words:
raise RuntimeError("there is no morpheme at " + str(i))
self._lattice.connect_eos_node()
build_lattice_c(self, input_)

def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeNode]:
if mode == self.SplitMode.C:
Expand All @@ -172,9 +184,9 @@ def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeN
offset = node.get_begin()
for wid in wids:
n = LatticeNode(self._lexicon, 0, 0, 0, wid)
n.begin = offset
n.set_begin(offset)
offset += n.get_word_info().head_word_length
n.end = offset
n.set_end(offset)
new_path.append(n)
return new_path

Expand Down
Loading

0 comments on commit 620f7c3

Please sign in to comment.