Cython based optimization (#123)

* Remove unecessary deep copy * Add lru_cache on get_word_info * Add lru_cache to get_word_info This seems to be a small speedup. * Basic Cythonization Unlike the other branch the tests pass on this one. Benchmark time went down by a third compared to the previous commit. I'm not sure the _c functions are necessary here - I think that's what cpdef functions are for, but I had difficulty getting them working. Will need to give that another look. * Use cpdef functions Didn't have any issues this time, and it's cleaner with no clear performance difference. * Move build_lattice to Cython, intern some slow parts This should cut execution time by roughly 25% compared to the last commit. * Don't use deepcopy This is not an appropriate use of deepcopy and it's slow. * Add cython to setup_requires * Fix setup.py * Make INHIBITED_CONNECTION literal Minor speed boost. * Bring the matrix into the lattice building This provides a notable speedup. * Various cythonizations Improvements are relatively minor compared to previous commit, but there is a few seconds of speedup. * Inline function for small speed boost * Change import order, make lru cache size explicit Maybe this will make Travis happy? * Add a build command * Use INT_MAX * Remove comment Missed this before, this is fine.
WorksApplications · Jun 10, 2020 · 620f7c3 · 620f7c3
1 parent 4d50586
commit 620f7c3
Show file tree

Hide file tree

Showing 14 changed files with 148 additions and 84 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,6 +6,7 @@ python:
  - '3.7'
 install:
  - pip install flake8 flake8-import-order flake8-builtins && pip install -r requirements.txt
+ - python setup.py build_ext --inplace
 before_script:
  - cp .travis/system.dic.test tests/resources/system.dic && cp .travis/user.dic.test tests/resources/user.dic
 script:

diff --git a/setup.py b/setup.py
@@ -14,9 +14,17 @@
 
 from setuptools import setup, find_packages
 
+from distutils.extension import Extension
+
+extensions = [
+        Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']),
+        Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']),
+        Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']),
+        ]
+
 setup(name="SudachiPy",
       use_scm_version=True,
-      setup_requires=['setuptools_scm'],
+      setup_requires=['setuptools_scm', 'cython'],
       description="Python version of Sudachi, the Japanese Morphological Analyzer",
       long_description=open('README.md', encoding='utf-8').read(),
       long_description_content_type="text/markdown",
@@ -33,4 +41,5 @@
             "sortedcontainers~=2.1.0",
             'dartsclone~=0.9.0',
       ],
+      ext_modules=extensions,
       )
diff --git a/sudachipy/dictionarylib/lexiconset.py b/sudachipy/dictionarylib/lexiconset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import lru_cache
 from typing import List
 
 from .lexicon import Lexicon
@@ -57,6 +58,7 @@ def get_cost(self, word_id: int) -> int:
         return self.lexicons[self.get_dictionary_id(word_id)]\
             .get_cost(self.get_word_id1(word_id))
 
+    @lru_cache(1024)
     def get_word_info(self, word_id: int) -> 'WordInfo':  # noqa: F821
         dic_id = self.get_dictionary_id(word_id)
         winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id))

diff --git a/sudachipy/dictionarylib/wordinfolist.py b/sudachipy/dictionarylib/wordinfolist.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import struct
+from functools import lru_cache
 
 from .wordinfo import WordInfo
 
@@ -23,6 +24,7 @@ def __init__(self, bytes_, offset, word_size):
         self.offset = offset
         self._word_size = word_size
 
+    @lru_cache(2048)
     def get_word_info(self, word_id):
         orig_pos = self.bytes.tell()
         index = self.word_id_to_offset(word_id)

diff --git a/sudachipy/lattice.pxd b/sudachipy/lattice.pxd
@@ -0,0 +1,20 @@
+from .latticenode cimport LatticeNode
+
+cdef extern from "limits.h":
+    cdef int INT_MAX
+
+cdef class Lattice:
+
+    cdef int size
+    cdef int capacity
+    cdef LatticeNode eos_node
+
+    cdef list end_lists
+    cdef object grammar
+    cdef object eos_params
+    cdef const short[:,:] connect_costs
+
+    cpdef void resize(self, int size)
+    cpdef void insert(self, int begin, int end, LatticeNode node)
+    cdef void connect_node(self, LatticeNode r_node)
+    cdef void connect_eos_node(self)
diff --git a/sudachipy/lattice.py → sudachipy/lattice.pyx b/sudachipy/lattice.py → sudachipy/lattice.pyx
@@ -15,26 +15,26 @@
 from typing import List, Optional
 
 from .dictionarylib.grammar import Grammar
-from .latticenode import LatticeNode
+from .latticenode cimport LatticeNode
 
+cdef class Lattice:
 
-class Lattice:
+    def __init__(self, grammar: Grammar):
+        self.size = 0
+        self.capacity = 0
 
-    size = 0
-    capacity = 0
-    eos_node = None
 
-    def __init__(self, grammar: Grammar):
         self.end_lists = []
         self.grammar = grammar
         self.eos_params = grammar.get_eos_parameter()
-        bos_node = LatticeNode()
+        cdef LatticeNode bos_node = LatticeNode()
         bos_params = grammar.get_bos_parameter()
         bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2])
         bos_node.is_connected_to_bos = True
         self.end_lists.append([bos_node])
+        self.connect_costs = self.grammar._matrix_view
 
-    def resize(self, size: int) -> None:
+    cpdef void resize(self, int size):
         if size > self.capacity:
             self.expand(size)
         self.size = size
@@ -69,7 +69,7 @@ def get_minumum_node(self, begin: int, end: int) -> Optional[LatticeNode]:
                 min_arg = node
         return min_arg
 
-    def insert(self, begin: int, end: int, node: LatticeNode) -> None:
+    cpdef void insert(self, int begin, int end, LatticeNode node):
         self.end_lists[end].append(node)
         node.begin = begin
         node.end = end
@@ -85,15 +85,20 @@ def create_node() -> LatticeNode:
     def has_previous_node(self, index: int) -> bool:
         return bool(self.end_lists[index])
 
-    def connect_node(self, r_node: LatticeNode) -> None:
+    cdef void connect_node(self, LatticeNode r_node):
         begin = r_node.begin
-        r_node.total_cost = float('inf')
+        r_node.total_cost = INT_MAX
+
+        cdef LatticeNode l_node
+        cdef int connect_cost
         for l_node in self.end_lists[begin]:
             if not l_node.is_connected_to_bos:
                 continue
             # right_id and left_id look reversed, but it works ...
-            connect_cost = self.grammar.get_connect_cost(l_node.right_id, r_node.left_id)
-            if connect_cost == Grammar.INHIBITED_CONNECTION:
+            connect_cost = self.connect_costs[l_node.right_id, r_node.left_id]
+
+            # 0x7fff == Grammar.INHIBITED_CONNECTION:
+            if connect_cost == 0x7fff:
                 continue
             cost = l_node.total_cost + connect_cost
             if cost < r_node.total_cost:
@@ -103,7 +108,7 @@ def connect_node(self, r_node: LatticeNode) -> None:
         r_node.is_connected_to_bos = r_node.best_previous_node is not None
         r_node.total_cost += r_node.cost
 
-    def connect_eos_node(self) -> None:
+    cdef void connect_eos_node(self):
         self.connect_node(self.eos_node)
 
     def get_best_path(self) -> List[LatticeNode]:

diff --git a/sudachipy/latticenode.pxd b/sudachipy/latticenode.pxd
@@ -0,0 +1,17 @@
+cdef class LatticeNode:
+
+    cdef int begin
+    cdef int end
+    cdef int total_cost
+    cdef int word_id
+    cdef bint _is_oov
+    cdef LatticeNode best_previous_node
+    cdef bint is_connected_to_bos
+    cdef object extra_word_info
+    cdef object undefined_word_info
+    cdef bint is_defined
+    cdef object lexicon
+    cdef int left_id
+    cdef int right_id
+    cdef int cost
+
diff --git a/sudachipy/latticenode.py → sudachipy/latticenode.pyx b/sudachipy/latticenode.py → sudachipy/latticenode.pyx
@@ -1,3 +1,5 @@
+# cython: profile=True
+
 # Copyright (c) 2019 Works Applications Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,27 +17,22 @@
 from .dictionarylib.wordinfo import WordInfo
 
 __NULL_SURFACE = '(null)'
-UNK = WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
-               __NULL_SURFACE, __NULL_SURFACE, [], [], [])
-
+UNK =\
+    WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
+             __NULL_SURFACE, __NULL_SURFACE, [], [], [])
 
-class LatticeNode:
-
-    begin = 0
-    end = 0
-    total_cost = 0
-    word_id = 0
-    _is_oov = False
-    best_previous_node = None
-    is_connected_to_bos = None
-    extra_word_info = None
-    lexicon = None
-    left_id = None
-    right_id = None
-    cost = None
+cdef class LatticeNode:
 
     def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None):
 
+        self.begin = 0
+        self.end = 0
+        self.word_id = 0
+        self._is_oov = False
+        self.best_previous_node = None
+        self.is_connected_to_bos = False
+        self.extra_word_info = None
+
         self.is_defined = True
         if lexicon is left_id is right_id is cost is word_id is None:
             self.is_defined = False
@@ -54,9 +51,15 @@ def set_parameter(self, left_id: int, right_id: int, cost: int) -> None:
     def get_begin(self) -> int:
         return self.begin
 
+    def set_begin(self, begin) -> None:
+        self.begin = begin
+
     def get_end(self) -> int:
         return self.end
 
+    def set_end(self, end) -> None:
+        self.end = end
+
     def set_range(self, begin: int, end: int) -> None:
         self.begin = begin
         self.end = end

diff --git a/sudachipy/plugin/oov/mecab_oov_plugin.py b/sudachipy/plugin/oov/mecab_oov_plugin.py
@@ -32,9 +32,9 @@ def __init__(self):
 
     class OOV:
         def __init__(self):
-            self.left_id = None
-            self.right_id = None
-            self.cost = None
+            self.left_id = -1
+            self.right_id = -1
+            self.cost = -1
             self.pos_id = None
 
     def __init__(self, json_obj=None):

diff --git a/sudachipy/plugin/oov/oov_provider_plugin.py b/sudachipy/plugin/oov/oov_provider_plugin.py
@@ -33,8 +33,8 @@ def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: b
     def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
         nodes = self.provide_oov(input_text, offset, has_other_words)
         for node in nodes:
-            node.begin = offset
-            node.end = offset + node.get_word_info().length()
+            node.set_begin(offset)
+            node.set_end(offset + node.get_word_info().length())
         return nodes
 
     @staticmethod

diff --git a/sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py b/sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py
@@ -80,4 +80,4 @@ def can_oov_bow_node(self, text, node):
 
     @staticmethod
     def is_shorter(length: int, text: UTF8InputText, node: LatticeNode):
-        return text.code_point_count(node.begin, node.end) < length
+        return text.code_point_count(node.get_begin(), node.get_end()) < length
diff --git a/sudachipy/tokenizer.py → sudachipy/tokenizer.pyx b/sudachipy/tokenizer.py → sudachipy/tokenizer.pyx
@@ -20,15 +20,58 @@
 from .dictionarylib.categorytype import CategoryType
 from .dictionarylib.grammar import Grammar
 from .dictionarylib.lexicon import Lexicon
-from .lattice import Lattice
-from .latticenode import LatticeNode
+from .lattice cimport Lattice
+from .latticenode cimport LatticeNode
 from .morphemelist import MorphemeList
 from .plugin.input_text import InputTextPlugin
 from .plugin.path_rewrite import PathRewritePlugin
 from .utf8inputtext import UTF8InputText
 from .utf8inputtextbuilder import UTF8InputTextBuilder
 
 
+cdef void build_lattice_c(object tokenizer, object input_):
+    bytes_ = input_.get_byte_text()
+
+    cdef Lattice lattice = tokenizer._lattice
+    lattice.resize(len(bytes_))
+
+    cdef unsigned int i, word_id, end, idx
+    cdef int left_id, right_id, cost
+    cdef object lexicon = tokenizer._lexicon
+    cdef list oov_provider_plugins = tokenizer._oov_provider_plugins
+
+    for i in range(len(bytes_)):
+        if not input_.can_bow(i) or not lattice.has_previous_node(i):
+            continue
+        iterator = lexicon.lookup(bytes_, i)
+        has_words = False
+        for word_id, end in iterator:
+            if (end < len(bytes_)) and (not input_.can_bow(end)):
+                continue
+            has_words = True
+
+            lex = lexicon.lexicons[word_id >> 28]
+            idx = (0x0FFFFFFF & word_id) * 3 # 3 is ELEMENT_SIZE_AS_SHORT
+            left_id, right_id, cost = lex.word_params._array_view[idx:idx+3]
+            n = LatticeNode(lexicon, left_id, right_id, cost, word_id)
+
+            lattice.insert(i, end, n)
+
+        # OOV
+        if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
+            for oov_plugin in tokenizer._oov_provider_plugins:
+                for node in oov_plugin.get_oov(input_, i, has_words):
+                    has_words = True
+                    lattice.insert(node.get_begin(), node.get_end(), node)
+        if not has_words and tokenizer.default_oov_provider:
+            for node in tokenizer.default_oov_provider.get_oov(input_, i, has_words):
+                has_words = True
+                lattice.insert(node.get_begin(), node.get_end(), node)
+
+        if not has_words:
+            raise RuntimeError("there is no morpheme at " + str(i))
+    lattice.connect_node(lattice.eos_node)
+
 class Tokenizer:
     """ tokenizer of morphological analysis
 
@@ -124,38 +167,7 @@ def tokenize(self, text: str, mode=None, logger=None) -> MorphemeList:
         return ml
 
     def _build_lattice(self, input_: UTF8InputText):
-        bytes_ = input_.get_byte_text()
-        self._lattice.resize(len(bytes_))
-        for i in range(len(bytes_)):
-            if not input_.can_bow(i) or not self._lattice.has_previous_node(i):
-                continue
-            iterator = self._lexicon.lookup(bytes_, i)
-            has_words = False
-            for word_id, end in iterator:
-                if (end < len(bytes_)) and (not input_.can_bow(end)):
-                    continue
-                has_words = True
-                n = LatticeNode(self._lexicon,
-                                self._lexicon.get_left_id(word_id),
-                                self._lexicon.get_right_id(word_id),
-                                self._lexicon.get_cost(word_id),
-                                word_id)
-                self._lattice.insert(i, end, n)
-
-            # OOV
-            if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
-                for oov_plugin in self._oov_provider_plugins:
-                    for node in oov_plugin.get_oov(input_, i, has_words):
-                        has_words = True
-                        self._lattice.insert(node.get_begin(), node.get_end(), node)
-            if not has_words and self.default_oov_provider:
-                for node in self.default_oov_provider.get_oov(input_, i, has_words):
-                    has_words = True
-                    self._lattice.insert(node.get_begin(), node.get_end(), node)
-
-            if not has_words:
-                raise RuntimeError("there is no morpheme at " + str(i))
-        self._lattice.connect_eos_node()
+        build_lattice_c(self, input_)
 
     def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeNode]:
         if mode == self.SplitMode.C:
@@ -172,9 +184,9 @@ def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeN
                 offset = node.get_begin()
                 for wid in wids:
                     n = LatticeNode(self._lexicon, 0, 0, 0, wid)
-                    n.begin = offset
+                    n.set_begin(offset)
                     offset += n.get_word_info().head_word_length
-                    n.end = offset
+                    n.set_end(offset)
                     new_path.append(n)
         return new_path