Merge pull request #35 from WorksApplications/fix-bug

fix bugs & add test in .travis.yml
WorksApplications · Jun 19, 2019 · b9a5fcf · b9a5fcf
2 parents 28a5fdc + 28a294a
commit b9a5fcf
Show file tree

Hide file tree

Showing 19 changed files with 148 additions and 108 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,5 +6,8 @@ python:
  - '3.7'
 install:
  - pip install flake8
+before_script:
+ - mv .travis/system.dic.test tests/resources/system.dic
 script:
  - ./scripts/format.sh
+ - python -m unittest discover tests
diff --git a/.travis/system.dic.test b/.travis/system.dic.test
diff --git a/README.md b/README.md
@@ -103,18 +103,15 @@ tokenizer_obj.tokenize(mode, "シュミレーション")[0].normalized_form()
 
 ### Code format
 
-You have to run `./scripts/format.sh` and check if your code is in rule before PR.
-This code formatting script will be integrated to CI system later. `flake8` is required.
+You can use `./scripts/format.sh` and check if your code is in rule. `flake8` is required.
 
 ### Test
 
-You have to run `./script/test.sh` and check if not your change cause regression.
-This test script will be integrated to CI system later. Current test assumes `sudachi-dictionary-20190531-core.dic`
-is in `resources` directory as `system.dic`. We will change it to special dictionary for test like Sudachi (Java) in all tests.  
-Some of the tests use `system.dic` built by Sudachi.  This is an example command to get `system.dic` for test
-```bash
-git clone https://github.com/WorksApplications/Sudachi.git
-cd Sudahi
-mvn test
-cp target/test-classes/system.dic ${SudachiPy}/tests/resources/
-```
+You can use `./script/test.sh` and check if not your change cause regression.
+Current test assumes
+
+- `sudachi-dictionary-20190531-core.dic`is in `resources` directory as `system.dic`.
+
+- `system.dic` for test in `tests/resources` directory   
+
+`system.dic` for test exists as `.travis/system.dic.test`.  Copy it into `tests/resources` before test.
diff --git a/scripts/test.sh b/scripts/test.sh
@@ -2,10 +2,12 @@
 
 # print error message only when it fails
 # python unittest print message in stderr even if it succeed
+# You need to prepare system.dic in resources and tests/resources
+# see README
 
 cd $(dirname $0)
-RES=`cd ..; python -m unittest discover tests 2>&1`
+RES=`cd ..; python -m unittest discover tests -p '*test*.py' 2>&1`
 RES_TAIL=`echo "$RES" | tail -1`
 if [[ $RES_TAIL != "OK" ]]; then
-    echo "$RES"
+    >&2 echo "$RES"
 fi
diff --git a/sudachipy/dictionarylib/charactercategory.py b/sudachipy/dictionarylib/charactercategory.py
@@ -48,7 +48,7 @@ def read_character_definition(self, char_def=None):
             cols = re.split(r"\s+", line)
             if len(cols) < 2:
                 f.close()
-                raise AttributeError("invalid format at line ", i)
+                raise AttributeError("invalid format at line {}".format(i))
             if not re.match("0x", cols[0]):
                 continue
             range_ = self.Range()
@@ -58,14 +58,14 @@ def read_character_definition(self, char_def=None):
                 range_.high = int(r[1], 16)
             if range_.low > range_.high:
                 f.close()
-                raise AttributeError("invalid range at line ", i)
+                raise AttributeError("invalid range at line {}".format(i))
             for j in range(1, len(cols)):
                 if re.match("#", cols[j]) or cols[j] is '':
                     break
                 type_ = categorytype.CategoryType.get(cols[j])
                 if type_ is None:
                     f.close()
-                    raise AttributeError(cols[j], " is invalid type at line ", i)
+                    raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
                 range_.categories.add(type_)
             self.range_list.append(range_)
         default_range = self.Range()

diff --git a/sudachipy/dictionarylib/dictionaryheader.py b/sudachipy/dictionarylib/dictionaryheader.py
@@ -2,10 +2,16 @@
 
 
 class DictionaryHeader:
-    description_size = 256
-    storage_size = 8 + 8 + description_size
+    __description_size = 256
+    storage_size = 8 + 8 + __description_size
 
     def __init__(self, bytes_, offset):
         self.version, self.create_time = struct.unpack_from("<2Q", bytes_, offset)
         offset += 16
-        self.description = bytes_[offset:offset+self.description_size].decode("utf-8")
+
+        len_ = 0
+        while len_ < self.__description_size:
+            if bytes_[offset + len_] == 0:
+                break
+            len_ += 1
+        self.description = bytes_[offset:offset + len_].decode("utf-8")
diff --git a/sudachipy/dictionarylib/doublearraylexicon.py b/sudachipy/dictionarylib/doublearraylexicon.py
@@ -13,11 +13,11 @@ class DoubleArrayLexicon(lexicon.Lexicon):
     def __init__(self, bytes_, offset):
         self.trie = dartsclone.doublearray.DoubleArray()
         bytes_.seek(offset)
-        self.size = int.from_bytes(bytes_.read(4), 'little')
+        size = int.from_bytes(bytes_.read(4), 'little')
         offset += 4
         bytes_.seek(offset)
-        array = struct.unpack_from("<{}I".format(self.size), bytes_, offset)
-        self.trie.set_array(array, self.size)
+        array = struct.unpack_from("<{}I".format(size), bytes_, offset)
+        self.trie.set_array(array, size)
         offset += self.trie.total_size()
 
         self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
@@ -49,3 +49,6 @@ def get_cost(self, word_id):
 
     def get_word_info(self, word_id):
         return self.word_infos.get_word_info(word_id)
+
+    def size(self):
+        return self.word_params.size
diff --git a/sudachipy/dictionarylib/wordinfolist.py b/sudachipy/dictionarylib/wordinfolist.py
@@ -9,49 +9,50 @@ def __init__(self, bytes_, offset, word_size):
         self.offset = offset
 
     def get_word_info(self, word_id):
+        orig_pos = self.bytes.tell()
         index = self.word_id_to_offset(word_id)
-
-        surface = self.buffer_to_string(index)
-        index += 1 + 2 * len(surface)
-        head_word_length = self.bytes[index]
-        index += 1
-        pos_id = int.from_bytes(self.bytes[index:index+2], 'little')
-        index += 2
-        normalized_form = self.buffer_to_string(index)
-        index += 1 + 2 * len(normalized_form)
+        self.bytes.seek(index)
+        surface = self.buffer_to_string()
+        head_word_length = self.buffer_to_string_length()
+        pos_id = int.from_bytes(self.bytes.read(2), 'little')
+        normalized_form = self.buffer_to_string()
         if not normalized_form:
             normalized_form = surface
-        dictionary_form_word_id = int.from_bytes(self.bytes[index:index+4], "little", signed=True)
-        index += 4
-        reading_form = self.buffer_to_string(index)
-        index += 1 + 2 * len(reading_form)
-        a_unit_split = self.buffer_to_int_array(index)
-        index += 1 + 4 * len(a_unit_split)
-        b_unit_split = self.buffer_to_int_array(index)
-        index += 1 + 4 * len(b_unit_split)
-        word_structure = self.buffer_to_int_array(index)
+        dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
+        reading_form = self.buffer_to_string()
+        a_unit_split = self.buffer_to_int_array()
+        b_unit_split = self.buffer_to_int_array()
+        word_structure = self.buffer_to_int_array()
 
         dictionary_form = surface
         if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
             wi = self.get_word_info(dictionary_form_word_id)
             dictionary_form = wi.surface
 
+        self.bytes.seek(orig_pos)
+
         return wordinfo.WordInfo(surface, head_word_length, pos_id, normalized_form,
                                  dictionary_form_word_id, dictionary_form, reading_form,
                                  a_unit_split, b_unit_split, word_structure)
 
     def word_id_to_offset(self, word_id):
         i = self.offset + 4 * word_id
-        return int.from_bytes(self.bytes[i:i+4], "little", signed=False)
-
-    def buffer_to_string(self, offset):
-        length = self.bytes[offset]
-        offset += 1
-        end = offset + 2 * length
-        return self.bytes[offset:end].decode("utf-16-le")
-
-    def buffer_to_int_array(self, offset):
-        length = self.bytes[offset]
-        offset += 1
-        array = struct.unpack_from("<{}I".format(length), self.bytes, offset)
+        return int.from_bytes(self.bytes[i:i+4], 'little', signed=False)
+
+    def buffer_to_string_length(self):
+        length = self.bytes.read_byte()
+        if length < 128:
+            return length
+        low = self.bytes.read_byte()
+        return ((length & 0x7F) << 8) | low
+
+    def buffer_to_string(self):
+        length = self.buffer_to_string_length()
+        return self.bytes.read(2 * length).decode('utf-16-le')
+
+    def buffer_to_int_array(self):
+        length = self.bytes.read_byte()
+        array = []
+        for _ in range(length):
+            array.append(int.from_bytes(self.bytes.read(4), 'little', signed=True))
         return array
diff --git a/sudachipy/plugin/input_text/default_input_text_plugin.py b/sudachipy/plugin/input_text/default_input_text_plugin.py
@@ -64,13 +64,15 @@ def read_rewrite_lists(self, rewrite_def):
         with open(rewrite_def, "r", encoding="utf-8") as f:
             for i, line in enumerate(f):
                 line = line.strip()
-                if not line or line.startswith("#"):
+                if (not line) or line.startswith("#"):
                     continue
                 cols = line.split()
 
                 # ignored normalize list
                 if len(cols) == 1:
                     key = cols[0]
+                    if len(key) != 1:
+                        raise RuntimeError("{} is not character at line {}".format(key, i))
                     self.ignore_normalize_set.add(key)
                 # replace char list
                 elif len(cols) == 2:

diff --git a/sudachipy/utf8inputtextbuilder.py b/sudachipy/utf8inputtextbuilder.py
@@ -52,7 +52,7 @@ def build(self):
         j = 0
         for i in range(len(self.modified_text)):
             # 注: サロゲートペア文字は考慮していない
-            for k in range(self.utf8_byte_length(ord(self.modified_text[i]))):
+            for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
                 byte_indexes[j] = i
                 offsets[j] = self.text_offsets[i]
                 j += 1

diff --git a/tests/dictionarylib/test_charactercategory.py b/tests/dictionarylib/test_charactercategory.py
@@ -10,6 +10,10 @@ class TestCharacterCategory(unittest.TestCase):
 
     def setUp(self):
         self.test_dir = tempfile.mkdtemp()
+        self.test_resources_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            os.pardir,
+            'resources')
         pass
 
     def tearDown(self):
@@ -24,7 +28,7 @@ def test_range_containing_length(self):
 
     def test_get_category_types(self):
         cat = charactercategory.CharacterCategory()
-        cat.read_character_definition('tests/resources/char.def')
+        cat.read_character_definition(os.path.join(self.test_resources_dir, 'char.def'))
         self.assertEqual({categorytype.CategoryType.KANJI}, cat.get_category_types(ord('熙')))
         self.assertNotEqual({categorytype.CategoryType.DEFAULT}, cat.get_category_types(ord('熙')))
 
@@ -45,33 +49,27 @@ def test_read_character_definition_with_invalid_format(self):
         with open(f, 'w') as wf:
             wf.write("0x0030..0x0039\n")
         cat = charactercategory.CharacterCategory()
-        try:
+        with self.assertRaises(AttributeError) as cm:
             cat.read_character_definition(f)
-            self.fail('no exception detected')
-        except AttributeError:
-            pass
+        self.assertEqual('invalid format at line 0', cm.exception.args[0])
 
     def test_read_character_definition_with_invalid_range(self):
         f = os.path.join(self.test_dir, 'test_file.txt')
         with open(f, 'w') as wf:
             wf.write("0x0030..0x0029 NUMERIC\n")
         cat = charactercategory.CharacterCategory()
-        try:
+        with self.assertRaises(AttributeError) as cm:
             cat.read_character_definition(f)
-            self.fail('no exception detected')
-        except AttributeError:
-            pass
+        self.assertEqual('invalid range at line 0', cm.exception.args[0])
 
     def test_read_character_definition_with_invalid_type(self):
         f = os.path.join(self.test_dir, 'test_file.txt')
         with open(f, 'w') as wf:
             wf.write("0x0030..0x0039 FOO\n")
         cat = charactercategory.CharacterCategory()
-        try:
+        with self.assertRaises(AttributeError) as cm:
             cat.read_character_definition(f)
-            self.fail('no exception detected')
-        except AttributeError:
-            pass
+        self.assertEqual('FOO is invalid type at line 0', cm.exception.args[0])
 
 
 if __name__ == '__main__':

diff --git a/tests/dictionarylib/test_dictionaryheader.py b/tests/dictionarylib/test_dictionaryheader.py
@@ -1,3 +1,4 @@
+import os
 import mmap
 import unittest
 
@@ -9,7 +10,11 @@ class TestDictionaryHeader(unittest.TestCase):
 
     def setUp(self):
         # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
-        filename = 'tests/resources/system.dic'
+        test_resources_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            os.pardir,
+            'resources')
+        filename = os.path.join(test_resources_dir, 'system.dic')
         with open(filename, 'r+b') as system_dic:
             bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
         offset = 0

diff --git a/tests/dictionarylib/test_doublearraylexicon.py b/tests/dictionarylib/test_doublearraylexicon.py
@@ -1,28 +1,29 @@
+import os
 import mmap
 import unittest
 
 from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
 from sudachipy.dictionarylib.dictionaryversion import DictionaryVersion
 from sudachipy.dictionarylib.doublearraylexicon import DoubleArrayLexicon
-from sudachipy.dictionarylib.grammar import Grammar
 
 
 class TestDoubleArrayLexicon(unittest.TestCase):
 
+    __GRAMMAR_SIZE = 470
+
     def setUp(self):
         # Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
-        filename = 'tests/resources/system.dic'
+        test_resources_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            os.pardir,
+            'resources')
+        filename = os.path.join(test_resources_dir, 'system.dic')
         with open(filename, 'r+b') as system_dic:
             bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
-        offset = 0
-        self.header = DictionaryHeader(bytes_, offset)
-        if self.header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
-            raise Exception("invalid system dictionary")
-        offset += self.header.storage_size
-
-        self.grammar = Grammar(bytes_, offset)
-        offset += self.grammar.get_storage_size()
-        self.lexicon = DoubleArrayLexicon(bytes_, offset)
+        header = DictionaryHeader(bytes_, 0)
+        if header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
+            raise Exception('invalid system dictionary')
+        self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size + 470)
 
     def test_lookup(self):
         res = self.lexicon.lookup('東京都'.encode('utf-8'), 0)
@@ -65,9 +66,9 @@ def test_wordinfo(self):
         self.assertEqual(-1, wi.dictionary_form_word_id)
         self.assertEqual('た', wi.dictionary_form)
         self.assertEqual('タ', wi.reading_form)
-        self.assertEqual([0], wi.a_unit_split)
-        self.assertEqual([0], wi.b_unit_split)
-        self.assertEqual([0], wi.word_structure)
+        self.assertEqual([], wi.a_unit_split)
+        self.assertEqual([], wi.b_unit_split)
+        self.assertEqual([], wi.word_structure)
 
         # 行っ
         wi = self.lexicon.get_word_info(8)
@@ -79,9 +80,9 @@ def test_wordinfo(self):
         # 東京都
         wi = self.lexicon.get_word_info(6)
         self.assertEqual('東京都', wi.surface)
-        self.assertEqual((5, 9), wi.a_unit_split)
-        self.assertEqual([0], wi.b_unit_split)
-        self.assertEqual((5, 9), wi.word_structure)
+        self.assertEqual([5, 9], wi.a_unit_split)
+        self.assertEqual([], wi.b_unit_split)
+        self.assertEqual([5, 9], wi.word_structure)
 
     def test_wordinfo_with_longword(self):
         # 0123456789 * 30
@@ -94,7 +95,7 @@ def test_wordinfo_with_longword(self):
         self.assertEqual(570, len(wi.reading_form))
 
     def test_size(self):
-        self.assertEqual(37, self.lexicon.size)
+        self.assertEqual(37, self.lexicon.size())
 
 
 if __name__ == '__main__':

diff --git a/tests/test_dictionary.py → tests/ignore_test_dictionary.py b/tests/test_dictionary.py → tests/ignore_test_dictionary.py
@@ -1,3 +1,6 @@
+# This test file is ignored if it runs on .travis
+# We probably need to parse user.dic to test this code.
+
 import json
 import unittest
 

diff --git a/tests/test_tokenizer.py → tests/ignore_test_tokenizer.py b/tests/test_tokenizer.py → tests/ignore_test_tokenizer.py
@@ -1,3 +1,6 @@
+# This test file is ignored if it runs on .travis
+# We probably need to parse user.dic to test this code.
+
 import json
 import unittest