Skip to content
This repository has been archived by the owner on Mar 9, 2023. It is now read-only.

Commit

Permalink
Merge pull request #35 from WorksApplications/fix-bug
Browse files Browse the repository at this point in the history
fix bugs & add test in .travis.yml
  • Loading branch information
izziiyt authored Jun 19, 2019
2 parents 28a5fdc + 28a294a commit b9a5fcf
Show file tree
Hide file tree
Showing 19 changed files with 148 additions and 108 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@ python:
- '3.7'
install:
- pip install flake8
before_script:
- mv .travis/system.dic.test tests/resources/system.dic
script:
- ./scripts/format.sh
- python -m unittest discover tests
Binary file added .travis/system.dic.test
Binary file not shown.
21 changes: 9 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,15 @@ tokenizer_obj.tokenize(mode, "シュミレーション")[0].normalized_form()

### Code format

You have to run `./scripts/format.sh` and check if your code is in rule before PR.
This code formatting script will be integrated to CI system later. `flake8` is required.
You can use `./scripts/format.sh` and check if your code is in rule. `flake8` is required.

### Test

You have to run `./script/test.sh` and check if not your change cause regression.
This test script will be integrated to CI system later. Current test assumes `sudachi-dictionary-20190531-core.dic`
is in `resources` directory as `system.dic`. We will change it to special dictionary for test like Sudachi (Java) in all tests.
Some of the tests use `system.dic` built by Sudachi. This is an example command to get `system.dic` for test
```bash
git clone https://github.com/WorksApplications/Sudachi.git
cd Sudahi
mvn test
cp target/test-classes/system.dic ${SudachiPy}/tests/resources/
```
You can use `./script/test.sh` and check if not your change cause regression.
Current test assumes

- `sudachi-dictionary-20190531-core.dic`is in `resources` directory as `system.dic`.

- `system.dic` for test in `tests/resources` directory

`system.dic` for test exists as `.travis/system.dic.test`. Copy it into `tests/resources` before test.
6 changes: 4 additions & 2 deletions scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

# print error message only when it fails
# python unittest print message in stderr even if it succeed
# You need to prepare system.dic in resources and tests/resources
# see README

cd $(dirname $0)
RES=`cd ..; python -m unittest discover tests 2>&1`
RES=`cd ..; python -m unittest discover tests -p '*test*.py' 2>&1`
RES_TAIL=`echo "$RES" | tail -1`
if [[ $RES_TAIL != "OK" ]]; then
echo "$RES"
>&2 echo "$RES"
fi
6 changes: 3 additions & 3 deletions sudachipy/dictionarylib/charactercategory.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def read_character_definition(self, char_def=None):
cols = re.split(r"\s+", line)
if len(cols) < 2:
f.close()
raise AttributeError("invalid format at line ", i)
raise AttributeError("invalid format at line {}".format(i))
if not re.match("0x", cols[0]):
continue
range_ = self.Range()
Expand All @@ -58,14 +58,14 @@ def read_character_definition(self, char_def=None):
range_.high = int(r[1], 16)
if range_.low > range_.high:
f.close()
raise AttributeError("invalid range at line ", i)
raise AttributeError("invalid range at line {}".format(i))
for j in range(1, len(cols)):
if re.match("#", cols[j]) or cols[j] is '':
break
type_ = categorytype.CategoryType.get(cols[j])
if type_ is None:
f.close()
raise AttributeError(cols[j], " is invalid type at line ", i)
raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
range_.categories.add(type_)
self.range_list.append(range_)
default_range = self.Range()
Expand Down
12 changes: 9 additions & 3 deletions sudachipy/dictionarylib/dictionaryheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@


class DictionaryHeader:
description_size = 256
storage_size = 8 + 8 + description_size
__description_size = 256
storage_size = 8 + 8 + __description_size

def __init__(self, bytes_, offset):
self.version, self.create_time = struct.unpack_from("<2Q", bytes_, offset)
offset += 16
self.description = bytes_[offset:offset+self.description_size].decode("utf-8")

len_ = 0
while len_ < self.__description_size:
if bytes_[offset + len_] == 0:
break
len_ += 1
self.description = bytes_[offset:offset + len_].decode("utf-8")
9 changes: 6 additions & 3 deletions sudachipy/dictionarylib/doublearraylexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ class DoubleArrayLexicon(lexicon.Lexicon):
def __init__(self, bytes_, offset):
self.trie = dartsclone.doublearray.DoubleArray()
bytes_.seek(offset)
self.size = int.from_bytes(bytes_.read(4), 'little')
size = int.from_bytes(bytes_.read(4), 'little')
offset += 4
bytes_.seek(offset)
array = struct.unpack_from("<{}I".format(self.size), bytes_, offset)
self.trie.set_array(array, self.size)
array = struct.unpack_from("<{}I".format(size), bytes_, offset)
self.trie.set_array(array, size)
offset += self.trie.total_size()

self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
Expand Down Expand Up @@ -49,3 +49,6 @@ def get_cost(self, word_id):

def get_word_info(self, word_id):
return self.word_infos.get_word_info(word_id)

def size(self):
return self.word_params.size
61 changes: 31 additions & 30 deletions sudachipy/dictionarylib/wordinfolist.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,49 +9,50 @@ def __init__(self, bytes_, offset, word_size):
self.offset = offset

def get_word_info(self, word_id):
orig_pos = self.bytes.tell()
index = self.word_id_to_offset(word_id)

surface = self.buffer_to_string(index)
index += 1 + 2 * len(surface)
head_word_length = self.bytes[index]
index += 1
pos_id = int.from_bytes(self.bytes[index:index+2], 'little')
index += 2
normalized_form = self.buffer_to_string(index)
index += 1 + 2 * len(normalized_form)
self.bytes.seek(index)
surface = self.buffer_to_string()
head_word_length = self.buffer_to_string_length()
pos_id = int.from_bytes(self.bytes.read(2), 'little')
normalized_form = self.buffer_to_string()
if not normalized_form:
normalized_form = surface
dictionary_form_word_id = int.from_bytes(self.bytes[index:index+4], "little", signed=True)
index += 4
reading_form = self.buffer_to_string(index)
index += 1 + 2 * len(reading_form)
a_unit_split = self.buffer_to_int_array(index)
index += 1 + 4 * len(a_unit_split)
b_unit_split = self.buffer_to_int_array(index)
index += 1 + 4 * len(b_unit_split)
word_structure = self.buffer_to_int_array(index)
dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
reading_form = self.buffer_to_string()
a_unit_split = self.buffer_to_int_array()
b_unit_split = self.buffer_to_int_array()
word_structure = self.buffer_to_int_array()

dictionary_form = surface
if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
wi = self.get_word_info(dictionary_form_word_id)
dictionary_form = wi.surface

self.bytes.seek(orig_pos)

return wordinfo.WordInfo(surface, head_word_length, pos_id, normalized_form,
dictionary_form_word_id, dictionary_form, reading_form,
a_unit_split, b_unit_split, word_structure)

def word_id_to_offset(self, word_id):
i = self.offset + 4 * word_id
return int.from_bytes(self.bytes[i:i+4], "little", signed=False)

def buffer_to_string(self, offset):
length = self.bytes[offset]
offset += 1
end = offset + 2 * length
return self.bytes[offset:end].decode("utf-16-le")

def buffer_to_int_array(self, offset):
length = self.bytes[offset]
offset += 1
array = struct.unpack_from("<{}I".format(length), self.bytes, offset)
return int.from_bytes(self.bytes[i:i+4], 'little', signed=False)

def buffer_to_string_length(self):
length = self.bytes.read_byte()
if length < 128:
return length
low = self.bytes.read_byte()
return ((length & 0x7F) << 8) | low

def buffer_to_string(self):
length = self.buffer_to_string_length()
return self.bytes.read(2 * length).decode('utf-16-le')

def buffer_to_int_array(self):
length = self.bytes.read_byte()
array = []
for _ in range(length):
array.append(int.from_bytes(self.bytes.read(4), 'little', signed=True))
return array
4 changes: 3 additions & 1 deletion sudachipy/plugin/input_text/default_input_text_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,15 @@ def read_rewrite_lists(self, rewrite_def):
with open(rewrite_def, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
line = line.strip()
if not line or line.startswith("#"):
if (not line) or line.startswith("#"):
continue
cols = line.split()

# ignored normalize list
if len(cols) == 1:
key = cols[0]
if len(key) != 1:
raise RuntimeError("{} is not character at line {}".format(key, i))
self.ignore_normalize_set.add(key)
# replace char list
elif len(cols) == 2:
Expand Down
2 changes: 1 addition & 1 deletion sudachipy/utf8inputtextbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def build(self):
j = 0
for i in range(len(self.modified_text)):
# 注: サロゲートペア文字は考慮していない
for k in range(self.utf8_byte_length(ord(self.modified_text[i]))):
for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
byte_indexes[j] = i
offsets[j] = self.text_offsets[i]
j += 1
Expand Down
24 changes: 11 additions & 13 deletions tests/dictionarylib/test_charactercategory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ class TestCharacterCategory(unittest.TestCase):

def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.test_resources_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.pardir,
'resources')
pass

def tearDown(self):
Expand All @@ -24,7 +28,7 @@ def test_range_containing_length(self):

def test_get_category_types(self):
cat = charactercategory.CharacterCategory()
cat.read_character_definition('tests/resources/char.def')
cat.read_character_definition(os.path.join(self.test_resources_dir, 'char.def'))
self.assertEqual({categorytype.CategoryType.KANJI}, cat.get_category_types(ord('熙')))
self.assertNotEqual({categorytype.CategoryType.DEFAULT}, cat.get_category_types(ord('熙')))

Expand All @@ -45,33 +49,27 @@ def test_read_character_definition_with_invalid_format(self):
with open(f, 'w') as wf:
wf.write("0x0030..0x0039\n")
cat = charactercategory.CharacterCategory()
try:
with self.assertRaises(AttributeError) as cm:
cat.read_character_definition(f)
self.fail('no exception detected')
except AttributeError:
pass
self.assertEqual('invalid format at line 0', cm.exception.args[0])

def test_read_character_definition_with_invalid_range(self):
f = os.path.join(self.test_dir, 'test_file.txt')
with open(f, 'w') as wf:
wf.write("0x0030..0x0029 NUMERIC\n")
cat = charactercategory.CharacterCategory()
try:
with self.assertRaises(AttributeError) as cm:
cat.read_character_definition(f)
self.fail('no exception detected')
except AttributeError:
pass
self.assertEqual('invalid range at line 0', cm.exception.args[0])

def test_read_character_definition_with_invalid_type(self):
f = os.path.join(self.test_dir, 'test_file.txt')
with open(f, 'w') as wf:
wf.write("0x0030..0x0039 FOO\n")
cat = charactercategory.CharacterCategory()
try:
with self.assertRaises(AttributeError) as cm:
cat.read_character_definition(f)
self.fail('no exception detected')
except AttributeError:
pass
self.assertEqual('FOO is invalid type at line 0', cm.exception.args[0])


if __name__ == '__main__':
Expand Down
7 changes: 6 additions & 1 deletion tests/dictionarylib/test_dictionaryheader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import mmap
import unittest

Expand All @@ -9,7 +10,11 @@ class TestDictionaryHeader(unittest.TestCase):

def setUp(self):
# Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
filename = 'tests/resources/system.dic'
test_resources_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.pardir,
'resources')
filename = os.path.join(test_resources_dir, 'system.dic')
with open(filename, 'r+b') as system_dic:
bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
offset = 0
Expand Down
37 changes: 19 additions & 18 deletions tests/dictionarylib/test_doublearraylexicon.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@
import os
import mmap
import unittest

from sudachipy.dictionarylib.dictionaryheader import DictionaryHeader
from sudachipy.dictionarylib.dictionaryversion import DictionaryVersion
from sudachipy.dictionarylib.doublearraylexicon import DoubleArrayLexicon
from sudachipy.dictionarylib.grammar import Grammar


class TestDoubleArrayLexicon(unittest.TestCase):

__GRAMMAR_SIZE = 470

def setUp(self):
# Copied from sudachipy.dictionay.Dictionary.read_system_dictionary
filename = 'tests/resources/system.dic'
test_resources_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.pardir,
'resources')
filename = os.path.join(test_resources_dir, 'system.dic')
with open(filename, 'r+b') as system_dic:
bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
offset = 0
self.header = DictionaryHeader(bytes_, offset)
if self.header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
raise Exception("invalid system dictionary")
offset += self.header.storage_size

self.grammar = Grammar(bytes_, offset)
offset += self.grammar.get_storage_size()
self.lexicon = DoubleArrayLexicon(bytes_, offset)
header = DictionaryHeader(bytes_, 0)
if header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
raise Exception('invalid system dictionary')
self.lexicon = DoubleArrayLexicon(bytes_, header.storage_size + 470)

def test_lookup(self):
res = self.lexicon.lookup('東京都'.encode('utf-8'), 0)
Expand Down Expand Up @@ -65,9 +66,9 @@ def test_wordinfo(self):
self.assertEqual(-1, wi.dictionary_form_word_id)
self.assertEqual('た', wi.dictionary_form)
self.assertEqual('タ', wi.reading_form)
self.assertEqual([0], wi.a_unit_split)
self.assertEqual([0], wi.b_unit_split)
self.assertEqual([0], wi.word_structure)
self.assertEqual([], wi.a_unit_split)
self.assertEqual([], wi.b_unit_split)
self.assertEqual([], wi.word_structure)

# 行っ
wi = self.lexicon.get_word_info(8)
Expand All @@ -79,9 +80,9 @@ def test_wordinfo(self):
# 東京都
wi = self.lexicon.get_word_info(6)
self.assertEqual('東京都', wi.surface)
self.assertEqual((5, 9), wi.a_unit_split)
self.assertEqual([0], wi.b_unit_split)
self.assertEqual((5, 9), wi.word_structure)
self.assertEqual([5, 9], wi.a_unit_split)
self.assertEqual([], wi.b_unit_split)
self.assertEqual([5, 9], wi.word_structure)

def test_wordinfo_with_longword(self):
# 0123456789 * 30
Expand All @@ -94,7 +95,7 @@ def test_wordinfo_with_longword(self):
self.assertEqual(570, len(wi.reading_form))

def test_size(self):
self.assertEqual(37, self.lexicon.size)
self.assertEqual(37, self.lexicon.size())


if __name__ == '__main__':
Expand Down
3 changes: 3 additions & 0 deletions tests/test_dictionary.py → tests/ignore_test_dictionary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# This test file is ignored if it runs on .travis
# We probably need to parse user.dic to test this code.

import json
import unittest

Expand Down
3 changes: 3 additions & 0 deletions tests/test_tokenizer.py → tests/ignore_test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# This test file is ignored if it runs on .travis
# We probably need to parse user.dic to test this code.

import json
import unittest

Expand Down
Loading

0 comments on commit b9a5fcf

Please sign in to comment.