Skip to content

Commit

Permalink
Do not normalize full-width punctuation with ftfy in CJK
Browse files Browse the repository at this point in the history
  • Loading branch information
ZJaume committed Feb 5, 2025
1 parent 7367178 commit e1f6331
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "bifixer"
version = "0.8.11"
version = "0.8.12"
license = {file = "LICENSE"}
authors = [
{ name="Prompsit Language Engineering", email="[email protected]" },
Expand Down
5 changes: 4 additions & 1 deletion src/bifixer/restorative_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,10 @@ def fix(text, lang, chars_rep, chars_pattern):
global global_chars_lang
global_chars_lang = chars_rep

ftfy_fixed_text = ftfy.fix_text_segment(text, uncurl_quotes=False, fix_latin_ligatures=False)
ftfy_fixed_text = ftfy.fix_text_segment(text,
uncurl_quotes=False,
fix_latin_ligatures=False,
fix_character_width=lang.lower() not in cjk_langs)

replaced_text = chars_pattern.sub(replace_chars, ftfy_fixed_text)

Expand Down
12 changes: 12 additions & 0 deletions src/tests/test_bifixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ class TestCharReplacements:
chars_el, charsRe_el = restorative_cleaning.getCharsReplacements("el")
chars_cs, charsRe_cs = restorative_cleaning.getCharsReplacements("cs")
chars_sl, charsRe_sl = restorative_cleaning.getCharsReplacements("sl")
chars_zh, charsRe_zh = restorative_cleaning.getCharsReplacements("zh")

punct_en, punctRe_en = restorative_cleaning.getNormalizedPunctReplacements("en")
punct_ru, punctRe_ru = restorative_cleaning.getNormalizedPunctReplacements("ru")
punct_es, punctRe_es = restorative_cleaning.getNormalizedPunctReplacements("es")
punct_fr, punctRe_fr = restorative_cleaning.getNormalizedPunctReplacements("fr")
punct_el, punctRe_el = restorative_cleaning.getNormalizedPunctReplacements("el")
punct_cs, punctRe_cs = restorative_cleaning.getNormalizedPunctReplacements("cs")
punct_zh, punctRe_zh = restorative_cleaning.getNormalizedPunctReplacements("zh")

def test_mojibake(self):
correct = "¿La cigüeña bebía café?"
Expand All @@ -78,6 +80,16 @@ def test_mojibake(self):
assert fixed_1 == correct
assert fixed_2 == correct

def test_cjk_punct(self):
text_1 = "直接煎出来就是一面带着焦焦脆片的\uff1f"
text_2 = "直接煎出来就是一面带着焦焦脆片的\uff01"
fixed_1 = restorative_cleaning.fix(text_1, "zh", self.chars_zh, self.charsRe_zh)
fixed_1 = restorative_cleaning.normalize(fixed_1, "zh", self.punct_zh, self.punctRe_zh)
fixed_2 = restorative_cleaning.fix(text_2, "zh", self.chars_zh, self.charsRe_zh)
fixed_2 = restorative_cleaning.normalize(fixed_2, "zh", self.punct_zh, self.punctRe_zh)
assert fixed_1 == text_1
assert fixed_2 == text_2

def test_encoding(self):
correct_1 = "Brošure"
text_1 = "BroĹĄure"
Expand Down

0 comments on commit e1f6331

Please sign in to comment.