Do not normalize full-width punctuation with ftfy in CJK

bitextor · Feb 5, 2025 · e1f6331 · e1f6331
1 parent 7367178
commit e1f6331
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 2 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "bifixer"
-version = "0.8.11"
+version = "0.8.12"
 license = {file = "LICENSE"}
 authors = [
     { name="Prompsit Language Engineering", email="[email protected]" },

diff --git a/src/bifixer/restorative_cleaning.py b/src/bifixer/restorative_cleaning.py
@@ -726,7 +726,10 @@ def fix(text, lang, chars_rep, chars_pattern):
     global global_chars_lang
     global_chars_lang = chars_rep
 
-    ftfy_fixed_text = ftfy.fix_text_segment(text, uncurl_quotes=False, fix_latin_ligatures=False)
+    ftfy_fixed_text = ftfy.fix_text_segment(text,
+            uncurl_quotes=False,
+            fix_latin_ligatures=False,
+            fix_character_width=lang.lower() not in cjk_langs)
 
     replaced_text = chars_pattern.sub(replace_chars, ftfy_fixed_text)
 

diff --git a/src/tests/test_bifixer.py b/src/tests/test_bifixer.py
@@ -61,13 +61,15 @@ class TestCharReplacements:
     chars_el, charsRe_el = restorative_cleaning.getCharsReplacements("el")
     chars_cs, charsRe_cs = restorative_cleaning.getCharsReplacements("cs")
     chars_sl, charsRe_sl = restorative_cleaning.getCharsReplacements("sl")
+    chars_zh, charsRe_zh = restorative_cleaning.getCharsReplacements("zh")
 
     punct_en, punctRe_en = restorative_cleaning.getNormalizedPunctReplacements("en")
     punct_ru, punctRe_ru = restorative_cleaning.getNormalizedPunctReplacements("ru")
     punct_es, punctRe_es = restorative_cleaning.getNormalizedPunctReplacements("es")
     punct_fr, punctRe_fr = restorative_cleaning.getNormalizedPunctReplacements("fr")
     punct_el, punctRe_el = restorative_cleaning.getNormalizedPunctReplacements("el")
     punct_cs, punctRe_cs = restorative_cleaning.getNormalizedPunctReplacements("cs")
+    punct_zh, punctRe_zh = restorative_cleaning.getNormalizedPunctReplacements("zh")
 
     def test_mojibake(self):
         correct = "¿La cigüeña bebía café?"
@@ -78,6 +80,16 @@ def test_mojibake(self):
         assert fixed_1 == correct
         assert fixed_2 == correct
 
+    def test_cjk_punct(self):
+        text_1 = "直接煎出来就是一面带着焦焦脆片的\uff1f"
+        text_2 = "直接煎出来就是一面带着焦焦脆片的\uff01"
+        fixed_1 = restorative_cleaning.fix(text_1, "zh", self.chars_zh, self.charsRe_zh)
+        fixed_1 = restorative_cleaning.normalize(fixed_1, "zh", self.punct_zh, self.punctRe_zh)
+        fixed_2 = restorative_cleaning.fix(text_2, "zh", self.chars_zh, self.charsRe_zh)
+        fixed_2 = restorative_cleaning.normalize(fixed_2, "zh", self.punct_zh, self.punctRe_zh)
+        assert fixed_1 == text_1
+        assert fixed_2 == text_2
+
     def test_encoding(self):
         correct_1 = "Brošure"
         text_1 = "BroĹĄure"