Skip to content

Commit

Permalink
Force lower case on R2S for case insensitive scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
scossu committed Sep 22, 2024
1 parent f5b1e37 commit 9f3ba34
Show file tree
Hide file tree
Showing 20 changed files with 1,315 additions and 6 deletions.
1,266 changes: 1,266 additions & 0 deletions ext/oriya.html

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion scriptshifter/tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from yaml import Loader

from scriptshifter import DB_PATH
from scriptshifter.exceptions import BREAK, ConfigError
from scriptshifter.exceptions import BREAK, ApiError, ConfigError


__doc__ = """
Expand Down Expand Up @@ -209,6 +209,9 @@ def populate_table(conn, tid, tname):
if "roman_to_script" in data:
flags |= FEAT_R2S

if not data.get("general", {}).get("case_sensitive", True):
flags |= FEAT_CASEI

conn.execute(
"UPDATE tbl_language SET features = ? WHERE id = ?",
(flags, tid))
Expand Down Expand Up @@ -555,6 +558,9 @@ def get_lang_general(conn, lang):
FROM tbl_language WHERE name = ?""", (lang,))
lang_data = lang_q.fetchone()

if not lang_data:
raise ApiError(f"No language data found for {lang}", 404)

return {
"id": lang_data[0],
"data": {
Expand Down
4 changes: 2 additions & 2 deletions scriptshifter/tables/data/_chinese_base.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# This file is derived and kept in sync with Princeton's OCLC Connexion Pinyin
# converter (https://github.com/pulibrary/oclcpinyin/).

general: # Section names and other keywords are all snake_cased.
general: # Section names and other keywords are all snake_cased.
name: Chinese base (from Princeton)
parents:
- _ignore_base

script_to_roman:
map: # Mapping section.
map: # Mapping section.
"\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
"\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "
"\u5DF4\u97F3\u90ED\u695E\u8499\u53E4\u81EA\u6CBB\u5DDE": "Bayinguoleng Menggu Zizhizhou "
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/arabic.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Arabic S2R using the 3rd-party ArabicTransliterator library:
# https://github.com/MTG/ArabicTransliterator

---
general:
name: Arabic
description: Arabic S2R using a 3rd party library.
case_sensitive: false

script_to_roman:
hooks:
Expand Down
3 changes: 3 additions & 0 deletions scriptshifter/tables/data/chinese.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
#
# All other Chinese mappings are kept in _chinese_base.yml. This mapping only
# adds an overlay for parsing numerals and Scriptshifter-specific features.

---
general:
name: Chinese
parents:
- _chinese_base
case_sensitive: false

options:
- id: marc_field
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/gujarati.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Gujarati
case_sensitive: false

script_to_roman:
hooks:
Expand Down
3 changes: 2 additions & 1 deletion scriptshifter/tables/data/hebrew.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
---
general:
name: Hebrew
description: Hebrew S2R.
case_sensitive: false

options:
- id: genre
Expand All @@ -19,4 +21,3 @@ script_to_roman:
post_config:
-
- hebrew.dicta_api.s2r_post_config

2 changes: 2 additions & 0 deletions scriptshifter/tables/data/kannada.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Kannada
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/malayalam.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Malayalam
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/marathi_devanagari.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Marathi (Devanagari)
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/nepali_devanagari.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Nepali (Devanagari)
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/oriya.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Oriya
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/pali.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Pali
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/sanskrit_devanagari.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Sanskrit (Devanagari)
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/sinhalese.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Sinhalese
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/telugu.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Telugu
case_sensitive: false

script_to_roman:
hooks:
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/thai.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Thai
case_sensitive: false

options:
- id: ThaiTranscription
Expand Down
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/yiddish.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Yiddish
case_sensitive: false

options:
- id: loshn_koydesh
Expand Down
9 changes: 8 additions & 1 deletion scriptshifter/trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from scriptshifter.exceptions import BREAK, CONT
from scriptshifter.tables import (
BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
get_lang_ignore, get_lang_map, get_lang_normalize)

Expand Down Expand Up @@ -111,6 +111,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
f"Roman-to-script not yet supported for {lang}."
)

# Normalize case before post_config and rule-based normalization.
if not ctx.general["case_sensitive"]:
ctx._src = ctx.src.lower()

# This hook may take over the whole transliteration process or delegate
# it to some external process, and return the output string directly.
if _run_hook("post_config", ctx) == BREAK:
Expand Down Expand Up @@ -309,6 +313,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):


def _normalize_src(ctx, norm_rules):
"""
Normalize source text according to rules.
"""
for nk, nv in norm_rules.items():
ctx._src = ctx.src.replace(nk, nv)
logger.debug(f"Normalized source: {ctx.src}")
Expand Down
2 changes: 1 addition & 1 deletion tests/data/script_samples/unclassified.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ armenian,Մեդիա իրավունք : (ուսումնական ձեռնարկ) ,
armenian,Ա Բ Գ Դ Ե Զ Է Ը Թ Ժ Ի Լ Խ Ծ Կ Հ Ձ Ղ Ճ Մ Յ Ն Շ Ո Չ Պ Ջ Ռ Ս Վ Տ Ր Ւ Փ Ք Օ Ֆ ՙ ՚ ՛ ՜ ՝ ՞ ՟ ա բ գ դ ե զ է ը թ ժ ի լ խ ծ կ ձ ղ ճ մ յ ն շ ո չ պ ջ ռ ս վ տ ր ց ւ փ ք օ ֆ և ։ ֊ .,A B G D E Y Z Ē Ě Tʻ Zh I L Kh Ts K H Dz Gg Ch M Y N Sh O Chʻ P J Ṛ S V T R Tsʻ W U Pʻ Kʻ Ew Ev Ō Fa b g d e y z ē ě tʻ zh i l kh ts k h dz gh ch m y n sh o chʻ p j ṛ s v t r tsʻ w u pʻ kʻ ew ev ō f,,
georgian,ადგილობრივი თვითმმართველობის კოდექსი : საქართველოს ორგანული კანონი; 2018 წლის 7 სექტებრის მდგომარეობით.,Adgilobrivi tʻvitʻmmartʻvelobis kodekʻsi : Sakʻartʻvelos organuli kanoni; 2018 clis 7 sekʻtembris mdgomareobitʻ.,,
hindi,परमहंस की पीड़ा : महान क्रांतिकारी रामप्रसाद बिस्मिल के जीवन पर आधारित उपन्यास,Paramahaṃsa kī pīṛā : mahāna krāntikārī Rāmaprasāda Bismila ke jīvana para ādhārita upanyāsa,,
mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶᠡ ᠶᠢᠨ ᠥᠯᠠᠨ ᠺᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠸᠷᠸᠭᠰᠠᠨ ᠰᠸᠷᠪᠸᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠸᠳᠸᠯᠸᠯ,Dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶᠡ ᠶᠢᠨ ᠣᠯᠠᠨ ᠬᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠤᠷᠤᠭᠰᠠᠨ ᠰᠤᠷᠪᠤᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠤᠳᠤᠯᠤᠯ,dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
,আগবাৰীত ফুলিলে সোনে মোৰ চম্পা,Āgabārīta phulile soṇe mora campā,,
,Milli dövlətçilik hərəkatının yüksəlişi və Xalq Cümhuriyyəti dövründə Azərbaycançılıq ideyası,Milli dövlätçilik häräkatının yüksälişi vä Xalq Cümhuriyyäti dövründä azärbaycançılıq ideyası,,
,مجنون مجنون دوشون منى شعر توپلوسو ,Macnūn macnūn düşün manī : şiʻr toplūsū,,
Expand Down

0 comments on commit 9f3ba34

Please sign in to comment.