diff --git a/legacy/data/DivehiThaanaRomanization.cfg b/legacy/data/DivehiThaanaRomanization.cfg new file mode 100644 index 0000000..cbaa8e1 --- /dev/null +++ b/legacy/data/DivehiThaanaRomanization.cfg @@ -0,0 +1,573 @@ +# Original table by Randall K. Barry, 28 February 2021 +# Updated by Randall K. Barry, 11 May 2023 to add script code + +[General] +Name=Divehi (Thaana script) +ScriptCode=Thaa + +# Script used by the Divehi language of the Maldives + +# UNICODE THAANA CHARACTER RANGE: U+0780-U+07BF +# ISO 15924 4-alpha script code: Thaa + +Truncation=% + +[RomanToScript] +FieldsIncluded=100 110 111 130 240 245 246 250 260 264 440 490 600 610 611 630 651 700 710 711 730 740 800 830 +SubfieldsAlwaysExcluded=uvxy0123456789 +OtherSubfieldsExcludedByTag=100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i + +# RDA boilerplate phrases not transliterated: +Place of publication not identified=Place of publication not identified +publisher not identified=publisher not identified +date of publication not identified=date of publication not identified +At head of title=At head of title +Colophon=Colophon +and others=and others +and one other=and one other +and two others=and two others +and three others=and three others +and four others=and four others +and five others=and five others +and six others=and six others +and seven others=and seven others +and eight others=and eight others +and nine others=and nine others +and ten others=and ten others +and eleven others=and eleven others +and twelve others=and twelve others +and thirteen others=and thirteen others +and fourteen others=and fourteen others +and fifteen others=and fifteen others +and sixteen others=and sixteen others +and seventeen others=and seventeen others +and eighteen others=and eighteen others +and nineteen others=and nineteen others +et al.=et al. +i.e.=i.e. +S.l.=S.l. +s.n.=s.n. +XLIX=XLIX +XLIV=XLIV +XLIII=XLIII +XLII=XLII +XLI=XLI +XLVIII-XLVIII +XLVII=XLVII +XLVI=XLVI +XLV=XLV +XL=XL +LXXXIX=LXXXIX +LXXXIV=LXXXIV +LXXXIII=LXXXIII +LXXXII=LXXXII +LXXXI=LXXXI +LXXXVIII=LXXXVIII +LXXXVII=LXXXVII +LXXXVI=LXXXVI +LXXXV=LXXXV +LXXX=LXXX +LXXIX=LXXIX +LXXIV=LXXIV +LXXIII=LXXIII +LXXII=LXXII +LXXI=LXXI +LXXVIII=LXXVIII +LXXVII=LXXVII +LXXVI=LXXVI +LXXV=LXXV +LXX=LXX +LXIX=XIX +LXIV=LXIV +LXIII=LXIII +LXII=LXII +LXI=LXI +LXVIII=LXVIII +LXVII=LXVII +LXVI=LXVI +LXV=LXV +LX=LX +LIX=LIX +LIV=LIV +LIII=LIII +LII=LII +LI=LI +LVIII=LVIII +LVII=LVII +LVI=LVI +LV=LV +XXXIX=XXXIX +XXXIV=XXXIV +XXXIII=XXXIII +XXXII=XXXII +XXXI=XXXI +XXXVIII=XXXVIII +XXXVII=XXXVII +XXXVI=XXXVI +XXXV=XXXV +XXX=XXX +XXIX=XXIX +XXIV=XXIV +XXIII=XXIII +XXII=XXII +XXI=XXI +XXVIII=XXVIII +XXVII=XXVII +XXVI=XXVI +XXV=XXV +XX=XX +XIX=XIX +XIV=XIV +XIII=XIII +XII=XII +XI=XI +XVIII=XVIII +XVII=XVII +XVI=XVI +XV=XV +IV=IV +VIII=VIII +VII=VII +VI=VI +IX=IX +III=III +II=II + +# COMMON COMBINING CHARACTERS (always follow a base letter) + +# combining acute U+0301 +# combining tilde U+0303 +# combining macron U+0304 +# combining dot above U+0307 +# combining dot below U+0323 +# combining diaeresis below U+0324 +# combining low line U+0332 +# combining breve below U+032E +# ayn (spacing) U+02BB +# apostrophe (spacing) U+0027 + +# REGULAR LATIN ALPHABETIC CHARACTERS TO BE CONVERTED + +# THAANA LETTER ALIF APPEARING MEDIALLY WITH ANY VOWEL IS ROMANIZED WITH APOSTROPHE +# ORIGINAL VOWEL ASSOCIATED WITH THE ALIF CANNOT BE REGENERATED DURING CONVERSION +U+0027=U+0787 + +# THAANA LETTER SHAVIYANI WITH SUKUN(BREVE 032E) +HU+032E=U+0781U+07B0 +hh=U+0787U+07B0U+0780 +hU+032E=U+0781U+07B0 + +# THAANA LETTER ALIFU FINAL WITH SUKUN (LOW LINE 0332) +%HU+0332=U+0787U+07B0 +%hU+0332=U+0787U+07B0 +H=U+0780 +h=U+0780 +SU+0301=U+0781 +ss=U+0787U+07B0U+0781 +sU+0301=U+0781 + +# THAANA LETTER MEDIAL NOONU WITHOUT SUKUN (DOT ABOVE 0307) +MU+0307=U+0782 +mU+0307=U+0782 +mm=U+0787U+07B0U+0789 +N=U+0782 +nn=U+0787U+07B0U+0782 +n=U+0782 +R=U+0783 +rr=U+0787U+07B0U+0783 +r=U+0783 +B=U+0784 +bb=U+0787U+07B0U+0784 +b=U+0784 + +# THAANA LETTER "L/l" WITH DOT BELOW (0323) +LU+0323=U+0785 +lU+0323=U+0785 +K=U+0786 +kk=U+0787U+07B0U+0786 +k=U+0786 + +# THAANA LETTER ALIF--OFTEN OMITTED IN ROMANIZATION (SEE VOWEL CONVERSION BELOW) +U+0027=U+0787 + +V=U+0788 +vv=U+0787U+07B0U+0788 +v=U+0788 +M=U+0789 +m=U+0789 +F=U+078A +ff=U+0787U+07B0U+078A +f=U+078A +# THAANA LETTER "D/d" WITH DOT BELOW (0323) +DU+0323=U+0791 +dU+0323=U+0791 +D=U+078B +dd=U+0787U+07B0U+078B +d=U+078B +# THAANA LETTER "T/t" WITH DOT BELOW (0323) +TU+0323=U+0793 +tU+0323=U+0793 +TU+0324T=U+078CU+07B0U+078C +TU+0324t=U+078CU+07B0U+078C +tU+0324t=U+078CU+07B0U+078C +tU+0324T=U+078CU+07B0U+078C +T=U+078C +tt=U+0787U+07B0U+078C +t=U+078C +L=U+078D +ll=U+0787U+07B0U+078D +l=U+078D +G=U+078E +gg=U+0787U+07B0U+078E +g=U+078E +# THAANA LETTER "N/n' WITH TILDE (0303) +NU+0303=U+078F +nU+0303=U+078F +S=U+0790 +ss=U+0787U+07B0U+0790 +s=U+0790 +Z=U+0792 +zz=U+0787U+07B0U+0792 +z=U+0792 +Y=U+0794 +yy=U+0787U+07B0U+0794 +y=U+0794 +P=U+0795 +p=U+0795 +pp=U+0787U+07B0U+0795 +J=U+0796 +jj=U+0787U+07B0U+0796 +j=U+0796 +C=U+0797 +cc=U+0787U+07B0U+0797 +c=U+0797 + +# THAANA EXTENSION FOR ARABIC LOAN WORDS AND NAMES + +# THAANA EXTENSION FOR ARABIC LETTER TTAA +TH=U+0798 +Th=U+0798 +thth=U+0787U+07B0U+0798 +th=U+0798 + +# THAANA EXTENSION FOR ARABIC LETTER HHAA +HU+0323=U+0799 +hU+0323=U+0799 + +# THAANA EXTENSION FOR ARABIC LETTER KHAA +KH=U+079A +Kh=U+079A +khkh=U+0787U+07B0U+079A +kh=U+079A + +# THAANA EXTENSION FOR ARABIC LETTER THAALU +DH=U+079B +Dh=U+079B +dhdh=U+0787U+07B0U+079B +dh=U+079B + +# THAANA EXTENSION FOR ARABIC LETTER ZAA (NEWER LETTER) +ZU+0332=U+079C +zU+0332=U+079C + +# THAANA EXTENSION FOR ARABIC LETTER SHEENU +SH=U+079D +Sh=U+079D +shsh=U+0787U+07B0U+079D +sh=U+079D + +# THAANA EXTENSION FOR ARABIC LETTER SAADHU +SU+0323=U+079E +sU+0323=U+079E + +# THAANA EXTENSION FOR ARABIC LETTER TO +TU+0324=U+07A0 +tU+0324=U+07A0 + +# THAANA EXTENSION FOR ARABIC LETTER ZO +DU+0332=U+07A1 +dU+0332=U+07A1 + +# THAANA EXTENSION FOR ARABIC LETTER AINU +U+02BB=U+07A2 + +# THAANA EXTENSION FOR ARABIC LETTER GHAINU +GH=U+07A3 +Gh=U+07A3 +ghgh=U+0787U+07B0U+07A3 +gh=U+07A3 + +# THAANA EXTENSION FOR ARABIC LETTER QAAFU +Q=U+07A4 +qq=U+0787U+07B0U+07A4 +q=U+07A4 + +# THAANA EXTENSION FOR ARABIC LETTER WAAVU (NEWER LETTER) +W=U+07A5 +ww=U+0787U+07B0U+07A5 +w=U+07A5 + +# INITIAL (AND UPPERCASE) VOWELS THAT CONVERT +# TO ALIF FOLLOWED BY VOWEL (ALIF OMITTED IN ROMANIZATION) +AU+0304%=U+0787U+07A7 +A%=U+0787U+07A6 +U+0020aU+0304=U+0020U+0787U+07A7 +U+0020a=U+0020U+0787U+07A6 +EU+0304%=U+0787U+07AD +E%=U+0787U+07AC +U+0020eU+0304=U+0020U+0787U+07AD +U+0020e=U+0020U+0787U+07AC +IU+0304%=U+0787U+07A9 +I%=U+0787U+07A8 +U+0020iU+0304=U+0020U+0787U+07A9 +U+0020i=U+0020U+0787U+07A8 +OU+0304%=U+0787U+07AF +O%=U+0787U+07AE +U+0020oU+0304=U+0020U+0787U+07AF +U+0020o=U+0020U+0787U+07AE +UU+0304%=U+0787U+07AB +U%=U+0787U+07AB +U+0020uU+0304=U+0020U+0787U+07AB +U+0020u=U+0020U+0787U+07AB + +# THAANA MEDIAL OR FINAL VOWELS OVER ANY CONSONANT +# (THIS ASSUMES NO UPPERCASE VOWELS REMAIN) + +aU+0304=U+07A7 +a=U+07A6 +eU+0304=U+07AD +e=U+07AC +iU+0304=U+07A9 +i=U+07A8 +oU+0304=U+07AF +o=U+07AE +uU+0304=U+07AB +u=U+07AB + +# THAANA SUKUN (SILENCE) MARK; ONLY GENERATED IN OTHER COMBINATIONS +# =U+07B0 + +[ScriptToRoman] +FieldsIncluded=100 110 111 130 240 245 246 250 260 264 440 490 600 610 611 630 651 700 710 711 730 740 800 830 +SubfieldsAlwaysExcluded=uvxy0123456789 +OtherSubfieldsExcludedByTag=100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i + +# RDA boilerplate phrases not transliterated: +Place of publication not identified=Place of publication not identified +publisher not identified=publisher not identified +date of publication not identified=date of publication not identified +At head of title=At head of title + +# THAANA LETTER NOONU WITHOUT SUKUN FOLLOWED BY A CONSONANT +# IS ROMANIZED AS "m"+ DOT ABOVE (0307) THEN THE CONSONANT +# OTHERWISE LETTER NOONU MAPS TO "n" +U+0782U+0780=mU+0307U+0780 +U+0782U+0781=mU+0307U+0781 +U+0782U+0783=mU+0307U+0783 +U+0782U+0784=mU+0307U+0784 +U+0782U+0785=mU+0307U+0785 +U+0782U+0786=mU+0307U+0786 +U+0782U+0788=mU+0307U+0788 +U+0782U+0789=mU+0307U+0789 +U+0782U+078A=mU+0307U+078A +U+0782U+0791=mU+0307U+0791 +U+0782U+078B=mU+0307U+078B +U+0782U+078C=mU+0307U+078C +U+0782U+078D=mU+0307U+078D +U+0782U+078E=mU+0307U+078E +U+0782U+078F=mU+0307U+078F +U+0782U+0790=mU+0307U+0790 +U+0782U+0792=mU+0307U+0792 +U+0782U+0794=mU+0307U+0794 +U+0782U+0795=mU+0307U+0795 +U+0782U+0796=mU+0307U+0796 +U+0782U+0797=mU+0307U+0797 +U+0782U+0798=mU+0307U+0798 +U+0782U+0799=mU+0307U+0799 +U+0782U+079A=mU+0307U+079A +U+0782U+079B=mU+0307U+079B +U+0782U+079C=mU+0307U+079C +U+0782U+079D=mU+0307U+079D +U+0782U+079E=mU+0307U+079E +U+0782U+07A0=mU+0307U+07A0 +U+0782U+07A1=mU+0307U+07A1 +U+0782U+07A2=mU+0307U+07A2 +U+0782U+07A3=mU+0307U+07A3 +U+0782U+07A4=mU+0307U+07A4 +U+0782U+07A5=mU+0307U+07A5 + +# THAANA FINAL ALIFU WITH SUKUN (SILENCE) MARK +# IS ROMANIZED WITH "h"+LOW LINE (0332) +U+0787U+07B0U+0020=hU+0332U+0020 + +# THAANA SHAVIYANI WITH SUKUN (SILENCE) MARK +# IS ROMANIZED WITH "h"+BREVE BELOW +U+0781U+07B0=hU+032E + +U+0787U+07B0U+0780=hh +U+0780=h + +# THAANA ALIF WITH SUKUN AND SHAVIYANI +U+0787U+07B0U+0781=sU+0301sU+0301 + +U+0787U+07B0=hU+0332 +U+0781=sU+0301 + +U+0787U+07B0U+0782=nn +U+0782=n + +U+0787U+07B0U+0783=rr +U+0783=r + +U+0787U+07B0U+0784=bb +U+0784=b + +U+0787U+07B0U+0785=lU+0323 +U+0785=lU+0323 + +U+0787U+07B0U+0786=kk +U+0786=k + +U+0787U+07B0U+0788=vv +U+0788=v + +U+0787U+07B0U+0789=mm +U+0789=m + +U+0787U+07B0U+078A=ff +U+078A=f + +# THAANA LETTER "D/d" WITH DOT BELOW (0323) +U+0787U+07B0U+0791=dU+0323dU+0323 +U+0791=dU+0323 + +U+0787U+07B0U+078B=dd +U+078B=d + +# THAANA LETTER "T/t" WITH DOT BELOW (0323) +U+078CU+07B0U+078C=tU+0324t +U+0787U+07B0U+078C=tt +U+0793=tU+0323 +U+078C=t + +U+0787U+07B0U+078D=ll +U+078D=l + +U+0787U+07B0U+078E=gg +U+078E=g + +# THAANA LETTER "N/n' WITH TILDE (0303) +U+0787U+07B0U+078F=nU+0303nU+0303 +U+078F=nU+0303 + +U+0787U+07B0U+0790=ss +U+0790=s + +U+0787U+07B0U+0792=zz +U+0792=z + +U+0787U+07B0U+0794=yy +U+0794=y + +U+0787U+07B0U+0795=pp +U+0795=p + +U+0787U+07B0U+0796=jj +U+0796=j + +U+0787U+07B0U+0797=cc +U+0797=c + +# THAANA EXTENSION FOR ARABIC LETTER TTAA +U+0787U+07B0U+0798=thth +U+0798=th + +# THAANA EXTENSION FOR ARABIC LETTER HHAA +U+0787U+07B0U+0799=hU+0323hU+0323 +U+0799=hU+0323 + +# THAANA EXTENSION FOR ARABIC LETTER KHAA +U+0787U+07B0U+079A=khkh +U+079A=kh + +# THAANA EXTENSION FOR ARABIC LETTER THAALU +U+0787U+07B0U+079B=dhdh +U+079B=dh + +# THAANA EXTENSION FOR ARABIC LETTER ZAA (NEWER LETTER) +U+0787U+07B0U+079C=zU+0332zU+0332 +U+079C=zU+0332 + +# THAANA EXTENSION FOR ARABIC LETTER SHEENU +U+0787U+07B0U+079D=shsh +U+079D=sh + +# THAANA EXTENSION FOR ARABIC LETTER SAADHU +U+0787U=07B0U+079E=sU+0323sU+0323 +U+079E=sU+0323 + +# THAANA EXTENSION FOR ARABIC LETTER TO +U+0787U=07B0U+07A0=tU+0324tU+0324 +U+07A0=tU+0324 + +# THAANA EXTENSION FOR ARABIC LETTER ZO +U+0787U=07B0U+07A1=dU+0332dU+0332 +U+07A1=dU+0332 + +# THAANA EXTENSION FOR ARABIC LETTER AINU +U+0787U=07B0U+07A2=U+02BBU+02BB +U+07A2=U+02BB + +# THAANA EXTENSION FOR ARABIC LETTER GHAINU +U+0787U+07B0U+07A3=ghgh +U+07A3=gh + +# THAANA EXTENSION FOR ARABIC LETTER QAAFU +U+0787U+07B0U+07A4=qq +U+07A4=q + +# THAANA EXTENSION FOR ARABIC LETTER WAAVU (NEWER LETTER) +U+0787U+07B0U+07A5=ww +U+07A5=w + +# INITIAL VOWELS FOLLOWING ALIF (ALIF OMITTED IN ROMANIZATION) +U+0020U+0787U+07A7=U+0020aU+0304 +U+0020U+0787U+07A6=U+0020a +U+0020U+0787U+07AD=U+0020eU+0304 +U+0020U+0787U+07AC=U+0020e +U+0020U+0787U+07A9=U+0020iU+0304 +U+0020U+0787U+07A8=U+0020i +U+0020U+0787U+07AF=U+0020oU+0304 +U+0020U+0787U+07AE=U+0020o +U+0020U+0787U+07AB=U+0020uU+0304 +U+0020U+0787U+07AB=U+0020u + +# THAANA ALIF APPEARING MEDIALLY WITH ANY VOWEL +# IS ROMANIZED WITH APOSTROPHE FOLLOWED BY THE SAME VOWEL + +U+0787U+07A7=U+0027aU+0304 +U+0787U+07A6=U+0027a +U+0787U+07AD=U+0027eU+0304 +U+0787U+07AC=U+0027e +U+0787U+07A9=U+0027iU+0304 +U+0787U+07A8=U+0027i +U+0787U+07AF=U+0027oU+0304 +U+0787U+07AE=U+0027o +U+0787U+07AB=U+0027uU+0304 +U+0787U+07AB=U+0027u + +# THAANA MEDIAL OR FINAL VOWELS OVER CONSONANTS EXCEPT ALIF +# THIS PRODUCES NO UPPERCASE UPPERCASE VOWELS +U+07A7=aU+0304 +U+07A6=a +U+07AD=eU+0304 +U+07AC=e +U+07A9=iU+0304 +U+07A8=i +U+07AF=oU+0304 +U+07AE=o +U+07AB=uU+0304 +U+07AB=u + +# THAANA LETTER ALIF--ANY REMAINING AFTER CONVERSION MAP TO APOSTROPHE +U+0787=U+0027 diff --git a/legacy/data/KurdishRomanization.cfg b/legacy/data/KurdishRomanization.cfg new file mode 100644 index 0000000..76e0ba1 --- /dev/null +++ b/legacy/data/KurdishRomanization.cfg @@ -0,0 +1,125 @@ +# version 0.9.1 +# Original table by William Kopycki +# Last updated 08 July 2009 + +[General] +Name=Kurdish +ScriptCode=(3 +Truncation=% + +[RomanToScript] +FieldsIncluded=100 110 111 130 240 245 246 250 260 440 490 600 610 611 630 651 700 710 711 730 740 800 830 +SubfieldsAlwaysExcluded=uvxy0123456789 +OtherSubfieldsExcludedByTag=650/a 260/c 246/i +IncludeFormattingCharactersLcPattern=True + +# "Authorized" names: + +# Punctuation marks: +# %=U+066A ; cannot transliterate the truncation character +*=U+066D +,=U+060C +;=U+061B +?=U+061F + +# Numbers (these should be Arabic-Indic digits from 0660-0669. We will use 06F0-06F9 for Persian and Urdu--WK) +0=U+0660 +1=U+0661 +2=U+0662 +3=U+0663 +4=U+0664 +5=U+0665 +6=U+0666 +7=U+0667 +8=U+0668 +9=U+0669 + +# Vowels and vowel/consonant combinations +U+02BBE=U+0639U+0647U+200C +U+02BBe=U+0639U+0647U+200C +A=U+0626U+0627 +a=U+0627 +EU+0302=U+0626U+0647U+200C +eU+0302=U+06CE +E=U+0626U+0647U+200C +e=U+0647U+200C +IU+0302=U+0626U+064A + +# here is the "alif maksura" which otherwise serves as the "Persian yah U+06CC which is not valid in MARC-8 character set. + +# THIS NEEDS TO BE ADJUSTED FOR "i[circumflex]y" and probably "e[circumflex]y combinations to = U+0649 + +iU+0302U+0020=U+0649 +iU+0302=U+064A +I= +i= +O=U+06C6 +o=U+06C6 +uU+0302=U+0648U+0648 +U=U+0626U+0648 +u=U+0648 + +# Consonants: +B=U+0628 +b=U+0628 +CU+0327=U+0686 +cU+0327=U+0686 +C=U+062C +c=U+062C +DU+0323=U+0636 +dU+0323=U+0636 +D=U+062F +d=U+062F +F=U+0641 +f=U+0641 +G=U+06AF +g=U+06AF +HU+0308=U+062D +hU+0308=U+062D +H=U+0647 +h=U+0647 +J=U+0698 +j=U+0698 +K=U+06A9 +k=U+06A9 +#L and l with stroke +U+0141=U+06B5 +U+0142=U+06B5 +L=U+0644 +l=U+0644 +M=U+0645 +m=U+0645 +N=U+0646 +n=U+0646 +P=U+067E +p=U+067E +Q=U+0642 +q=U+0642 +RU+0304=U+0695 +rU+0304=U+0695 +R=U+0631 +r=U+0631 +SU+0323=U+0635 +sU+0323=U+0635 +SU+0327=U+0634 +sU+0327=U+0634 +S=U+0633 +s=U+0633 +TU+0323=U+0637 +tU+0323=U+0637 +T=U+062A +t=U+062A +V=U+06A8 +v=U+06A8 +W=U+0648 +w=U+0648 +XU+0308=U+063A +xU+0308=U+063A +X=U+062E +x=U+062E +Y=U+064A +y=U+064A +Z=U+0632 +z=U+0632 + +[ScriptToRoman] diff --git a/legacy/data/PersianRomanization.cfg b/legacy/data/PersianRomanization.cfg index 5ddecbe..f647b64 100644 --- a/legacy/data/PersianRomanization.cfg +++ b/legacy/data/PersianRomanization.cfg @@ -5,13 +5,13 @@ [General] Name=Persian +ScriptCode=(3 Truncation=% [RomanToScript] FieldsIncluded=100 110 111 130 240 245 246 250 260 264 440 490 600 610 611 630 651 700 710 711 730 740 800 830 SubfieldsAlwaysExcluded=uvxy0123456789 OtherSubfieldsExcludedByTag=100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i -Subfield6Code=(3 IncludeFormattingCharactersLcPattern=True # RDA boilerplate phrases not transliterated: diff --git a/legacy/data/PushtoRomanization.cfg b/legacy/data/PushtoRomanization.cfg index ae425f1..bc17bf4 100644 --- a/legacy/data/PushtoRomanization.cfg +++ b/legacy/data/PushtoRomanization.cfg @@ -5,13 +5,13 @@ [General] Name=Pushto +ScriptCode=(3 Truncation=% [RomanToScript] FieldsIncluded=100 110 111 130 245 246 250 260 264 440 490 505 600 610 611 630 651 700 710 711 730 740 800 830 SubfieldsAlwaysExcluded=uvxy0123456789 OtherSubfieldsExcludedByTag=100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i -Subfield6Code=(3 IncludeFormattingCharactersLcPattern=True # RDA boilerplate phrases not transliterated: diff --git a/legacy/data/UrduRomanization.cfg b/legacy/data/UrduRomanization.cfg index 58c088a..a1e3d38 100644 --- a/legacy/data/UrduRomanization.cfg +++ b/legacy/data/UrduRomanization.cfg @@ -5,13 +5,13 @@ [General] Name=Urdu +ScriptCode=(3 Truncation=% [RomanToScript] FieldsIncluded=100 110 111 130 240 245 246 250 260 264 440 490 505 600 610 611 630 651 700 710 711 730 740 800 830 SubfieldsAlwaysExcluded=uvxy0123456789 OtherSubfieldsExcludedByTag=100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i -Subfield6Code=(3 IncludeFormattingCharactersLcPattern=True # RDA boilerplate phrases not transliterated: diff --git a/scriptshifter/tables/data/divehi_thaana.yml b/scriptshifter/tables/data/divehi_thaana.yml new file mode 100644 index 0000000..24c513f --- /dev/null +++ b/scriptshifter/tables/data/divehi_thaana.yml @@ -0,0 +1,435 @@ +general: + name: Divehi (Thaana) + +roman_to_script: + map: + # COMMON COMBINING CHARACTERS (always follow a base letter) + + # combining acute \u0301 + # combining tilde \u0303 + # combining macron \u0304 + # combining dot above \u0307 + # combining dot below \u0323 + # combining diaeresis below \u0324 + # combining low line \u0332 + # combining breve below \u032E + # ayn (spacing) \u02BB + # apostrophe (spacing) \u0027 + + # REGULAR LATIN ALPHABETIC CHARACTERS TO BE CONVERTED + + # THAANA LETTER ALIF APPEARING MEDIALLY WITH ANY VOWEL IS ROMANIZED WITH APOSTROPHE + # ORIGINAL VOWEL ASSOCIATED WITH THE ALIF CANNOT BE REGENERATED DURING CONVERSION + "\u0027": "\u0787" + + # THAANA LETTER SHAVIYANI WITH SUKUN(BREVE 032E) + "H\u032E": "\u0781\u07B0" + "hh": "\u0787\u07B0\u0780" + "h\u032E": "\u0781\u07B0" + + # THAANA LETTER ALIFU FINAL WITH SUKUN (LOW LINE 0332) + "%H\u0332": "\u0787\u07B0" + "%h\u0332": "\u0787\u07B0" + "H": "\u0780" + "h": "\u0780" + "S\u0301": "\u0781" + "ss": "\u0787\u07B0\u0781" + "s\u0301": "\u0781" + + # THAANA LETTER MEDIAL NOONU WITHOUT SUKUN (DOT ABOVE 0307) + "M\u0307": "\u0782" + "m\u0307": "\u0782" + "mm": "\u0787\u07B0\u0789" + "N": "\u0782" + "nn": "\u0787\u07B0\u0782" + "n": "\u0782" + "R": "\u0783" + "rr": "\u0787\u07B0\u0783" + "r": "\u0783" + "B": "\u0784" + "bb": "\u0787\u07B0\u0784" + "b": "\u0784" + + # THAANA LETTER "L/l" WITH DOT BELOW (0323) + "L\u0323": "\u0785" + "l\u0323": "\u0785" + "K": "\u0786" + "kk": "\u0787\u07B0\u0786" + "k": "\u0786" + + # THAANA LETTER ALIF--OFTEN OMITTED IN ROMANIZATION (SEE VOWEL CONVERSION BELOW) + "\u0027": "\u0787" + + "V": "\u0788" + "vv": "\u0787\u07B0\u0788" + "v": "\u0788" + "M": "\u0789" + "m": "\u0789" + "F": "\u078A" + "ff": "\u0787\u07B0\u078A" + "f": "\u078A" + # THAANA LETTER "D/d" WITH DOT BELOW (0323) + "D\u0323": "\u0791" + "d\u0323": "\u0791" + "D": "\u078B" + "dd": "\u0787\u07B0\u078B" + "d": "\u078B" + # THAANA LETTER "T/t" WITH DOT BELOW (0323) + "T\u0323": "\u0793" + "t\u0323": "\u0793" + "T\u0324T": "\u078C\u07B0\u078C" + "T\u0324t": "\u078C\u07B0\u078C" + "t\u0324t": "\u078C\u07B0\u078C" + "t\u0324T": "\u078C\u07B0\u078C" + "T": "\u078C" + "tt": "\u0787\u07B0\u078C" + "t": "\u078C" + "L": "\u078D" + "ll": "\u0787\u07B0\u078D" + "l": "\u078D" + "G": "\u078E" + "gg": "\u0787\u07B0\u078E" + "g": "\u078E" + # THAANA LETTER "N/n' WITH TILDE (0303) + "N\u0303": "\u078F" + "n\u0303": "\u078F" + "S": "\u0790" + "ss": "\u0787\u07B0\u0790" + "s": "\u0790" + "Z": "\u0792" + "zz": "\u0787\u07B0\u0792" + "z": "\u0792" + "Y": "\u0794" + "yy": "\u0787\u07B0\u0794" + "y": "\u0794" + "P": "\u0795" + "p": "\u0795" + "pp": "\u0787\u07B0\u0795" + "J": "\u0796" + "jj": "\u0787\u07B0\u0796" + "j": "\u0796" + "C": "\u0797" + "cc": "\u0787\u07B0\u0797" + "c": "\u0797" + + # THAANA EXTENSION FOR ARABIC LOAN WORDS AND NAMES + + # THAANA EXTENSION FOR ARABIC LETTER TTAA + "TH": "\u0798" + "Th": "\u0798" + "thth": "\u0787\u07B0\u0798" + "th": "\u0798" + + # THAANA EXTENSION FOR ARABIC LETTER HHAA + "H\u0323": "\u0799" + "h\u0323": "\u0799" + + # THAANA EXTENSION FOR ARABIC LETTER KHAA + "KH": "\u079A" + "Kh": "\u079A" + "khkh": "\u0787\u07B0\u079A" + "kh": "\u079A" + + # THAANA EXTENSION FOR ARABIC LETTER THAALU + "DH": "\u079B" + "Dh": "\u079B" + "dhdh": "\u0787\u07B0\u079B" + "dh": "\u079B" + + # THAANA EXTENSION FOR ARABIC LETTER ZAA (NEWER LETTER) + "Z\u0332": "\u079C" + "z\u0332": "\u079C" + + # THAANA EXTENSION FOR ARABIC LETTER SHEENU + "SH": "\u079D" + "Sh": "\u079D" + "shsh": "\u0787\u07B0\u079D" + "sh": "\u079D" + + # THAANA EXTENSION FOR ARABIC LETTER SAADHU + "S\u0323": "\u079E" + "s\u0323": "\u079E" + + # THAANA EXTENSION FOR ARABIC LETTER TO + "T\u0324": "\u07A0" + "t\u0324": "\u07A0" + + # THAANA EXTENSION FOR ARABIC LETTER ZO + "D\u0332": "\u07A1" + "d\u0332": "\u07A1" + + # THAANA EXTENSION FOR ARABIC LETTER AINU + "\u02BB": "\u07A2" + + # THAANA EXTENSION FOR ARABIC LETTER GHAINU + "GH": "\u07A3" + "Gh": "\u07A3" + "ghgh": "\u0787\u07B0\u07A3" + "gh": "\u07A3" + + # THAANA EXTENSION FOR ARABIC LETTER QAAFU + "Q": "\u07A4" + "qq": "\u0787\u07B0\u07A4" + "q": "\u07A4" + + # THAANA EXTENSION FOR ARABIC LETTER WAAVU (NEWER LETTER) + "W": "\u07A5" + "ww": "\u0787\u07B0\u07A5" + "w": "\u07A5" + + # INITIAL (AND UPPERCASE) VOWELS THAT CONVERT + # TO ALIF FOLLOWED BY VOWEL (ALIF OMITTED IN ROMANIZATION) + "A\u0304%": "\u0787\u07A7" + "A%": "\u0787\u07A6" + "\u0020a\u0304": "\u0020\u0787\u07A7" + "\u0020a": "\u0020\u0787\u07A6" + "E\u0304%": "\u0787\u07AD" + "E%": "\u0787\u07AC" + "\u0020e\u0304": "\u0020\u0787\u07AD" + "\u0020e": "\u0020\u0787\u07AC" + "I\u0304%": "\u0787\u07A9" + "I%": "\u0787\u07A8" + "\u0020i\u0304": "\u0020\u0787\u07A9" + "\u0020i": "\u0020\u0787\u07A8" + "O\u0304%": "\u0787\u07AF" + "O%": "\u0787\u07AE" + "\u0020o\u0304": "\u0020\u0787\u07AF" + "\u0020o": "\u0020\u0787\u07AE" + "U\u0304%": "\u0787\u07AB" + "U%": "\u0787\u07AB" + "\u0020u\u0304": "\u0020\u0787\u07AB" + "\u0020u": "\u0020\u0787\u07AB" + + # THAANA MEDIAL OR FINAL VOWELS OVER ANY CONSONANT + # (THIS ASSUMES NO UPPERCASE VOWELS REMAIN) + + "a\u0304": "\u07A7" + "a": "\u07A6" + "e\u0304": "\u07AD" + "e": "\u07AC" + "i\u0304": "\u07A9" + "i": "\u07A8" + "o\u0304": "\u07AF" + "o": "\u07AE" + "u\u0304": "\u07AB" + "u": "\u07AB" + + # THAANA SUKUN (SILENCE) MARK; ONLY GENERATED IN OTHER COMBINATIONS + # "": "\u07B0" + +script_to_roman: + map: + + # THAANA LETTER NOONU WITHOUT SUKUN FOLLOWED BY A CONSONANT + # IS ROMANIZED AS "m"+ DOT ABOVE (0307) THEN THE CONSONANT + # OTHERWISE LETTER NOONU MAPS TO "n" + "\u0782\u0780": "m\u0307\u0780" + "\u0782\u0781": "m\u0307\u0781" + "\u0782\u0783": "m\u0307\u0783" + "\u0782\u0784": "m\u0307\u0784" + "\u0782\u0785": "m\u0307\u0785" + "\u0782\u0786": "m\u0307\u0786" + "\u0782\u0788": "m\u0307\u0788" + "\u0782\u0789": "m\u0307\u0789" + "\u0782\u078A": "m\u0307\u078A" + "\u0782\u0791": "m\u0307\u0791" + "\u0782\u078B": "m\u0307\u078B" + "\u0782\u078C": "m\u0307\u078C" + "\u0782\u078D": "m\u0307\u078D" + "\u0782\u078E": "m\u0307\u078E" + "\u0782\u078F": "m\u0307\u078F" + "\u0782\u0790": "m\u0307\u0790" + "\u0782\u0792": "m\u0307\u0792" + "\u0782\u0794": "m\u0307\u0794" + "\u0782\u0795": "m\u0307\u0795" + "\u0782\u0796": "m\u0307\u0796" + "\u0782\u0797": "m\u0307\u0797" + "\u0782\u0798": "m\u0307\u0798" + "\u0782\u0799": "m\u0307\u0799" + "\u0782\u079A": "m\u0307\u079A" + "\u0782\u079B": "m\u0307\u079B" + "\u0782\u079C": "m\u0307\u079C" + "\u0782\u079D": "m\u0307\u079D" + "\u0782\u079E": "m\u0307\u079E" + "\u0782\u07A0": "m\u0307\u07A0" + "\u0782\u07A1": "m\u0307\u07A1" + "\u0782\u07A2": "m\u0307\u07A2" + "\u0782\u07A3": "m\u0307\u07A3" + "\u0782\u07A4": "m\u0307\u07A4" + "\u0782\u07A5": "m\u0307\u07A5" + + # THAANA FINAL ALIFU WITH SUKUN (SILENCE) MARK + # IS ROMANIZED WITH "h"+LOW LINE (0332) + "\u0787\u07B0\u0020": "h\u0332\u0020" + + # THAANA SHAVIYANI WITH SUKUN (SILENCE) MARK + # IS ROMANIZED WITH "h"+BREVE BELOW + "\u0781\u07B0": "h\u032E" + + "\u0787\u07B0\u0780": "hh" + "\u0780": "h" + + # THAANA ALIF WITH SUKUN AND SHAVIYANI + "\u0787\u07B0\u0781": "s\u0301s\u0301" + + "\u0787\u07B0": "h\u0332" + "\u0781": "s\u0301" + + "\u0787\u07B0\u0782": "nn" + "\u0782": "n" + + "\u0787\u07B0\u0783": "rr" + "\u0783": "r" + + "\u0787\u07B0\u0784": "bb" + "\u0784": "b" + + "\u0787\u07B0\u0785": "l\u0323" + "\u0785": "l\u0323" + + "\u0787\u07B0\u0786": "kk" + "\u0786": "k" + + "\u0787\u07B0\u0788": "vv" + "\u0788": "v" + + "\u0787\u07B0\u0789": "mm" + "\u0789": "m" + + "\u0787\u07B0\u078A": "ff" + "\u078A": "f" + + # THAANA LETTER "D/d" WITH DOT BELOW (0323) + "\u0787\u07B0\u0791": "d\u0323d\u0323" + "\u0791": "d\u0323" + + "\u0787\u07B0\u078B": "dd" + "\u078B": "d" + + # THAANA LETTER "T/t" WITH DOT BELOW (0323) + "\u078C\u07B0\u078C": "t\u0324t" + "\u0787\u07B0\u078C": "tt" + "\u0793": "t\u0323" + "\u078C": "t" + + "\u0787\u07B0\u078D": "ll" + "\u078D": "l" + + "\u0787\u07B0\u078E": "gg" + "\u078E": "g" + + # THAANA LETTER "N/n' WITH TILDE (0303) + "\u0787\u07B0\u078F": "n\u0303n\u0303" + "\u078F": "n\u0303" + + "\u0787\u07B0\u0790": "ss" + "\u0790": "s" + + "\u0787\u07B0\u0792": "zz" + "\u0792": "z" + + "\u0787\u07B0\u0794": "yy" + "\u0794": "y" + + "\u0787\u07B0\u0795": "pp" + "\u0795": "p" + + "\u0787\u07B0\u0796": "jj" + "\u0796": "j" + + "\u0787\u07B0\u0797": "cc" + "\u0797": "c" + + # THAANA EXTENSION FOR ARABIC LETTER TTAA + "\u0787\u07B0\u0798": "thth" + "\u0798": "th" + + # THAANA EXTENSION FOR ARABIC LETTER HHAA + "\u0787\u07B0\u0799": "h\u0323h\u0323" + "\u0799": "h\u0323" + + # THAANA EXTENSION FOR ARABIC LETTER KHAA + "\u0787\u07B0\u079A": "khkh" + "\u079A": "kh" + + # THAANA EXTENSION FOR ARABIC LETTER THAALU + "\u0787\u07B0\u079B": "dhdh" + "\u079B": "dh" + + # THAANA EXTENSION FOR ARABIC LETTER ZAA (NEWER LETTER) + "\u0787\u07B0\u079C": "z\u0332z\u0332" + "\u079C": "z\u0332" + + # THAANA EXTENSION FOR ARABIC LETTER SHEENU + "\u0787\u07B0\u079D": "shsh" + "\u079D": "sh" + + # THAANA EXTENSION FOR ARABIC LETTER SAADHU + "\u0787U": "07B0\u079E=s\u0323s\u0323" + "\u079E": "s\u0323" + + # THAANA EXTENSION FOR ARABIC LETTER TO + "\u0787U": "07B0\u07A0=t\u0324t\u0324" + "\u07A0": "t\u0324" + + # THAANA EXTENSION FOR ARABIC LETTER ZO + "\u0787U": "07B0\u07A1=d\u0332d\u0332" + "\u07A1": "d\u0332" + + # THAANA EXTENSION FOR ARABIC LETTER AINU + "\u0787U": "07B0\u07A2=\u02BB\u02BB" + "\u07A2": "\u02BB" + + # THAANA EXTENSION FOR ARABIC LETTER GHAINU + "\u0787\u07B0\u07A3": "ghgh" + "\u07A3": "gh" + + # THAANA EXTENSION FOR ARABIC LETTER QAAFU + "\u0787\u07B0\u07A4": "qq" + "\u07A4": "q" + + # THAANA EXTENSION FOR ARABIC LETTER WAAVU (NEWER LETTER) + "\u0787\u07B0\u07A5": "ww" + "\u07A5": "w" + + # INITIAL VOWELS FOLLOWING ALIF (ALIF OMITTED IN ROMANIZATION) + "\u0020\u0787\u07A7": "\u0020a\u0304" + "\u0020\u0787\u07A6": "\u0020a" + "\u0020\u0787\u07AD": "\u0020e\u0304" + "\u0020\u0787\u07AC": "\u0020e" + "\u0020\u0787\u07A9": "\u0020i\u0304" + "\u0020\u0787\u07A8": "\u0020i" + "\u0020\u0787\u07AF": "\u0020o\u0304" + "\u0020\u0787\u07AE": "\u0020o" + "\u0020\u0787\u07AB": "\u0020u\u0304" + "\u0020\u0787\u07AB": "\u0020u" + + # THAANA ALIF APPEARING MEDIALLY WITH ANY VOWEL + # IS ROMANIZED WITH APOSTROPHE FOLLOWED BY THE SAME VOWEL + + "\u0787\u07A7": "\u0027a\u0304" + "\u0787\u07A6": "\u0027a" + "\u0787\u07AD": "\u0027e\u0304" + "\u0787\u07AC": "\u0027e" + "\u0787\u07A9": "\u0027i\u0304" + "\u0787\u07A8": "\u0027i" + "\u0787\u07AF": "\u0027o\u0304" + "\u0787\u07AE": "\u0027o" + "\u0787\u07AB": "\u0027u\u0304" + "\u0787\u07AB": "\u0027u" + + # THAANA MEDIAL OR FINAL VOWELS OVER CONSONANTS EXCEPT ALIF + # THIS PRODUCES NO UPPERCASE UPPERCASE VOWELS + "\u07A7": "a\u0304" + "\u07A6": "a" + "\u07AD": "e\u0304" + "\u07AC": "e" + "\u07A9": "i\u0304" + "\u07A8": "i" + "\u07AF": "o\u0304" + "\u07AE": "o" + "\u07AB": "u\u0304" + "\u07AB": "u" + + # THAANA LETTER ALIF--ANY REMAINING AFTER CONVERSION MAP TO APOSTROPHE + "\u0787": "\u0027" diff --git a/scriptshifter/tables/data/dogri_devanagari.yml b/scriptshifter/tables/data/dogri_devanagari.yml new file mode 100644 index 0000000..318dd4f --- /dev/null +++ b/scriptshifter/tables/data/dogri_devanagari.yml @@ -0,0 +1,16 @@ +general: + name: Dogri (Devanagari) + +script_to_roman: + hooks: + post_config: + - + - aksharamukha.romanizer.s2r_post_config + - src_script: "Devanagari" + +roman_to_script: + hooks: + post_config: + - + - aksharamukha.romanizer.r2s_post_config + - dest_script: "Devanagari" diff --git a/scriptshifter/tables/data/index.yml b/scriptshifter/tables/data/index.yml index 99aa976..59ee728 100644 --- a/scriptshifter/tables/data/index.yml +++ b/scriptshifter/tables/data/index.yml @@ -44,6 +44,10 @@ chuvash_cyrillic: name: Chuvash (Cyrillic) devanagari: name: Devanagari +divehi_thaana: + name: Divehi (Thaana) +dogri_devanagari: + name: Dogri (Devanagari) dungan_cyrillic: name: Dungan (Cyrillic) ethiopic: @@ -96,13 +100,15 @@ korean_names: description: Korean S2R for strings ONLY containing personal names formatted as last + first name. Separate multiple names with a comma or a center-dot (U+00B7). koryak_cyrillic: name: Koryak (Cyrillic) +kurdish: + name: Kurdish kyrgyz_cyrillic: name: Kyrgyz (Cyrillic) lithuanian_cyrillic: name: Lithuanian (Cyrillic) macedonian: name: Macedonian -marathi: +marathi_devanagari: name: Marathi (Devanagari) mansi_cyrillic: name: Mansi (Cyrillic) @@ -118,6 +124,10 @@ mordvin_cyrillic: name: Mordvin (Cyrillic) nenets_cyrillic: name: Nenets (Cyrillic) +newari_devanagari: + name: Newari (Devanagari) +nepali_devanagari: + name: Nepali (Devanagari) oriya: name: Oriya ossetic_cyrillic: @@ -126,17 +136,23 @@ pali: name: Pali panjabi: name: Panjabi -prakrit: +persian: + name: Persian +prakrit_devanagari: name: Prakrit (Devanagari) pulaar: name: Pulaar (Adlam) +pushto: + name: Pushto +rajasthani_devanagari: + name: Rajasthani (Devanagari) gurmukhi: name: Punjabi (Gurmukhi) romani_cyrillic: name: Romani (Cyrillic) russian: name: Russian -sanskrit: +sanskrit_devanagari: name: Sanskrit (Devanagari) serbian: name: Serbian @@ -174,6 +190,8 @@ uighur_cyrillic: name: Uighur (Cyrillic) ukrainian: name: Ukrainian +urdu: + name: Urdu uzbek_cyrillic: name: Uzbek (Cyrillic) yakut_cyrillic: diff --git a/scriptshifter/tables/data/kurdish.yml b/scriptshifter/tables/data/kurdish.yml new file mode 100644 index 0000000..1dfbea0 --- /dev/null +++ b/scriptshifter/tables/data/kurdish.yml @@ -0,0 +1,113 @@ +general: + name: Kurdish + +roman_to_script: + map: + # "Authorized" names: + + # Punctuation marks: + "# %": "\u066A ; cannot transliterate the truncation character" + "*": "\u066D" + ",": "\u060C" + ";": "\u061B" + "?": "\u061F" + + # Numbers (these should be Arabic-Indic digits from 0660-0669. We will use 06F0-06F9 for Persian and Urdu--WK) + "0": "\u0660" + "1": "\u0661" + "2": "\u0662" + "3": "\u0663" + "4": "\u0664" + "5": "\u0665" + "6": "\u0666" + "7": "\u0667" + "8": "\u0668" + "9": "\u0669" + + # Vowels and vowel/consonant combinations + "\u02BBE": "\u0639\u0647\u200C" + "\u02BBe": "\u0639\u0647\u200C" + "A": "\u0626\u0627" + "a": "\u0627" + "E\u0302": "\u0626\u0647\u200C" + "e\u0302": "\u06CE" + "E": "\u0626\u0647\u200C" + "e": "\u0647\u200C" + "I\u0302": "\u0626\u064A" + + # here is the "alif maksura" which otherwise serves as the "Persian yah \u06CC which is not valid in MARC-8 character set. + + # THIS NEEDS TO BE ADJUSTED FOR "i[circumflex]y" and probably "e[circumflex]y combinations to = \u0649" + + "i\u0302\u0020": "\u0649" + "i\u0302": "\u064A" + "I": "" + "i": "" + "O": "\u06C6" + "o": "\u06C6" + "u\u0302": "\u0648\u0648" + "U": "\u0626\u0648" + "u": "\u0648" + + # Consonants: + "B": "\u0628" + "b": "\u0628" + "C\u0327": "\u0686" + "c\u0327": "\u0686" + "C": "\u062C" + "c": "\u062C" + "D\u0323": "\u0636" + "d\u0323": "\u0636" + "D": "\u062F" + "d": "\u062F" + "F": "\u0641" + "f": "\u0641" + "G": "\u06AF" + "g": "\u06AF" + "H\u0308": "\u062D" + "h\u0308": "\u062D" + "H": "\u0647" + "h": "\u0647" + "J": "\u0698" + "j": "\u0698" + "K": "\u06A9" + "k": "\u06A9" + #L and l with stroke + "\u0141": "\u06B5" + "\u0142": "\u06B5" + "L": "\u0644" + "l": "\u0644" + "M": "\u0645" + "m": "\u0645" + "N": "\u0646" + "n": "\u0646" + "P": "\u067E" + "p": "\u067E" + "Q": "\u0642" + "q": "\u0642" + "R\u0304": "\u0695" + "r\u0304": "\u0695" + "R": "\u0631" + "r": "\u0631" + "S\u0323": "\u0635" + "s\u0323": "\u0635" + "S\u0327": "\u0634" + "s\u0327": "\u0634" + "S": "\u0633" + "s": "\u0633" + "T\u0323": "\u0637" + "t\u0323": "\u0637" + "T": "\u062A" + "t": "\u062A" + "V": "\u06A8" + "v": "\u06A8" + "W": "\u0648" + "w": "\u0648" + "X\u0308": "\u063A" + "x\u0308": "\u063A" + "X": "\u062E" + "x": "\u062E" + "Y": "\u064A" + "y": "\u064A" + "Z": "\u0632" + "z": "\u0632" diff --git a/scriptshifter/tables/data/marathi.yml b/scriptshifter/tables/data/marathi_devanagari.yml similarity index 100% rename from scriptshifter/tables/data/marathi.yml rename to scriptshifter/tables/data/marathi_devanagari.yml diff --git a/scriptshifter/tables/data/nepali_devanagari.yml b/scriptshifter/tables/data/nepali_devanagari.yml new file mode 100644 index 0000000..2aa20a2 --- /dev/null +++ b/scriptshifter/tables/data/nepali_devanagari.yml @@ -0,0 +1,16 @@ +general: + name: Nepali (Devanagari) + +script_to_roman: + hooks: + post_config: + - + - aksharamukha.romanizer.s2r_post_config + - src_script: "Devanagari" + +roman_to_script: + hooks: + post_config: + - + - aksharamukha.romanizer.r2s_post_config + - dest_script: "Devanagari" diff --git a/scriptshifter/tables/data/newari_devanagari.yml b/scriptshifter/tables/data/newari_devanagari.yml new file mode 100644 index 0000000..3c0d7fe --- /dev/null +++ b/scriptshifter/tables/data/newari_devanagari.yml @@ -0,0 +1,16 @@ +general: + name: Newari (Devanagari) + +script_to_roman: + hooks: + post_config: + - + - aksharamukha.romanizer.s2r_post_config + - src_script: "Devanagari" + +roman_to_script: + hooks: + post_config: + - + - aksharamukha.romanizer.r2s_post_config + - dest_script: "Devanagari" diff --git a/scriptshifter/tables/data/persian.yml b/scriptshifter/tables/data/persian.yml new file mode 100644 index 0000000..edc54dc --- /dev/null +++ b/scriptshifter/tables/data/persian.yml @@ -0,0 +1,309 @@ +general: + name: Persian + +roman_to_script: + map: + # Punctuation marks: + "# %": "\u066A ; cannot transliterate the truncation character" + "*": "\u066D" + ",": "\u060C" + ";": "\u061B" + "?": "\u061F" + + # Exceptions for specific words + # Allah + "Alla\u0304h": "\u0627\u0644\u0644\u0647" + + # Qur'an + "Qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646" + + # lillah + "lilla\u0304h": "\u0644\u0644\u0647" + + # billah + "billa\u0304h": "\u0628\u0644\u0644\u0647" + + # Rahman + "Rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646" + + # ta'lif + + # Ibn when it appears in the middle of a name sequence + "ibn": "\u0628\u0646" + + # Parsing "sh[dot below] as in "Ishaq [name]" + "%sh\u0323%": "\u0633\u062D" + + # Edition statements with Latin number + "Cha\u0304p-i 1": "\u0686\u0627\u067E 1" + "Cha\u0304p-i 2": "\u0686\u0627\u067E 2" + "Cha\u0304p-i 3": "\u0686\u0627\u067E 3" + "Cha\u0304p-i 4": "\u0686\u0627\u067E 4" + "Cha\u0304p-i 5": "\u0686\u0627\u067E 5" + "Cha\u0304p-i 6": "\u0686\u0627\u067E 6" + "Cha\u0304p-i 7": "\u0686\u0627\u067E 7" + "Cha\u0304p-i 8": "\u0686\u0627\u067E 8" + "Cha\u0304p-i 9": "\u0686\u0627\u067E 9" + + # Numbers (Extended Arabic \u06F0-06F9 for Persian) + # currently *not* valid MARC21 characters + "# 0": "\u06F0" + "# 1": "\u06F1" + "# 2": "\u06F2" + "# 3": "\u06F3" + "# 4": "\u06F4" + "# 5": "\u06F5" + "# 6": "\u06F6" + "# 7": "\u06F7" + "# 8": "\u06F8" + "# 9": "\u06F9" + + # Use Basic Arabic \u0660-0669, instead: + "0": "\u0660" + "1": "\u0661" + "2": "\u0662" + "3": "\u0663" + "4": "\u0664" + "5": "\u0665" + "6": "\u0666" + "7": "\u0667" + "8": "\u0668" + "9": "\u0669" + + "# prime ": " ZWNJ" + "\u02B9a\u0304": "\u200C\u0622" + "\u02B9a": "\u200C\u0627" + "%\u02B9i\u0304": "\u200C\u0627\u0649" + "\u02B9i\u0304": "\u200C\u0627\u064A" + "i\u0304\u02B9a\u0304": "\u0649\u200C\u0622" + "i\u0304\u02B9a": "\u0649\u200C\u0627" + "i\u0304\u02B9u\u0304": "\u0649\u200C\u0622" + "i\u0304\u02B9u": "\u0649\u200C\u0627" + "i\u0304\u02B9": "\u0649\u200C" + "\u02B9": "\u200C" + + # Vowel and vowel/consonant combinations + # and hyphenated suffixes: + # izafah here + "%h-\u02BEi": "\u06C0" + "%h-\u02BCi": "\u06C0 " + "%-\u02BEi": "\u06C0" + "%-\u02BCi": "\u06C0" + + "%h-yi": "\u0647\u200C\u0649" + "%-yi": "\u0649" + "%yi": "\u0649" + "%\u02BEi\u0304": "\u0649" + "%\u02BCi\u0304": "\u0649" + "%i\u0304-i": "\u0649" + "%i\u0304": "\u0649" + "%ayy-i": "\u0649" + "%ay": "\u0649" + "%al-i": "\u0644" + "%-i": "" + + # Hyphenated prefixes: + "wa-": "\u0648" + "bi-": "\u0628" + "al-": "\u0627\u0644" + "lil-": "\u0644\u0644" + "li-": "\u0644" + "ka-": "\u0643" + + # ayn combo + "%\u02BBa\u0304%": "\u0639\u0627" + # alif combos + # [final position] + "%a\u0304\u02BE": "\u0627\u0621" + "%a\u0304\u02BC": "\u0627\u0621" + "%a\u0304\u02BEi\u0304": "\u0627\u0626\u0649" + "%a\u0304\u02BCi\u0304": "\u0627\u0626\u0649" + # [initial position] + "A\u0304\u02BEi\u0304%": "\u0622\u0626\u064A" + "A\u0304\u02BCi\u0304%": "\u0622\u0626\u064A" + "a\u0304\u02BEi\u0304%": "\u0622\u0626\u064A" + "a\u0304\u02BCi\u0304%": "\u0622\u0626\u064A" + # [medial position] + "a\u0304\u02BEi\u0304": "\u0627\u0626\u064A" + "a\u0304\u02BCi\u0304": "\u0627\u0626\u064A" + "a\u0304\u02BEi": "\u0627\u0626" + "a\u0304\u02BCi": "\u0627\u0626" + "a\u02BEi\u0304": "\u0626\u064A" + "a\u02BCi\u0304": "\u0626\u064A" + "u\u0304\u02BEi\u0304": "\u0648\u0626\u0649" + "u\u0304\u02BCi\u0304": "\u0648\u0626\u0649" + + # A + "\u02BBA\u0304": "\u0639\u0627" + "\u02BBa\u0304": "\u0639\u0627" + "\u02BBA%": "\u0639" + "\u02BBa": "\u0639" + "A\u02BB%": "\u0627\u0639" + "a\u02BB%": "\u0627\u0639" + "A\u02BB": "\u0623\u0639" + "a\u02BB": "\u0639" + "a\u02BE": "\u0623" + "a\u02BC": "\u0623" + "\u02BEa": "\u0623" + "\u02BCa": "\u0623" + "a\u0304\u02BE": "\u0621" + "a\u0304\u02BC": "\u0621" + "A\u0304%": "\u0622" + "a\u0304%": "\u0622" + # previously an alif: + "A\u0304": "\u0622" + "a\u0304": "\u0627" + "A\u0301": "\u0649" + "a\u0301": "\u0649" + "ayy": "\u064A" + "A%": "\u0627" + "a%": "\u0627" + "A": "\u0627" + "a": "" + + # I + "\u02BBI\u0304": "\u0639\u064A" + "\u02BBi\u0304": "\u0639\u064A" + "I\u02BB%": "\u0627\u0639" + "i\u02BB%": "\u0627\u0639" + "i\u02BB": "\u0639" + "\u02BBI%": "\u0639" + "I\u02BE%": "\u0627\u0626" + "I\u02BC%": "\u0627\u0626" + "i\u02BE": "\u0626" + "i\u02BC": "\u0626" + "\u02BEi\u0304": "\u0626\u0649" + "\u02BCi\u0304": "\u0626\u0649" + "\u02BEi": "\u0626" + "\u02BCi": "\u0626" + "I\u0304%": "\u0627\u064A" + "i\u0304%": "\u0627\u064A" + "i\u0304y": "\u064A" + "I\u0304": "\u0627\u0649" + "i\u0304": "\u064A" + "I%": "\u0627" + "i%": "\u0627" + "I": "\u0627" + "i": "" + + # U + "\u02BEu\u0304": "\u0626\u0648" + "\u02BCu\u0304": "\u0626\u0648" + "U\u02BE%": "\u0627\u0624" + "U\u02BC%": "\u0627\u0624" + "u\u02BE": "\u0624" + "u\u02BC": "\u0624" + "U\u0304w%": "\u0627\u0628" + "u\u0304w": "\u0628" + "U\u0304": "\u0627\u0648" + "u\u0304": "\u0648" + "U%": "\u0627" + "u%": "\u0627" + "U": "\u0627" + "u": "" + + # Consonants: + "B": "\u0628" + "bb": "\u0628" + "b": "\u0628" + "P": "\u067E" + "pp": "\u067E" + "p": "\u067E" + "T\u0323": "\u0637" + "t\u0323t\u0323": "\u0637" + "t\u0323": "\u0637" + "T": "\u062A" + "tt": "\u062A" + "t": "\u062A" + "Sh": "\u0634" + "shsh": "\u0634" + "sh": "\u0634" + "S\u0323": "\u0635" + "s\u0323s\u0323": "\u0635" + "s\u0323": "\u0635" + "S\u0332": "\u062B" + "ss\u0332": "\u062B" + "s\u0332": "\u062B" + "S": "\u0633" + "ss": "\u0633" + "s": "\u0633" + "J": "\u062C" + "jj": "\u062C" + "j": "\u062C" + "Ch": "\u0686" + "chch": "\u0686" + "ch": "\u0686" + "H\u0323": "\u062D" + "h\u0323h\u0323": "\u062D" + "h\u0323": "\u062D" + "H": "\u0647" + "hh": "\u0647" + "h": "\u0647" + "Kh": "\u062E" + "khkh": "\u062E" + "kh": "\u062E" + "K": "\u06A9" + "kk": "\u06A9" + "k": "\u06A9" + # Arabic d with dot below: + "D\u0323": "\u0636" + "d\u0323d\u0323": "\u0636" + "d\u0323": "\u0636" + "D": "\u062F" + "dd": "\u062F" + "d": "\u062F" + "Zh": "\u0698" + "zhzh": "\u0698" + "zh": "\u0698" + "Z\u0323": "\u0638" + "z\u0323z\u0323": "\u0638" + "z\u0323": "\u0638" + "Z\u0324": "\u0636" + "z\u0324z\u0324": "\u0636" + "z\u0324": "\u0636" + "Z\u0332": "\u0630" + "zz\u0332": "\u0630" + "z\u0332": "\u0630" + "Z": "\u0632" + "zz": "\u0632" + "z": "\u0632" + "R": "\u0631" + "rr": "\u0631" + "r": "\u0631" + "Gh": "\u063A" + "ghgh": "\u063A" + "gh": "\u063A" + "G": "\u06AF" + "g": "\u06AF" + "F": "\u0641" + "ff": "\u0641" + "f": "\u0641" + "Q": "\u0642" + "qq": "\u0642" + "q": "\u0642" + "L": "\u0644" + "ll": "\u0644" + "l": "\u0644" + "M": "\u0645" + "mm": "\u0645" + "m": "\u0645" + "N": "\u0646" + "nn": "\u0646" + "n": "\u0646" + "V": "\u0648" + "vv": "\u0648" + "v": "\u0648" + "W": "\u0648" + "ww": "\u0648" + "w": "\u0648" + "Y": "\u064A" + "y": "\u064A" + + # non-Arabic consonants: + + # ain (alone) + "\u02BB": "\u0639" + + # hamza (alone in final position) + "%\u02BE": "\u0621" + "%\u02BC": "\u0621" diff --git a/scriptshifter/tables/data/prakrit.yml b/scriptshifter/tables/data/prakrit_devanagari.yml similarity index 100% rename from scriptshifter/tables/data/prakrit.yml rename to scriptshifter/tables/data/prakrit_devanagari.yml diff --git a/scriptshifter/tables/data/pushto.yml b/scriptshifter/tables/data/pushto.yml new file mode 100644 index 0000000..1dce5ea --- /dev/null +++ b/scriptshifter/tables/data/pushto.yml @@ -0,0 +1,390 @@ +general: + name: Pushto + +roman_to_script: + map: + # Punctuation marks: + # %": "\u066A # cannot transliterate the truncation character" + "*": "\u066D" + ",": "\u060C" + ";": "\u061B" + "?": "\u061F" + + # Exceptions for specific words + # Allah + "Alla\u0304h": "\u0627\u0644\u0644\u0647" + + # Qur'an + "Qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646" + + # lillah + "lilla\u0304h": "\u0644\u0644\u0647" + + # billah + "billa\u0304h": "\u0628\u0644\u0644\u0647" + + # Rahman + "Rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646" + + # ta'lif + + # Ibn when it appears in the middle of a name sequence + "ibn": "\u0628\u0646" + + # Parsing "sh[dot below] as in "Ishaq [name]" + "%sh\u0323%": "\u0633\u062D" + + # Edition statements with Latin number + "Cha\u0304p-i 1": "\u0686\u0627\u067E 1" + "Cha\u0304p-i 2": "\u0686\u0627\u067E 2" + "Cha\u0304p-i 3": "\u0686\u0627\u067E 3" + "Cha\u0304p-i 4": "\u0686\u0627\u067E 4" + "Cha\u0304p-i 5": "\u0686\u0627\u067E 5" + "Cha\u0304p-i 6": "\u0686\u0627\u067E 6" + "Cha\u0304p-i 7": "\u0686\u0627\u067E 7" + "Cha\u0304p-i 8": "\u0686\u0627\u067E 8" + "Cha\u0304p-i 9": "\u0686\u0627\u067E 9" + + # Numbers (Extended Arabic \u06F0-06F9 for Persian) + # currently *not* valid MARC21 characters + "# 0": "\u06F0" + "# 1": "\u06F1" + "# 2": "\u06F2" + "# 3": "\u06F3" + "# 4": "\u06F4" + "# 5": "\u06F5" + "# 6": "\u06F6" + "# 7": "\u06F7" + "# 8": "\u06F8" + "# 9": "\u06F9" + + # Use Basic Arabic \u0660-0669, instead: + "0": "\u0660" + "1": "\u0661" + "2": "\u0662" + "3": "\u0663" + "4": "\u0664" + "5": "\u0665" + "6": "\u0666" + "7": "\u0667" + "8": "\u0668" + "9": "\u0669" + + "# prime ": " ZWNJ" + "\u02B9a\u0304": "\u200C\u0622" + "\u02B9a": "\u200C\u0627" + "%\u02B9i\u0304": "\u200C\u0627\u0649" + "\u02B9i\u0304": "\u200C\u0627\u064A" + "i\u0304\u02B9": "\u0649\u200C" + "a\u0323y\u02B9": "\u06D3\u200C" + "\u02B9": "\u200C" + + # Vowel and vowel/consonant combinations + # and hyphenated suffixes: + # izafah here + "%h-\u02BEi": "\u06C0" + "%h-\u02BCi": "\u06C0 " + "%-\u02BEi": "\u06C0" + "%-\u02BCi": "\u06C0" + + "%h-yi": "\u0647\u200C\u0649" + "%-yi": "\u0649" + "%yi": "\u0649" + "%\u02BEi\u0304": "\u0649" + "%\u02BCi\u0304": "\u0649" + "%i\u0304-i": "\u0649" + "%i\u0304": "\u0649" + + "%a\u0323h": "\u06C0" + "%ayy-i": "\u0649" + "%a\u0304y": "\u0627\u0649" + "%a\u0301": "\u0649\u0670" + "%al-i": "\u0644" + "%-i": "" + + # Hyphenated prefixes: + "wa-": "\u0648" + "bi-": "\u0628" + "al-": "\u0627\u0644" + "lil-": "\u0644\u0644" + "li-": "\u0644" + "ka-": "\u0643" + + # Diphthongs here + "Ayy%": "\u0627\u064A" + "ayy%": "\u0627\u064A" + "%a\u0323y": "\u06D3" + "%ay": "\u0649" + "\u02BBAw": "\u0639\u0648" + "\u02BBaw": "\u0639\u0648" + "Aw": "\u0627\u0648" + "aw": "\u0648" + + # ayn combo + "%\u02BBa\u0304%": "\u0639\u0627" + + # alif combos + # [final position] + "%a\u0304\u02BE": "\u0627\u0621" + "%a\u0304\u02BC": "\u0627\u0621" + "%a\u0304\u02BEi\u0304": "\u0627\u0626\u0649" + "%a\u0304\u02BCi\u0304": "\u0627\u0626\u0649" + # [initial position] + "A\u0304\u02BEi\u0304%": "\u0622\u0626\u064A" + "A\u0304\u02BCi\u0304%": "\u0622\u0626\u064A" + "a\u0304\u02BEi\u0304%": "\u0622\u0626\u064A" + "a\u0304\u02BCi\u0304%": "\u0622\u0626\u064A" + + # [medial position] + "a\u0304\u02BEi\u0304": "\u0627\u0626\u064A" + "a\u0304\u02BCi\u0304": "\u0627\u0626\u064A" + "a\u0304\u02BEi": "\u0627\u0626" + "a\u0304\u02BCi": "\u0627\u0626" + "a\u02BEi\u0304": "\u0626\u064A" + "a\u02BCi\u0304": "\u0626\u064A" + + #a [macron] hamza followed by e + "a\u0304\u02BEe": "\u0627\u0626\u064A" + "a\u0304\u02BCe": "\u0627\u0626\u064A" + + # s[dot below]h + "S\u0323h": "\u069A" + "s\u0323hs\u0323h": "\u069A" + "s\u0323h": "\u069A" + + # A + "\u02BBA\u0304": "\u0639\u0627" + "\u02BBa\u0304": "\u0639\u0627" + "\u02BBA%": "\u0639" + "\u02BBa": "\u0639" + "A\u02BB": "\u0623\u0639" + "a\u02BB": "\u0639" + "a\u02BE": "\u0623" + "a\u02BC": "\u0623" + "\u02BEa": "\u0623" + "\u02BCa": "\u0623" + "a\u0304\u02BE": "\u0621" + "a\u0304\u02BC": "\u0621" + "A\u0304%": "\u0622" + "a\u0304%": "\u0622" + "A\u0304": "\u0627" + "a\u0304": "\u0627" + "A\u0301": "\u0649" + "a\u0301": "\u0649" + "ayy": "\u064A" + + # heh hamzah at end + "a\u0323h": "\u06C0" + + "A%": "\u0627" + "a%": "\u0627" + "A": "" + "a": "" + + # E + "%e": "\u06D0" + "E%": "\u0627\u064A" + "e%": "\u0627\u064A" + "e": "\u06D0" + + # I + "I\u02BB%": "\u0627\u0639" + "i\u02BB%": "\u0627\u0639" + "i\u02BB": "\u0639" + "\u02BBI%": "\u0639" + "I\u02BE%": "\u0627\u0626" + "I\u02BC%": "\u0627\u0626" + "i\u02BE": "\u0626" + "i\u02BC": "\u0626" + "\u02BEi": "\u0626" + "\u02BCi": "\u0626" + "I\u0304%": "\u0627\u064A" + "i\u0304%": "\u0627\u064A" + "i\u0304y": "\u064A" + "i\u0304": "\u064A" + "I%": "\u0627" + "i%": "\u0627" + "I": "" + "i": "" + + # O + "o%": "\u0627\u0648" + "O%": "\u0627\u0648" + "o": "\u0648" + + # U + # (u [macron] hamza combos) + "u\u0304\u02BE": "\u0624" + "u\u0304\u02BC": "\u0624" + + "\u02BEu\u0304": "\u0626\u0648" + "\u02BCu\u0304": "\u0626\u0648" + "U\u02BE%": "\u0627\u0624" + "U\u02BC%": "\u0627\u0624" + "u\u02BE": "\u0624" + "u\u02BC": "\u0624" + "U\u0304w%": "\u0627\u0628" + "u\u0304w": "\u0628" + "U\u0304": "\u0627\u0648" + "u\u0304": "\u0648" + "U%": "\u0627" + "u%": "\u0627" + "U": "" + "u": "" + + # Consonants: + "B": "\u0628" + "bb": "\u0628" + "b": "\u0628" + "P": "\u067E" + "pp": "\u067E" + "p": "\u067E" + + "T\u0324": "\u0637" + "t\u0324t\u0324": "\u0637" + "t\u0324": "\u0637" + + "T\u0323": "\u067C" + "t\u0323t\u0323": "\u067C" + "t\u0323": "\u067C" + + "T": "\u062A" + "tt": "\u062A" + "t": "\u062A" + + # s[dot]h used to be here + + "Sh": "\u0634" + "shsh": "\u0634" + "sh": "\u0634" + + "S\u0323": "\u0635" + "s\u0323s\u0323": "\u0635" + "s\u0323": "\u0635" + + "S\u0332": "\u062B" + "s\u0332s\u0332": "\u062B" + "s\u0332": "\u062B" + + "S\u0307": "\u0685" + "s\u0307s\u0307": "\u0685" + "s\u0307": "\u0685" + + "S": "\u0633" + "ss": "\u0633" + "s": "\u0633" + "J": "\u062C" + "jj": "\u062C" + "j": "\u062C" + "Ch": "\u0686" + "chch": "\u0686" + "ch": "\u0686" + "H\u0323": "\u062D" + "h\u0323h\u0323": "\u062D" + "h\u0323": "\u062D" + "H": "\u0647" + "hh": "\u0647" + "h": "\u0647" + "Kh": "\u062E" + "khkh": "\u062E" + "kh": "\u062E" + + "K": "\u06A9" + "kk": "\u06A9" + "k": "\u06A9" + + # particle da + "Da\u0020": "\u062F\u0020" + "da\u0020": "\u062F\u0020" + + "D\u0323": "\u0689" + "d\u0323d\u0323": "\u0689" + "d\u0323": "\u0689" + + "D": "\u062F" + "dd": "\u062F" + "d": "\u062F" + + "Z\u0323h": "\u0696" + "z\u0323hz\u0323h": "\u0696" + "z\u0323h": "\u0696" + + "Zh": "\u0698" + "zhzh": "\u0698" + "zh": "\u0698" + + "Z\u0324": "\u0638" + "z\u0324z\u0324": "\u0638" + "z\u0324": "\u0638" + + "Z\u0323": "\u0636" + "z\u0323z\u0323": "\u0636" + "z\u0323": "\u0636" + + "Z\u0332": "\u0630" + "z\u0332z\u0332": "\u0630" + "z\u0332": "\u0630" + + "Z\u0307": "\u0681" + "z\u0307z\u0307": "\u0681" + "z\u0307": "\u0681" + + "Z": "\u0632" + "zz": "\u0632" + "z": "\u0632" + + "R\u0323": "\u0693" + "r\u0323r\u0323": "\u0693" + "r\u0323": "\u0693" + + "R": "\u0631" + "rr": "\u0631" + "r": "\u0631" + + "Gh": "\u063A" + "ghgh": "\u063A" + "gh": "\u063A" + + "G": "\u06AB" + "gg": "\u06AB" + "g": "\u06AB" + + "F": "\u0641" + "ff": "\u0641" + "f": "\u0641" + + "Q": "\u0642" + "qq": "\u0642" + "q": "\u0642" + + "L": "\u0644" + "ll": "\u0644" + "l": "\u0644" + + "M": "\u0645" + "mm": "\u0645" + "m": "\u0645" + + "N\u0323": "\u06BC" + "n\u0323n\u0323": "\u06BC" + "n\u0323": "\u06BC" + + "N": "\u0646" + "nn": "\u0646" + "n": "\u0646" + + "W": "\u0648" + "ww": "\u0648" + "w": "\u0648" + + "Y": "\u064A" + "y": "\u064A" + + # non-Arabic consonants: + + # ain (alone) + "\u02BB": "\u0639" + + # hamza (alone in final position) + "%\u02BE": "\u0621" + "%\u02BC": "\u0621" diff --git a/scriptshifter/tables/data/rajasthani_devanagari.yml b/scriptshifter/tables/data/rajasthani_devanagari.yml new file mode 100644 index 0000000..061130e --- /dev/null +++ b/scriptshifter/tables/data/rajasthani_devanagari.yml @@ -0,0 +1,16 @@ +general: + name: Rajasthani (Devanagari) + +script_to_roman: + hooks: + post_config: + - + - aksharamukha.romanizer.s2r_post_config + - src_script: "Devanagari" + +roman_to_script: + hooks: + post_config: + - + - aksharamukha.romanizer.r2s_post_config + - dest_script: "Devanagari" diff --git a/scriptshifter/tables/data/sanskrit.yml b/scriptshifter/tables/data/sanskrit_devanagari.yml similarity index 100% rename from scriptshifter/tables/data/sanskrit.yml rename to scriptshifter/tables/data/sanskrit_devanagari.yml diff --git a/scriptshifter/tables/data/urdu.yml b/scriptshifter/tables/data/urdu.yml new file mode 100644 index 0000000..ba50220 --- /dev/null +++ b/scriptshifter/tables/data/urdu.yml @@ -0,0 +1,463 @@ +general: + name: Urdi + +roman_to_script: + map: + # Punctuation marks: + # "%": "\u066A"; cannot transliterate the truncation character + "*": "\u066D" + ",": "\u060C" + ";": "\u061B" + "?": "\u061F" + + # Exceptions for specific words + # Allah + "Alla\u0304h": "\u0627\u0644\u0644\u0647" + "alla\u0304h": "\u0627\u0644\u0644\u0647" + + # Qur'an + "Qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646" + "qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646" + + # aur (with spaces) + " aur ": " \u0627\u0648\u0631 " + "Aur ": "\u0627\u0648\u0631 " + + #### + # Abdurrahman + "\u02BBAbdurrah\u0323ma\u0301n": "\u0639\u0628\u062F\u0627\u0644\u0631\u062D\u0645\u0670\u0646" + "\u02BBAbdurrah\u0323ma\u0304n": "\u0639\u0628\u062F\u0627\u0644\u0631\u062D\u0645\u0670\u0646" + + # Abd names + "\u02BBAbdul\u02BB": "\u0639\u0628\u062F\u0627\u0644\u0639" + "\u02BBAbdula": "\u0639\u0628\u062F\u0627\u0644" + "\u02BBAbdulb": "\u0639\u0628\u062F\u0627\u0644\u0628" + "\u02BBAbdulf": "\u0639\u0628\u062F\u0627\u0644\u0641" + "\u02BBAbdulg\u0332h\u0332": "\u0639\u0628\u062F\u0627\u0644\u063A" + "\u02BBAbdulh\u0323": "\u0639\u0628\u062F\u0627\u0644\u062D" + "\u02BBAbdulh": "\u0639\u0628\u062F\u0627\u0644\u0647 " + "\u02BBAbdulj": "\u0639\u0628\u062F\u0627\u0644\u062C" + "\u02BBAbdulk\u0332h\u0332": "\u0639\u0628\u062F\u0627\u0644\u062E" + "\u02BBAbdulk": "\u0639\u0628\u062F\u0627\u0644\u0643 " + "\u02BBAbdulm": "\u0639\u0628\u062F\u0627\u0644\u0645" + "\u02BBAbdulq": "\u0639\u0628\u062F\u0627\u0644\u0642" + "\u02BBAbdulv": "\u0639\u0628\u062F\u0627\u0644\u0648" + "\u02BBAbdunn": "\u0639\u0628\u062F\u0627\u0644\u0646" + "\u02BBAbdurr": "\u0639\u0628\u062F\u0627\u0644\u0631" + "\u02BBAbdus\u0323s\u0323": "\u0639\u0628\u062F\u0627\u0644\u0325" + "\u02BBAbduss": "\u0639\u0628\u062F\u0627\u0644\u0633" + "\u02BBAbdushsh": "\u0639\u0628\u062F\u0627\u0644\u0634" + "\u02BBAbdutt": "\u0639\u0628\u062F\u0627\u0644\u062A" + "\u02BBAbduz\u0323z\u0323": "\u0639\u0628\u062F\u0627\u0644\u0636" + "\u02BBAbduz\u0324z\u0324": "\u0639\u0628\u062F\u0627\u0644\u0638" + + # Abu names + "Abu\u0304 ": "\u0627\u0628\u0648\u0020" + "Abu\u0304": "\u0627\u0628\u0648\u200C\u0627\u0644" + + #### + + #lillah + "lilla\u0304h": "\u0644\u0644\u0647" + + #billah + "billa\u0304h": "\u0628\u0644\u0644\u0647" + + # Rahman + "Rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646" + + # Nuzhat + "Nuzhat": "\u0646\u0632\u0647\u062A" + + # Uddin names + "%i\u0304uddi\u0304n": "\u0649\u200C\u0627\u0644\u062F\u0651\u064A\u0646" + "%uddi\u0304n": "\u200C\u0627\u0644\u062F\u0651\u064A\u0646" + + # ta'lif + + # Ibn when it appears in the middle of a name sequence + "ibn": "\u0628\u0646" + + # Abbreviated name elements + "# Ae": "\u0627\u06D2" + + # Parsing "sh[dot below] as in "Ishaq [name]" + "%sh\u0323%": "\u0633\u062D" + + # Numbers (\u06F0-06F9 for Persian/Urdu) + # currently *not* valid MARC21 characters + "# 0": "\u06F0" + "# 1": "\u06F1" + "# 2": "\u06F2" + "# 3": "\u06F3" + "# 4": "\u06F4" + "# 5": "\u06F5" + "# 6": "\u06F6" + "# 7": "\u06F7" + "# 8": "\u06F8" + "# 9": "\u06F9" + + # Postpositions + + # Aspirates [06BE] vs. heh [062D] combinations + "bh\u0323": "\u0628\u062D" + "Bh": "\u0628\u06BE" + "bh": "\u0628\u06BE" + + "ph\u0323": "\u067E\u062D" + "Ph": "\u067E\u06BE" + "ph": "\u067E\u06BE" + + "th\u0323": "\u062A\u062D" + "Th": "\u062A\u06BE" + "th": "\u062A\u06BE" + + "t\u0323h\u0323": "\u0679\u062D" + "T\u0323h": "\u0679\u06BE" + "t\u0323h": "\u0679\u06BE" + + "jh\u0323": "\u062C\u062D" + "Jh": "\u062C\u06BE" + "jh": "\u062C\u06BE" + + "ch\u0323": "\u0686\u062D" + "Ch": "\u0686\u06BE" + "ch": "\u0686\u06BE" + + "dh\u0323": "\u062F\u062D" + "Dh": "\u062F\u06BE" + "dh": "\u062F\u06BE" + + "d\u0323h\u0323": "\u0688\u062D" + "D\u0323h": "\u0688\u06BE" + "d\u0323h": "\u0688\u06BE" + + "r\u0323h\u0323": "\u0691\u062D" + "R\u0323h": "\u0691\u06BE" + "r\u0323h": "\u0691\u06BE" + + "kh\u0323": "\u06A9\u062D" + "Kh": "\u06A9\u06BE" + "kh": "\u06A9\u06BE" + + "gh\u0323": "\u06AF\u062D" + "Gh": "\u06AF\u06BE" + "gh": "\u06AF\u06BE" + + # prime = ZWNJ" + "\u02B9A\u0304": "\u200C\u0622" + "\u02B9a\u0304": "\u200C\u0622" + "a\u0304\u02BC\u02B9": "\u0627\u0621\u200C" + "i\u0304\u02B9": "\u0649\u200C" + "\u02B9": "\u200C" + + # Izafah here + "%a\u0304-yi": "\u0627\u0626\u06D2" + "%u\u0304-yi": "\u0648\u0626\u06D2" + "%o-yi": "\u0648\u0626\u06D2" + "%e-yi": "\u06D2" + "%i\u0304-yi": "\u0649" + "%h-yi": "\u06C0" + "%-yi": "\u06C0" + "%al-i": "\u0644" + "%ul-i": "\u0644" + "%-i": "" + + # Hyphenated prefixes: + "bi-": "\u0628" + "al-a\u0304%": "\u0627\u0644\u0627" + "ul-a\u0304%": "\u0627\u0644\u0627" + "al-": "\u0627\u0644" + "ul-": "\u0627\u0644" + "lil-i": "\u0644\u0644" + "lil-": "\u0644\u0644" + + # al-/ul- plus sun letters + "ar-r": "\u0627\u0644\u0631" + "ur-r": "\u0627\u0644\u0631" + "ar-R": "\u0627\u0644\u0631" + "ur-R": "\u0627\u0644\u0631" + "az\u0332-z\u0332": "\u0627\u0644\u0630" + "uz\u0332-z\u0332": "\u0627\u0644\u0630" + "az\u0332-Z\u0332": "\u0627\u0644\u0630" + "uz\u0332-Z\u0332": "\u0627\u0644\u0630" + "ad-d": "\u0627\u0644\u0627" + "ud-d": "\u0627\u0644\u0627" + "ad-D": "\u0627\u0644\u0627" + "ud-D": "\u0627\u0644\u0627" + "as\u0332-s\u0332": "\u0627\u0644\u062B" + "us\u0332-s\u0332": "\u0627\u0644\u062B" + "as\u0332-S\u0332": "\u0627\u0644\u062B" + "us\u0332-S\u0332": "\u0627\u0644\u062B" + "at-t": "\u0627\u0644\u062A" + "ut-t": "\u0627\u0644\u062A" + "at-T": "\u0627\u0644\u062A" + "ut-T": "\u0627\u0644\u062A" + "an-n": "\u0627\u0644\u0646" + "un-n": "\u0627\u0644\u0646" + "an-N": "\u0627\u0644\u0646" + "un-N": "\u0627\u0644\u0646" + "al-l": "\u0627\u0644\u0644" + "ul-l": "\u0627\u0644\u0644" + "al-L": "\u0627\u0644\u0644" + "ul-L": "\u0627\u0644\u0644" + "az\u0324-z\u0324": "\u0627\u0644\u0638" + "uz\u0324-z\u0324": "\u0627\u0644\u0638" + "az\u0324-Z\u0324": "\u0627\u0644\u0638" + "uz\u0324-Z\u0324": "\u0627\u0644\u0638" + "at\u0324-t\u0324": "\u0627\u0644\u0637" + "ut\u0324-t\u0324": "\u0627\u0644\u0637" + "at\u0324-T\u0324": "\u0627\u0644\u0637" + "ut\u0324-T\u0324": "\u0627\u0644\u0637" + "az\u0323-z\u0323": "\u0627\u0644\u0636" + "uz\u0323-z\u0323": "\u0627\u0644\u0636" + "az\u0323-Z\u0323": "\u0627\u0644\u0636" + "uz\u0323-Z\u0323": "\u0627\u0644\u0636" + "as\u0323-s\u0323": "\u0627\u0644\u0635" + "us\u0323-s\u0323": "\u0627\u0644\u0635" + "as\u0323-S\u0323": "\u0627\u0644\u0635" + "us\u0323-S\u0323": "\u0627\u0644\u0635" + "ash-sh": "\u0627\u0644\u0634" + "ush-sh": "\u0627\u0644\u0634" + "ash-Sh": "\u0627\u0644\u0634" + "ush-Sh": "\u0627\u0644\u0634" + "as-s": "\u0627\u0644\u0633" + "us-s": "\u0627\u0644\u0633" + "as-S": "\u0627\u0644\u0633" + "us-S": "\u0627\u0644\u0633" + "az-z": "\u0627\u0644\u0632" + "uz-z": "\u0627\u0644\u0632" + "az-Z": "\u0627\u0644\u0632" + "uz-Z": "\u0627\u0644\u0632" + + # Diphthongs here + "Ae": "\u0627\u06D2" + "%ai": "\u06D2" + "Ai": "\u0627\u064A" + "ai%": "\u0627\u064A" + "ai": "\u064A" + "\u02BBAu": "\u0639\u0648" + "\u02BBau": "\u0639\u0648" + "Au": "\u0627\u0648" + "au": "\u0648" + + # ayn-alif combo + "%\u02BBa\u0304\u02BE": "\u0639\u0627\u0621" + "%\u02BBa\u0304\u02BC": "\u0639\u0627\u0621" + "%\u02BBa\u0304%": "\u0639\u0627" + + # hamza and vowel combo + # [in final position] + "%u\u0304\u02BEi\u0304": "\u0648\u0626\u0649" + "%u\u0304\u02BCi\u0304": "\u0648\u0626\u0649" + "%\u02BEi\u0304": "\u0626\u0649" + "%\u02BCi\u0304": "\u0626\u0649" + "%\u02BEe": "\u0626\u06D2" + "%\u02BCe": "\u0626\u06D2" + + "%\u02BEu\u0304": "\u0624" + "%\u02BCu\u0304": "\u0624" + "%\u02BEo": "\u0624" + "%\u02BCo": "\u0624" + + # [in medial position] + "a\u02BEa": "\u0623" + "a\u02BCa": "\u0623" + "a\u0304\u02BEa": "\u0627\u0621" + "a\u0304\u02BCa": "\u0627\u0621" + "a\u02BEa\u0304": "\u0622" + "a\u02BCa\u0304": "\u0622" + "o\u02BEi\u0304": "\u0648\u0626\u064A" + "o\u02BCi\u0304": "\u0648\u0626\u064A" + "o\u02BEi": "\u0648\u0626" + "o\u02BCi": "\u0648\u0626" + "\u02BEi\u0304": "\u0626\u064A" + "\u02BCi\u0304": "\u0626\u064A" + "\u02BEi": "\u0626" + "\u02BCi": "\u0626" + "\u02BEe": "\u0626\u064A" + "\u02BCe": "\u0626\u064A" + + "\u02BEu\u0304": "\u0624" + "\u02BCu\u0304": "\u0624" + "u\u0304\u02BE": "\u0624" + "u\u0304\u02BC": "\u0624" + "\u02BEo": "\u0624" + "\u02BCo": "\u0624" + "o\u02BE": "\u0624" + "o\u02BC": "\u0624" + "au\u02BE": "\u0624" + "au\u02BC": "\u0624" + + "\u02BEa": "\u0626" + "\u02BCa": "\u0626" + + "%i\u0304": "\u0649" + "%a\u0301": "\u0649\u0670" + + # A + "\u02BBA\u0304": "\u0639\u0627" + "\u02BBa\u0304": "\u0639\u0627" + "\u02BBA%": "\u0639" + "\u02BBa": "\u0639" + "A\u02BB": "\u0627\u0639" + "a\u02BB%": "\u0627\u0639" + "a\u02BB": "\u0639" + "A\u0304%": "\u0622" + "a\u0304%": "\u0622" + "a\u0304": "\u0627" + "a\u0301": "\u0649" + "ayy": "\u064A\u0651" + "A%": "\u0627" + "a%": "\u0627" + "A": "" + "a": "" + + # E + "%e": "\u06D2" + "E%": "\u0627\u064A" + "e%": "\u0627\u064A" + "e": "\u064A" + + # I + "\u02BBI\u0304": "\u0639\u064A" + "\u02BBi\u0304": "\u0639\u064A" + "I\u02BB": "\u0627\u0639" + "i\u02BB": "\u0639" + "\u02BBI": "\u0639" + "I\u0304%": "\u0627\u064A" + "i\u0304%": "\u0627\u064A" + "i\u0304y": "\u064A" + "i\u0304": "\u064A" + "iyy": "\u064A\u0651" + "I%": "\u0627" + "i%": "\u0627" + "I": "\u0627" + "i": "" + + # O + "O%": "\u0627\u0648" + "o": "\u0648" + + # U + "\u02BBu\u0304": "\u0639\u0648" + "\u02BBU": "\u0639" + "\u02BBu": "\u0639" + "U\u0304%": "\u0627\u0648" + "u\u0304%": "\u0627\u0648" + "u\u0304": "\u0648" + "U%": "\u0627" + "u%": "\u0627" + "U": "" + "u": "" + + # Consonants: + "B": "\u0628" + "bb": "\u0628\u0651" + "b": "\u0628" + "P": "\u067E" + "pp": "\u067E\u0651" + "p": "\u067E" + "T\u0323": "\u0679" + "t\u0323t\u0323": "\u0679\u0651" + "t\u0323": "\u0679" + "T\u0324": "\u0637" + "t\u0324t\u0324": "\u0637\u0651" + "t\u0324": "\u0637" + "T": "\u062A" + "tt": "\u062A\u0651" + "t": "\u062A" + "Sh": "\u0634" + "shsh": "\u0634\u0651" + "sh": "\u0634" + "S\u0323": "\u0635" + "s\u0323s\u0323": "\u0635\u0651" + "s\u0323": "\u0635" + "S\u0332": "\u062B" + "s\u0332s\u0332": "\u062B\u0651" + "s\u0332": "\u062B" + "S": "\u0633" + "ss": "\u0633\u0651" + "s": "\u0633" + "J": "\u062C" + "jj": "\u062C\u0651" + "j": "\u062C" + "C": "\u0686" + "cc": "\u0686\u0651" + "c": "\u0686" + "H\u0323": "\u062D" + "h\u0323h\u0323": "\u062D\u0651" + "h\u0323": "\u062D" + "H": "\u0647" + "hh": "\u0647\u0651" + "h": "\u0647" + "K\u0332h\u0332": "\u062E" + "k\u0332h\u0332k\u0332h\u0332": "\u062E\u0651" + "k\u0332h\u0332": "\u062E" + "K": "\u06A9" + "kk": "\u06A9\u0651" + "k": "\u06A9" + "D\u0323": "\u0688" + "d\u0323d\u0323": "\u0688\u0651" + "d\u0323": "\u0688" + "D": "\u062F" + "dd": "\u062F\u0651" + "d": "\u062F" + "Z\u0324": "\u0638" + "z\u0324z\u0324": "\u0638\u0651" + "z\u0324": "\u0638" + "Z\u0323": "\u0636" + "z\u0323z\u0323": "\u0636\u0651" + "z\u0323": "\u0636" + "Z\u0332": "\u0630" + "z\u0332z\u0332": "\u0630\u0651" + "z\u0332": "\u0630" + "zz": "\u0632\u0651" + "Zh": "\u0698" + "zhzh": "\u0698\u0651" + "zh": "\u0698" + "Z": "\u0632" + "z": "\u0632" + "R\u0323": "\u0691" + "r\u0323r\u0323": "\u0691\u0651" + "r\u0323": "\u0691" + "R": "\u0631" + "rr": "\u0631\u0651" + "r": "\u0631" + "G\u0332h\u0332": "\u063A" + "g\u0332h\u0332g\u0332h\u0332": "\u063A\u0651" + "g\u0332h\u0332": "\u063A" + "G": "\u06AF" + "gg": "\u06AF\u0651" + "g": "\u06AF" + "F": "\u0641" + "ff": "\u0641\u0651" + "f": "\u0641" + "Q": "\u0642" + "qq": "\u0642\u0651" + "q": "\u0642" + "L": "\u0644" + "ll": "\u0644\u0651" + "l": "\u0644" + "M": "\u0645" + "mm": "\u0645\u0651" + "m": "\u0645" + "N\u0332": "\u06BA" + "n\u0332n\u0332": "\u06BA\u0651" + "n\u0332": "\u06BA" + "N": "\u0646" + "nn": "\u0646\u0651" + "n": "\u0646" + "V": "\u0648" + "vv": "\u0648\u0651" + "v": "\u0648" + "Y": "\u064A" + "yy": "\u064A\u0651" + "y": "\u064A" + + # ain (alone) + "\u02BB": "\u0639" + + # hamza (alone in final position) + "%\u02BE": "\u0621" + "%\u02BC": "\u0621" diff --git a/tests/data/script_samples/tibetan.csv b/tests/data/script_samples/tibetan.csv new file mode 100644 index 0000000..d976a0e --- /dev/null +++ b/tests/data/script_samples/tibetan.csv @@ -0,0 +1,5 @@ +tibetan,བྱང་ཕྱོགས་བསྟན་འགྲོའི་སྐྱབས་མགོན་ཐམས་ཅད་མཁྱེན་པ་ཁལ་ཁ་ཨེར་ཏེ་ནེ་ཁུ་ཐག་ཐུ་བློ་བཟང་བསྟན་འཛིན་རྒྱལ་མཚན་གྱིའི་གསུང་འབུམ།,Byang phyogs bstan ‘gro’i skyabs mgon Thams-cad-mkhyen-pa Khal-kha Er-te-ne Khu-thag-thu Blo-bzang-bstan-‘dzin-rgyal-mtshan gyi’i gsung ʼbum,{"capitalize": "first"}, +tibetan,རྗེ་བཙུན་དམ་པ་སྐུ་ཕྲེང་བརྒྱད་པའི་གསུང་འབུམ,Rje-btsun-dam-pa sku phreng brgyad paʼi gsung ʼbum,{"capitalize": "first"}, +tibetan,རྗེ་བཙུན་ཐམས་ཅད་མཁྱེན་པ་དགེ་འདུན་རྒྱ་མཚོའི་གསུང་འབུམ་བཞུགས་སོ་,Rje-btsun Thams-cad-mkhyen-pa Dge-ʼdun-rgya-mtshoʼi gsung ʼbum bzhugs so,{"capitalize": "first"}, +tibetan,སྒྲུབ་ཐབས་འདོད་འཇོའི་བུམ་བཟང་གི་བརྒྱུད་པའི་རིམ་པ་ཕྱོགས་གཅིག་ཏུ་བསྡེབས་པ་བཞུགས་སོ།,Sgrub thabs ʼdod ʼjoʼi bum bzang gi brgyud paʼi rim pa phyogs gcig tu bsdebs pa bzhugs so,{"capitalize": "first"}, +tibetan,བཀའ་གདམས་ཀྱི་སྐྱེས་བུ་དམ་པ་རྣམས་ཀྱི་གསུང་བགྲོས་ཐོར་བུ་རྣམས་བཞུགས་སོ།,Bkaʼ gdams kyi skyes bu dam pa rnams kyi gsung bgros thor bu rnams bzhugs so,{"capitalize": "first"},