diff --git a/cyrtranslit/__init__.py b/cyrtranslit/__init__.py index 387d0a1..7ccc732 100644 --- a/cyrtranslit/__init__.py +++ b/cyrtranslit/__init__.py @@ -135,11 +135,11 @@ def to_cyrillic(string_to_transliterate, lang_code='sr'): (c in u'Šš' and c_plus_1 in u'č') # šč )) or \ (lang_code == "mn" and ( - (c in u'Kk' and c_plus_1 == u'h') or # Х х - (c in u'Ss' and c_plus_1 == u'h') or # Ш ш - (c in u'Tt' and c_plus_1 == u's') or # Ц ц - (c in u'Cc' and c_plus_1 == u'h') or # Ч ч - (c in u'Yy' and c_plus_1 in u'eoua') # Е Ё Ю Я + (c in u'Kk' and c_plus_1 in u'Hh') or # Х х + (c in u'Ss' and c_plus_1 in u'Hh') or # Ш ш + (c in u'Tt' and c_plus_1 in u'Ss') or # Ц ц + (c in u'Cc' and c_plus_1 in u'Hh') or # Ч ч + (c in u'Yy' and c_plus_1 in u'EeOoUuAa') # Е Ё Ю Я )): index += 1 c += c_plus_1 diff --git a/cyrtranslit/mapping.py b/cyrtranslit/mapping.py index b814f8b..41d993d 100644 --- a/cyrtranslit/mapping.py +++ b/cyrtranslit/mapping.py @@ -303,28 +303,44 @@ u"Ф", u"F", u"ф", u"f", u"К", u"K", u"к", u"k", u"Х", u"Kh", u"х", u"kh", # lat 1 + u"Х", u"KH", u"х", u"kH", # lat 1 u"Г", u"G", u"г", u"g", u"С", u"S", u"с", u"s", u"Ш", u"Sh", u"ш", u"sh", # sh # lat2 + u"Ш", u"SH", u"ш", u"sH", # sh # lat2 u"Т", u"T", u"т", u"t", u"Д", u"D", u"д", u"d", u"Ц", u"Ts", u"ц", u"ts", # lat3 + u"Ц", u"TS", u"ц", u"tS", # lat3 u"Ч", u"Ch", u"ч", u"ch", # lat4 + u"Ч", u"CH", u"ч", u"cH", # lat4 u"З", u"Z", u"з", u"z", u"Ж", u"J", u"ж", u"j", u"Й", u"I", u"й", u"i", # i * 2 u"Р", u"R", u"р", u"r", u"Б", u"B", u"б", u"b", u"Е", u"Ye", u"е", u"ye", # lat 5 + u"Е", u"YE", u"е", u"yE", # lat 5 u"Ё", u"Yo", u"ё", u"yo", # lat 6 - u"Щ", u"Sh", u"щ", u"sh", # sh x 2 # lat 7 + u"Ё", u"YO", u"ё", u"yO", # lat 6 + #u"Щ", u"Sh", u"щ", u"sh", # sh x 2 # lat 7 # FIXME: How to handle the two possible cyrillic versions of Sh? Ш or Щ? + #u"Щ", u"SH", u"щ", u"sH", # sh x 2 # lat 7 # FIXME: How to handle the two possible cyrillic versions of Sh? Ш or Щ? u"Ъ", u"I", u"ъ", u"i", # i * 3 u"Ы", u"Y", u"ы", u"y", u"Ь", u"I", u"ь", u"i", # i * 4 u"Ю", u"Yu", u"ю", u"yu", # lat 8 + u"Ю", u"YU", u"ю", u"yU", # lat 8 u"Я", u"Ya", u"я", u"ya", # lat 9 + u"Я", u"YA", u"я", u"yA", # lat 9 ] -MN_CYR_TO_LAT_DICT = dict([(c, l) for c, l in zip(MN_CYR_LAT_LIST[::2], MN_CYR_LAT_LIST[1::2])]) +# Building the dictionary with the filter to skip pairs with 2-character Latin letters where the second character is uppercase +MN_CYR_TO_LAT_DICT = { + c: l for c, l in zip(MN_CYR_LAT_LIST[::2], MN_CYR_LAT_LIST[1::2]) + if not (len(l) == 2 and l[1].isupper()) +} +MN_CYR_TO_LAT_DICT['Щ'] = 'Sh' +MN_CYR_TO_LAT_DICT['щ'] = 'sh' + MN_LAT_TO_CYR_DICT = dict([(l, c) for c, l in zip(MN_CYR_LAT_LIST[-2::-2], MN_CYR_LAT_LIST[-1::-2])]) # Bundle up all the dictionaries in a lookup dictionary diff --git a/tests.py b/tests.py index f992ab1..9353746 100644 --- a/tests.py +++ b/tests.py @@ -264,6 +264,26 @@ def test_alphabet_transliteration_latin_to_cyrillic(self): self.assertEqual(transliterated_mongolian_alphabet, mongolian_alphabet_cyrillic) + def test_mixed_casing_transliteration_latin_to_cyrillic(self): + ''' Transliteration from latin with mixed casing, e.g. Sh SH sh sH. + ''' + input_latin = 'KhKHkhkHShSHshsHTsTStstSChCHchcHYeYEyeyEYoYOyoyOYaYAyayA' + expected_output_cyrillic = 'ХХххШШшшЦЦццЧЧччЕЕееЁЁёёЯЯяя' + + actual_output_cyrillic = cyrtranslit.to_cyrillic(input_latin, 'mn') + + self.assertEqual(actual_output_cyrillic, expected_output_cyrillic) + + def test_transliteration_cyrillic_to_sh(self): + ''' Transliteration from Ш/Щ and ш/щ should be Sh and sh. + ''' + input_cyrillic= 'ШшЩщ' + expected_output_latin = 'ShshShsh' + + actual_output_latin = cyrtranslit.to_latin(input_cyrillic, 'mn') + + self.assertEqual(actual_output_latin, expected_output_latin) + if __name__ == '__main__': # Run all tests.