Skip to content

Commit

Permalink
fix issue mpcabd#86 - missed ligature due to non-overlapping regex ma…
Browse files Browse the repository at this point in the history
…tches of ligature pattern in string when previous overlapping ligature candidate is skipped due to its form mismatch
  • Loading branch information
jurajmichalak committed Oct 13, 2022
1 parent 76aae4a commit 86bc129
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions arabic_reshaper/arabic_reshaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,17 @@ def reshape(self, text):
if delete_tatweel:
text = text.replace(TATWEEL, '')

for match in re.finditer(self._ligatures_re, text):
regex_start = 0
matchIt = re.finditer(self._ligatures_re, text)
match = next(matchIt, None)
while match:
group_index = next((
i for i, group in enumerate(match.groups()) if group
), -1)
forms = self._get_ligature_forms_from_re_group_index(
group_index
)
a, b = match.span()
a, b = tuple(i+regex_start for i in match.span())
a_form = output[a][FORM]
b_form = output[b - 1][FORM]
ligature_form = None
Expand All @@ -218,9 +221,13 @@ def reshape(self, text):
else:
ligature_form = MEDIAL
if not forms[ligature_form]:
regex_start = a+1
matchIt = re.finditer(self._ligatures_re, text[regex_start:])
match = next(matchIt, None)
continue
output[a] = (forms[ligature_form], NOT_SUPPORTED)
output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a)
match = next(matchIt, None)

result = []
if not delete_harakat and -1 in positions_harakat:
Expand Down

0 comments on commit 86bc129

Please sign in to comment.