forked from Prasengupta/weasyprint_for_awslambda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyphen.py
326 lines (251 loc) · 10.6 KB
/
pyphen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# This file is part of Pyphen
#
# Copyright 2008 - Wilbert Berendsen <[email protected]>
# Copyright 2012-2013 - Guillaume Ayoub <[email protected]>
#
# This library is free software. It is released under the
# GPL 2.0+/LGPL 2.1+/MPL 1.1 tri-license. See COPYING.GPL, COPYING.LGPL and
# COPYING.MPL for more details.
#
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
"""
Pyphen
======
Pure Python module to hyphenate text, inspired by Ruby's Text::Hyphen.
"""
from __future__ import unicode_literals
import os
import sys
import re
try:
unichr
except NameError:
# Python3
unichr = chr
__all__ = ('Pyphen', 'LANGUAGES', 'language_fallback')
# cache of per-file HyphDict objects
hdcache = {}
# precompile some stuff
parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
parse = re.compile(r'(\d?)(\D?)').findall
# included dictionaries are available:
# - at <sys.prefix>/share/pyphen/dictionaries when Pyphen is installed
# - at <project_root>/dictionaries when Pyphen is not installed
# - at <pkg_resources>/share/pyphen/dictionaries when Pyphen is in an egg
try:
import pkg_resources
dictionaries_roots = (os.path.join(
pkg_resources.resource_filename('pyphen', ''),
'share', 'pyphen', 'dictionaries'),)
except ImportError:
dictionaries_roots = ()
finally:
dictionaries_roots += (
os.path.join(sys.prefix, 'share', 'pyphen', 'dictionaries'),
os.path.join(os.path.dirname(__file__), 'dictionaries'))
LANGUAGES = dict(
(filename[5:-4], os.path.join(dictionaries_root, filename))
for dictionaries_root in dictionaries_roots
if os.path.isdir(dictionaries_root)
for filename in os.listdir(dictionaries_root)
if filename.endswith('.dic'))
def language_fallback(language):
"""Get a fallback language available in our dictionaries.
http://www.unicode.org/reports/tr35/#Locale_Inheritance
We use the normal truncation inheritance. This function needs aliases
including scripts for languages with multiple regions available.
"""
parts = language.replace('-', '_').split('_')
while parts:
language = '_'.join(parts)
if language in LANGUAGES:
return language
parts.pop()
class AlternativeParser(object):
"""Parser of nonstandard hyphen pattern alternative.
The instance returns a special int with data about the current position in
the pattern when called with an odd value.
"""
def __init__(self, pattern, alternative):
alternative = alternative.split(',')
self.change = alternative[0]
self.index = int(alternative[1])
self.cut = int(alternative[2])
if pattern.startswith('.'):
self.index += 1
def __call__(self, value):
self.index -= 1
value = int(value)
if value & 1:
return DataInt(value, (self.change, self.index, self.cut))
else:
return value
class DataInt(int):
"""``int`` with some other data can be stuck to in a ``data`` attribute."""
def __new__(cls, value, data=None, reference=None):
"""Create a new ``DataInt``.
Call with ``reference=dataint_object`` to use the data from another
``DataInt``.
"""
obj = int.__new__(cls, value)
if reference and isinstance(reference, DataInt):
obj.data = reference.data
else:
obj.data = data
return obj
class HyphDict(object):
"""Hyphenation patterns."""
def __init__(self, filename):
"""Read a ``hyph_*.dic`` and parse its patterns.
:param filename: filename of hyph_*.dic to read
"""
self.patterns = {}
with open(filename, 'rb') as stream:
# see "man 4 hunspell", iscii-devanagari is not supported by python
charset = stream.readline().strip().decode('ascii')
if charset.lower() == 'microsoft-cp1251':
charset = 'cp1251'
for pattern in stream:
pattern = pattern.decode(charset).strip()
if (not pattern or
pattern.startswith('%') or
pattern.startswith('#')):
continue
# replace ^^hh with the real character
pattern = parse_hex(
lambda match: unichr(int(match.group(1), 16)), pattern)
# read nonstandard hyphen alternatives
if '/' in pattern:
pattern, alternative = pattern.split('/', 1)
factory = AlternativeParser(pattern, alternative)
else:
factory = int
tags, values = zip(*[
(string, factory(i or '0'))
for i, string in parse(pattern)])
# if only zeros, skip this pattern
if max(values) == 0:
continue
# chop zeros from beginning and end, and store start offset
start, end = 0, len(values)
while not values[start]:
start += 1
while not values[end - 1]:
end -= 1
self.patterns[''.join(tags)] = start, values[start:end]
self.cache = {}
self.maxlen = max(len(key) for key in self.patterns)
def positions(self, word):
"""Get a list of positions where the word can be hyphenated.
:param word: unicode string of the word to hyphenate
E.g. for the dutch word 'lettergrepen' this method returns ``[3, 6,
9]``.
Each position is a ``DataInt`` with a data attribute.
If the data attribute is not ``None``, it contains a tuple with
information about nonstandard hyphenation at that point: ``(change,
index, cut)``.
change
a string like ``'ff=f'``, that describes how hyphenation should
take place.
index
where to substitute the change, counting from the current point
cut
how many characters to remove while substituting the nonstandard
hyphenation
"""
word = word.lower()
points = self.cache.get(word)
if points is None:
pointed_word = '.%s.' % word
references = [0] * (len(pointed_word) + 1)
for i in range(len(pointed_word) - 1):
for j in range(
i + 1, min(i + self.maxlen, len(pointed_word)) + 1):
pattern = self.patterns.get(pointed_word[i:j])
if pattern:
offset, values = pattern
slice_ = slice(i + offset, i + offset + len(values))
references[slice_] = map(
max, values, references[slice_])
points = [
DataInt(i - 1, reference=reference)
for i, reference in enumerate(references) if reference % 2]
self.cache[word] = points
return points
class Pyphen(object):
"""Hyphenation class, with methods to hyphenate strings in various ways."""
def __init__(self, filename=None, lang=None, left=2, right=2, cache=True):
"""Create an hyphenation instance for given lang or filename.
:param filename: filename of hyph_*.dic to read
:param lang: lang of the included dict to use if no filename is given
:param left: minimum number of characters of the first syllabe
:param right: minimum number of characters of the last syllabe
:param cache: if ``True``, use cached copy of the hyphenation patterns
"""
if not filename:
filename = LANGUAGES[language_fallback(lang)]
self.left = left
self.right = right
if not cache or filename not in hdcache:
hdcache[filename] = HyphDict(filename)
self.hd = hdcache[filename]
def positions(self, word):
"""Get a list of positions where the word can be hyphenated.
:param word: unicode string of the word to hyphenate
See also ``HyphDict.positions``. The points that are too far to the
left or right are removed.
"""
right = len(word) - self.right
return [i for i in self.hd.positions(word) if self.left <= i <= right]
def iterate(self, word):
"""Iterate over all hyphenation possibilities, the longest first.
:param word: unicode string of the word to hyphenate
"""
for position in reversed(self.positions(word)):
if position.data:
# get the nonstandard hyphenation data
change, index, cut = position.data
index += position
if word.isupper():
change = change.upper()
c1, c2 = change.split('=')
yield word[:index] + c1, c2 + word[index + cut:]
else:
yield word[:position], word[position:]
def wrap(self, word, width, hyphen='-'):
"""Get the longest possible first part and the last part of a word.
:param word: unicode string of the word to hyphenate
:param width: maximum length of the first part
:param hyphen: unicode string used as hyphen character
The first part has the hyphen already attached.
Returns ``None`` if there is no hyphenation point before ``width``, or
if the word could not be hyphenated.
"""
width -= len(hyphen)
for w1, w2 in self.iterate(word):
if len(w1) <= width:
return w1 + hyphen, w2
def inserted(self, word, hyphen='-'):
"""Get the word as a string with all the possible hyphens inserted.
:param word: unicode string of the word to hyphenate
:param hyphen: unicode string used as hyphen character
E.g. for the dutch word ``'lettergrepen'``, this method returns the
unicode string ``'let-ter-gre-pen'``. The hyphen string to use can be
given as the second parameter, that defaults to ``'-'``.
"""
word_list = list(word)
for position in reversed(self.positions(word)):
if position.data:
# get the nonstandard hyphenation data
change, index, cut = position.data
index += position
if word.isupper():
change = change.upper()
word_list[index:index + cut] = change.replace('=', hyphen)
else:
word_list.insert(position, hyphen)
return ''.join(word_list)
__call__ = iterate