-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchar_classes.py
109 lines (96 loc) · 3.82 KB
/
char_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Group large choices of character literals into categories"""
from pygramm import grammar
from enum import Enum, auto
from typing import List
import logging
logging.basicConfig()
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
# How big does a choice need to be to make it worthwhile to split?
CHARCLASS_TRIGGER_THRESHOLD = 5 # Must be at least 3
class CharCategory(Enum):
"""The values associated with these categories are
functions that classify the strings. They are ordered
from more specific to less specific.
"""
digit = [str.isdigit]
lower = [str.islower]
upper = [str.isupper]
space = [str.isspace]
ascii_alnum = [lambda s: s.isascii() and s.isalnum()]
control = [lambda s: s.isascii() and not s.isprintable()]
ascii_other = [str.isascii]
unicode_letter = [str.isalpha]
unicode_other = [lambda s: True]
# Implementation note: functions as Enum values
# do not work in Python 3. This is an undocumented
# bug: Python turns function values into methods instead
# of Enum entries. That is why I need to make each
# category a list.
def categorize(s: str) -> CharCategory:
assert isinstance(s, str)
for cat in CharCategory:
log.debug(f"Checking {s} against {cat}")
if cat.value[0](s):
return cat
assert False, "No category matched!"
class CharClasses(grammar.TransformBase):
"""Where we have a single Choice node with many character Literals,
break the character literals into categories such as alpha, numeric, national,
control, etc.
"""
def __init__(self, g: grammar.Grammar):
self.gram = g
def divide(self, items: List[grammar._Literal]) -> grammar.RHSItem:
parent = self.gram.choice() # New parent node for choosing a group
items.sort(key=lambda i: i.text)
# We assume categories are mostly in contiguous
# ranges of character codes, and we proceed to
# break up the categories by runs of same
# category.
subgroup = self.gram.choice()
parent.append(subgroup)
category = categorize(items[0].text)
for item in items:
item_cat = categorize(item.text)
if item_cat != category:
subgroup = self.gram.choice()
parent.append(subgroup)
category = item_cat
subgroup.append(item)
if len(parent.items) == 1:
# They were all in the same category!
return subgroup
return parent
def is_eligible(self, item: grammar.RHSItem) -> bool:
"""A node is eligible for breaking into character classes if it
is a Choice node with at least CHARCLASS_TRIGGER_THRESHOLD alternatives,
each of which is a literal consisting of a single character.
"""
if not isinstance(item, grammar._Choice):
return False
if len(item.items) < CHARCLASS_TRIGGER_THRESHOLD:
return False
for choice in item.items:
if not isinstance(choice, grammar._Literal):
return False
# class _Literal objects have a 'text' field; the ones we
# care about are single unicode characters
if len(choice.text) > 1:
return False
return True
def apply(self, item: grammar.RHSItem) -> grammar.RHSItem:
"""Postorder visit"""
# Only _Choice nodes are affected
if not self.is_eligible(item):
return item
divided = self.divide(item.items)
return divided
def teardown(self, g: grammar.Grammar):
"""We have introduced new nodes, so we need
to recalculate min tokens.
FIXME: Increasingly min tokens looks like it shouldn't
be part of initial grammar creation.
"""
g._calc_min_tokens()
g._calc_pot_tokens()