-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathtweeregex.py
38 lines (30 loc) · 1.94 KB
/
tweeregex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# regexes
UNQUOTED_REGEX = r"""(?=(?:[^"'\\]*(?:\\.|'(?:[^'\\]*\\.)*[^'\\]*'|"(?:[^"\\]*\\.)*[^"\\]*"))*[^'"]*$)"""
LINK_REGEX = r"\[\[([^\|]*?)(?:\|(.*?))?\](\[.*?\])?\]"
MACRO_REGEX = r"""<<([^>\s]+)(?:\s*)((?:\\.|'(?:[^'\\]*\\.)*[^'\\]*'|"(?:[^"\\]*\\.)*[^"\\]*"|[^'"\\>]|>(?!>))*)>>"""
IMAGE_REGEX = r"\[([<]?)(>?)img\[(?:([^\|\]]+)\|)?([^\[\]\|]+)\](?:\[([^\]]*)\]?)?(\])"
HTML_BLOCK_REGEX = r"<html>((?:.|\n)*?)</html>"
HTML_REGEX = r"<(?:\/?([\w\-]+)(?:(\s+[\w\-]+(?:\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?)>"
INLINE_STYLE_REGEX = "@@"
INLINE_STYLE_PROP_REGEX = r"((?:([^\(@]+)\(([^\)]+)(?:\):))|(?:([^:@]+):([^;@]+);)|(?:(\.[^\.;@]+);))+"
MONO_REGEX = r"^\{\{\{\n(?:(?:^[^\n]*\n)+?)(?:^\}\}\}$\n?)|\{\{\{((?:.|\n)*?)\}\}\}"
COMMENT_REGEX = r"/%((?:.|\n)*?)%/"
COMBINED_REGEX = '(' + ')|('.join([ LINK_REGEX, MACRO_REGEX, IMAGE_REGEX, HTML_BLOCK_REGEX, HTML_REGEX, INLINE_STYLE_REGEX,\
MONO_REGEX, COMMENT_REGEX, r"''|\/\/|__|\^\^|~~|==" ]) + ')'
# macro param regex - string or number or boolean or variable
# (Mustn't match all-digit names)
MACRO_PARAMS_VAR_REGEX = r'(\$[\w_\.]*[a-zA-Z_\.]+[\w_\.]*)'
# This isn't included because it's too general - but it's used by the broken link lexer
# (Not including whitespace between name and () because of false positives)
MACRO_PARAMS_FUNC_REGEX = r'([\w\d_\.]+\((.*?)\))'
MACRO_PARAMS_REGEX = r'(?:("(?:[^\\"]|\\.)*"|\'(?:[^\\\']|\\.)*\'|(?:\[\[(?:[^\]]*)\]\]))' \
+r'|\b(\-?\d+\.?(?:[eE][+\-]?\d+)?|NaN)\b' \
+r'|(true|false|null|undefined)' \
+r'|'+MACRO_PARAMS_VAR_REGEX \
+r')'
# This includes BMP even though you can't normally import it
IMAGE_FILENAME_REGEX = r"[^\"']+\.(?:jpe?g|a?png|gif|bmp|webp|svg)"
EXTERNAL_IMAGE_URL = r"\s*['\"]?(" + IMAGE_FILENAME_REGEX + ")['\"]?\s*"
EXTERNAL_IMAGE_REGEX = IMAGE_REGEX.replace(r"([^\[\]\|]+)", EXTERNAL_IMAGE_URL)
HTML_IMAGE_REGEX = r"src\s*=" + EXTERNAL_IMAGE_URL
CSS_IMAGE_REGEX = r"url\s*\(" + EXTERNAL_IMAGE_URL + r"\)"