Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiline regex in text filtering (#2857) #2889

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,22 +363,41 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# wordlist - list of regex's (str) or words (str)
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"):
i = 0
output = []
ignore_text = []
ignore_regex = []
ignored_line_numbers = []
ignore_regex_multiline = []
ignored_lines = []

for k in wordlist:
# Is it a regex?
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
if res:
ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
if res.flags & re.DOTALL or res.flags & re.MULTILINE:
ignore_regex_multiline.append(res)
else:
ignore_regex.append(res)
else:
ignore_text.append(k.strip())

for line in content.splitlines(keepends=True):
i += 1
for r in ignore_regex_multiline:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain what the approach is here so i dont have to manually compile it in my brain?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It finds all matches over the whole content using finditer, without splitting it into lines.
For each match it calculates the start/end line number by using splitlines on part of the content before the match end, and the number of lines in the matched content.

Instead of accumulating output lines to an array, the line numbers are added to ignored_lines array which is then converted to a set to remove duplicates. This way multiple filtering methods can mark lines as ignored, and the output is constructed as the last step.

for match in r.finditer(content):
content_lines = content[:match.end()].splitlines(keepends=True)
match_lines = content[match.start():match.end()].splitlines(keepends=True)

end_line = len(content_lines)
start_line = end_line - len(match_lines)

if end_line - start_line <= 1:
# Match is empty or in the middle of the line
ignored_lines.append(start_line)
else:
for i in range(start_line, end_line):
ignored_lines.append(i)

line_index = 0
lines = content.splitlines(keepends=True)
for line in lines:
# Always ignore blank lines in this mode. (when this function gets called)
got_match = False
for l in ignore_text:
Expand All @@ -390,17 +409,19 @@ def strip_ignore_text(content, wordlist, mode="content"):
if r.search(line):
got_match = True

if not got_match:
# Not ignored, and should preserve "keepends"
output.append(line)
else:
ignored_line_numbers.append(i)
if got_match:
ignored_lines.append(line_index)

line_index += 1

ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])

# Used for finding out what to highlight
if mode == "line numbers":
return ignored_line_numbers
return [i + 1 for i in ignored_lines]

return ''.join(output)
output_lines = set(range(len(lines))) - ignored_lines
return ''.join([lines[i] for i in output_lines])

def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape
Expand Down
41 changes: 40 additions & 1 deletion changedetectionio/tests/test_ignore_regex_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def test_strip_regex_text_func():
]

stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

assert "but 1 lines" in stripped_content
assert "igNORe-cAse text" not in stripped_content
assert "but 1234 lines" not in stripped_content
Expand All @@ -42,6 +41,46 @@ def test_strip_regex_text_func():
# Check line number reporting
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
assert stripped_content == [2, 5, 6, 7, 8, 10]

stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for bumping the tests!

assert "but 1 lines" not in stripped_content
assert "skip 5 lines" not in stripped_content

stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'], mode="line numbers")
assert stripped_content == [4, 5]

stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'])
assert stripped_content == ""

stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'], mode="line numbers")
assert stripped_content == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'])
assert "but 1 lines" not in stripped_content
assert "skip 5 lines" not in stripped_content

stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'], mode="line numbers")
assert stripped_content == [4, 5]

stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'])
assert "but sometimes we want to remove the lines." not in stripped_content
assert "but not always." not in stripped_content

stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'], mode="line numbers")
assert stripped_content == [2, 11]

stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'])
assert "but sometimes we want to remove the lines." not in stripped_content
assert "but 1 lines" not in stripped_content
assert "but 1234 lines" not in stripped_content
assert "igNORe-cAse text we dont want to keep" not in stripped_content
assert "but not always." not in stripped_content

stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'], mode="line numbers")
assert stripped_content == [2, 3, 4, 9, 10, 11]

stripped_content = html_tools.strip_ignore_text("\n\ntext\n\ntext\n\n", ['/^$/ms'], mode="line numbers")
assert stripped_content == [1, 2, 4, 6]

# Check that linefeeds are preserved when there are is no matching ignores
content = "some text\n\nand other text\n"
Expand Down
Loading