From 213d3a10e8afd7bf0dbed46e6a4e5dadf5b31531 Mon Sep 17 00:00:00 2001 From: James Hewitt Date: Sat, 9 Mar 2024 10:38:32 +0000 Subject: [PATCH] Add re.findall to pick out re matches Signed-off-by: James Hewitt --- docs/source/filters.rst | 1 + lib/urlwatch/filters.py | 20 +++++++++++++++++ lib/urlwatch/tests/data/filter_tests.yaml | 26 +++++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/docs/source/filters.rst b/docs/source/filters.rst index 65148d02..3f17fde8 100644 --- a/docs/source/filters.rst +++ b/docs/source/filters.rst @@ -77,6 +77,7 @@ At the moment, the following filters are built-in: - **ical2text**: Convert `iCalendar`_ to plaintext - **ocr**: Convert text in images to plaintext using Tesseract OCR - **re.sub**: Replace text with regular expressions using Python's re.sub +- **re.findall**: Find all non-overlapping matches using Python's re.findall - **reverse**: Reverse input items - **sha1sum**: Calculate the SHA-1 checksum of the content - **shellpipe**: Filter using a shell command diff --git a/lib/urlwatch/filters.py b/lib/urlwatch/filters.py index ed21b4c0..2ece9d4b 100644 --- a/lib/urlwatch/filters.py +++ b/lib/urlwatch/filters.py @@ -848,6 +848,26 @@ def filter(self, data, subfilter): return re.sub(subfilter['pattern'], subfilter.get('repl', ''), data) +class RegexFindall(FilterBase): + """Pick out regular expressions using Python's re.findall""" + + __kind__ = 're.findall' + + __supported_subfilters__ = { + 'pattern': 'Regular expression to search for (required)', + 'repl': 'Replacement string (default: empty string)', + } + + __default_subfilter__ = 'pattern' + + def filter(self, data, subfilter): + if 'pattern' not in subfilter: + raise ValueError('{} needs a pattern'.format(self.__kind__)) + + # Default: Replace with empty string if no "repl" value is set + return "\n".join([match.expand(subfilter.get('repl', '\\g<0>')) for match in re.finditer(subfilter['pattern'], data)]) + + class SortFilter(FilterBase): """Sort input items""" diff --git a/lib/urlwatch/tests/data/filter_tests.yaml b/lib/urlwatch/tests/data/filter_tests.yaml index 081cf5e4..72629d23 100644 --- a/lib/urlwatch/tests/data/filter_tests.yaml +++ b/lib/urlwatch/tests/data/filter_tests.yaml @@ -326,6 +326,32 @@ re_sub_multiline: One Line Another Line +re_findall: + filter: + - re.findall: '-[a-z][a-z][a-z]-' + data: |- + Some-abc-things-def-on-ghi-this-line-and + some-jkl-more-mno-here + expected_result: |- + -abc- + -def- + -ghi- + -jkl- + -mno- +re_findall_repl: + filter: + - re.findall: + pattern: '-([a-z])([a-z])([a-z])-' + repl: '\3\2\1' + data: |- + Some-abc-things-def-on-ghi-this-line-and + some-jkl-more-mno-here + expected_result: |- + cba + fed + ihg + lkj + onm strip: filter: strip data: " The rose is red; \n\nthe violet's blue.\nSugar is sweet, \nand so are you. "