Merge branch 'master' into 1678-notifications-for-errors

dgtlmoon · Nov 13, 2023 · 10f1aff · 10f1aff
2 parents 9708262 + f7f9894
commit 10f1aff
Show file tree

Hide file tree

Showing 46 changed files with 1,274 additions and 340 deletions.
diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml
@@ -96,8 +96,9 @@ jobs:
           tags: |
             ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:dev,ghcr.io/${{ github.repository }}:dev
           platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7,linux/arm/v8
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
 # Looks like this was disabled
 #          provenance: false
 
@@ -116,18 +117,11 @@ jobs:
             ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:latest
             ghcr.io/dgtlmoon/changedetection.io:latest
           platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7,linux/arm/v8
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
 # Looks like this was disabled
 #          provenance: false
 
       - name: Image digest
         run: echo step SHA ${{ steps.vars.outputs.sha_short }} tag ${{steps.vars.outputs.tag}} branch ${{steps.vars.outputs.branch}} digest ${{ steps.docker_build.outputs.digest }}
 
-      - name: Cache Docker layers
-        uses: actions/cache@v3
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml
@@ -30,7 +30,10 @@ jobs:
 
           # Selenium+browserless
           docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome:4
-          docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.60-chrome-stable
+          docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.60-chrome-stable
+          
+          # For accessing custom browser tests
+          docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g"  browserless/chrome:1.60-chrome-stable
 
       - name: Build changedetection.io container for testing
         run: |         
@@ -86,6 +89,12 @@ jobs:
           # And again with PLAYWRIGHT_DRIVER_URL=..
           cd ..
 
+      - name: Test custom browser URL
+        run: |
+          cd changedetectionio
+          ./run_custom_browser_url_tests.sh
+          cd ..
+
       - name: Test changedetection.io container starts+runs basically without error
         run: |
           docker run -p 5556:5000 -d test-changedetectionio

diff --git a/Dockerfile b/Dockerfile
@@ -20,11 +20,6 @@ WORKDIR /install
 
 COPY requirements.txt /requirements.txt
 
-# Instructing pip to fetch wheels from piwheels.org" on ARMv6 and ARMv7 machines
-RUN if [ "$(dpkg --print-architecture)" = "armhf" ] || [ "$(dpkg --print-architecture)" = "armel" ]; then \
-      printf "[global]\nextra-index-url=https://www.piwheels.org/simple\n" > /etc/pip.conf; \
-    fi;
-
 RUN pip install --target=/dependencies -r /requirements.txt
 
 # Playwright is an alternative to Selenium

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -16,3 +16,4 @@ global-exclude venv
 
 global-exclude test-datastore
 global-exclude changedetection.io*dist-info
+global-exclude changedetectionio/tests/proxy_socks5/test-datastore
diff --git a/README.md b/README.md
@@ -268,3 +268,7 @@ I offer commercial support, this software is depended on by network security, ae
 [license-shield]: https://img.shields.io/github/license/dgtlmoon/changedetection.io.svg?style=for-the-badge
 [release-link]: https://github.com/dgtlmoon/changedetection.io/releases
 [docker-link]: https://hub.docker.com/r/dgtlmoon/changedetection.io
+
+## Third-party licenses
+
+changedetectionio.html_tools.elementpath_tostring: Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati), Licensed under [MIT license](https://github.com/sissaschool/elementpath/blob/master/LICENSE)
diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
@@ -38,7 +38,7 @@
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1
 
-__version__ = '0.45.5'
+__version__ = '0.45.7.3'
 
 from changedetectionio.store import BASE_URL_NOT_SET_TEXT
 
@@ -105,6 +105,10 @@ def get_darkmode_state():
     css_dark_mode = request.cookies.get('css_dark_mode', 'false')
     return 'true' if css_dark_mode and strtobool(css_dark_mode) else 'false'
 
+@app.template_global()
+def get_css_version():
+    return __version__
+
 # We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread
 # running or something similar.
 @app.template_filter('format_last_checked_time')
@@ -610,6 +614,8 @@ def edit_page(uuid):
         # For the form widget tag uuid lookup
         form.tags.datastore = datastore # in _value
 
+        for p in datastore.extra_browsers:
+            form.fetch_backend.choices.append(p)
 
         form.fetch_backend.choices.append(("system", 'System settings default'))
 
@@ -710,7 +716,7 @@ def edit_page(uuid):
             system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
 
             is_html_webdriver = False
-            if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
+            if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
                 is_html_webdriver = True
 
             # Only works reliably with Playwright
@@ -815,6 +821,16 @@ def settings_page():
 
         return output
 
+    @app.route("/settings/reset-api-key", methods=['GET'])
+    @login_optionally_required
+    def settings_reset_api_key():
+        import secrets
+        secret = secrets.token_hex(16)
+        datastore.data['settings']['application']['api_access_token'] = secret
+        datastore.needs_write_urgent = True
+        flash("API Key was regenerated.")
+        return redirect(url_for('settings_page')+'#api')
+
     @app.route("/import", methods=['GET', "POST"])
     @login_optionally_required
     def import_page():
@@ -973,7 +989,7 @@ def diff_history_page(uuid):
         system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
 
         is_html_webdriver = False
-        if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
+        if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
             is_html_webdriver = True
 
         password_enabled_and_share_is_off = False
@@ -1027,7 +1043,7 @@ def preview_page(uuid):
 
 
         is_html_webdriver = False
-        if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
+        if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
             is_html_webdriver = True
 
         # Never requested successfully, but we detected a fetch error
@@ -1208,8 +1224,7 @@ def static_content(group, filename):
             # These files should be in our subdirectory
             try:
                 # set nocache, set content-type
-                watch_dir = datastore_o.datastore_path + "/" + filename
-                response = make_response(send_from_directory(filename="elements.json", directory=watch_dir, path=watch_dir + "/elements.json"))
+                response = make_response(send_from_directory(os.path.join(datastore_o.datastore_path, filename), "elements.json"))
                 response.headers['Content-type'] = 'application/json'
                 response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
                 response.headers['Pragma'] = 'no-cache'

diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py
@@ -40,8 +40,8 @@ def long_task(uuid, preferred_proxy):
         contents = ''
         now = time.time()
         try:
-            update_handler = text_json_diff.perform_site_check(datastore=datastore)
-            changed_detected, update_obj, contents = update_handler.run(uuid, preferred_proxy=preferred_proxy, skip_when_checksum_same=False)
+            update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
+            update_handler.call_browser()
         # title, size is len contents not len xfer
         except content_fetcher.Non200ErrorCodeReceived as e:
             if e.status_code == 404:

diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html
@@ -69,11 +69,12 @@
                                 {% endif %}
                             </ul>
                         </li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with <code>xpath:</code>
                             <ul>
-                                <li>Example:  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
+                                <li>Example:  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath:count(//*[contains(@class, 'sametext')])</code>, <a
                                 href="http://xpather.com/" target="new">test your XPath here</a></li>
                                 <li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
+                                <li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
                             </ul>
                             </li>
                     </ul>

diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py
@@ -96,6 +96,7 @@ class Fetcher():
     content = None
     error = None
     fetcher_description = "No description"
+    browser_connection_url = None
     headers = {}
     status_code = None
     webdriver_js_execute_code = None
@@ -251,14 +252,16 @@ class base_html_playwright(Fetcher):
 
     proxy = None
 
-    def __init__(self, proxy_override=None):
+    def __init__(self, proxy_override=None, browser_connection_url=None):
         super().__init__()
-        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
+
         self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
-        self.command_executor = os.getenv(
-            "PLAYWRIGHT_DRIVER_URL",
-            'ws://playwright-chrome:3000'
-        ).strip('"')
+
+        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
+        if not browser_connection_url:
+            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
+        else:
+            self.browser_connection_url = browser_connection_url
 
         # If any proxy settings are enabled, then we should setup the proxy object
         proxy_args = {}
@@ -419,11 +422,7 @@ def run(self,
             is_binary=False):
 
         # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
-        has_browser_steps = self.browser_steps and list(filter(
-                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
-                self.browser_steps))
-
-        if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
+        if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
             if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
                 # Temporary backup solution until we rewrite the playwright code
                 return self.run_fetch_browserless_puppeteer(
@@ -448,7 +447,7 @@ def run(self,
             # Seemed to cause a connection Exception even tho I can see it connect
             # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
             # 60,000 connection timeout only
-            browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
+            browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)
 
             # SOCKS5 with authentication is not supported (yet)
             # https://github.com/microsoft/playwright/issues/10567
@@ -508,7 +507,11 @@ def run(self,
             self.status_code = response.status
 
             if self.status_code != 200 and not ignore_status_codes:
-                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code)
+
+                screenshot=self.page.screenshot(type='jpeg', full_page=True,
+                                     quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
+
+                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 
             if len(self.page.content().strip()) == 0:
                 context.close()
@@ -559,21 +562,22 @@ class base_html_webdriver(Fetcher):
     else:
         fetcher_description = "WebDriver Chrome/Javascript"
 
-    command_executor = ''
-
     # Configs for Proxy setup
     # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
     selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
                                         'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
                                         'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
     proxy = None
 
-    def __init__(self, proxy_override=None):
+    def __init__(self, proxy_override=None, browser_connection_url=None):
         super().__init__()
         from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
 
         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
-        self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
+        if not browser_connection_url:
+            self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
+        else:
+            self.browser_connection_url = browser_connection_url
 
         # If any proxy settings are enabled, then we should setup the proxy object
         proxy_args = {}
@@ -615,7 +619,7 @@ def run(self,
             options.proxy = self.proxy
 
         self.driver = webdriver.Remote(
-            command_executor=self.command_executor,
+            command_executor=self.browser_connection_url,
             options=options)
 
         try:
@@ -670,8 +674,10 @@ def quit(self):
 class html_requests(Fetcher):
     fetcher_description = "Basic fast Plaintext/HTTP Client"
 
-    def __init__(self, proxy_override=None):
+    def __init__(self, proxy_override=None, browser_connection_url=None):
+        super().__init__()
         self.proxy_override = proxy_override
+        # browser_connection_url is none because its always 'launched locally'
 
     def run(self,
             url,

diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
@@ -168,7 +168,9 @@ def __init__(self, message=None):
     def __call__(self, form, field):
         import urllib3.exceptions
         from changedetectionio import content_fetcher
+        return
 
+# AttributeError: module 'changedetectionio.content_fetcher' has no attribute 'extra_browser_unlocked<>ASDF213r123r'
         # Better would be a radiohandler that keeps a reference to each class
         if field.data is not None and field.data != 'system':
             klass = getattr(content_fetcher, field.data)
@@ -326,11 +328,30 @@ def __call__(self, form, field):
                 return
 
             # Does it look like XPath?
-            if line.strip()[0] == '/':
+            if line.strip()[0] == '/' or line.strip().startswith('xpath:'):
                 if not self.allow_xpath:
                     raise ValidationError("XPath not permitted in this field!")
                 from lxml import etree, html
+                import elementpath
+                # xpath 2.0-3.1
+                from elementpath.xpath3 import XPath3Parser
                 tree = html.fromstring("<html></html>")
+                line = line.replace('xpath:', '')
+
+                try:
+                    elementpath.select(tree, line.strip(), parser=XPath3Parser)
+                except elementpath.ElementPathError as e:
+                    message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+                    raise ValidationError(message % (line, str(e)))
+                except:
+                    raise ValidationError("A system-error occurred when validating your XPath expression")
+
+            if line.strip().startswith('xpath1:'):
+                if not self.allow_xpath:
+                    raise ValidationError("XPath not permitted in this field!")
+                from lxml import etree, html
+                tree = html.fromstring("<html></html>")
+                line = re.sub(r'^xpath1:', '', line)
 
                 try:
                     tree.xpath(line.strip())
@@ -497,6 +518,12 @@ class SingleExtraProxy(Form):
     proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50})
     # @todo do the validation here instead
 
+class SingleExtraBrowser(Form):
+    browser_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
+    browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
+    # @todo do the validation here instead
+
+
 # datastore.data['settings']['requests']..
 class globalSettingsRequestForm(Form):
     time_between_check = FormField(TimeBetweenCheckForm)
@@ -505,6 +532,7 @@ class globalSettingsRequestForm(Form):
                                   render_kw={"style": "width: 5em;"},
                                   validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")])
     extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
+    extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)
 
     def validate_extra_proxies(self, extra_validators=None):
         for e in self.data['extra_proxies']:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,3 +16,4 @@ global-exclude venv

		global-exclude test-datastore
		global-exclude changedetection.io*dist-info
		global-exclude changedetectionio/tests/proxy_socks5/test-datastore