Skip to content

Commit

Permalink
Merge branch 'master' into 1678-notifications-for-errors
Browse files Browse the repository at this point in the history
  • Loading branch information
dgtlmoon authored Nov 13, 2023
2 parents 9708262 + f7f9894 commit 10f1aff
Show file tree
Hide file tree
Showing 46 changed files with 1,274 additions and 340 deletions.
16 changes: 5 additions & 11 deletions .github/workflows/containers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ jobs:
tags: |
${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:dev,ghcr.io/${{ github.repository }}:dev
platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7,linux/arm/v8
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
cache-from: type=gha
cache-to: type=gha,mode=max

# Looks like this was disabled
# provenance: false

Expand All @@ -116,18 +117,11 @@ jobs:
${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:latest
ghcr.io/dgtlmoon/changedetection.io:latest
platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7,linux/arm/v8
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
cache-from: type=gha
cache-to: type=gha,mode=max
# Looks like this was disabled
# provenance: false

- name: Image digest
run: echo step SHA ${{ steps.vars.outputs.sha_short }} tag ${{steps.vars.outputs.tag}} branch ${{steps.vars.outputs.branch}} digest ${{ steps.docker_build.outputs.digest }}

- name: Cache Docker layers
uses: actions/cache@v3
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
11 changes: 10 additions & 1 deletion .github/workflows/test-only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ jobs:
# Selenium+browserless
docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable
docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable
# For accessing custom browser tests
docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g" browserless/chrome:1.60-chrome-stable
- name: Build changedetection.io container for testing
run: |
Expand Down Expand Up @@ -86,6 +89,12 @@ jobs:
# And again with PLAYWRIGHT_DRIVER_URL=..
cd ..
- name: Test custom browser URL
run: |
cd changedetectionio
./run_custom_browser_url_tests.sh
cd ..
- name: Test changedetection.io container starts+runs basically without error
run: |
docker run -p 5556:5000 -d test-changedetectionio
Expand Down
5 changes: 0 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ WORKDIR /install

COPY requirements.txt /requirements.txt

# Instructing pip to fetch wheels from piwheels.org" on ARMv6 and ARMv7 machines
RUN if [ "$(dpkg --print-architecture)" = "armhf" ] || [ "$(dpkg --print-architecture)" = "armel" ]; then \
printf "[global]\nextra-index-url=https://www.piwheels.org/simple\n" > /etc/pip.conf; \
fi;

RUN pip install --target=/dependencies -r /requirements.txt

# Playwright is an alternative to Selenium
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ global-exclude venv

global-exclude test-datastore
global-exclude changedetection.io*dist-info
global-exclude changedetectionio/tests/proxy_socks5/test-datastore
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,7 @@ I offer commercial support, this software is depended on by network security, ae
[license-shield]: https://img.shields.io/github/license/dgtlmoon/changedetection.io.svg?style=for-the-badge
[release-link]: https://github.com/dgtlmoon/changedetection.io/releases
[docker-link]: https://hub.docker.com/r/dgtlmoon/changedetection.io

## Third-party licenses

changedetectionio.html_tools.elementpath_tostring: Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati), Licensed under [MIT license](https://github.com/sissaschool/elementpath/blob/master/LICENSE)
27 changes: 21 additions & 6 deletions changedetectionio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from changedetectionio import html_tools
from changedetectionio.api import api_v1

__version__ = '0.45.5'
__version__ = '0.45.7.3'

from changedetectionio.store import BASE_URL_NOT_SET_TEXT

Expand Down Expand Up @@ -105,6 +105,10 @@ def get_darkmode_state():
css_dark_mode = request.cookies.get('css_dark_mode', 'false')
return 'true' if css_dark_mode and strtobool(css_dark_mode) else 'false'

@app.template_global()
def get_css_version():
return __version__

# We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread
# running or something similar.
@app.template_filter('format_last_checked_time')
Expand Down Expand Up @@ -610,6 +614,8 @@ def edit_page(uuid):
# For the form widget tag uuid lookup
form.tags.datastore = datastore # in _value

for p in datastore.extra_browsers:
form.fetch_backend.choices.append(p)

form.fetch_backend.choices.append(("system", 'System settings default'))

Expand Down Expand Up @@ -710,7 +716,7 @@ def edit_page(uuid):
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'

is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True

# Only works reliably with Playwright
Expand Down Expand Up @@ -815,6 +821,16 @@ def settings_page():

return output

@app.route("/settings/reset-api-key", methods=['GET'])
@login_optionally_required
def settings_reset_api_key():
import secrets
secret = secrets.token_hex(16)
datastore.data['settings']['application']['api_access_token'] = secret
datastore.needs_write_urgent = True
flash("API Key was regenerated.")
return redirect(url_for('settings_page')+'#api')

@app.route("/import", methods=['GET', "POST"])
@login_optionally_required
def import_page():
Expand Down Expand Up @@ -973,7 +989,7 @@ def diff_history_page(uuid):
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'

is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True

password_enabled_and_share_is_off = False
Expand Down Expand Up @@ -1027,7 +1043,7 @@ def preview_page(uuid):


is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver':
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True

# Never requested successfully, but we detected a fetch error
Expand Down Expand Up @@ -1208,8 +1224,7 @@ def static_content(group, filename):
# These files should be in our subdirectory
try:
# set nocache, set content-type
watch_dir = datastore_o.datastore_path + "/" + filename
response = make_response(send_from_directory(filename="elements.json", directory=watch_dir, path=watch_dir + "/elements.json"))
response = make_response(send_from_directory(os.path.join(datastore_o.datastore_path, filename), "elements.json"))
response.headers['Content-type'] = 'application/json'
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
response.headers['Pragma'] = 'no-cache'
Expand Down
4 changes: 2 additions & 2 deletions changedetectionio/blueprint/check_proxies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def long_task(uuid, preferred_proxy):
contents = ''
now = time.time()
try:
update_handler = text_json_diff.perform_site_check(datastore=datastore)
changed_detected, update_obj, contents = update_handler.run(uuid, preferred_proxy=preferred_proxy, skip_when_checksum_same=False)
update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
update_handler.call_browser()
# title, size is len contents not len xfer
except content_fetcher.Non200ErrorCodeReceived as e:
if e.status_code == 404:
Expand Down
5 changes: 3 additions & 2 deletions changedetectionio/blueprint/tags/templates/edit-tag.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,12 @@
{% endif %}
</ul>
</li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with <code>xpath:</code>
<ul>
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath:count(//*[contains(@class, 'sametext')])</code>, <a
href="http://xpather.com/" target="new">test your XPath here</a></li>
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
<li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
</ul>
</li>
</ul>
Expand Down
44 changes: 25 additions & 19 deletions changedetectionio/content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Fetcher():
content = None
error = None
fetcher_description = "No description"
browser_connection_url = None
headers = {}
status_code = None
webdriver_js_execute_code = None
Expand Down Expand Up @@ -251,14 +252,16 @@ class base_html_playwright(Fetcher):

proxy = None

def __init__(self, proxy_override=None):
def __init__(self, proxy_override=None, browser_connection_url=None):
super().__init__()
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value

self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
self.command_executor = os.getenv(
"PLAYWRIGHT_DRIVER_URL",
'ws://playwright-chrome:3000'
).strip('"')

# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not browser_connection_url:
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
else:
self.browser_connection_url = browser_connection_url

# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
Expand Down Expand Up @@ -419,11 +422,7 @@ def run(self,
is_binary=False):

# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
has_browser_steps = self.browser_steps and list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
self.browser_steps))

if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
# Temporary backup solution until we rewrite the playwright code
return self.run_fetch_browserless_puppeteer(
Expand All @@ -448,7 +447,7 @@ def run(self,
# Seemed to cause a connection Exception even tho I can see it connect
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# 60,000 connection timeout only
browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)

# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567
Expand Down Expand Up @@ -508,7 +507,11 @@ def run(self,
self.status_code = response.status

if self.status_code != 200 and not ignore_status_codes:
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code)

screenshot=self.page.screenshot(type='jpeg', full_page=True,
quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))

raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

if len(self.page.content().strip()) == 0:
context.close()
Expand Down Expand Up @@ -559,21 +562,22 @@ class base_html_webdriver(Fetcher):
else:
fetcher_description = "WebDriver Chrome/Javascript"

command_executor = ''

# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None

def __init__(self, proxy_override=None):
def __init__(self, proxy_override=None, browser_connection_url=None):
super().__init__()
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
if not browser_connection_url:
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
else:
self.browser_connection_url = browser_connection_url

# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
Expand Down Expand Up @@ -615,7 +619,7 @@ def run(self,
options.proxy = self.proxy

self.driver = webdriver.Remote(
command_executor=self.command_executor,
command_executor=self.browser_connection_url,
options=options)

try:
Expand Down Expand Up @@ -670,8 +674,10 @@ def quit(self):
class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"

def __init__(self, proxy_override=None):
def __init__(self, proxy_override=None, browser_connection_url=None):
super().__init__()
self.proxy_override = proxy_override
# browser_connection_url is none because its always 'launched locally'

def run(self,
url,
Expand Down
30 changes: 29 additions & 1 deletion changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def __init__(self, message=None):
def __call__(self, form, field):
import urllib3.exceptions
from changedetectionio import content_fetcher
return

# AttributeError: module 'changedetectionio.content_fetcher' has no attribute 'extra_browser_unlocked<>ASDF213r123r'
# Better would be a radiohandler that keeps a reference to each class
if field.data is not None and field.data != 'system':
klass = getattr(content_fetcher, field.data)
Expand Down Expand Up @@ -326,11 +328,30 @@ def __call__(self, form, field):
return

# Does it look like XPath?
if line.strip()[0] == '/':
if line.strip()[0] == '/' or line.strip().startswith('xpath:'):
if not self.allow_xpath:
raise ValidationError("XPath not permitted in this field!")
from lxml import etree, html
import elementpath
# xpath 2.0-3.1
from elementpath.xpath3 import XPath3Parser
tree = html.fromstring("<html></html>")
line = line.replace('xpath:', '')

try:
elementpath.select(tree, line.strip(), parser=XPath3Parser)
except elementpath.ElementPathError as e:
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
raise ValidationError(message % (line, str(e)))
except:
raise ValidationError("A system-error occurred when validating your XPath expression")

if line.strip().startswith('xpath1:'):
if not self.allow_xpath:
raise ValidationError("XPath not permitted in this field!")
from lxml import etree, html
tree = html.fromstring("<html></html>")
line = re.sub(r'^xpath1:', '', line)

try:
tree.xpath(line.strip())
Expand Down Expand Up @@ -497,6 +518,12 @@ class SingleExtraProxy(Form):
proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50})
# @todo do the validation here instead

class SingleExtraBrowser(Form):
browser_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
# @todo do the validation here instead


# datastore.data['settings']['requests']..
class globalSettingsRequestForm(Form):
time_between_check = FormField(TimeBetweenCheckForm)
Expand All @@ -505,6 +532,7 @@ class globalSettingsRequestForm(Form):
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")])
extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)

def validate_extra_proxies(self, extra_validators=None):
for e in self.data['extra_proxies']:
Expand Down
Loading

0 comments on commit 10f1aff

Please sign in to comment.