Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(playwright): add functionality to fetch paginated content #2780

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions changedetectionio/blueprint/browser_steps/browser_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ def action_wait_for_text_in_element(self, selector, value):
v = json.dumps(value)
self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000)

def action_wait_for_load_state(self, selector, value='networkidle'):
self.page.wait_for_load_state(value)

# @todo - in the future make some popout interface to capture what needs to be set
# https://playwright.dev/python/docs/api/class-keyboard
def action_press_enter(self, selector, value):
Expand All @@ -190,6 +193,9 @@ def action_check_checkbox(self, selector, value):
def action_uncheck_checkbox(self, selector, value):
self.page.locator(selector, timeout=1000).uncheck(timeout=1000)

def get_locator(self, selector):
return self.page.locator(selector)


# Responsible for maintaining a live 'context' with the chrome CDP
# @todo - how long do contexts live for anyway?
Expand Down
3 changes: 3 additions & 0 deletions changedetectionio/content_fetchers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class Fetcher():
instock_data_js = ""
status_code = None
webdriver_js_execute_code = None
webdriver_enable_pagination = False
webdriver_paginated_js_execute_each_page = None
webdriver_paginated_next_selector = None
xpath_data = None
xpath_element_js = ""

Expand Down
5 changes: 5 additions & 0 deletions changedetectionio/content_fetchers/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,8 @@ def __init__(self, status_code, url, screenshot=None, has_filters=False, html_co
self.html_content = html_content
self.xpath_data = xpath_data
return


class PaginatedContentMisconfigured(Exception):
def __init__(self):
return
68 changes: 64 additions & 4 deletions changedetectionio/content_fetchers/playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from loguru import logger

from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, PaginatedContentMisconfigured, ScreenshotUnavailable

class fetcher(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
Expand Down Expand Up @@ -133,10 +133,13 @@ def run(self,
browser.close()
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)

try:
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
if self.webdriver_enable_pagination == True:
self.run_paginated(url=url)
else:
self.run_normal(browsersteps_interface=browsersteps_interface)
except playwright._impl._errors.TimeoutError as e:
context.close()
browser.close()
Expand All @@ -147,7 +150,7 @@ def run(self,
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))

extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000)

Expand Down Expand Up @@ -209,3 +212,60 @@ def run(self,
finally:
context.close()
browser.close()

def run_normal(self, browsersteps_interface):
"""
Run normal content extraction.
"""
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)

def run_paginated(self, url):
"""
Run paginated content extraction in the following order:
1. Execute initial JS code after the page is loaded
2.
a. Execute JS code to extract content from the page\n
b. Look for a "next page" button and click it if it exists\n
c. Repeat step 2 until the "next page" button is not found
3. Write the extracted content to a hidden input element with ID "cd_data"
"""
if self.webdriver_paginated_js_execute_each_page is None or not len(self.webdriver_paginated_js_execute_each_page) \
or self.webdriver_paginated_next_selector is None or not len(self.webdriver_paginated_next_selector):
raise PaginatedContentMisconfigured()

from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._errors import TimeoutError

browsersteps_interface = steppable_browser_interface(start_url=url)
browsersteps_interface.page = self.page

browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
browsersteps_interface.action_wait_for_load_state(selector=None)

data = ""
step_n = 1
while True:
if data != "":
data += ","

logger.debug(f"Paginated content > Page {step_n}")
data += browsersteps_interface.action_execute_js(value=self.webdriver_paginated_js_execute_each_page, selector=None)

try:
next_button = browsersteps_interface.get_locator(self.webdriver_paginated_next_selector)
next_button.wait_for()
next_button.click()
browsersteps_interface.action_wait_for_load_state(selector=None)
step_n += 1
except TimeoutError:
# This just means the button could not be found.
logger.debug(f"Paginated content > Next button could not be found")
break

self.page.evaluate('''(data) => {
const el = document.createElement('input');
el.id = 'cd_data';
el.type = 'hidden';
el.value = data;
document.body.appendChild(el);
}''', data)
5 changes: 5 additions & 0 deletions changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,12 @@ class processor_text_json_diff_form(commonSettingsForm):
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])

webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])

webdriver_enable_pagination = BooleanField('Enable paginated mode', default=False)
webdriver_paginated_js_execute_each_page = TextAreaField('(Paginated) Execute JavaScript on each page', render_kw={"rows": "5"}, validators=[validators.Optional()])
webdriver_paginated_next_selector = TextAreaField('(Paginated) Next page button selector', validators=[validators.Optional()])

save_button = SubmitField('Save', render_kw={"class": "pure-button button-small pure-button-primary"})

Expand Down
3 changes: 3 additions & 0 deletions changedetectionio/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def __init__(self, *arg, **kw):
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
'webdriver_enable_pagination': False, # Run before change-detection
'webdriver_paginated_js_execute_each_page': None, # Run before change-detection
'webdriver_paginated_next_selector': None, # Run before change-detection
})

super(watch_base, self).__init__(*arg, **kw)
Expand Down
6 changes: 6 additions & 0 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ def call_browser(self, preferred_proxy_id=None):
if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip():
self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code')

self.fetcher.webdriver_enable_pagination = self.watch.get('webdriver_enable_pagination', False)
if self.watch.get('webdriver_paginated_js_execute_each_page') is not None and self.watch.get('webdriver_paginated_js_execute_each_page').strip():
self.fetcher.webdriver_paginated_js_execute_each_page = self.watch.get('webdriver_paginated_js_execute_each_page')
if self.watch.get('webdriver_paginated_next_selector') is not None and self.watch.get('webdriver_paginated_next_selector').strip():
self.fetcher.webdriver_paginated_next_selector = self.watch.get('webdriver_paginated_next_selector')

# Requests for PDF's, images etc should be passwd the is_binary flag
is_binary = self.watch.is_pdf

Expand Down
20 changes: 18 additions & 2 deletions changedetectionio/static/js/vis.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,39 @@ $(document).ready(function () {

// Lazy Hide/Show elements mechanism
$('[data-visible-for]').hide();

function show_related_elem(e) {
var n = $(e).attr('name') + "=" + $(e).val();
if (n === 'fetch_backend=system') {
n = "fetch_backend=" + default_system_fetch_backend;
}
$(`[data-visible-for~="${n}"]`).show();
}

function toggle_related_elem(e) {
var n = $(e).attr('name') + "=" + $(e).val();
if (n === 'fetch_backend=system') {
n = "fetch_backend=" + default_system_fetch_backend;
}
console.log('n: ' + n);
$(`[data-visible-for~="${n}"]`).toggle();
}

$(':radio').on('keyup keypress blur change click', function (e) {
$(`[data-visible-for]`).hide();
$('.advanced-options').hide();
show_related_elem(this);
});

$(':radio:checked').each(function (e) {
show_related_elem(this);
})
});

$(':checkbox').on('change', function (e) {
toggle_related_elem(this);
});
$(':checkbox:checked').each(function (e) {
show_related_elem(this);
});

// Show advanced
$('.show-advanced').click(function (e) {
Expand Down
8 changes: 8 additions & 0 deletions changedetectionio/static/styles/scss/styles.scss
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,10 @@ footer {
.pure-controls {
padding-bottom: 1em;

&.spacing-top {
padding-top: 1rem;
}

div {
margin: 0px;
}
Expand All @@ -609,6 +613,10 @@ footer {
legend {
color: var(--color-text-legend);
}

pre {
margin-bottom: 0;
}
}

/* The input fields with errors */
Expand Down
8 changes: 8 additions & 0 deletions changedetectionio/static/styles/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,10 @@ footer {
.pure-form .pure-group,
.pure-form .pure-controls {
padding-bottom: 1em; }
.pure-form .pure-control-group.spacing-top,
.pure-form .pure-group.spacing-top,
.pure-form .pure-controls.spacing-top {
padding-top: 1rem; }
.pure-form .pure-control-group div,
.pure-form .pure-group div,
.pure-form .pure-controls div {
Expand All @@ -942,6 +946,10 @@ footer {
.pure-form .pure-group legend,
.pure-form .pure-controls legend {
color: var(--color-text-legend); }
.pure-form .pure-control-group pre,
.pure-form .pure-group pre,
.pure-form .pure-controls pre {
margin-bottom: 0; }
.pure-form .error input {
background-color: var(--color-error-input); }
.pure-form ul.errors {
Expand Down
3 changes: 3 additions & 0 deletions changedetectionio/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,9 @@ def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=
'trigger_text',
'url',
'webdriver_js_execute_code',
'webdriver_enable_pagination',
'webdriver_paginated_js_execute_each_page',
'webdriver_paginated_next_selector',
]:
if res.get(k):
if k != 'css_filter':
Expand Down
21 changes: 21 additions & 0 deletions changedetectionio/templates/edit.html
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,27 @@
href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More
help and examples here</a>
</div>

<div class="pure-control-group spacing-top">
{{ render_checkbox_field(form.webdriver_enable_pagination) }}
<div class="pure-form-message-inline">
Running in pagination mode will extract data from each page, and inject it on the last in the following element:
<pre>&lt;input type="hidden" id="cd_data" value="&lt;extracted data here&gt;"&gt;</pre>
</div>
</div>

<div data-visible-for="webdriver_enable_pagination=y" style="display: none;">
{{ render_field(form.webdriver_paginated_js_execute_each_page) }}
<div class="pure-form-message-inline">
This code will be executed on each page to extract the data.
</div>

{{ render_field(form.webdriver_paginated_next_selector) }}
<div class="pure-form-message-inline">
This selector defines the element that should be clicked to navigate to the next page. If this element cannot bet found,
the watch will interpret it as the last page.
</div>
</div>
</div>
</fieldset>
<!-- html requests always -->
Expand Down
10 changes: 10 additions & 0 deletions changedetectionio/update_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,16 @@ def run(self):
if e.message:
err_text = "{} - {}".format(err_text, e.message)

if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)

self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers_exceptions.PaginatedContentMisconfigured as e:
err_text = "Paginated content fetching is not configured properly. Did you fill in all fields?"

if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)

Expand Down