From 6a897b6804a3eb6885e4f6516439376e192cf722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Mon, 12 Aug 2024 11:40:10 +0200 Subject: [PATCH 1/2] feat: ignore SSL and HTTPS errors --- code/src/crawler.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/src/crawler.ts b/code/src/crawler.ts index 3533b6e..6040d12 100644 --- a/code/src/crawler.ts +++ b/code/src/crawler.ts @@ -11,8 +11,8 @@ export const createCrawler = async (config: Config) => { const crawler = new PlaywrightCrawler({ launchContext: { launchOptions: { - // TODO: Just headless - headless: true, + /** We intentionally ignore these errors, because some broken websites would otherwise not be scraped */ + args: ['--ignore-certificate-errors'], }, }, /** From 04007dcf2e8219bf93064eb7890a55a58e725ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Mon, 12 Aug 2024 11:42:51 +0200 Subject: [PATCH 2/2] docs: update CHANGELOG.md --- shared/CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md index 8aa4ae3..4be8892 100644 --- a/shared/CHANGELOG.md +++ b/shared/CHANGELOG.md @@ -1,10 +1,11 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors. -# 2024-07-30 +# 2024-08-12 *Features* - Added support for GPT-4o-mini model. (Extended GPT scraper) - Set this model as the default one for the the *Pay Per Result* scraper with a set token limit. - With this, the maximum token limit for the *Pay Per Result* scraper was increased by 150%. +- Ignore HTTPS errors, which will allow the scraper to work on broken websites with invalid certificates. *Fixes* - Fixed concurrency scaling issues that were causing the Actor to fail due to scaling too quickly.