From 6b3c0664bf809912f9e6e3d9dbdebd23aba8132d Mon Sep 17 00:00:00 2001 From: Sviat Date: Tue, 30 Jan 2024 16:42:05 +0200 Subject: [PATCH 1/5] fix(core): use try/catch in `.evaluate()` to avoid errors --- packages/gpt-scraper-core/src/processors.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/gpt-scraper-core/src/processors.ts b/packages/gpt-scraper-core/src/processors.ts index 7d3a662..bc7c43c 100644 --- a/packages/gpt-scraper-core/src/processors.ts +++ b/packages/gpt-scraper-core/src/processors.ts @@ -15,7 +15,13 @@ export const shrinkHtml = async (html: string, page: Page, removeElementsCssSele if (removeSelector) { const elements = doc.querySelectorAll(removeSelector); for (const element of elements) { - element.remove(); + // there have been some cases when the page's own scripts cause errors and running this line + // causes them to reemerge, so what in try/cartch + try { + element.remove(); + } catch (err) { + /* ignore */ + } } } return doc.documentElement.outerHTML; @@ -34,7 +40,7 @@ export const htmlToMarkdown = (html: string) => { return htmlToMarkdownProcessor.turndown(html); }; -const chunkText = (text:string, maxLength: number) => { +const chunkText = (text: string, maxLength: number) => { const numChunks = Math.ceil(text.length / maxLength); const chunks = new Array(numChunks); From fdd2bda0ef46758dfb854b83316a7aec5a5bcc05 Mon Sep 17 00:00:00 2001 From: Sviat Date: Wed, 31 Jan 2024 12:18:28 +0200 Subject: [PATCH 2/5] docs(changelog) --- shared/CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md index 7a25865..52d6b63 100644 --- a/shared/CHANGELOG.md +++ b/shared/CHANGELOG.md @@ -1,5 +1,9 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors. +# 2023-01-31 +*Fixes* +- Eliminated the bug, when on some sites that contain erronous javascript the scraper would fail + ### 2023-01-26 *Fixes* - Fixed "max pages per run" not working correctly on specific websites. @@ -31,4 +35,4 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actor *Changes* - Use LangChain to connect to GPT models. This means some error messages are different. -- The default model `temperature` is now set to `0` instead of `1`. This should improve the reliability of scraping. While this is technically a breaking change, it should mostly behave as an improvement so we don't consider need to release a separate version. \ No newline at end of file +- The default model `temperature` is now set to `0` instead of `1`. This should improve the reliability of scraping. While this is technically a breaking change, it should mostly behave as an improvement so we don't consider need to release a separate version. From 10ab10b26c56dabd06438eb0e2728aa9c8736e2e Mon Sep 17 00:00:00 2001 From: Sviat Date: Wed, 31 Jan 2024 12:20:35 +0200 Subject: [PATCH 3/5] fix: remove duplicate import lines --- packages/gpt-scraper-core/src/models/openai.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/gpt-scraper-core/src/models/openai.ts b/packages/gpt-scraper-core/src/models/openai.ts index 432a969..d93fbc4 100644 --- a/packages/gpt-scraper-core/src/models/openai.ts +++ b/packages/gpt-scraper-core/src/models/openai.ts @@ -3,8 +3,7 @@ import { log, sleep } from 'crawlee'; import { ChatOpenAI } from 'langchain/chat_models/openai'; import { OpenAI } from 'langchain/llms/openai'; import { LLMResult } from 'langchain/schema'; -import { REPETITIVE_PROMPT_ERROR_MESSAGE } from '../errors.js'; -import { NonRetryableOpenaiAPIError, OpenaiAPIError, OpenaiAPIErrorToExitActor, RateLimitedError } from '../errors.js'; +import { NonRetryableOpenaiAPIError, OpenaiAPIError, OpenaiAPIErrorToExitActor, RateLimitedError, REPETITIVE_PROMPT_ERROR_MESSAGE } from '../errors.js'; import { tryToParseJsonFromString } from '../processors.js'; import { ProcessInstructionsOptions } from '../types/model.js'; import { OpenAIModelSettings } from '../types/models.js'; From 072502b4cf38deb1953254c8f35319d7cf0e0f91 Mon Sep 17 00:00:00 2001 From: Sviat Date: Mon, 5 Feb 2024 15:24:58 +0200 Subject: [PATCH 4/5] fix(core): wait for navigation so that meta tag redirects work --- packages/gpt-scraper-core/src/crawler.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index 4ed342a..5af2982 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -92,7 +92,12 @@ export const createCrawler = async ({ input }: { input: Input }) => { } }, ], - + postNavigationHooks: [ + async ({ page }) => { + // see https://github.com/apify/crawlee/issues/2314 + await page.waitForNavigation(); + }, + ], async requestHandler({ request, page, enqueueLinks, closeCookieModals }) { const { depth = 0 } = request.userData; const state = await crawler.useState(DEFAULT_STATE); From 10e6335748a985ff40901847775a83e155b093bc Mon Sep 17 00:00:00 2001 From: Sviat Date: Mon, 5 Feb 2024 15:52:28 +0200 Subject: [PATCH 5/5] fix(core): better handle meta redirects --- packages/gpt-scraper-core/src/crawler.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index 5af2982..6cf4207 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -95,7 +95,11 @@ export const createCrawler = async ({ input }: { input: Input }) => { postNavigationHooks: [ async ({ page }) => { // see https://github.com/apify/crawlee/issues/2314 - await page.waitForNavigation(); + // will solve client-side redirects through meta tags + await page.waitForSelector('body', { + state: 'attached', + timeout: 60_000, + }); }, ], async requestHandler({ request, page, enqueueLinks, closeCookieModals }) {