diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index 4ed342a..6cf4207 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -92,7 +92,16 @@ export const createCrawler = async ({ input }: { input: Input }) => { } }, ], - + postNavigationHooks: [ + async ({ page }) => { + // see https://github.com/apify/crawlee/issues/2314 + // will solve client-side redirects through meta tags + await page.waitForSelector('body', { + state: 'attached', + timeout: 60_000, + }); + }, + ], async requestHandler({ request, page, enqueueLinks, closeCookieModals }) { const { depth = 0 } = request.userData; const state = await crawler.useState(DEFAULT_STATE); diff --git a/packages/gpt-scraper-core/src/models/openai.ts b/packages/gpt-scraper-core/src/models/openai.ts index 432a969..d93fbc4 100644 --- a/packages/gpt-scraper-core/src/models/openai.ts +++ b/packages/gpt-scraper-core/src/models/openai.ts @@ -3,8 +3,7 @@ import { log, sleep } from 'crawlee'; import { ChatOpenAI } from 'langchain/chat_models/openai'; import { OpenAI } from 'langchain/llms/openai'; import { LLMResult } from 'langchain/schema'; -import { REPETITIVE_PROMPT_ERROR_MESSAGE } from '../errors.js'; -import { NonRetryableOpenaiAPIError, OpenaiAPIError, OpenaiAPIErrorToExitActor, RateLimitedError } from '../errors.js'; +import { NonRetryableOpenaiAPIError, OpenaiAPIError, OpenaiAPIErrorToExitActor, RateLimitedError, REPETITIVE_PROMPT_ERROR_MESSAGE } from '../errors.js'; import { tryToParseJsonFromString } from '../processors.js'; import { ProcessInstructionsOptions } from '../types/model.js'; import { OpenAIModelSettings } from '../types/models.js'; diff --git a/packages/gpt-scraper-core/src/processors.ts b/packages/gpt-scraper-core/src/processors.ts index 7d3a662..bc7c43c 100644 --- a/packages/gpt-scraper-core/src/processors.ts +++ b/packages/gpt-scraper-core/src/processors.ts @@ -15,7 +15,13 @@ export const shrinkHtml = async (html: string, page: Page, removeElementsCssSele if (removeSelector) { const elements = doc.querySelectorAll(removeSelector); for (const element of elements) { - element.remove(); + // there have been some cases when the page's own scripts cause errors and running this line + // causes them to reemerge, so what in try/cartch + try { + element.remove(); + } catch (err) { + /* ignore */ + } } } return doc.documentElement.outerHTML; @@ -34,7 +40,7 @@ export const htmlToMarkdown = (html: string) => { return htmlToMarkdownProcessor.turndown(html); }; -const chunkText = (text:string, maxLength: number) => { +const chunkText = (text: string, maxLength: number) => { const numChunks = Math.ceil(text.length / maxLength); const chunks = new Array(numChunks); diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md index 7a25865..52d6b63 100644 --- a/shared/CHANGELOG.md +++ b/shared/CHANGELOG.md @@ -1,5 +1,9 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors. +# 2023-01-31 +*Fixes* +- Eliminated the bug, when on some sites that contain erronous javascript the scraper would fail + ### 2023-01-26 *Fixes* - Fixed "max pages per run" not working correctly on specific websites. @@ -31,4 +35,4 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actor *Changes* - Use LangChain to connect to GPT models. This means some error messages are different. -- The default model `temperature` is now set to `0` instead of `1`. This should improve the reliability of scraping. While this is technically a breaking change, it should mostly behave as an improvement so we don't consider need to release a separate version. \ No newline at end of file +- The default model `temperature` is now set to `0` instead of `1`. This should improve the reliability of scraping. While this is technically a breaking change, it should mostly behave as an improvement so we don't consider need to release a separate version.