diff --git a/code/src/configuration.ts b/code/src/configuration.ts index cc9ce4f..248a340 100644 --- a/code/src/configuration.ts +++ b/code/src/configuration.ts @@ -4,10 +4,12 @@ import { Cookie, RequestList, log } from 'crawlee'; import { Page } from 'playwright'; import { getModelConfigByName } from './models/models.js'; +import { LABELS } from './routes/router.js'; import { Config } from './types/config.js'; import { Input, PAGE_FORMAT } from './types/input.js'; import { ModelConfig } from './types/model.js'; import { OpenAIModelSettings } from './types/models.js'; +import { CrawlRouteUserData } from './types/user-data.js'; // eslint-disable-next-line new-cap const ajv = new Ajv.default(); @@ -44,6 +46,10 @@ export const parseConfiguration = async (input: Input): Promise => { const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration); const { requests } = await RequestList.open({ sources: startUrls }); + requests.forEach((request) => { + request.userData = { depth: 0, startUrl: request.url } satisfies CrawlRouteUserData; + request.label = LABELS.CRAWL; + }); const totalMaxItems = Number(process.env.ACTOR_MAX_PAID_DATASET_ITEMS) || Number.POSITIVE_INFINITY; const maxPagesPerCrawl = Math.min(input.maxPagesPerCrawl || Number.POSITIVE_INFINITY, totalMaxItems); diff --git a/code/src/crawler.ts b/code/src/crawler.ts index ed824c8..ecced20 100644 --- a/code/src/crawler.ts +++ b/code/src/crawler.ts @@ -1,17 +1,11 @@ -import { - Dataset, - NonRetryableError, - PlaywrightCrawler, - PlaywrightCrawlingContext, - createRequestDebugInfo, - log, -} from 'crawlee'; +import { NonRetryableError, PlaywrightCrawler, PlaywrightCrawlingContext, createRequestDebugInfo } from 'crawlee'; import { initialCookiesHook } from './hooks/initial-cookies.js'; import { LABELS, router } from './routes/router.js'; import { Config } from './types/config.js'; import { CrawlerState } from './types/crawler-state.js'; -import { ERROR_TYPE } from './utils.js'; +import { UserData } from './types/user-data.js'; +import { ERROR_TYPE, saveErrorResult } from './utils.js'; export const createCrawler = async (config: Config) => { const { maxPagesPerCrawl, proxyConfiguration, requests } = config; @@ -59,23 +53,19 @@ export const createCrawler = async (config: Config) => { }, ], - async failedRequestHandler({ request }, error: Error) { - if (error.name === ERROR_TYPE.LIMIT_ERROR) { - return; - } - const errorMessage = error.message || 'no error'; - const url = request.loadedUrl || request.url; - log.error(`Request ${url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`); - if (error.name === 'UserFacedError') { - await Dataset.pushData({ - url, - answer: `ERROR: ${errorMessage}`, - }); - return; - } - await Dataset.pushData({ - '#error': true, - '#debug': createRequestDebugInfo(request), + async failedRequestHandler(context, error: Error) { + const { request } = context; + + if (error.name === ERROR_TYPE.LIMIT_ERROR) return; + + const state = await crawler.useState(); + if (state.pagesOpened >= maxPagesPerCrawl) return; + + state.pagesOpened++; + await saveErrorResult(context as PlaywrightCrawlingContext, { + error: 'failed_to_load_page', + errorDescription: 'The page failed to load, reaching the maximum number of retries.', + debugInfo: createRequestDebugInfo(request), }); }, }); diff --git a/code/src/types/user-data.ts b/code/src/types/user-data.ts index bbba337..c1211fa 100644 --- a/code/src/types/user-data.ts +++ b/code/src/types/user-data.ts @@ -4,7 +4,7 @@ export type UserData = { export type CrawlRouteUserData = UserData & { depth?: number; - wasOpenedKey: string; + wasOpenedKey?: string; }; export type GptRequestUserData = { diff --git a/code/src/utils.ts b/code/src/utils.ts index df3f172..81bb1f2 100644 --- a/code/src/utils.ts +++ b/code/src/utils.ts @@ -1,6 +1,8 @@ -import { GlobInput } from 'crawlee'; +import { Dictionary, GlobInput, PlaywrightCrawlingContext } from 'crawlee'; import { minimatch } from 'minimatch'; +import { UserData } from './types/user-data.js'; + export const doesUrlMatchGlobs = (url: string, globs: GlobInput[]): boolean => { return globs.some((glob) => doesUrlMatchGlob(url, glob)); }; @@ -14,3 +16,19 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => { export enum ERROR_TYPE { LIMIT_ERROR = 'LimitError', } + +export const saveErrorResult = async ( + context: PlaywrightCrawlingContext, + additionalData: { error: string; errorDescription: string; debugInfo: Dictionary }, +) => { + const { request, crawler } = context; + const { startUrl } = request.userData; + + const errorItem = { + url: request.loadedUrl || request.url, + startUrl, + ...additionalData, + }; + + await crawler.pushData(errorItem); +}; diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md index fd5a4b2..665b121 100644 --- a/shared/CHANGELOG.md +++ b/shared/CHANGELOG.md @@ -1,5 +1,10 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors. +# 2024-11-17 +*Features* +- Improved GPT call handling, which should parallelize the calls together with the crawling better. +- Added error results to output, which will contain the failed website URL to help with debugging and error handling. + # 2024-10-07 *Fixes* - Fixed initial cookies not being set correctly from input.