diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index d782a28..8756f57 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -10,7 +10,7 @@ import { Input, PAGE_FORMAT } from './types/input.js'; import { parseInput, validateInput, validateInputCssSelectors } from './input.js'; import { NonRetryableOpenaiAPIError } from './errors.js'; import { OpenAIModelSettings } from './types/models.js'; -import { doesUrlMatchGlobs } from './utils.js'; +import { doesUrlMatchGlobs, ERROR_TYPE } from './utils.js'; interface State { pagesOpened: number; @@ -87,7 +87,7 @@ export const createCrawler = async ({ input }: { input: Input }) => { const state = await crawler.useState(DEFAULT_STATE); if (state.pagesOpened >= input.maxPagesPerCrawl) { const err = new NonRetryableError('Skipping this page'); - err.name = 'LimitError'; + err.name = ERROR_TYPE.LIMIT_ERROR; throw err; } }, @@ -97,7 +97,27 @@ export const createCrawler = async ({ input }: { input: Input }) => { const { depth = 0 } = request.userData; const state = await crawler.useState(DEFAULT_STATE); const isFirstPage = state.pagesOpened === 0; - state.pagesOpened++; + // perform an explicit check (to see if this request has already dealt with counters) + // by the request key, so as to avoid it succeeding in case of other requests inheriting userData with `...userData` + if (request.userData.wasOpenedKey !== request.uniqueKey) { + if (state.pagesOpened >= input.maxPagesPerCrawl) { + // performing a check in the preNavigationHook is helpful to prevent extra requests, + // but as the counters are incremented only later in a different async function, + // a race condition may occur when multiple pages are opened at the same time; + // performing a double check here, synchronously before dealing with counters just below, + // will ensure that this race condition is avoided + const err = new NonRetryableError('Skipping this page'); + err.name = ERROR_TYPE.LIMIT_ERROR; + throw err; + } + // only increment this counter once for each page (via the check in the outer `if`); + // also, do not increment in the preNavigationHook, because the page might somehow not exist and before successful + // navigation should not be counted + state.pagesOpened++; + // this flag is used in the checks for reaching the limit - a page that was allowed to open will ignore + // the `pagesOpened` counter, which will deal with possible retries + request.userData.wasOpenedKey = request.uniqueKey; + } const url = request.loadedUrl || request.url; if (isFirstPage) await validateInputCssSelectors(input, page); @@ -117,6 +137,7 @@ export const createCrawler = async ({ input }: { input: Input }) => { userData: { depth: depth + 1, }, + limit: input.maxPagesPerCrawl - state.pagesOpened, }); const enqueuedLinks = processedRequests.filter(({ wasAlreadyPresent }) => !wasAlreadyPresent); const alreadyPresentLinksCount = processedRequests.length - enqueuedLinks.length; @@ -240,7 +261,7 @@ export const createCrawler = async ({ input }: { input: Input }) => { }, async failedRequestHandler({ request }, error: Error) { - if (error.name === 'LimitError') { + if (error.name === ERROR_TYPE.LIMIT_ERROR) { return; } const errorMessage = error.message || 'no error'; diff --git a/packages/gpt-scraper-core/src/input.ts b/packages/gpt-scraper-core/src/input.ts index 2b7864c..ebaed28 100644 --- a/packages/gpt-scraper-core/src/input.ts +++ b/packages/gpt-scraper-core/src/input.ts @@ -6,7 +6,7 @@ import { Input } from './types/input'; /** * Parses the Actor input. Throws an Actor fail if the input is invalid. */ -export const parseInput = async (input: Input) => { +export const parseInput = async (input: Input): Promise => { // OpenAI defaults to 1, but we want the crawlers to be deterministic const temperatureOptions = { default: 0, range: { min: 0, max: 2 } }; const temperature = await parseNumberInRange(input.temperature, 'temperature', temperatureOptions); @@ -22,6 +22,9 @@ export const parseInput = async (input: Input) => { return { ...input, + // make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons + maxPagesPerCrawl: input.maxPagesPerCrawl || 999999, + maxCrawlingDepth: input.maxCrawlingDepth || 999999, temperature, topP, frequencyPenalty, diff --git a/packages/gpt-scraper-core/src/utils.ts b/packages/gpt-scraper-core/src/utils.ts index bf2dc61..df3f172 100644 --- a/packages/gpt-scraper-core/src/utils.ts +++ b/packages/gpt-scraper-core/src/utils.ts @@ -10,3 +10,7 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => { return minimatch(url, globString, { nocase: true }); }; + +export enum ERROR_TYPE { + LIMIT_ERROR = 'LimitError', +}