apify-projects · metalwarrior665 · Jan 10, 2024 · Jan 10, 2024
diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts
@@ -10,7 +10,7 @@
 import { parseInput, validateInput, validateInputCssSelectors } from './input.js';
 import { NonRetryableOpenaiAPIError } from './errors.js';
 import { OpenAIModelSettings } from './types/models.js';
-import { doesUrlMatchGlobs } from './utils.js';
+import { doesUrlMatchGlobs, ERROR_TYPE } from './utils.js';
 
 interface State {
     pagesOpened: number;
@@ -33,7 +33,7 @@
        addFormats(validator);
        validator.compile(schema);
        return schema;
    } catch (e: any) {
        log.error(`Schema is not valid: ${e.message}`, { error: e });
        await Actor.fail('Schema is not valid. Go to Actor run log, '
            + 'where you can find error details or disable "Use JSON schema to format answer" option.');
@@ -87,7 +87,7 @@
                 const state = await crawler.useState<State>(DEFAULT_STATE);
                 if (state.pagesOpened >= input.maxPagesPerCrawl) {
                     const err = new NonRetryableError('Skipping this page');
-                    err.name = 'LimitError';
+                    err.name = ERROR_TYPE.LIMIT_ERROR;
                     throw err;
                 }
             },
@@ -97,7 +97,27 @@
             const { depth = 0 } = request.userData;
             const state = await crawler.useState<State>(DEFAULT_STATE);
             const isFirstPage = state.pagesOpened === 0;
-            state.pagesOpened++;
+            // perform an explicit check (to see if this request has already dealt with counters)
+            // by the request key, so as to avoid it succeeding in case of other requests inheriting userData with `...userData`
+            if (request.userData.wasOpenedKey !== request.uniqueKey) {
+                if (state.pagesOpened >= input.maxPagesPerCrawl) {
+                    // performing a check in the preNavigationHook is helpful to prevent extra requests,
+                    // but as the counters are incremented only later in a different async function,
+                    // a race condition may occur when multiple pages are opened at the same time;
+                    // performing a double check here, synchronously before dealing with counters just below,
+                    // will ensure that this race condition is avoided
+                    const err = new NonRetryableError('Skipping this page');
+                    err.name = ERROR_TYPE.LIMIT_ERROR;
+                    throw err;
+                }
+                // only increment this counter once for each page (via the check in the outer `if`);
+                // also, do not increment in the preNavigationHook, because the page might somehow not exist and before successful
+                // navigation should not be counted
+                state.pagesOpened++;
+                // this flag is used in the checks for reaching the limit - a page that was allowed to open will ignore
+                // the `pagesOpened` counter, which will deal with possible retries
+                request.userData.wasOpenedKey = request.uniqueKey;
+            }
             const url = request.loadedUrl || request.url;
 
             if (isFirstPage) await validateInputCssSelectors(input, page);
@@ -117,6 +137,7 @@
                     userData: {
                         depth: depth + 1,
                     },
+                    limit: input.maxPagesPerCrawl - state.pagesOpened,
                 });
                 const enqueuedLinks = processedRequests.filter(({ wasAlreadyPresent }) => !wasAlreadyPresent);
                 const alreadyPresentLinksCount = processedRequests.length - enqueuedLinks.length;
@@ -197,7 +218,7 @@
                answer = answerResult.answer;
                jsonAnswer = answerResult.jsonAnswer;
                model.updateApiCallUsage(answerResult.usage);
            } catch (error: any) {
                if (error instanceof NonRetryableOpenaiAPIError) {
                    throw await Actor.fail(error.message);
                }
@@ -240,7 +261,7 @@
         },
 
         async failedRequestHandler({ request }, error: Error) {
-            if (error.name === 'LimitError') {
+            if (error.name === ERROR_TYPE.LIMIT_ERROR) {
                 return;
             }
             const errorMessage = error.message || 'no error';

diff --git a/packages/gpt-scraper-core/src/input.ts b/packages/gpt-scraper-core/src/input.ts
@@ -6,7 +6,7 @@ import { Input } from './types/input';
 /**
  * Parses the Actor input. Throws an Actor fail if the input is invalid.
  */
-export const parseInput = async (input: Input) => {
+export const parseInput = async (input: Input): Promise<Input> => {
     // OpenAI defaults to 1, but we want the crawlers to be deterministic
     const temperatureOptions = { default: 0, range: { min: 0, max: 2 } };
     const temperature = await parseNumberInRange(input.temperature, 'temperature', temperatureOptions);
@@ -22,6 +22,9 @@ export const parseInput = async (input: Input) => {
 
     return {
         ...input,
+        // make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons
+        maxPagesPerCrawl: input.maxPagesPerCrawl || 999999,
+        maxCrawlingDepth: input.maxCrawlingDepth || 999999,
         temperature,
         topP,
         frequencyPenalty,

diff --git a/packages/gpt-scraper-core/src/utils.ts b/packages/gpt-scraper-core/src/utils.ts
@@ -10,3 +10,7 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => {
 
     return minimatch(url, globString, { nocase: true });
 };
+
+export enum ERROR_TYPE {
+    LIMIT_ERROR = 'LimitError',
+}