diff --git a/actors/extended-gpt-scraper/.actor/input_schema.json b/actors/extended-gpt-scraper/.actor/input_schema.json index 724397b..c3efd9b 100644 --- a/actors/extended-gpt-scraper/.actor/input_schema.json +++ b/actors/extended-gpt-scraper/.actor/input_schema.json @@ -98,6 +98,14 @@ "default": 10, "unit": "pages" }, + "skipGptGlobs": { + "title": "Skip GPT processing for Globs", + "type": "array", + "description": "This setting allows you to specify certain page URLs to skip GPT instructions for. Pages matching these glob patterns will only be crawled for links, excluding them from GPT processing. Useful for intermediary pages used for navigation or undesired content.", + "editor": "globs", + "default": [], + "prefill": [] + }, "useStructureOutput": { "sectionCaption": "Formatted output", "sectionDescription": "By default, the scraper outputs text answers for each page. If you want to get data in a structured format, you can define a JSON schema. The scraper uses [function](https://platform.openai.com/docs/api-reference/chat/create#chat/create-functions), which is called for each page. The function receives the page content and returns the answer in the defined JSON format.", diff --git a/actors/gpt-scraper/src/main.ts b/actors/gpt-scraper/src/main.ts index 0b1ad13..fb9f702 100644 --- a/actors/gpt-scraper/src/main.ts +++ b/actors/gpt-scraper/src/main.ts @@ -29,14 +29,16 @@ if (process.env.ACTOR_MAX_PAID_DATASET_ITEMS) { await updateDeprecatedInput(input); if (process.env.OPENAI_API_KEY) { - const crawler = await createCrawler({ - input: { - ...input, - maxPagesPerCrawl: maxRequestsPerCrawl, - model: DEFAULT_PEY_PER_RESULT_OPENAI_MODEL, - openaiApiKey: process.env.OPENAI_API_KEY, - }, - }); + /** Input for the PPR Actor. We explicitly override some of these values, so that users don't change them. */ + const adjustedPayPerResultInput = { + ...input, + maxPagesPerCrawl: maxRequestsPerCrawl, + skipGptGlobs: [], + model: DEFAULT_PEY_PER_RESULT_OPENAI_MODEL, + openaiApiKey: process.env.OPENAI_API_KEY, + }; + + const crawler = await createCrawler({ input: adjustedPayPerResultInput }); // We explicitly remove it so we are sure the key is only passed through params to remove double source of truth delete process.env.OPENAI_API_KEY; diff --git a/package-lock.json b/package-lock.json index e03c389..8eb3ad6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10316,6 +10316,7 @@ "html-to-text": "^9.0.5", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.209", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "turndown": "^7.1.2" @@ -12649,6 +12650,7 @@ "jest": "^29.5.0", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.209", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "rimraf": "^5.0.1", diff --git a/packages/gpt-scraper-core/package.json b/packages/gpt-scraper-core/package.json index 478288e..1a976ad 100644 --- a/packages/gpt-scraper-core/package.json +++ b/packages/gpt-scraper-core/package.json @@ -27,6 +27,7 @@ "gpt-3-encoder": "^1.1.4", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.209", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "turndown": "^7.1.2" diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index e7e5bd9..0859a93 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -7,10 +7,10 @@ import addFormats from 'ajv-formats'; import { getModelByName } from './models/models.js'; import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from './processors.js'; import { Input, PAGE_FORMAT } from './types/input.js'; -import { parseInput } from './input.js'; -import { NonRetryableOpenaiAPIError } from './errors.js'; import { parseInput, validateInput, validateInputCssSelectors } from './input.js'; +import { NonRetryableOpenaiAPIError } from './errors.js'; import { OpenAIModelSettings } from './types/models.js'; +import { doesUrlMatchGlobs } from './utils.js'; interface State { pageOutputted: number; @@ -118,6 +118,11 @@ export const createCrawler = async ({ input }: { input: Input }) => { ); } + const skipGptProcessing = input.skipGptGlobs && doesUrlMatchGlobs(url, input.skipGptGlobs); + if (skipGptProcessing) { + log.info(`Skipping page from GPT processing because it matched 'skipGptGlobs', crawling only.`, { url }); + } + // A function to be evaluated by Playwright within the browser context. let originContentHtml; if (input.targetSelector) { diff --git a/packages/gpt-scraper-core/src/types/index.ts b/packages/gpt-scraper-core/src/types/index.ts index 593f251..4106c4b 100644 --- a/packages/gpt-scraper-core/src/types/index.ts +++ b/packages/gpt-scraper-core/src/types/index.ts @@ -1,2 +1,2 @@ -export { HTML_TAGS_TO_IGNORE, type Input, PAGE_FORMAT } from './input.js'; +export { type Input, PAGE_FORMAT } from './input.js'; export { type OpenAIModelSettings } from './models.js'; diff --git a/packages/gpt-scraper-core/src/types/input.ts b/packages/gpt-scraper-core/src/types/input.ts index 3f9cbdc..60096d9 100644 --- a/packages/gpt-scraper-core/src/types/input.ts +++ b/packages/gpt-scraper-core/src/types/input.ts @@ -21,6 +21,7 @@ export interface Input extends OpenAIModelSettings { useStructureOutput?: boolean; pageFormatInRequest?: PAGE_FORMAT; saveSnapshots?: boolean; + skipGptGlobs?: GlobInput[]; initialCookies?: Cookie[]; removeElementsCssSelector?: string; } diff --git a/packages/gpt-scraper-core/src/utils.ts b/packages/gpt-scraper-core/src/utils.ts new file mode 100644 index 0000000..bf2dc61 --- /dev/null +++ b/packages/gpt-scraper-core/src/utils.ts @@ -0,0 +1,12 @@ +import { GlobInput } from 'crawlee'; +import { minimatch } from 'minimatch'; + +export const doesUrlMatchGlobs = (url: string, globs: GlobInput[]): boolean => { + return globs.some((glob) => doesUrlMatchGlob(url, glob)); +}; + +const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => { + const globString = typeof glob === 'string' ? glob : glob.glob; + + return minimatch(url, globString, { nocase: true }); +};