From b3bf66e705d431b66e5a606b32cd862605317b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Tue, 26 Dec 2023 22:15:54 +0100 Subject: [PATCH 1/5] feat: implement `skipGptOnGlobs` logic and input --- actors/gpt-scraper/src/main.ts | 17 +++++++++-------- package-lock.json | 2 ++ packages/gpt-scraper-core/package.json | 1 + packages/gpt-scraper-core/src/crawler.ts | 12 ++++++++++++ packages/gpt-scraper-core/src/types/input.ts | 1 + packages/gpt-scraper-core/src/utils.ts | 12 ++++++++++++ 6 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 packages/gpt-scraper-core/src/utils.ts diff --git a/actors/gpt-scraper/src/main.ts b/actors/gpt-scraper/src/main.ts index dcb105a..72d8d39 100644 --- a/actors/gpt-scraper/src/main.ts +++ b/actors/gpt-scraper/src/main.ts @@ -27,14 +27,15 @@ if (process.env.ACTOR_MAX_PAID_DATASET_ITEMS) { } if (process.env.OPENAI_API_KEY) { - const crawler = await createCrawler({ - input: { - ...input, - maxPagesPerCrawl: maxRequestsPerCrawl, - model: DEFAULT_PEY_PER_RESULT_OPENAI_MODEL, - openaiApiKey: process.env.OPENAI_API_KEY, - }, - }); + const adjustedPayPerResultInput = { + ...input, + maxPagesPerCrawl: maxRequestsPerCrawl, + skipGptGlobs: [], + model: DEFAULT_PEY_PER_RESULT_OPENAI_MODEL, + openaiApiKey: process.env.OPENAI_API_KEY, + }; + + const crawler = await createCrawler({ input: adjustedPayPerResultInput }); // We explicitly remove it so we are sure the key is only passed through params to remove double source of truth delete process.env.OPENAI_API_KEY; diff --git a/package-lock.json b/package-lock.json index 4deb17a..1c1dd8c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10289,6 +10289,7 @@ "html-to-text": "^9.0.5", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.197-rc.1", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "turndown": "^7.1.2" @@ -12486,6 +12487,7 @@ "jest": "^29.5.0", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.197-rc.1", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "rimraf": "^5.0.1", diff --git a/packages/gpt-scraper-core/package.json b/packages/gpt-scraper-core/package.json index 674f8c0..0a354a2 100644 --- a/packages/gpt-scraper-core/package.json +++ b/packages/gpt-scraper-core/package.json @@ -29,6 +29,7 @@ "html-to-text": "^9.0.5", "joplin-turndown-plugin-gfm": "^1.0.12", "langchain": "^0.0.197-rc.1", + "minimatch": "^9.0.3", "openai": "^3.3.0", "playwright": "*", "turndown": "^7.1.2" diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index cd2c4e1..055a108 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -11,6 +11,7 @@ import { Input, PAGE_FORMAT } from './types/input.js'; import { parseInput } from './input.js'; import { OpenaiAPIError } from './errors.js'; import { OpenAIModelSettings } from './types/models.js'; +import { doesUrlMatchGlobs } from './utils.js'; interface State { pageOutputted: number; @@ -113,6 +114,11 @@ export const createCrawler = async ({ input }: { input: Input }) => { ); } + const skipGptProcessing = shouldSkipGptProcessing(url, input.skipGptGlobs); + if (skipGptProcessing) { + return log.info(`Skipping page '${url}' from GPT processing, crawling only.`); + } + // A function to be evaluated by Playwright within the browser context. let originContentHtml; if (input.targetSelector) { @@ -248,3 +254,9 @@ export const createCrawler = async ({ input }: { input: Input }) => { return crawler; }; + +const shouldSkipGptProcessing = (url: string, skipGptGlobs: Input['skipGptGlobs']): boolean => { + if (!skipGptGlobs) return false; + + return doesUrlMatchGlobs(url, skipGptGlobs); +}; diff --git a/packages/gpt-scraper-core/src/types/input.ts b/packages/gpt-scraper-core/src/types/input.ts index 44db4f0..fe4f7b8 100644 --- a/packages/gpt-scraper-core/src/types/input.ts +++ b/packages/gpt-scraper-core/src/types/input.ts @@ -20,6 +20,7 @@ export interface Input extends OpenAIModelSettings { useStructureOutput?: boolean; pageFormatInRequest?: PAGE_FORMAT; saveSnapshots?: boolean; + skipGptGlobs?: GlobInput[]; } export const HTML_TAGS_TO_IGNORE = ['script', 'style', 'noscript']; diff --git a/packages/gpt-scraper-core/src/utils.ts b/packages/gpt-scraper-core/src/utils.ts new file mode 100644 index 0000000..bf2dc61 --- /dev/null +++ b/packages/gpt-scraper-core/src/utils.ts @@ -0,0 +1,12 @@ +import { GlobInput } from 'crawlee'; +import { minimatch } from 'minimatch'; + +export const doesUrlMatchGlobs = (url: string, globs: GlobInput[]): boolean => { + return globs.some((glob) => doesUrlMatchGlob(url, glob)); +}; + +const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => { + const globString = typeof glob === 'string' ? glob : glob.glob; + + return minimatch(url, globString, { nocase: true }); +}; From 1e3245ea62b3a0f579c45a4c3953a7a501eea7a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Tue, 26 Dec 2023 22:32:14 +0100 Subject: [PATCH 2/5] docs: add `skipGptGlobs` to input_schema.json --- actors/extended-gpt-scraper/.actor/input_schema.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/actors/extended-gpt-scraper/.actor/input_schema.json b/actors/extended-gpt-scraper/.actor/input_schema.json index 0256224..3d144fc 100644 --- a/actors/extended-gpt-scraper/.actor/input_schema.json +++ b/actors/extended-gpt-scraper/.actor/input_schema.json @@ -74,6 +74,14 @@ "default": 10, "unit": "pages" }, + "skipGptGlobs": { + "title": "Skip GPT processing for Globs", + "type": "array", + "description": "This setting allows you to specify certain page URLs to skip GPT instructions for. Pages matching these glob patterns will only be crawled for links, excluding them from GPT processing. Useful for intermediary pages used for navigation or undesired content.", + "editor": "globs", + "default": [], + "prefill": [] + }, "useStructureOutput": { "sectionCaption": "Formatted output", "sectionDescription": "By default, the scraper outputs text answers for each page. If you want to get data in a structured format, you can define a JSON schema. The scraper uses [function](https://platform.openai.com/docs/api-reference/chat/create#chat/create-functions), which is called for each page. The function receives the page content and returns the answer in the defined JSON format.", From b8e76d89f4f786aea6d668b619cc30c53379aa79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Thu, 4 Jan 2024 18:04:24 +0100 Subject: [PATCH 3/5] refactor: fix PR remarks --- actors/gpt-scraper/src/main.ts | 1 + packages/gpt-scraper-core/src/crawler.ts | 10 ++-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/actors/gpt-scraper/src/main.ts b/actors/gpt-scraper/src/main.ts index 72d8d39..27035ad 100644 --- a/actors/gpt-scraper/src/main.ts +++ b/actors/gpt-scraper/src/main.ts @@ -27,6 +27,7 @@ if (process.env.ACTOR_MAX_PAID_DATASET_ITEMS) { } if (process.env.OPENAI_API_KEY) { + /** Input for the PPR Actor. We explicitly override some of these values, so that users don't change them. */ const adjustedPayPerResultInput = { ...input, maxPagesPerCrawl: maxRequestsPerCrawl, diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index 055a108..dacf2bb 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -114,9 +114,9 @@ export const createCrawler = async ({ input }: { input: Input }) => { ); } - const skipGptProcessing = shouldSkipGptProcessing(url, input.skipGptGlobs); + const skipGptProcessing = input.skipGptGlobs && doesUrlMatchGlobs(url, input.skipGptGlobs); if (skipGptProcessing) { - return log.info(`Skipping page '${url}' from GPT processing, crawling only.`); + log.info(`Skipping page from GPT processing because it matched 'skipGptGlobs', crawling only.`, { url }); } // A function to be evaluated by Playwright within the browser context. @@ -254,9 +254,3 @@ export const createCrawler = async ({ input }: { input: Input }) => { return crawler; }; - -const shouldSkipGptProcessing = (url: string, skipGptGlobs: Input['skipGptGlobs']): boolean => { - if (!skipGptGlobs) return false; - - return doesUrlMatchGlobs(url, skipGptGlobs); -}; From c7ba5718ec14e0233256f2eae2bd32c9fa35e472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Thu, 4 Jan 2024 18:08:08 +0100 Subject: [PATCH 4/5] refactor: fix linting --- packages/gpt-scraper-core/src/crawler.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/gpt-scraper-core/src/crawler.ts b/packages/gpt-scraper-core/src/crawler.ts index 577e4a2..0859a93 100644 --- a/packages/gpt-scraper-core/src/crawler.ts +++ b/packages/gpt-scraper-core/src/crawler.ts @@ -7,9 +7,8 @@ import addFormats from 'ajv-formats'; import { getModelByName } from './models/models.js'; import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from './processors.js'; import { Input, PAGE_FORMAT } from './types/input.js'; -import { parseInput } from './input.js'; -import { NonRetryableOpenaiAPIError } from './errors.js'; import { parseInput, validateInput, validateInputCssSelectors } from './input.js'; +import { NonRetryableOpenaiAPIError } from './errors.js'; import { OpenAIModelSettings } from './types/models.js'; import { doesUrlMatchGlobs } from './utils.js'; From 171f1b2f630a783400255d5c2739d3574504fcf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= Date: Thu, 4 Jan 2024 18:10:02 +0100 Subject: [PATCH 5/5] fix: incorrect merge --- packages/gpt-scraper-core/src/types/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/gpt-scraper-core/src/types/index.ts b/packages/gpt-scraper-core/src/types/index.ts index 593f251..4106c4b 100644 --- a/packages/gpt-scraper-core/src/types/index.ts +++ b/packages/gpt-scraper-core/src/types/index.ts @@ -1,2 +1,2 @@ -export { HTML_TAGS_TO_IGNORE, type Input, PAGE_FORMAT } from './input.js'; +export { type Input, PAGE_FORMAT } from './input.js'; export { type OpenAIModelSettings } from './models.js';