From dee78cbedf750e46cdfc7973c1b7c87ed3f07926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pr=C5=AF=C5=A1a?= <87543374+Patai5@users.noreply.github.com> Date: Sun, 22 Sep 2024 13:22:50 +0200 Subject: [PATCH] fix: shrink HTML with cheerio (#74) * fix: shrink HTML with cheerio * test(unit): add shrink HTML tests * docs: update changelog --- code/package-lock.json | 103 ++++++++++++++++-------------- code/package.json | 1 + code/src/processors.ts | 44 ++++--------- code/src/routes/crawl-route.ts | 2 +- code/test/unit/processors.test.ts | 35 ++++++++++ shared/CHANGELOG.md | 4 ++ 6 files changed, 108 insertions(+), 81 deletions(-) create mode 100644 code/test/unit/processors.test.ts diff --git a/code/package-lock.json b/code/package-lock.json index 981ad92..f86ad39 100644 --- a/code/package-lock.json +++ b/code/package-lock.json @@ -16,6 +16,7 @@ "ajv-formats": "^2.1.1", "apify": "^3.1.16", "apify-client": "^2.9.3", + "cheerio": "^1.0.0", "crawlee": "^3.8.1", "gpt-3-encoder": "^1.1.4", "joplin-turndown-plugin-gfm": "^1.0.12", @@ -314,24 +315,6 @@ "node": ">=16.0.0" } }, - "node_modules/@crawlee/cheerio/node_modules/htmlparser2": { - "version": "9.1.0", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", - "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", - "funding": [ - "https://github.com/fb55/htmlparser2?sponsor=1", - { - "type": "github", - "url": "https://github.com/sponsors/fb55" - } - ], - "dependencies": { - "domelementtype": "^2.3.0", - "domhandler": "^5.0.3", - "domutils": "^3.1.0", - "entities": "^4.5.0" - } - }, "node_modules/@crawlee/cli": { "version": "3.8.1", "resolved": "https://registry.npmjs.org/@crawlee/cli/-/cli-3.8.1.tgz", @@ -3048,20 +3031,25 @@ } }, "node_modules/cheerio": { - "version": "1.0.0-rc.12", - "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", - "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", + "license": "MIT", "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "htmlparser2": "^8.0.1", - "parse5": "^7.0.0", - "parse5-htmlparser2-tree-adapter": "^7.0.0" + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" }, "engines": { - "node": ">= 6" + "node": ">=18.17" }, "funding": { "url": "https://github.com/cheeriojs/cheerio?sponsor=1" @@ -3734,6 +3722,19 @@ "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", "dev": true }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, "node_modules/enhanced-resolve": { "version": "5.16.0", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.16.0.tgz", @@ -5471,9 +5472,9 @@ } }, "node_modules/htmlparser2": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", - "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", "funding": [ "https://github.com/fb55/htmlparser2?sponsor=1", { @@ -5481,11 +5482,12 @@ "url": "https://github.com/sponsors/fb55" } ], + "license": "MIT", "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "entities": "^4.4.0" + "domutils": "^3.1.0", + "entities": "^4.5.0" } }, "node_modules/http-cache-semantics": { @@ -6587,24 +6589,6 @@ "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==" }, - "node_modules/linkedom/node_modules/htmlparser2": { - "version": "9.1.0", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", - "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", - "funding": [ - "https://github.com/fb55/htmlparser2?sponsor=1", - { - "type": "github", - "url": "https://github.com/sponsors/fb55" - } - ], - "dependencies": { - "domelementtype": "^2.3.0", - "domhandler": "^5.0.3", - "domutils": "^3.1.0", - "entities": "^4.5.0" - } - }, "node_modules/lint-staged": { "version": "15.2.2", "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.2.tgz", @@ -7794,6 +7778,18 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", @@ -9271,6 +9267,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/undici": { + "version": "6.19.8", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.19.8.tgz", + "integrity": "sha512-U8uCCl2x9TK3WANvmBavymRzxbfFYG+tAu+fgx3zxQy3qdagQqBLwJVrdyO1TBfUXvfKveMKJZhpvUYoOjM+4g==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", diff --git a/code/package.json b/code/package.json index 8b64dc7..8637f45 100644 --- a/code/package.json +++ b/code/package.json @@ -13,6 +13,7 @@ "ajv-formats": "^2.1.1", "apify": "^3.1.16", "apify-client": "^2.9.3", + "cheerio": "^1.0.0", "crawlee": "^3.8.1", "gpt-3-encoder": "^1.1.4", "joplin-turndown-plugin-gfm": "^1.0.12", diff --git a/code/src/processors.ts b/code/src/processors.ts index 465137b..b36890a 100644 --- a/code/src/processors.ts +++ b/code/src/processors.ts @@ -1,5 +1,5 @@ +import { load } from 'cheerio'; import { encode } from 'gpt-3-encoder'; -import { Page } from 'playwright'; import { htmlToMarkdownProcessor } from './markdown.js'; @@ -7,45 +7,27 @@ const JSON_REGEX = /\{(?:[^{}]|())*\}/; /** * Shrinks HTML by removing css targeted elements and extra spaces - * @param html */ export const shrinkHtml = async ( html: string, - page: Page, options: { removeLinkUrls: boolean; removeElementsCssSelector?: string }, ) => { const { removeElementsCssSelector, removeLinkUrls } = options; - const stripped = await page.evaluate( - // eslint-disable-next-line @typescript-eslint/no-shadow - ([unstripped, removeSelector, removeLinkUrls]) => { - const doc = new DOMParser().parseFromString(unstripped, 'text/html'); - if (removeSelector) { - const elements = doc.querySelectorAll(removeSelector); - for (const element of elements) { - // there have been some cases when the page's own scripts cause errors and running this line - // causes them to reemerge, so what in try/cartch - try { - element.remove(); - } catch (err) { - /* ignore */ - } - } - } + const $ = load(html); - if (removeLinkUrls) { - const linkEls = doc.querySelectorAll('a'); - for (const linkEl of linkEls) { - linkEl.removeAttribute('href'); - } - } + if (removeElementsCssSelector) { + $(removeElementsCssSelector).map((_, el) => $(el).remove()); + } + if (removeLinkUrls) { + $('a').map((_, el) => $(el).removeAttr('href')); + } - return doc.documentElement.outerHTML; - }, - [html, removeElementsCssSelector, removeLinkUrls] as const, - ); - return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces - .replace(/>\s+<'); // remove all spaces between tags + const stripped = $.html(); + return stripped + .replace(/\s{2,}/g, ' ') // remove extra spaces + .replace(/>\s+<') // remove all spaces between tags + .replace(/^]*>/i, ''); // remove doctype }; /** diff --git a/code/src/routes/crawl-route.ts b/code/src/routes/crawl-route.ts index 9d8c664..52ae5b5 100644 --- a/code/src/routes/crawl-route.ts +++ b/code/src/routes/crawl-route.ts @@ -114,7 +114,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => { originContentHtml = await page.content(); } - const shrunkHtml = await shrinkHtml(originContentHtml, page, { removeLinkUrls, removeElementsCssSelector }); + const shrunkHtml = await shrinkHtml(originContentHtml, { removeLinkUrls, removeElementsCssSelector }); const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml; const instructionTokenLength = getNumberOfTextTokens(instructions); diff --git a/code/test/unit/processors.test.ts b/code/test/unit/processors.test.ts new file mode 100644 index 0000000..fce240a --- /dev/null +++ b/code/test/unit/processors.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, test } from 'vitest'; + +import { shrinkHtml } from '../../src/processors'; + +describe('shrinkHtml', () => { + test('should shrink additional spaces', async () => { + const html = ` Title1 end

text 1

`; + const result = await shrinkHtml(html, { removeLinkUrls: false }); + + expect(result).toBe(`Title1 end

text 1

`); + }); + + test('should ignore doctype', async () => { + const html = `

Test

`; + const result = await shrinkHtml(html, { removeLinkUrls: false }); + + expect(result).toBe(`

Test

`); + }); + + test('should remove link urls', async () => { + const html = `Link

Test

`; + const result = await shrinkHtml(html, { removeLinkUrls: true }); + + expect(result).toBe(`Link

Test

`); + }); + + test('should remove elements by css selector', async () => { + const html = `Link

Test1

Test2

`; + const result = await shrinkHtml(html, { removeLinkUrls: false, removeElementsCssSelector: '.remove' }); + + expect(result).toBe( + `Link

Test1

`, + ); + }); +}); diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md index 4be8892..8850913 100644 --- a/shared/CHANGELOG.md +++ b/shared/CHANGELOG.md @@ -1,5 +1,9 @@ This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors. +# 2024-09-22 +*Fixes* +- Fixed a bug where HTML minimization was failing on some specific websites. + # 2024-08-12 *Features* - Added support for GPT-4o-mini model. (Extended GPT scraper)