diff --git a/CHANGELOG.md b/CHANGELOG.md index 7621a50..4f189b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +# [v8.2.0](https://github.com/coder-hxl/x-crawl/compare/v8.1.1...v8.2.0) (2023-09-07) + +### 🚀 Features + +- Added crawlHTML API for crawling static HTML pages. + +--- + +### 🚀 特征 + +- 新增 crawlHTML API ,用于爬取静态 HTML 页面。 + # [v8.1.1](https://github.com/coder-hxl/x-crawl/compare/v8.1.0...v8.1.1) (2023-09-01) ### 🐞 Bug fixes diff --git a/README.md b/README.md index 09938b3..ec344d6 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,15 @@ x-crawl is a flexible Node.js multifunctional crawler library. Flexible usage an ## Features - **🔥 Asynchronous Synchronous** - Just change the mode property to toggle asynchronous or synchronous crawling mode. -- **⚙️ Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios. -- **☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)). -- **⚒️ Control Page** - Automate form submission, UI testing, keyboard input, event manipulation, open browser, etc. +- **⚙️Multiple uses** - Supports crawling dynamic pages, static pages, interface data, files and polling operations. +- **⚒️ Control page** - Crawling dynamic pages supports automated operations, keyboard input, event operations, etc. - **🖋️ Flexible writing style** - The same crawling API can be adapted to multiple configurations, and each configuration method is very unique. - **⏱️ Interval Crawling** - No interval, fixed interval and random interval to generate or avoid high concurrent crawling. - **🔄 Failed Retry** - Avoid crawling failure due to short-term problems, and customize the number of retries. - **➡️ Proxy Rotation** - Auto-rotate proxies with failure retry, custom error times and HTTP status codes. - **👀 Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations. - **🚀 Priority Queue** - According to the priority of a single crawling target, it can be crawled ahead of other targets. -- **🧾 Capture Record** - Capture and record crawling, and use colored strings to remind in the terminal. +- **🧾 crawl log** - Logs the crawl and uses colored string reminders at the terminal. - **🦾 TypeScript** - Own types, implement complete types through generics. ## Sponsor @@ -41,12 +40,15 @@ x-crawl is an open source project under the MIT license, completely free to use. - [life Cycle](#life-cycle) - [onCrawlItemComplete](#oncrawlitemcomplete) - [Open Browser](#open-browser) - - [Crawl Interface](#crawl-interface) + - [Crawl HTML](#crawl-html) - [life Cycle](#life-cycle-1) - [onCrawlItemComplete](#oncrawlitemcomplete-1) - - [Crawl Files](#crawl-files) - - [life Cycle](#life-cycle) + - [Crawl Interface](#crawl-interface) + - [life Cycle](#life-cycle-2) - [onCrawlItemComplete](#oncrawlitemcomplete-2) + - [Crawl Files](#crawl-files) + - [life Cycle](#life-cycle-3) + - [onCrawlItemComplete](#oncrawlitemcomplete-3) - [onBeforeSaveItemFile](#onbeforesaveitemfile) - [Start Polling](#start-polling) - [Config Priority](#config-priority) @@ -69,33 +71,43 @@ x-crawl is an open source project under the MIT license, completely free to use. - [Detailed target config - CrawlPageDetailTargetConfig](#detailed-target-config---crawlpagedetailtargetconfig) - [Mixed target array config - (string | CrawlPageDetailTargetConfig)[]](#mixed-target-array-config---string--crawlpagedetailtargetconfig) - [Advanced config - CrawlPageAdvancedConfig](#advanced-config---crawlpageadvancedconfig) - - [crawlData](#crawldata) + - [crawlHTML](#crawlhtml) - [Type](#type-2) - [Example](#example-3) - [Config](#config-1) - [Simple target config - string](#simple-target-config---string-1) + - [Detailed target config - CrawlHTMLDetailTargetConfig](#detailed-target-config---crawlhtmldetailtargetconfig) + - [Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[]](#mixed-target-array-config---string--crawlhtmldetailtargetconfig) + - [Advanced config - CrawlHTMLAdvancedConfig](#advanced-config---crawlhtmladvancedconfig) + - [crawlData](#crawldata) + - [Type](#type-3) + - [Example](#example-4) + - [Config](#config-2) + - [Simple target config - string](#simple-target-config---string-2) - [Detailed target config - CrawlDataDetailTargetConfig](#detailed-target-config---crawldatadetailtargetconfig) - [Mixed target array config - (string | CrawlDataDetailTargetConfig)[]](#mixed-target-array-config---string--crawldatadetailtargetconfig) - [Advanced config - CrawlDataAdvancedConfig](#advanced-config---crawldataadvancedconfig) - [crawlFile](#crawlfile) - - [Type](#type-3) - - [Example](#example-4) - - [Config](#config-2) + - [Type](#type-4) + - [Example](#example-5) + - [Config](#config-3) - [Detailed target config - CrawlFileDetailTargetConfig](#detailed-target-config---crawlFiledetailtargetconfig) - [Detailed target array config - CrawlFileDetailTargetConfig[]](#detailed-target-array-config---crawlfiledetailtargetconfig) - [Advanced config - CrawlFileAdvancedConfig](#advanced-config-crawlfileadvancedconfig) - [crawlPolling](#crawlpolling) - - [Type](#type-4) - - [Example](#example-5) + - [Type](#type-5) + - [Example](#example-6) - [Types](#types) - [API Config](#api-config) - [XCrawlConfig](#xcrawlconfig) - [Detail Target Config](#detail-target-config) - [CrawlPageDetailTargetConfig](#crawlpagedetailtargetconfig) + - [CrawlHTMLDetailTargetConfig](#crawlhtmldetailtargetconfig) - [CrawlDataDetailTargetConfig](#crawldatadetailtargetconfig) - [CrawlFileDetailTargetConfig](#crawlfiledetailtargetconfig) - [Advanced Config](#advanced-config) - [CrawlPageAdvancedConfig](#crawlpageadvancedconfig) + - [CrawlHTMLAdvancedConfig](#crawlhtmladvancedconfig) - [CrawlDataAdvancedConfig](#crawldataadvancedconfig) - [CrawlFileAdvancedConfig](#crawlfileadvancedconfig) - [StartPollingConfig](#startpollingconfig) @@ -111,6 +123,7 @@ x-crawl is an open source project under the MIT license, completely free to use. - [XCrawlInstance](#xcrawlinstance) - [CrawlCommonResult](#crawlcommonResult) - [CrawlPageSingleResult](#crawlpagesingleresult) + - [CrawlHTMLSingleResult](#crawlhtmlsingleresult) - [CrawlDataSingleResult](#crawldatasingleresult) - [CrawlFileSingleResult](#crawlfilesingleresult) - [API Other](#api-other) @@ -337,6 +350,35 @@ const myXCrawl = xCrawl({ myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` +### Crawl HTML + +Crawl interface data through [crawlHTML()](#crawlHTML) . + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) + +myXCrawl + .crawlHTML([ + 'https://www.example.com/html-1', + 'https://www.example.com/html-2' + ]) + .then((res) => { + // deal with + }) +``` + +#### life Cycle + +Life cycle functions owned by crawlHTML API: + +- onCrawlItemComplete: Call back when each crawl is complete + +##### onCrawlItemComplete + +In the onCrawlItemComplete function, you can get the results of each crawled goal in advance. + ### Crawl Interface Crawl interface data through [crawlData()](#crawlData) . @@ -759,7 +801,7 @@ Each crawl target will generate a detail object, which will contain the followin If it is a specific configuration, it will automatically determine whether the details object is stored in an array according to the configuration method you choose, and return the array, otherwise return the details object. Already fits types perfectly in TypeScript. -Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlData config](#config-1), [crawlFile config](#config-2). +Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlHTML config](#config-1), [crawlData config](#config-2), [crawlFile config](#config-3). ### TypeScript @@ -771,7 +813,7 @@ x-crawl itself is written in TypeScript and supports TypeScript. Comes with a ty ### xCrawl -Create a crawler instance via call xCrawl. The crawl target queue is maintained by the instance method itself, not by the instance itself. +Create a crawler instance by calling xCrawl. The crawl target is maintained internally by the instance method, not by the instance. #### Type @@ -806,7 +848,7 @@ const myXCrawl = xCrawl({ ### crawlPage -crawlPage is the method of the crawler instance, usually used to crawl page. +crawlPage is the method of a crawler instance, usually used to crawl a dynamic page. #### Type @@ -955,6 +997,157 @@ More configuration options can view [CrawlPageAdvancedConfig](#CrawlPageAdvanced More information about the results can be found at [About results](#About-results) , which can be selected according to the actual situation. +### crawlHTML + +crawlHTML is a method for crawling instances, usually used to crawl static HTML pages. + +#### Type + +The crawlHTML API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. + +```ts +type crawlHTML = { + ( + config: string, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise +} +``` + +**Parameter Type:** + +- Look at the [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) type +- Look at the [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig) type + +**Return value type:** + +- Look at the [CrawlHTMLSingleResult](#CrawlHTMLSingleResult) type + +#### Example + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +// crawlHTML API +myXCrawl.crawlHTML('https://www.example.com').then((res) => { + const { browser, page } = res.data + + // Close the browser + browser.close() +}) +``` + +#### Config + +There are 4 types: + +- Simple target config - string +- Detailed target config - CrawlHTMLDetailTargetConfig +- Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[] +- Advanced config - CrawlHTMLAdvancedConfig + +##### Simple target config - string + +This is a simple target configuration. if you just want to simply crawl this static HTML page, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl.crawlHTML('https://www.example.com').then((res) => {}) +``` + +The res you get will be an object. + +##### Detailed target config - CrawlHTMLDetailTargetConfig + +This is the detailed target configuration. if you want to crawl this static HTML page and need to retry on failure, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + url: 'https://www.example.com', + proxy: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +The res you get will be an object. + +More configuration options can view [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig). + +##### Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[] + +This is a mixed target array configuration. if you want to crawl multiple static HTML pages, and some pages need to fail and retry, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) + .then((res) => {}) +``` + +The res you get will be an array of objects. + +More configuration options can view [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig). + +##### Advanced config - CrawlHTMLAdvancedConfig + +This is an advanced configuration, targets is a mixed target array configuration. if you want to crawl multiple static HTML pages and crawl target configurations (proxy, cookies, retries, etc.) that you don't want to write repeatedly, but also need interval time, device fingerprint, lifecycle, etc., try this: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + targets: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +The res you get will be an array of objects. + +More configuration options can view [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig). + +More information about the results can be found at [About results](#About-results) , which can be selected according to the actual situation. + ### crawlData crawlData is the method of the crawler instance, which is usually used to crawl APIs to obtain JSON data and so on. @@ -1342,6 +1535,24 @@ export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig { - viewport: undefined - fingerprint: undefined +##### CrawlHTMLDetailTargetConfig + +```ts +export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig { + url: string + headers?: AnyObject | null + priority?: number + fingerprint?: DetailTargetFingerprintCommon | null +} +``` + +**Default Value** + +- url: undefined +- headers: undefined +- priority: undefined +- fingerprint: undefined + ##### CrawlDataDetailTargetConfig ```ts @@ -1423,6 +1634,28 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig { - viewport: undefined - onCrawlItemComplete: undefined +##### CrawlHTMLAdvancedConfig + +```ts +export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig { + targets: (string | CrawlHTMLDetailTargetConfig)[] + intervalTime?: IntervalTime + fingerprints?: DetailTargetFingerprintCommon[] + + headers?: AnyObject + + onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void +} +``` + +**Default Value** + +- targets: undefined +- intervalTime: undefined +- fingerprints: undefined +- headers: undefined +- onCrawlItemComplete: undefined + ##### CrawlDataAdvancedConfig ```ts @@ -1642,6 +1875,28 @@ export interface XCrawlInstance { ): Promise } + crawlHTML: { + ( + config: string, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + } + crawlData: { ( config: CrawlDataDetailTargetConfig, @@ -1720,6 +1975,18 @@ export interface CrawlPageSingleResult extends CrawlCommonResult { } ``` +#### CrawlHTMLSingleResult + +```ts +export interface CrawlHTMLSingleResult extends CrawlCommonResult { + data: { + statusCode: number | undefined + headers: IncomingHttpHeaders + html: string + } | null +} +``` + #### CrawlDataSingleResult ```ts diff --git a/docs/cn.md b/docs/cn.md index 1501bb2..5f7b790 100644 --- a/docs/cn.md +++ b/docs/cn.md @@ -9,16 +9,15 @@ x-crawl 是一个灵活的 Node.js 多功能爬虫库。灵活的使用方式和 ## 特征 - **🔥 异步同步** - 只需更改一下 mode 属性即可切换异步或同步爬取模式。 -- **⚙️ 多种用途** - 可爬页面、爬接口、爬文件以及轮询爬,满足各种场景需求。 -- **☁️ 爬取 SPA** - 爬取 SPA(单页应用程序)生成预渲染内容(即“SSR”(服务器端渲染))。 -- **⚒️ 控制页面** - 自动化表单提交、UI 测试、键盘输入、事件操作、打开浏览器等。 +- **⚙️ 多种用途** - 支持爬动态页面、静态页面、接口数据、文件以及轮询操作。 +- **⚒️ 控制页面** - 爬取动态页面支持自动化操作、键盘输入、事件操作等。 - **🖋️ 写法灵活** - 同种爬取 API 适配多种配置,每种配置方式都非常独特。 - **⏱️ 间隔爬取** - 无间隔、固定间隔以及随机间隔,产生或避免高并发爬取。 - **🔄 失败重试** - 避免因短暂的问题而造成爬取失败,自定义重试次数。 - **➡️ 轮换代理** - 配合失败重试,自定义错误次数以及 HTTP 状态码自动轮换代理。 - **👀 设备指纹** - 零配置或自定义配置,避免指纹识别从不同位置识别并跟踪我们。 - **🚀 优先队列** - 根据单个爬取目标的优先级可以优先于其他目标提前爬取。 -- **🧾 捕获记录** - 对爬取进行捕获记录,并在终端使用彩色字符串提醒。 +- **🧾 爬取记录** - 对爬取进行记录,并在终端使用彩色字符串提醒。 - **🦾 TypeScript** - 拥有类型,通过泛型实现完整的类型。 ## 赞助 @@ -41,12 +40,15 @@ x-crawl 是采用 MIT 许可的开源项目,使用完全免费。如果你在 - [生命周期](#生命周期) - [onCrawlItemComplete](#onCrawlItemComplete) - [打开浏览器](#打开浏览器) - - [爬取接口](#爬取接口) + - [爬取 HTML](#爬取-html) - [生命周期](#生命周期-1) - [onCrawlItemComplete](#onCrawlItemComplete-1) - - [爬取文件](#爬取文件) + - [爬取接口](#爬取接口) - [生命周期](#生命周期-2) - [onCrawlItemComplete](#onCrawlItemComplete-2) + - [爬取文件](#爬取文件) + - [生命周期](#生命周期-3) + - [onCrawlItemComplete](#onCrawlItemComplete-3) - [onBeforeSaveItemFile](#onBeforeSaveItemFile) - [启动轮询](#启动轮询) - [配置优先级](#配置优先级) @@ -69,34 +71,43 @@ x-crawl 是采用 MIT 许可的开源项目,使用完全免费。如果你在 - [详细目标配置 - CrawlPageDetailTargetConfig](#详细目标配置---CrawlPageDetailTargetConfig) - [混合目标数组配置 - (string | CrawlPageDetailTargetConfig)[]](#混合目标数组配置---string--CrawlPageDetailTargetConfig) - [进阶配置 - CrawlPageAdvancedConfig](#进阶配置---CrawlPageAdvancedConfig) - - [crawlData](#crawlData) + - [crawlHTML](#crawlHTML) - [类型](#类型-2) - [示例](#示例-3) - [配置](#配置-1) - [简单目标配置 - string](#简单目标配置---string-1) + - [详细目标配置 - CrawlHTMLDetailTargetConfig](#详细目标配置---CrawlHTMLDetailTargetConfig) + - [混合目标数组配置 - (string | CrawlHTMLDetailTargetConfig)[]](#混合目标数组配置---string--CrawlHTMLDetailTargetConfig) + - [进阶配置 - CrawlHTMLAdvancedConfig](#进阶配置---CrawlHTMLAdvancedConfig) + - [crawlData](#crawlData) + - [类型](#类型-3) + - [示例](#示例-4) + - [配置](#配置-2) + - [简单目标配置 - string](#简单目标配置---string-2) - [详细目标配置 - CrawlDataDetailTargetConfig](#详细目标配置---CrawlDataDetailTargetConfig) - [混合目标数组配置 - (string | CrawlDataDetailTargetConfig)[]](#混合目标数组配置---string--CrawlDataDetailTargetConfig) - [进阶配置 - CrawlDataAdvancedConfig](#进阶配置---CrawlDataAdvancedConfig) - [crawlFile](#crawlFile) - - [类型](#类型-3) - - [示例](#示例-4) - - [配置](#配置-2) + - [类型](#类型-4) + - [示例](#示例-5) + - [配置](#配置-3) - [详细目标配置 - CrawlFileDetailTargetConfig](#详细目标配置---CrawlFileDetailTargetConfig) - [详细目标数组配置 - CrawlFileDetailTargetConfig[]](#详细目标数组配置---CrawlFileDetailTargetConfig) - [进阶配置 - CrawlFileAdvancedConfig](#进阶配置---CrawlFileAdvancedConfig) - [startPolling](#startPolling) - - [类型](#类型-4) - - [示例](#示例-5) - [类型](#类型-5) + - [示例](#示例-6) - [类型](#类型-6) - [API Config](#API-config) - [XCrawlConfig](#XCrawlConfig) - [Detail Target Config](#Detail-Target-Config) - [CrawlPageDetailTargetConfig](#CrawlPageDetailTargetConfig) + - [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) - [CrawlDataDetailTargetConfig](#CrawlDataDetailTargetConfig) - [CrawlFileDetailTargetConfig](#CrawlFileDetailTargetConfig) - [Advanced Config](#Advanced-Config) - [CrawlPageAdvancedConfig](#CrawlPageAdvancedConfig) + - [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig) - [CrawlDataAdvancedConfig](#CrawlDataAdvancedConfig) - [CrawlFileAdvancedConfig](#CrawlFileAdvancedConfig) - [StartPollingConfig](#StartPollingConfig) @@ -112,6 +123,7 @@ x-crawl 是采用 MIT 许可的开源项目,使用完全免费。如果你在 - [XCrawlInstance](#XCrawlInstance) - [CrawlCommonResult](#CrawlCommonResult) - [CrawlPageSingleResult](#CrawlPageSingleResult) + - [CrawlHTMLSingleResult](#CrawlHTMLSingleResult) - [CrawlDataSingleResult](#CrawlDataSingleResult) - [CrawlFileSingleResult](#CrawlFileSingleResult) - [API Other](#API-Other) @@ -335,6 +347,35 @@ const myXCrawl = xCrawl({ myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` +### 爬取 HTML + +通过 [crawlHTML()](#crawlData) 爬取静态 HTML。 + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) + +myXCrawl + .crawlHTML([ + 'https://www.example.com/html-1', + 'https://www.example.com/html-2' + ]) + .then((res) => { + // 处理 + }) +``` + +#### 生命周期 + +crawlHTML API 拥有的声明周期函数: + +- onCrawlItemComplete: 当每个爬取目标完成后会回调 + +##### onCrawlItemComplete + +在 onCrawlItemComplete 函数中你可以提前拿到每次爬取目标的结果。 + ### 爬取接口 通过 [crawlData()](#crawlData) 爬取接口数据。 @@ -754,7 +795,7 @@ priority 属性的值越大就在当前爬取队列中越优先。 如果是特定的配置,会自动根据你选用的配置方式决定详情对象是否存放在一个数组中,并把该数组返回,否则返回详情对象。已经在 TypeScript 中类型完美适配。 -相关的配置方式和结果详情查看:[crawlPage 配置](#配置)、[crawlData 配置](#配置-1)、[crawlFile 配置](#配置-2) 。 +相关的配置方式和结果详情查看:[crawlPage 配置](#配置)、[crawlHTML 配置](#配置-1)、[crawlData 配置](#配置-2)、[crawlFile 配置](#配置-3) 。 ### TypeScript @@ -766,7 +807,7 @@ x-crawl 本身就是用 TypeScript 编写的,并对 TypeScript 提供了支持 ### xCrawl -通过调用 xCrawl 创建一个爬虫实例。爬取目标是由实例方法内部自己维护,并非由实例自己维护。 +通过调用 xCrawl 创建一个爬虫实例。爬取目标是由实例方法内部维护,并非由实例维护。 #### 类型 @@ -799,7 +840,7 @@ const myXCrawl = xCrawl({ ### crawlPage -crawlPage 是爬虫实例的方法,通常用于爬取页面。 +crawlPage 是爬虫实例的方法,通常用于爬取动态页面。 #### 类型 @@ -948,6 +989,152 @@ myXCrawl 关于结果的更多信息可查看 [关于结果](#关于结果) ,可以根据实际情况选用即可。 +### crawlHTML + +crawlHTML 是爬虫实例的方法,通常用于爬取静态 HTML 页面。 + +#### 类型 + +crawlHTML API 是一个函数。类型是 [重载函数](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) 可以通过不同的配置参数调用该函数(在类型方面)。 + +```ts +type crawlHTML = { + ( + config: string, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise +} +``` + +**参数类型:** + +- 查看 [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) 类型 +- 查看 [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig) 类型 + +**返回值类型:** + +- 查看 [CrawlHTMLSingleResult](#CrawlHTMLSingleResult) 类型 + +#### 示例 + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +// crawlHTML API +myXCrawl.crawlHTML('https://www.example.com').then((res) => {}) +``` + +#### 配置 + +一共有 4 种: + +- 简单目标配置 - string +- 详细目标配置 - CrawlHTMLDetailTargetConfig +- 混合目标数组配置 - (string | CrawlHTMLDetailTargetConfig)[] +- 进阶配置 - CrawlHTMLAdvancedConfig + +##### 简单目标配置 - string + +这是简单目标配置。如果你只想单纯爬一下这个静态 HTML 页面,可以试试这种写法: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl.crawlHTML('https://www.example.com').then((res) => {}) +``` + +拿到的 res 将是一个对象。 + +##### 详细目标配置 - CrawlHTMLDetailTargetConfig + +这是详细目标配置。如果你想爬一下这个静态 HTML 页面,并且需要失败重试之类的,可以试试这种写法: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + url: 'https://www.example.com', + proxy: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +拿到的 res 将是一个对象。 + +更多配置选项可以查看 [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) 。 + +##### 混合目标数组配置 - (string | CrawlHTMLDetailTargetConfig)[] + +这是混合目标数组配置。如果你想爬取多个静态 HTML 页面,并且有些静态 HTML 页面需要失败重试之类的,可以试试这种写法: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) + .then((res) => {}) +``` + +拿到的 res 将是一个数组,里面是对象。 + +更多配置选项可以查看 [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) 。 + +##### 进阶配置 - CrawlHTMLAdvancedConfig + +这是进阶配置,targets 是混合目标数组配置。如果你想爬取多个静态 HTML 页面,并且爬取目标配置(proxy、cookies、重试等等)不想重复写,还需要间隔时间、设备指纹以及生命周期等等,可以试试这种写法: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + targets: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +拿到的 res 将是一个数组,里面是对象。 + +更多配置选项可以查看 [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig) 。 + +关于结果的更多信息可查看 [关于结果](#关于结果) ,可以根据实际情况选用即可。 + ### crawlData crawl 是爬虫实例的方法,通常用于爬取 API ,可获取 JSON 数据等等。 @@ -1336,6 +1523,24 @@ export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig { - viewport: undefined - fingerprint: undefined +##### CrawlHTMLDetailTargetConfig + +```ts +export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig { + url: string + headers?: AnyObject | null + priority?: number + fingerprint?: DetailTargetFingerprintCommon | null +} +``` + +**默认值** + +- url: undefined +- headers: undefined +- priority: undefined +- fingerprint: undefined + ##### CrawlDataDetailTargetConfig ```ts @@ -1417,6 +1622,28 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig { - viewport: undefined - onCrawlItemComplete: undefined +##### CrawlHTMLAdvancedConfig + +```ts +export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig { + targets: (string | CrawlHTMLDetailTargetConfig)[] + intervalTime?: IntervalTime + fingerprints?: DetailTargetFingerprintCommon[] + + headers?: AnyObject + + onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void +} +``` + +**默认值** + +- targets: undefined +- intervalTime: undefined +- fingerprints: undefined +- headers: undefined +- onCrawlItemComplete: undefined + ##### CrawlDataAdvancedConfig ```ts @@ -1636,6 +1863,28 @@ export interface XCrawlInstance { ): Promise } + crawlHTML: { + ( + config: string, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + } + crawlData: { ( config: CrawlDataDetailTargetConfig, @@ -1714,6 +1963,18 @@ export interface CrawlPageSingleResult extends CrawlCommonResult { } ``` +#### CrawlHTMLSingleResult + +```ts +export interface CrawlHTMLSingleResult extends CrawlCommonResult { + data: { + statusCode: number | undefined + headers: IncomingHttpHeaders + html: string + } | null +} +``` + #### CrawlDataSingleResult ```ts diff --git a/package.json b/package.json index 17b7b4a..b2d281e 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "8.1.1", + "version": "8.2.0", "author": "coderHXL", "description": "x-crawl is a flexible Node.js multifunctional crawler library.", "license": "MIT", @@ -22,7 +22,7 @@ "build-dts": "tsc && prettier --write ./publish/src", "build-strict": "pnpm test-dev && pnpm build && pnpm test-pro", "start": "rollup --config script/start.mjs", - "start-server": "rollup --config script/server.mjs", + "start-server": "rollup --watch --config script/server.mjs", "test-dev": "jest test/environment/test.ts dev", "test-pro": "jest test/environment/test.ts pro", "test-crawlPage": "jest test/environment/api/crawlPage.test.ts dev", diff --git a/publish/README.md b/publish/README.md index b21d476..52587e8 100644 --- a/publish/README.md +++ b/publish/README.md @@ -9,16 +9,15 @@ x-crawl is a flexible Node.js multifunctional crawler library. Flexible usage an ## Features - **🔥 Asynchronous Synchronous** - Just change the mode property to toggle asynchronous or synchronous crawling mode. -- **⚙️ Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios. -- **☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)). -- **⚒️ Control Page** - Automate form submission, UI testing, keyboard input, event manipulation, open browser, etc. +- **⚙️Multiple uses** - Supports crawling dynamic pages, static pages, interface data, files and polling operations. +- **⚒️ Control page** - Crawling dynamic pages supports automated operations, keyboard input, event operations, etc. - **🖋️ Flexible writing style** - The same crawling API can be adapted to multiple configurations, and each configuration method is very unique. - **⏱️ Interval Crawling** - No interval, fixed interval and random interval to generate or avoid high concurrent crawling. - **🔄 Failed Retry** - Avoid crawling failure due to short-term problems, and customize the number of retries. - **➡️ Proxy Rotation** - Auto-rotate proxies with failure retry, custom error times and HTTP status codes. - **👀 Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations. - **🚀 Priority Queue** - According to the priority of a single crawling target, it can be crawled ahead of other targets. -- **🧾 Capture Record** - Capture and record crawling, and use colored strings to remind in the terminal. +- **🧾 crawl log** - Logs the crawl and uses colored string reminders at the terminal. - **🦾 TypeScript** - Own types, implement complete types through generics. ## Sponsor @@ -27,100 +26,114 @@ x-crawl is an open source project under the MIT license, completely free to use. # Table of Contents -- [Install](#Install) -- [Example](#Example) -- [Core Concepts](#Core-Concepts) - - [Create Application](#Create-Application) - - [An Example of a Crawler Application](#An-Example-of-a-Crawler-Application) - - [Crawl Mode](#Crawl-Mode) - - [Default Device Fingerprint](#Default-Device-Fingerprint) - - [Multiple Crawler Application Anstances](#Multiple-Crawler-Application-Instances) - - [Crawl Page](#Crawl-Page) - - [Browser Instance](#Browser-Instance) - - [Page Instance](#Page-Instance) - - [life Cycle](#life-Cycle) - - [onCrawlItemComplete](#onCrawlItemComplete) - - [Open Browser](#Open-Browser) - - [Crawl Interface](#Crawl-Interface) - - [life Cycle](#life-Cycle-1) - - [onCrawlItemComplete](#onCrawlItemComplete-1) - - [Crawl Files](#Crawl-Files) - - [life Cycle](#life-Cycle) - - [onCrawlItemComplete](#onCrawlItemComplete-2) - - [onBeforeSaveItemFile](#onBeforeSaveItemFile) - - [Start Polling](#Start-Polling) - - [Config Priority](#Config-Priority) - - [Interval Time](#Interval-Time) - - [Fail Retry](#Fail-Retry) - - [Rotate Proxy](#Rotate-Proxy) - - [Custom Device Fingerprint](#Custom-Device-Fingerprint) - - [Priority Queue](#Priority-Queue) - - [About Results](#About-Results) - - [TypeScript](#TypeScript) -- [API](#API) - - [xCrawl](#xCrawl) - - [Type](#Type) - - [Example](#Example-1) - - [crawlPage](#crawlPage) - - [Type](#Type-1) - - [Example](#Example-2) - - [Config](#Config) - - [Simple target config - string](#Simple-target-config---string) - - [Detailed target config - CrawlPageDetailTargetConfig](#Detailed-target-config---CrawlPageDetailTargetConfig) - - [Mixed target array config - (string | CrawlPageDetailTargetConfig)[]](#Mixed-target-array-config---string--CrawlPageDetailTargetConfig) - - [Advanced config - CrawlPageAdvancedConfig](#Advanced-config---CrawlPageAdvancedConfig) - - [crawlData](#crawlData) - - [Type](#Type-2) - - [Example](#Example-3) - - [Config](#Config-1) - - [Simple target config - string](#Simple-target-config---string-1) - - [Detailed target config - CrawlDataDetailTargetConfig](#Detailed-target-config---CrawlDataDetailTargetConfig) - - [Mixed target array config - (string | CrawlDataDetailTargetConfig)[]](#Mixed-target-array-config---string--CrawlDataDetailTargetConfig) - - [Advanced config - CrawlDataAdvancedConfig](#Advanced-config---CrawlDataAdvancedConfig) - - [crawlFile](#crawlFile) - - [Type](#Type-3) - - [Example](#Example-4) - - [Config](#Config-2) - - [Detailed target config - CrawlFileDetailTargetConfig](#Detailed-target-config---CrawlFileDetailTargetConfig) - - [Detailed target array config - CrawlFileDetailTargetConfig[]](#Detailed-target-array-config---CrawlFileDetailTargetConfig) - - [Advanced config - CrawlFileAdvancedConfig](#Advanced-config-CrawlFileAdvancedConfig) - - [crawlPolling](#crawlPolling) - - [Type](#Type-4) - - [Example](#Example-5) -- [Types](#Types) - - [API Config](#API-Config) - - [XCrawlConfig](#XCrawlConfig) - - [Detail Target Config](#Detail-Target-Config) - - [CrawlPageDetailTargetConfig](#CrawlPageDetailTargetConfig) - - [CrawlDataDetailTargetConfig](#CrawlDataDetailTargetConfig) - - [CrawlFileDetailTargetConfig](#CrawlFileDetailTargetConfig) - - [Advanced Config](#Advanced-Config) - - [CrawlPageAdvancedConfig](#CrawlPageAdvancedConfig) - - [CrawlDataAdvancedConfig](#CrawlDataAdvancedConfig) - - [CrawlFileAdvancedConfig](#CrawlFileAdvancedConfig) - - [StartPollingConfig](#StartPollingConfig) - - [Crawl Other Config](#Crawl-Other-Config) - - [CrawlCommonConfig](#CrawlCommonConfig) - - [DetailTargetFingerprintCommon](#DetailTargetFingerprintCommon) - - [Mobile](#Mobile) - - [Platform](#Platform) - - [PageCookies](#PageCookies) - - [Method](#Method) - - [IntervalTime](#IntervalTime) - - [API Result](#API-Result) - - [XCrawlInstance](#XCrawlInstance) - - [CrawlCommonResult](#CrawlCommonResult) - - [CrawlPageSingleResult](#CrawlPageSingleResult) - - [CrawlDataSingleResult](#CrawlDataSingleResult) - - [CrawlFileSingleResult](#CrawlFileSingleResult) - - [API Other](#API-Other) - - [AnyObject](#AnyObject) -- [FAQ](#FAQ) - - [The relationship between crawlPage API and puppeteer](#The-relationship-between-crawlPage-API-and-puppeteer) -- [More](#More) - - [Community](#Community) - - [Issues](#Issues) - - [Sponsor](#Sponsor-1) +- [Install](#install) +- [Example](#example) +- [Core Concepts](#core-concepts) + - [Create Application](#create-application) + - [An Example of a Crawler Application](#an-example-of-a-crawler-application) + - [Crawl Mode](#crawl-mode) + - [Default Device Fingerprint](#default-device-fingerprint) + - [Multiple Crawler Application Anstances](#multiple-crawler-application-instances) + - [Crawl Page](#crawl-page) + - [Browser Instance](#browser-instance) + - [Page Instance](#page-instance) + - [life Cycle](#life-cycle) + - [onCrawlItemComplete](#oncrawlitemcomplete) + - [Open Browser](#open-browser) + - [Crawl HTML](#crawl-html) + - [life Cycle](#life-cycle-1) + - [onCrawlItemComplete](#oncrawlitemcomplete-1) + - [Crawl Interface](#crawl-interface) + - [life Cycle](#life-cycle-2) + - [onCrawlItemComplete](#oncrawlitemcomplete-2) + - [Crawl Files](#crawl-files) + - [life Cycle](#life-cycle-3) + - [onCrawlItemComplete](#oncrawlitemcomplete-3) + - [onBeforeSaveItemFile](#onbeforesaveitemfile) + - [Start Polling](#start-polling) + - [Config Priority](#config-priority) + - [Interval Time](#interval-time) + - [Fail Retry](#fail-retry) + - [Rotate Proxy](#rotate-proxy) + - [Custom Device Fingerprint](#custom-device-fingerprint) + - [Priority Queue](#priority-queue) + - [About Results](#about-results) + - [TypeScript](#typescript) +- [API](#api) + - [xCrawl](#xcrawl) + - [Type](#type) + - [Example](#example-1) + - [crawlPage](#crawlpage) + - [Type](#type-1) + - [Example](#example-2) + - [Config](#config) + - [Simple target config - string](#simple-target-config---string) + - [Detailed target config - CrawlPageDetailTargetConfig](#detailed-target-config---crawlpagedetailtargetconfig) + - [Mixed target array config - (string | CrawlPageDetailTargetConfig)[]](#mixed-target-array-config---string--crawlpagedetailtargetconfig) + - [Advanced config - CrawlPageAdvancedConfig](#advanced-config---crawlpageadvancedconfig) + - [crawlHTML](#crawlhtml) + - [Type](#type-2) + - [Example](#example-3) + - [Config](#config-1) + - [Simple target config - string](#simple-target-config---string-1) + - [Detailed target config - CrawlHTMLDetailTargetConfig](#detailed-target-config---crawlhtmldetailtargetconfig) + - [Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[]](#mixed-target-array-config---string--crawlhtmldetailtargetconfig) + - [Advanced config - CrawlHTMLAdvancedConfig](#advanced-config---crawlhtmladvancedconfig) + - [crawlData](#crawldata) + - [Type](#type-3) + - [Example](#example-4) + - [Config](#config-2) + - [Simple target config - string](#simple-target-config---string-2) + - [Detailed target config - CrawlDataDetailTargetConfig](#detailed-target-config---crawldatadetailtargetconfig) + - [Mixed target array config - (string | CrawlDataDetailTargetConfig)[]](#mixed-target-array-config---string--crawldatadetailtargetconfig) + - [Advanced config - CrawlDataAdvancedConfig](#advanced-config---crawldataadvancedconfig) + - [crawlFile](#crawlfile) + - [Type](#type-4) + - [Example](#example-5) + - [Config](#config-3) + - [Detailed target config - CrawlFileDetailTargetConfig](#detailed-target-config---crawlFiledetailtargetconfig) + - [Detailed target array config - CrawlFileDetailTargetConfig[]](#detailed-target-array-config---crawlfiledetailtargetconfig) + - [Advanced config - CrawlFileAdvancedConfig](#advanced-config-crawlfileadvancedconfig) + - [crawlPolling](#crawlpolling) + - [Type](#type-5) + - [Example](#example-6) +- [Types](#types) + - [API Config](#api-config) + - [XCrawlConfig](#xcrawlconfig) + - [Detail Target Config](#detail-target-config) + - [CrawlPageDetailTargetConfig](#crawlpagedetailtargetconfig) + - [CrawlHTMLDetailTargetConfig](#crawlhtmldetailtargetconfig) + - [CrawlDataDetailTargetConfig](#crawldatadetailtargetconfig) + - [CrawlFileDetailTargetConfig](#crawlfiledetailtargetconfig) + - [Advanced Config](#advanced-config) + - [CrawlPageAdvancedConfig](#crawlpageadvancedconfig) + - [CrawlHTMLAdvancedConfig](#crawlhtmladvancedconfig) + - [CrawlDataAdvancedConfig](#crawldataadvancedconfig) + - [CrawlFileAdvancedConfig](#crawlfileadvancedconfig) + - [StartPollingConfig](#startpollingconfig) + - [Crawl Other Config](#crawl-other-config) + - [CrawlCommonConfig](#crawlcommonconfig) + - [DetailTargetFingerprintCommon](#detailtargetfingerprintcommon) + - [Mobile](#mobile) + - [Platform](#platform) + - [PageCookies](#pagecookies) + - [Method](#method) + - [IntervalTime](#intervaltime) + - [API Result](#api-result) + - [XCrawlInstance](#xcrawlinstance) + - [CrawlCommonResult](#crawlcommonResult) + - [CrawlPageSingleResult](#crawlpagesingleresult) + - [CrawlHTMLSingleResult](#crawlhtmlsingleresult) + - [CrawlDataSingleResult](#crawldatasingleresult) + - [CrawlFileSingleResult](#crawlfilesingleresult) + - [API Other](#api-other) + - [AnyObject](#anyobject) +- [FAQ](#faq) + - [The relationship between crawlPage API and puppeteer](#the-relationship-between-crawlpage-api-and-puppeteer) +- [More](#more) + - [Community](#community) + - [Issues](#issues) + - [Sponsor](#sponsor-1) ## Install @@ -337,6 +350,35 @@ const myXCrawl = xCrawl({ myXCrawl.crawlPage('https://www.example.com').then((res) => {}) ``` +### Crawl HTML + +Crawl interface data through [crawlHTML()](#crawlHTML) . + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } }) + +myXCrawl + .crawlHTML([ + 'https://www.example.com/html-1', + 'https://www.example.com/html-2' + ]) + .then((res) => { + // deal with + }) +``` + +#### life Cycle + +Life cycle functions owned by crawlHTML API: + +- onCrawlItemComplete: Call back when each crawl is complete + +##### onCrawlItemComplete + +In the onCrawlItemComplete function, you can get the results of each crawled goal in advance. + ### Crawl Interface Crawl interface data through [crawlData()](#crawlData) . @@ -759,7 +801,7 @@ Each crawl target will generate a detail object, which will contain the followin If it is a specific configuration, it will automatically determine whether the details object is stored in an array according to the configuration method you choose, and return the array, otherwise return the details object. Already fits types perfectly in TypeScript. -Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlData config](#config-1), [crawlFile config](#config-2). +Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlHTML config](#config-1), [crawlData config](#config-2), [crawlFile config](#config-3). ### TypeScript @@ -771,7 +813,7 @@ x-crawl itself is written in TypeScript and supports TypeScript. Comes with a ty ### xCrawl -Create a crawler instance via call xCrawl. The crawl target queue is maintained by the instance method itself, not by the instance itself. +Create a crawler instance by calling xCrawl. The crawl target is maintained internally by the instance method, not by the instance. #### Type @@ -806,7 +848,7 @@ const myXCrawl = xCrawl({ ### crawlPage -crawlPage is the method of the crawler instance, usually used to crawl page. +crawlPage is the method of a crawler instance, usually used to crawl a dynamic page. #### Type @@ -955,6 +997,157 @@ More configuration options can view [CrawlPageAdvancedConfig](#CrawlPageAdvanced More information about the results can be found at [About results](#About-results) , which can be selected according to the actual situation. +### crawlHTML + +crawlHTML is a method for crawling instances, usually used to crawl static HTML pages. + +#### Type + +The crawlHTML API is a function. A type is an [overloaded function](https://www.typescriptlang.org/docs/handbook/2/functions.html#function-overloads) which can be called (in terms of type) with different configuration parameters. + +```ts +type crawlHTML = { + ( + config: string, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (res: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (res: CrawlHTMLSingleResult[]) => void + ): Promise +} +``` + +**Parameter Type:** + +- Look at the [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig) type +- Look at the [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig) type + +**Return value type:** + +- Look at the [CrawlHTMLSingleResult](#CrawlHTMLSingleResult) type + +#### Example + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +// crawlHTML API +myXCrawl.crawlHTML('https://www.example.com').then((res) => { + const { browser, page } = res.data + + // Close the browser + browser.close() +}) +``` + +#### Config + +There are 4 types: + +- Simple target config - string +- Detailed target config - CrawlHTMLDetailTargetConfig +- Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[] +- Advanced config - CrawlHTMLAdvancedConfig + +##### Simple target config - string + +This is a simple target configuration. if you just want to simply crawl this static HTML page, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl.crawlHTML('https://www.example.com').then((res) => {}) +``` + +The res you get will be an object. + +##### Detailed target config - CrawlHTMLDetailTargetConfig + +This is the detailed target configuration. if you want to crawl this static HTML page and need to retry on failure, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + url: 'https://www.example.com', + proxy: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +The res you get will be an object. + +More configuration options can view [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig). + +##### Mixed target array config - (string | CrawlHTMLDetailTargetConfig)[] + +This is a mixed target array configuration. if you want to crawl multiple static HTML pages, and some pages need to fail and retry, you can try this way of writing: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML([ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 2 } + ]) + .then((res) => {}) +``` + +The res you get will be an array of objects. + +More configuration options can view [CrawlHTMLDetailTargetConfig](#CrawlHTMLDetailTargetConfig). + +##### Advanced config - CrawlHTMLAdvancedConfig + +This is an advanced configuration, targets is a mixed target array configuration. if you want to crawl multiple static HTML pages and crawl target configurations (proxy, cookies, retries, etc.) that you don't want to write repeatedly, but also need interval time, device fingerprint, lifecycle, etc., try this: + +```js +import xCrawl from 'x-crawl' + +const myXCrawl = xCrawl() + +myXCrawl + .crawlHTML({ + targets: [ + 'https://www.example.com/page-1', + { url: 'https://www.example.com/page-2', maxRetry: 6 } + ], + intervalTime: { max: 3000, min: 1000 }, + cookies: 'xxx', + maxRetry: 1 + }) + .then((res) => {}) +``` + +The res you get will be an array of objects. + +More configuration options can view [CrawlHTMLAdvancedConfig](#CrawlHTMLAdvancedConfig). + +More information about the results can be found at [About results](#About-results) , which can be selected according to the actual situation. + ### crawlData crawlData is the method of the crawler instance, which is usually used to crawl APIs to obtain JSON data and so on. @@ -1342,6 +1535,24 @@ export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig { - viewport: undefined - fingerprint: undefined +##### CrawlHTMLDetailTargetConfig + +```ts +export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig { + url: string + headers?: AnyObject | null + priority?: number + fingerprint?: DetailTargetFingerprintCommon | null +} +``` + +**Default Value** + +- url: undefined +- headers: undefined +- priority: undefined +- fingerprint: undefined + ##### CrawlDataDetailTargetConfig ```ts @@ -1423,6 +1634,28 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig { - viewport: undefined - onCrawlItemComplete: undefined +##### CrawlHTMLAdvancedConfig + +```ts +export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig { + targets: (string | CrawlHTMLDetailTargetConfig)[] + intervalTime?: IntervalTime + fingerprints?: DetailTargetFingerprintCommon[] + + headers?: AnyObject + + onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void +} +``` + +**Default Value** + +- targets: undefined +- intervalTime: undefined +- fingerprints: undefined +- headers: undefined +- onCrawlItemComplete: undefined + ##### CrawlDataAdvancedConfig ```ts @@ -1642,6 +1875,28 @@ export interface XCrawlInstance { ): Promise } + crawlHTML: { + ( + config: string, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + } + crawlData: { ( config: CrawlDataDetailTargetConfig, @@ -1720,6 +1975,18 @@ export interface CrawlPageSingleResult extends CrawlCommonResult { } ``` +#### CrawlHTMLSingleResult + +```ts +export interface CrawlHTMLSingleResult extends CrawlCommonResult { + data: { + statusCode: number | undefined + headers: IncomingHttpHeaders + html: string + } | null +} +``` + #### CrawlDataSingleResult ```ts diff --git a/publish/package.json b/publish/package.json index 5fb1625..86f2481 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "8.1.1", + "version": "8.2.0", "author": "coderHXL", "description": "x-crawl is a flexible Node.js multifunctional crawler library.", "license": "MIT", diff --git a/src/api.ts b/src/api.ts index 41a606e..c567df6 100644 --- a/src/api.ts +++ b/src/api.ts @@ -33,7 +33,10 @@ import { CrawlFileAdvancedConfig, CrawlDataAdvancedConfig, IntervalTime, - DetailTargetFingerprintCommon + DetailTargetFingerprintCommon, + CrawlHTMLSingleResult, + CrawlHTMLDetailTargetConfig, + CrawlHTMLAdvancedConfig } from './types/api' import { LoaderXCrawlConfig } from './types' import { fingerprints } from './default' @@ -42,7 +45,7 @@ import { fingerprints } from './default' // Extra config export interface ExtraCommonConfig { - type: 'page' | 'data' | 'file' + type: 'page' | 'html' | 'data' | 'file' intervalTime: IntervalTime | undefined } @@ -54,6 +57,12 @@ interface ExtraPageConfig extends ExtraCommonConfig { | undefined } +interface ExtraHTMLConfig extends ExtraCommonConfig { + onCrawlItemComplete: + | ((crawlHTMLSingleResult: CrawlHTMLSingleResult) => void) + | undefined +} + interface ExtraDataConfig extends ExtraCommonConfig { onCrawlItemComplete: | ((crawlDataSingleResult: CrawlDataSingleResult) => void) @@ -101,6 +110,10 @@ export type LoaderCrawlPageDetail = LoaderCommonConfig & LoaderHasConfig & CrawlPageDetailTargetConfig +export type LoaderCrawlHTMLDetail = LoaderCommonConfig & + LoaderHasConfig & + CrawlHTMLDetailTargetConfig + export type LoaderCrawlDataDetail = LoaderCommonConfig & LoaderHasConfig & CrawlDataDetailTargetConfig @@ -114,6 +127,10 @@ interface CrawlPageAdvancedDetailTargetsConfig extends CrawlPageAdvancedConfig { detailTargets: CrawlPageDetailTargetConfig[] } +interface CrawlHTMLAdvancedDetailTargetsConfig extends CrawlHTMLAdvancedConfig { + detailTargets: CrawlHTMLDetailTargetConfig[] +} + interface CrawlDataAdvancedDetailTargetsConfig extends CrawlDataAdvancedConfig { detailTargets: CrawlDataDetailTargetConfig[] @@ -135,6 +152,17 @@ interface CrawlPageConfig { | undefined } +interface CrawlHTMLConfig { + detailTargets: LoaderCrawlHTMLDetail[] + intervalTime: IntervalTime | undefined + + selectFingerprintIndexs: number[] + + onCrawlItemComplete: + | ((crawlHTMLSingleResult: CrawlHTMLSingleResult) => void) + | undefined +} + interface CrawlDataConfig { detailTargets: LoaderCrawlDataDetail[] intervalTime: IntervalTime | undefined @@ -172,6 +200,12 @@ type UniteCrawlPageConfig = | (string | CrawlPageDetailTargetConfig)[] | CrawlPageAdvancedConfig +type UniteCrawlHTMLConfig = + | string + | CrawlHTMLDetailTargetConfig + | (string | CrawlHTMLDetailTargetConfig)[] + | CrawlHTMLAdvancedConfig + type UniteCrawlDataConfig = | string | CrawlDataDetailTargetConfig @@ -361,9 +395,14 @@ function loaderCommonConfigToCrawlConfig( xCrawlConfig: LoaderXCrawlConfig, advancedDetailTargetsConfig: | CrawlPageAdvancedDetailTargetsConfig + | CrawlHTMLAdvancedDetailTargetsConfig | CrawlDataAdvancedDetailTargetsConfig | CrawlFileAdvancedDetailTargetsConfig, - crawlConfig: CrawlPageConfig | CrawlDataConfig | CrawlFileConfig + crawlConfig: + | CrawlPageConfig + | CrawlHTMLConfig + | CrawlDataConfig + | CrawlFileConfig ) { // 1.detailTargets crawlConfig.detailTargets = advancedDetailTargetsConfig.detailTargets.map( @@ -567,6 +606,55 @@ function createCrawlPageConfig( return crawlPageConfig } +function createCrawlHTMLConfig( + xCrawlConfig: LoaderXCrawlConfig, + originalConfig: UniteCrawlHTMLConfig +): CrawlHTMLConfig { + const crawlHTMLConfig: CrawlHTMLConfig = { + detailTargets: [], + intervalTime: undefined, + + selectFingerprintIndexs: [], + + onCrawlItemComplete: undefined + } + + let advancedDetailTargetsConfig: CrawlHTMLAdvancedDetailTargetsConfig = { + targets: [], + detailTargets: [] + } + + if (isObject(originalConfig) && Object.hasOwn(originalConfig, 'targets')) { + // CrawlHTMLAdvancedConfig + const { targets } = originalConfig as CrawlHTMLAdvancedConfig + + advancedDetailTargetsConfig = { + ...advancedDetailTargetsConfig, + ...(originalConfig as CrawlHTMLAdvancedConfig) + } + + advancedDetailTargetsConfig.detailTargets = + transformTargetToDetailTargets(targets) + } else { + // string | CrawlHTMLDetailTargetConfig | (string | CrawlHTMLDetailTargetConfig)[] + + advancedDetailTargetsConfig.detailTargets = transformTargetToDetailTargets( + originalConfig as + | string + | CrawlDataDetailTargetConfig + | (string | CrawlDataDetailTargetConfig)[] + ) + } + + loaderCommonConfigToCrawlConfig( + xCrawlConfig, + advancedDetailTargetsConfig, + crawlHTMLConfig + ) + + return crawlHTMLConfig +} + function createCrawlDataConfig( xCrawlConfig: LoaderXCrawlConfig, originalConfig: UniteCrawlDataConfig @@ -772,9 +860,12 @@ async function pageSingleCrawlHandle( } } -async function dataAndFileSingleCrawlHandle( - device: Device, - extraConfig: ExtraDataConfig | ExtraFileConfig +async function useRequestFnSingleCrawlHandle( + device: Device< + LoaderCrawlHTMLDetail | LoaderCrawlDataDetail | LoaderCrawlFileDetail, + Request + >, + extraConfig: ExtraHTMLConfig | ExtraDataConfig | ExtraFileConfig ) { const { detailTargetConfig, crawlErrorQueue, maxRetry, retryCount } = device const notAllowRetry = maxRetry === retryCount @@ -800,7 +891,9 @@ async function dataAndFileSingleCrawlHandle( if (isSuccess || notAllowRetry) { device.isHandle = true - if (extraConfig.type === 'data') { + if (extraConfig.type === 'html') { + HTMLSingleResultHandle(device, extraConfig as ExtraHTMLConfig) + } else if (extraConfig.type === 'data') { dataSingleResultHandle(device, extraConfig as ExtraDataConfig) } else if (extraConfig.type === 'file') { fileSingleResultHandle(device, extraConfig as ExtraFileConfig) @@ -835,6 +928,27 @@ function pageSingleResultHandle( } } +function HTMLSingleResultHandle( + device: Device, + extraConfig: ExtraHTMLConfig +) { + const { isSuccess, detailTargetResult, result } = device + const { onCrawlItemComplete } = extraConfig + + handleResultEssentialOtherValue(device) + + if (isSuccess && detailTargetResult) { + const { data, headers, statusCode } = detailTargetResult + const html = data.toString() + + result.data = { statusCode, headers, html } + } + + if (onCrawlItemComplete) { + onCrawlItemComplete(result as CrawlHTMLSingleResult) + } +} + function dataSingleResultHandle( device: Device, extraConfig: ExtraDataConfig @@ -1029,6 +1143,62 @@ export function createCrawlPage(xCrawlConfig: LoaderXCrawlConfig) { return crawlPage } +export function createCrawlHTML(xCrawlConfig: LoaderXCrawlConfig) { + function crawlHTML( + config: string, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + function crawlHTML( + config: CrawlHTMLDetailTargetConfig, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + function crawlHTML( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + function crawlHTML( + config: CrawlHTMLAdvancedConfig, + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + async function crawlHTML( + config: UniteCrawlHTMLConfig, + callback?: (result: any) => void + ): Promise { + const { detailTargets, intervalTime, onCrawlItemComplete } = + createCrawlHTMLConfig(xCrawlConfig, config) + + const extraConfig: ExtraHTMLConfig = { + type: 'html', + intervalTime, + onCrawlItemComplete + } + + const crawlResultArr = (await controller( + xCrawlConfig.mode, + detailTargets, + extraConfig, + useRequestFnSingleCrawlHandle + )) as CrawlHTMLSingleResult[] + + const crawlResult = + isArray(config) || (isObject(config) && Object.hasOwn(config, 'targets')) + ? crawlResultArr + : crawlResultArr[0] + + if (callback) { + callback(crawlResult) + } + + return crawlResult + } + + return crawlHTML +} + export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) { function crawlData( config: string, @@ -1067,7 +1237,7 @@ export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) { xCrawlConfig.mode, detailTargets, extraConfig, - dataAndFileSingleCrawlHandle + useRequestFnSingleCrawlHandle )) as CrawlDataSingleResult[] const crawlResult = @@ -1127,7 +1297,7 @@ export function createCrawlFile(xCrawlConfig: LoaderXCrawlConfig) { xCrawlConfig.mode, detailTargets, extraConfig, - dataAndFileSingleCrawlHandle + useRequestFnSingleCrawlHandle )) as CrawlFileSingleResult[] const { saveFilePendingQueue, saveFileErrorArr } = extraConfig diff --git a/src/controller.ts b/src/controller.ts index 78be3b3..bbf6cf8 100644 --- a/src/controller.ts +++ b/src/controller.ts @@ -5,6 +5,7 @@ import { ExtraCommonConfig, LoaderCrawlDataDetail, LoaderCrawlFileDetail, + LoaderCrawlHTMLDetail, LoaderCrawlPageDetail, ProxyDetails } from './api' @@ -25,6 +26,7 @@ import { CrawlCommonResult } from './types/api' export type CrawlDetail = | LoaderCrawlPageDetail + | LoaderCrawlHTMLDetail | LoaderCrawlDataDetail | LoaderCrawlFileDetail diff --git a/src/index.ts b/src/index.ts index d2e05ff..6261e9f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,7 @@ import { createCrawlData, createCrawlFile, + createCrawlHTML, createCrawlPage, startPolling } from './api' @@ -33,6 +34,7 @@ function loaderBaseConfig( function createnInstance(baseConfig: LoaderXCrawlConfig): XCrawlInstance { const instance: XCrawlInstance = { crawlPage: createCrawlPage(baseConfig), + crawlHTML: createCrawlHTML(baseConfig), crawlData: createCrawlData(baseConfig), crawlFile: createCrawlFile(baseConfig), startPolling diff --git a/src/types/api.ts b/src/types/api.ts index cec1274..cd6ccde 100644 --- a/src/types/api.ts +++ b/src/types/api.ts @@ -95,6 +95,13 @@ export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig { | null } +export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig { + url: string + headers?: AnyObject | null + priority?: number + fingerprint?: DetailTargetFingerprintCommon | null +} + export interface CrawlDataDetailTargetConfig extends CrawlCommonConfig { url: string method?: Method @@ -108,7 +115,6 @@ export interface CrawlDataDetailTargetConfig extends CrawlCommonConfig { export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig { url: string headers?: AnyObject | null - priority?: number storeDir?: string | null fileName?: string | null extension?: string | null @@ -133,6 +139,16 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig { onCrawlItemComplete?: (crawlPageSingleResult: CrawlPageSingleResult) => void } +export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig { + targets: (string | CrawlHTMLDetailTargetConfig)[] + intervalTime?: IntervalTime + fingerprints?: DetailTargetFingerprintCommon[] + + headers?: AnyObject + + onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void +} + export interface CrawlDataAdvancedConfig extends CrawlCommonConfig { targets: (string | CrawlDataDetailTargetConfig)[] intervalTime?: IntervalTime @@ -188,6 +204,14 @@ export interface CrawlPageSingleResult extends CrawlCommonResult { } } +export interface CrawlHTMLSingleResult extends CrawlCommonResult { + data: { + statusCode: number | undefined + headers: IncomingHttpHeaders + html: string + } | null +} + export interface CrawlDataSingleResult extends CrawlCommonResult { data: { statusCode: number | undefined diff --git a/src/types/index.ts b/src/types/index.ts index 1d0865a..74a0dfb 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -11,7 +11,10 @@ import { CrawlDataAdvancedConfig, CrawlPageDetailTargetConfig, CrawlPageAdvancedConfig, - CrawlCommonConfig + CrawlCommonConfig, + CrawlHTMLSingleResult, + CrawlHTMLDetailTargetConfig, + CrawlHTMLAdvancedConfig } from './api' export interface XCrawlConfig extends CrawlCommonConfig { @@ -54,6 +57,28 @@ export interface XCrawlInstance { ): Promise } + crawlHTML: { + ( + config: string, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: CrawlHTMLDetailTargetConfig, + callback?: (result: CrawlHTMLSingleResult) => void + ): Promise + + ( + config: (string | CrawlHTMLDetailTargetConfig)[], + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + + ( + config: CrawlHTMLAdvancedConfig, + callback?: (result: CrawlHTMLSingleResult[]) => void + ): Promise + } + crawlData: { ( config: CrawlDataDetailTargetConfig, diff --git a/test/environment/api/crawlData.test.ts b/test/environment/api/crawlData.test.ts index 778de77..618c4d7 100644 --- a/test/environment/api/crawlData.test.ts +++ b/test/environment/api/crawlData.test.ts @@ -20,7 +20,10 @@ async function testCrawlData() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlData({ - targets: ['http://localhost:8888', { url: 'http://localhost:8888' }] + targets: [ + 'http://localhost:8888/data', + { url: 'http://localhost:8888/data' } + ] }) return res.reduce((prev, item) => prev && item.isSuccess, true) diff --git a/test/environment/api/crawlHTML.test.ts b/test/environment/api/crawlHTML.test.ts new file mode 100644 index 0000000..8f161c5 --- /dev/null +++ b/test/environment/api/crawlHTML.test.ts @@ -0,0 +1,35 @@ +import process from 'node:process' +import { expect, test, jest } from '@jest/globals' +import chalk from 'chalk' + +import IXCrawl from 'src/' + +const args = process.argv.slice(3) +const environment = args[0] + +let xCrawl: typeof IXCrawl +if (environment === 'dev') { + xCrawl = require('src/').default +} else if (environment === 'pro') { + xCrawl = require('publish/') +} + +jest.setTimeout(60000) + +async function testCrawlHTML() { + const testXCrawl = xCrawl({ proxy: { urls: ['http://localhost:14892'] } }) + + const res = await testXCrawl.crawlHTML({ + targets: [ + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } + ] + }) + + return res.reduce((prev, item) => prev && item.isSuccess, true) +} + +test('crawlHTML', async () => { + console.log(chalk.bgGreen('================ crawlHTML ================')) + await expect(testCrawlHTML()).resolves.toBe(true) +}) diff --git a/test/environment/api/crawlPage.test.ts b/test/environment/api/crawlPage.test.ts index 36e4b0f..7d97870 100644 --- a/test/environment/api/crawlPage.test.ts +++ b/test/environment/api/crawlPage.test.ts @@ -21,8 +21,8 @@ async function testCrawlPage() { const res = await testXCrawl.crawlPage({ targets: [ - 'https://github.com/coder-hxl/x-crawl', - { url: 'https://github.com/coder-hxl/x-crawl' } + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } ] }) diff --git a/test/environment/arguments/fingerprint.test.ts b/test/environment/arguments/fingerprint.test.ts index ad5dbab..e3fb135 100644 --- a/test/environment/arguments/fingerprint.test.ts +++ b/test/environment/arguments/fingerprint.test.ts @@ -21,10 +21,10 @@ async function fingerprint() { const res = await testXCrawl.crawlPage({ targets: [ - 'http://localhost:8888', - { url: 'http://localhost:8888', fingerprint: null }, + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html', fingerprint: null }, { - url: 'http://localhost:8888', + url: 'http://localhost:8888/html', fingerprint: { maxWidth: 1024, maxHeight: 800, diff --git a/test/environment/arguments/mode.test.ts b/test/environment/arguments/mode.test.ts index 97b37d6..a06539e 100644 --- a/test/environment/arguments/mode.test.ts +++ b/test/environment/arguments/mode.test.ts @@ -20,8 +20,8 @@ async function async() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlData([ - 'http://localhost:8888', - 'http://localhost:8888' + 'http://localhost:8888/data', + 'http://localhost:8888/data' ]) return res.reduce((prev, item) => prev && item.isSuccess, true) @@ -31,8 +31,8 @@ async function sync() { const testXCrawl = xCrawl({ mode: 'sync' }) const res = await testXCrawl.crawlData([ - 'http://localhost:8888', - 'http://localhost:8888' + 'http://localhost:8888/data', + 'http://localhost:8888/data' ]) return res.reduce((prev, item) => prev && item.isSuccess, true) diff --git a/test/environment/arguments/proxy.test.ts b/test/environment/arguments/proxy.test.ts index d9cd84b..d4aaa4d 100644 --- a/test/environment/arguments/proxy.test.ts +++ b/test/environment/arguments/proxy.test.ts @@ -20,7 +20,7 @@ async function proxy() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlPage({ - targets: ['https://', 'http://localhost:8888'], + targets: ['https://', 'http://localhost:8888/html'], maxRetry: 3, proxy: { urls: ['http://localhost:129032', 'http://localhost:14892'], diff --git a/test/environment/test.ts b/test/environment/test.ts index 39b9ba0..809fdf9 100644 --- a/test/environment/test.ts +++ b/test/environment/test.ts @@ -3,12 +3,14 @@ import './arguments/fingerprint.test' import './arguments/proxy.test' import './written/crawlPage.test' +import './written/crawlHTML.test' import './written/crawlData.test' import './written/crawlFile.test' import './functions/errorCollect.test' import './api/crawlPage.test' +import './api/crawlHTML.test' import './api/crawlData.test' import './api/crawlFile.test' import './api/startPolling.test' diff --git a/test/environment/written/crawlData.test.ts b/test/environment/written/crawlData.test.ts index 76ee6fe..d0034c8 100644 --- a/test/environment/written/crawlData.test.ts +++ b/test/environment/written/crawlData.test.ts @@ -21,7 +21,7 @@ jest.setTimeout(60000) async function writtenString() { const testXCrawl = xCrawl() - const res = await testXCrawl.crawlData('http://localhost:8888') + const res = await testXCrawl.crawlData('http://localhost:8888/data') return res.isSuccess } @@ -31,7 +31,7 @@ async function writtenCrawlDataDetailConfig() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlData({ - url: 'http://localhost:8888' + url: 'http://localhost:8888/data' }) return res.isSuccess @@ -42,8 +42,8 @@ async function writtenStringAndCrawlDataDetailConfigArr() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlData([ - 'http://localhost:8888', - { url: 'http://localhost:8888' } + 'http://localhost:8888/data', + { url: 'http://localhost:8888/data' } ]) return res.reduce((prev, item) => prev && item.isSuccess, true) @@ -54,7 +54,10 @@ async function writtenCrawlDataAdvancedConfig() { const testXCrawl = xCrawl() const res = await testXCrawl.crawlData({ - targets: ['http://localhost:8888', { url: 'http://localhost:8888' }] + targets: [ + 'http://localhost:8888/data', + { url: 'http://localhost:8888/data' } + ] }) return res.reduce((prev, item) => prev && item.isSuccess, true) @@ -71,7 +74,7 @@ async function loaderBaseConfig() { maxRetry: 0 }) - const res = await testXCrawl.crawlData(['/', '/']) + const res = await testXCrawl.crawlData(['/data', '/data']) return res.reduce((prev, item) => prev && item.isSuccess, true) } @@ -83,7 +86,7 @@ async function loaderAdvancedConfig() { }) const res = await testXCrawl.crawlData({ - targets: ['/', '/'], + targets: ['/data', '/data'], proxy: { urls: ['http://localhost:14892'] }, timeout: 10000, intervalTime: { max: 1000 }, diff --git a/test/environment/written/crawlHTML.test.ts b/test/environment/written/crawlHTML.test.ts new file mode 100644 index 0000000..53533b4 --- /dev/null +++ b/test/environment/written/crawlHTML.test.ts @@ -0,0 +1,150 @@ +import process from 'node:process' +import { expect, test, jest } from '@jest/globals' +import chalk from 'chalk' + +import IXCrawl from 'src/' + +const args = process.argv.slice(3) +const environment = args[0] + +let xCrawl: typeof IXCrawl +if (environment === 'dev') { + xCrawl = require('src/').default +} else if (environment === 'pro') { + xCrawl = require('publish/') +} + +jest.setTimeout(60000) + +/* 1.Written */ +// 1.1.written string +async function writtenString() { + const testXCrawl = xCrawl() + + const res = await testXCrawl.crawlHTML('http://localhost:8888/html') + + return res.isSuccess +} + +// 1.2.written CrawlHTMLDetailConfig +async function writtenCrawlHTMLDetailConfig() { + const testXCrawl = xCrawl() + + const res = await testXCrawl.crawlHTML({ + url: 'http://localhost:8888/html' + }) + + return res.isSuccess +} + +// 1.3.written (string | CrawlHTMLDetailConfig)[] +async function writtenStringAndCrawlHTMLDetailConfigArr() { + const testXCrawl = xCrawl() + + const res = await testXCrawl.crawlHTML([ + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } + ]) + + return res.reduce((prev, item) => prev && item.isSuccess, true) +} + +// 1.4.written CrawlHTMLAdvancedConfig +async function writtenCrawlHTMLAdvancedConfig() { + const testXCrawl = xCrawl() + + const res = await testXCrawl.crawlHTML({ + targets: [ + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } + ] + }) + + return res.reduce((prev, item) => prev && item.isSuccess, true) +} + +/* 2.Loader Config */ +// 2.1.Loader Base Config +async function loaderBaseConfig() { + const testXCrawl = xCrawl({ + baseUrl: 'http://localhost:8888', + proxy: { urls: ['http://localhost:14892'] }, + timeout: 10000, + intervalTime: { max: 1000 }, + maxRetry: 0 + }) + + const res = await testXCrawl.crawlHTML(['/html', '/html']) + + return res.reduce((prev, item) => prev && item.isSuccess, true) +} + +// 2.2.Loader Advanced Config +async function loaderAdvancedConfig() { + const testXCrawl = xCrawl({ + baseUrl: 'http://localhost:8888' + }) + + const res = await testXCrawl.crawlHTML({ + targets: ['/html', '/html'], + proxy: { urls: ['http://localhost:14892'] }, + timeout: 10000, + intervalTime: { max: 1000 }, + maxRetry: 0 + }) + + return res.reduce((prev, item) => prev && item.isSuccess, true) +} + +test('crawlHTML - writtenString', async () => { + console.log( + chalk.bgGreen('================ crawlHTML - writtenString ================') + ) + await expect(writtenString()).resolves.toBe(true) +}) + +test('crawlHTML - writtenCrawlHTMLDetailConfig', async () => { + console.log( + chalk.bgGreen( + '================ crawlHTML - writtenCrawlHTMLDetailConfig ================' + ) + ) + await expect(writtenCrawlHTMLDetailConfig()).resolves.toBe(true) +}) + +test('crawlHTML - writtenStringAndCrawlHTMLDetailConfigArr', async () => { + console.log( + chalk.bgGreen( + '================ crawlHTML - writtenStringAndCrawlHTMLDetailConfigArr ================' + ) + ) + await expect(writtenStringAndCrawlHTMLDetailConfigArr()).resolves.toBe(true) +}) + +test('crawlHTML - writtenCrawlHTMLAdvancedConfig', async () => { + console.log( + chalk.bgGreen( + '================ crawlHTML - writtenCrawlHTMLAdvancedConfig ================' + ) + ) + await expect(writtenCrawlHTMLAdvancedConfig()).resolves.toBe(true) +}) + +/* 2.Loader Config */ +test('crawlHTML - loaderBaseConfig', async () => { + console.log( + chalk.bgGreen( + '================ crawlHTML - loaderBaseConfig ================' + ) + ) + await expect(loaderBaseConfig()).resolves.toBe(true) +}) + +test('crawlHTML - loaderAdvancedConfig', async () => { + console.log( + chalk.bgGreen( + '================ crawlHTML - loaderAdvancedConfig ================' + ) + ) + await expect(loaderAdvancedConfig()).resolves.toBe(true) +}) diff --git a/test/environment/written/crawlPage.test.ts b/test/environment/written/crawlPage.test.ts index 75a99e8..b8e0898 100644 --- a/test/environment/written/crawlPage.test.ts +++ b/test/environment/written/crawlPage.test.ts @@ -21,7 +21,7 @@ jest.setTimeout(60000) async function writtenString() { const testXCrawl = xCrawl() - const res = await testXCrawl.crawlPage('https://gitee.com/coderhxl') + const res = await testXCrawl.crawlPage('http://localhost:8888/html') await res.data.browser.close() @@ -33,7 +33,7 @@ async function writtenCrawlPageDetailConfig() { const testXCrawl = xCrawl({ proxy: { urls: ['http://localhost:14892'] } }) const res = await testXCrawl.crawlPage({ - url: 'https://github.com/coder-hxl/x-crawl' + url: 'http://localhost:8888/html' }) await res.data.browser.close() @@ -46,8 +46,8 @@ async function writtenStringAndCrawlPageDetailConfigArr() { const testXCrawl = xCrawl({ proxy: { urls: ['http://localhost:14892'] } }) const res = await testXCrawl.crawlPage([ - 'https://github.com/coder-hxl/x-crawl', - { url: 'https://github.com/coder-hxl/x-crawl' } + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } ]) await res[0].data.browser.close() @@ -61,8 +61,8 @@ async function writtenCrawlPageAdvancedConfig() { const res = await testXCrawl.crawlPage({ targets: [ - 'https://github.com/coder-hxl/x-crawl', - { url: 'https://github.com/coder-hxl/x-crawl' } + 'http://localhost:8888/html', + { url: 'http://localhost:8888/html' } ] }) @@ -75,14 +75,14 @@ async function writtenCrawlPageAdvancedConfig() { // 2.1.Loader Base Config async function loaderBaseConfig() { const testXCrawl = xCrawl({ - baseUrl: 'https://github.com', + baseUrl: 'http://localhost:8888', proxy: { urls: ['http://localhost:14892'] }, timeout: 10000, intervalTime: { max: 1000 }, maxRetry: 0 }) - const res = await testXCrawl.crawlPage(['/coder-hxl', '/coder-hxl/x-crawl']) + const res = await testXCrawl.crawlPage(['/html', '/html']) await res[0].data.browser.close() @@ -91,10 +91,10 @@ async function loaderBaseConfig() { // 2.2.Loader Advanced Config async function loaderAdvancedConfig() { - const testXCrawl = xCrawl({ baseUrl: 'https://github.com' }) + const testXCrawl = xCrawl({ baseUrl: 'http://localhost:8888' }) const res = await testXCrawl.crawlPage({ - targets: ['/coder-hxl', '/coder-hxl/x-crawl'], + targets: ['/html', '/html'], proxy: { urls: ['http://localhost:14892'] }, timeout: 10000, intervalTime: { max: 1000 }, diff --git a/test/server/index.js b/test/server/index.js index e9acf41..ac6f836 100644 --- a/test/server/index.js +++ b/test/server/index.js @@ -1 +1 @@ -"use strict";require("node:http").createServer(((e,t)=>{console.log(e.headers),t.setHeader("Content-Type","text/plain"),t.end("success")})).listen(8888,(()=>{console.log("服务器在 8888 端口启动成功~")})); +"use strict";var e=require("node:http");e.createServer(((e,t)=>{const{url:n,method:o}=e;console.log(o,n);let r="text/plain",s="Please select /html or /data";"/html"===n?(r="text/html; charset=utf-8",s='\n \n

Hi

\n + +` + http .createServer((req, res) => { - console.log(req.headers) + const { url, method } = req + console.log(method, url) + + let contentType = 'text/plain' + let content: any = 'Please select /html or /data' + + if (url === '/html') { + contentType = 'text/html; charset=utf-8' + content = html + } else if (url === '/data') { + contentType = 'application/json' + content = { code: 200, message: 'Hi' } + } - res.setHeader('Content-Type', 'text/plain') - res.end('success') + res.setHeader('Content-Type', contentType) + res.end(typeof content === 'string' ? content : JSON.stringify(content)) }) .listen(8888, () => { console.log(`服务器在 8888 端口启动成功~`) diff --git a/test/start/index.js b/test/start/index.js index a5c2e0a..715f9c1 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),n=require("puppeteer"),o=require("chalk"),a=require("node:http"),i=require("node:https"),s=require("node:url"),l=require("node:querystring"),c=require("https-proxy-agent");function u(e,t=0){let r=Math.floor(Math.random()*e);for(;r1){const e=t?r:u(r.max,r.min);m(`Target id: ${y(n)} - Sleep time: ${y(e+"ms")}`),await function(e){return new Promise((t=>setTimeout(t,e)))}(e)}else m(`Target id: ${y(n)} - Sleep time: ${y("0ms")}`)}async function b(e,t,r){const{intervalTime:n}=t,o=!w(n),a=x(n),i=[];for(const s of e){const{id:e}=s;await S(o,a,n,e),i.push(r(s,t))}await Promise.all(i)}async function $(e,t,r){const{intervalTime:n}=t,o=!w(n),a=x(n);for(const i of e){const{id:e}=i;await S(o,a,n,e),await r(i,t)}}function j(e,t,r){const n=e[t];e[t]=e[r],e[r]=n}function I(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=I(e.slice(0,t)),n=I(e.slice(t)),o=[];let a=0,i=0;for(;a=n[i]?(o.push(r[a]),a++):(o.push(n[i]),i++);return ae.priority===t[0].priority))?I(t.map((e=>({...e,valueOf:()=>e.priority})))):t).map(((e,t)=>{const r=++t,{maxRetry:n,proxyDetails:o}=e,a=[];return{id:r,isHandle:!1,isSuccess:!1,isStatusNormal:!1,detailTargetConfig:e,detailTargetResult:null,maxRetry:n,retryCount:0,proxyDetails:o,crawlErrorQueue:a,result:{id:r,isSuccess:!1,maxRetry:n,retryCount:0,proxyDetails:o,crawlErrorQueue:a,data:null}}}));m(p(`Start crawling - type: ${o}, mode: ${e}, total: ${a.length}`));const i="async"===e?b:$;let s=0,l=a;for(;l.length;)if(await i(l,r,n),l=l.filter((e=>{const{isHandle:t,retryCount:r,maxRetry:n,detailTargetConfig:o,proxyDetails:a,crawlErrorQueue:i,isStatusNormal:s}=e;let l=!1;if(!t&&r=2)){const e=o.proxy?.switchByErrorCount;if(!s||!w(e)&&e>=i.length){a.find((e=>e.url===o.proxyUrl)).state=!1;const e=a.find((e=>e.state))?.url;w(e)||(o.proxyUrl=e)}}return l})),l.length){const e=l.map((e=>(e.retryCount++,e.id)));m(h(`Start retrying - count: ${++s}, targets id: [ ${e.join(", ")} ]`))}const c=[],u=[];return a.forEach((e=>{e.isSuccess?c.push(e.id):u.push(e.id)})),m(d(`Crawl ${o}s finish:`)),m(f(` Success - total: ${c.length}, targets id: [ ${c.join(", ")} ]`)),m(g(` Error - total: ${u.length}, targets id: [ ${u.join(", ")} ]`)),a.map((e=>e.result))}function F(e){const{protocol:t,hostname:r,port:n,pathname:o,search:u}=new s.URL(e.url),m="http:"===t;let p=o;(u||e.params)&&(p+=u?`${u}${e.params?"&"+l.stringify(e.params):""}`:`?${l.stringify(e.params)}`);const d={agent:e.proxyUrl?c(e.proxyUrl):m?new a.Agent:new i.Agent,protocol:t,hostname:r,port:n,path:p,method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return d.headers=function(e,t){const r=e.headers??{},n={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",...r};"POST"===t.method&&e.data&&[{key:"Content-Type",value:"application/json"},{key:"Content-Length",value:Buffer.byteLength(e.data)}].forEach((e=>{const{key:t,value:o}=e;w(r[t])&&(n[t]=o)}));return n}(e,d),d}const R=[{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",versions:[{name:"Chrome",maxMajorVersion:112,minMajorVersion:100,maxMinorVersion:10,maxPatchVersion:5615},{name:"Safari",maxMinorVersion:36,maxPatchVersion:2333}]}},{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",versions:[{name:"Chrome",maxMajorVersion:91,minMajorVersion:88,maxMinorVersion:10,maxPatchVersion:5615},{name:"Safari",maxMinorVersion:36,maxPatchVersion:2333},{name:"Edg",maxMinorVersion:10,maxPatchVersion:864}]}},{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",versions:[{name:"Firefox",maxMajorVersion:47,minMajorVersion:43,maxMinorVersion:10,maxPatchVersion:5e3}]}}];function V(e){return C(e)?e.map((e=>T(e)?e:{url:e})):[T(e)?e:{url:e}]}function O(e,t){const{ua:r,platform:n,platformVersion:o,mobile:a,acceptLanguage:i,userAgent:s}=t;let l=e.headers;if(l||(e.headers=l={}),r&&(l["sec-ch-ua"]=r),a&&(l["sec-ch-ua-mobile"]="random"===a?u(2)?"?1":"?0":a),n&&(l["sec-ch-platform"]=n),o&&(l["sec-ch-ua-platform-version"]=o),i&&(l["accept-language"]=i),s){let e=s.value;s.versions?.forEach((t=>{const{name:r,maxMajorVersion:n,minMajorVersion:o,maxMinorVersion:a,minMinorVersion:i,maxPatchVersion:s,minPatchVersion:l}=t,c=e.split(`${r}/`)[1].split(" ")[0].split("."),m=c.join(".");w(n)||(c[0]=n===o?n:u(n,o)),w(a)||(c[1]=a===i?a:u(a,i)),w(s)||(c[2]=s===l?s:u(s,l));const p=`${r}/${m}`,d=`${r}/${c.join(".")}`;e=e.replace(p,d)})),l["user-agent"]=e}}function P(e,t){const{maxWidth:r,minWidth:n,maxHeight:o,minHidth:a}=t,i=e.viewport??{};r&&(i.width=r===n?r:u(r,n)),o&&(i.height=o===a?o:u(o,a)),Object.hasOwn(i,"width")&&Object.hasOwn(i,"height")&&(e.viewport=i)}function k(e,t,r){r.detailTargets=t.detailTargets.map((n=>{const o=n,{url:a,timeout:i,proxy:s,maxRetry:l,priority:c,headers:m,fingerprint:p}=o;if(e.baseUrl&&(o.url=e.baseUrl+a),w(i)&&(w(t.timeout)?o.timeout=e.timeout:o.timeout=t.timeout??void 0),w(l)&&(w(t.maxRetry)?o.maxRetry=e.maxRetry:o.maxRetry=t.maxRetry??0),w(s)&&(w(t.proxy)?w(e.proxy)||(o.proxy=e.proxy):o.proxy=t.proxy),w(o.proxy?.urls))o.proxyDetails=[];else{const e=o.proxy.urls;o.proxyUrl=e[0],o.proxyDetails=e.map((e=>({url:e,state:!0})))}if(w(c)&&(o.priority=0),w(m)&&t.headers&&(o.headers={...t.headers}),p)O(o,p);else if(w(p)&&C(t.fingerprints)&&t.fingerprints.length){const e=t.fingerprints,n=u(e.length),a=e[n];r.selectFingerprintIndexs.push(n),O(o,a)}else if(w(p)&&!C(t.fingerprints)&&e.enableRandomFingerprint){O(o,R[u(R.length)])}return o})),r.intervalTime=t.intervalTime,w(t.intervalTime)&&!w(e.intervalTime)&&(r.intervalTime=e.intervalTime),r.onCrawlItemComplete=t.onCrawlItemComplete}async function A(e,t){const{detailTargetConfig:r,detailTargetResult:n,retryCount:o,maxRetry:a,crawlErrorQueue:i}=e,{browser:s}=t,l=o===a,c=n?.page??await s.newPage();r.viewport&&await c.setViewport(r.viewport);let u=null,m=!0;try{if(r.proxyUrl?await s.createIncognitoBrowserContext({proxyServer:r.proxyUrl}):await s.createIncognitoBrowserContext({proxyServer:void 0}),r.cookies){const e=function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const n=t.split("=");r.push({name:n[0],value:n[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(r.url,r.cookies);await c.setCookie(...e)}else{const e=await c.cookies(r.url);await c.deleteCookie(...e)}r.headers&&await c.setExtraHTTPHeaders(r.headers),u=await c.goto(r.url,{timeout:r.timeout})}catch(e){m=!1,i.push(e)}e.detailTargetResult={response:u,page:c};const p=!M(e),d=m&&p;e.isStatusNormal=p,e.isSuccess=d,(d||l)&&(e.isHandle=!0,function(e,t){const{detailTargetResult:r,result:n}=e,{browser:o,onCrawlItemComplete:a}=t;W(e),n.data={browser:o,...r},a&&a(e.result)}(e,t))}async function D(n,o){const{detailTargetConfig:s,crawlErrorQueue:l,maxRetry:c,retryCount:u}=n,m=c===u;let p=null,d=!0;try{p=await(f=s,new Promise(((e,t)=>{const r=w(f.data);f.data=r?f.data:JSON.stringify(f.data);const n=F(f);function o(t){const{statusCode:r,headers:n}=t,o=[];t.on("data",(e=>o.push(e))),t.on("end",(()=>{const t=Buffer.concat(o);e({statusCode:r,headers:n,data:t})}))}let s;s="http:"===n.protocol?a.request(n,o):i.request(n,o),s.on("timeout",(()=>{t(new Error(`Timeout ${f.timeout}ms`))})),s.on("error",(e=>{t(e)})),"POST"!==n.method||r||s.write(f.data),s.end()})))}catch(e){d=!1,l.push(e)}var f;n.detailTargetResult=p;const g=!M(n),h=d&&g;n.isStatusNormal=g,n.isSuccess=h,(h||m)&&(n.isHandle=!0,"data"===o.type?function(e,t){const{isSuccess:r,detailTargetResult:n,result:o}=e,{onCrawlItemComplete:a}=t;if(W(e),r&&n){const e=n.headers["content-type"]??"",t="application/json"===e?JSON.parse(n.data.toString()):e.includes("text")?n.data.toString():n.data;o.data={...n,data:t}}a&&a(o)}(n,o):"file"===o.type&&function(n,o){const{id:a,isSuccess:i,detailTargetConfig:s,detailTargetResult:l,result:c}=n,{saveFileErrorArr:u,saveFilePendingQueue:m,onCrawlItemComplete:p,onBeforeSaveItemFile:d}=o;if(W(n),i&&l){const o=l.headers["content-type"]??"",i=s.fileName??`${a}-${(new Date).getTime()}`,f=s.extension??`.${o.split("/").pop()}`;s.storeDir&&!e.existsSync(s.storeDir)&&e.mkdirSync(s.storeDir,{recursive:!0});const g=s.storeDir??__dirname,h=r.resolve(g,i+f),y=l.data;let w=Promise.resolve(y);d&&(w=d({id:a,fileName:i,filePath:h,data:y}));const x=w.then((async e=>{let r=!0;try{await t.writeFile(h,e)}catch(e){r=!1;const t=`File save error at id ${a}: ${e.message}`,n=()=>a;u.push({message:t,valueOf:n})}const s=e.length;c.data={...l,data:{isSuccess:r,fileName:i,fileExtension:f,mimeType:o,size:s,filePath:h}},p&&p(n.result)}));m.push(x)}else p&&p(n.result)}(n,o))}const N=["isSuccess","retryCount"];function W(e){Object.keys(e).forEach((t=>{N.includes(t)&&(e.result[t]=e[t])}))}function B(e){let t=null,r=null,o=!1;return async function(a,i){o||(o=!0,r=n.launch(e.crawlPage?.launchBrowser).then((e=>{t=e}))),r&&(await r,r&&(r=null));const{detailTargets:s,intervalTime:l,onCrawlItemComplete:c}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=V(e)}else n.detailTargets=V(t);return k(e,n,r),r.detailTargets.forEach(((e,t)=>{const{cookies:o,viewport:a,fingerprint:i}=e;if(w(o)&&n.cookies&&(e.cookies=n.cookies),w(a)&&n.viewport&&(e.viewport=n.viewport),i)P(e,i);else if(w(i)&&n.fingerprints?.length){const o=r.selectFingerprintIndexs[t];P(e,n.fingerprints[o])}})),r}(e,a),u={type:"page",browser:t,intervalTime:l,onCrawlItemComplete:c},m=await E(e.mode,s,u,A),p=C(a)||T(a)&&Object.hasOwn(a,"targets")?m:m[0];return i&&i(p),p}}function q(e){return async function(t,r){const{detailTargets:n,intervalTime:o,onCrawlItemComplete:a}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=V(e)}else n.detailTargets=V(t);return k(e,n,r),r}(e,t),i={type:"data",intervalTime:o,onCrawlItemComplete:a},s=await E(e.mode,n,i,D),l=C(t)||T(t)&&Object.hasOwn(t,"targets")?s:s[0];return r&&r(l),l}}function U(e){return async function(t,r){const{detailTargets:n,intervalTime:o,onBeforeSaveItemFile:a,onCrawlItemComplete:i}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onBeforeSaveItemFile:void 0,onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=V(e)}else n.detailTargets=C(t)?t:[t];k(e,n,r);const o=!w(n?.storeDirs),a=v(n?.storeDirs)?0:1,i=!w(n?.extensions),s=(v(n?.extensions),!w(n?.fileNames));return r.detailTargets.forEach(((e,t)=>{w(e.storeDir)&&o&&(e.storeDir=0===a?n.storeDirs:n.storeDirs[t]),w(e.extension)&&i&&(e.extension=0===a?n.extensions:n.extensions[t]),w(e.fileName)&&s&&(e.fileName=n.fileNames[t])})),r.onBeforeSaveItemFile=n.onBeforeSaveItemFile,r}(e,t),s={type:"file",saveFileErrorArr:[],saveFilePendingQueue:[],intervalTime:o,onCrawlItemComplete:i,onBeforeSaveItemFile:a},l=await E(e.mode,n,s,D),{saveFilePendingQueue:c,saveFileErrorArr:u}=s;var p;await Promise.all(c),(p=u,function e(t,r){if(t>=r)return;const n=p[r];let o=t,a=r-1;for(;o<=a;){for(;p[o]n;)a--;o<=a&&(j(p,o,a),o++,a--)}j(p,o,r),e(t,o-1),e(o+1,r)}(0,p.length-1),p).forEach((e=>m(g(e.message))));const h=[],y=[];l.forEach((e=>{e.data?.data.isSuccess?h.push(e.id):y.push(e.id)})),m(d("Save files finish:")),m(f(` Success - total: ${h.length}, targets id: [ ${h.join(", ")} ]`)),m(g(` Error - total: ${y.length}, targets id: [ ${y.join(", ")} ]`));const x=C(t)||T(t)&&Object.hasOwn(t,"targets")?l:l[0];return r&&r(x),x}}function H(e,t){const{d:r,h:n,m:o}=e,a=(w(r)?0:1e3*r*60*60*24)+(w(n)?0:1e3*n*60*60)+(w(o)?0:1e3*o*60);let i=0;l();const s=setInterval(l,a);function l(){console.log(p("Start polling - count: "+ ++i)),t(i,c)}function c(){clearInterval(s),console.log(h("Stop the polling"))}}const L=function(e){const t=function(e){const t=e||{};return w(t.mode)&&(t.mode="async"),w(t.enableRandomFingerprint)&&(t.enableRandomFingerprint=!0),w(e?.timeout)&&(t.timeout=1e4),w(e?.maxRetry)&&(t.maxRetry=0),t}(e);return function(e){return{crawlPage:B(e),crawlData:q(e),crawlFile:U(e),startPolling:H}}(t)}();L.crawlData({url:"http://localhost:8888",method:"post",headers:{"Content-Type":"application/x-www-form-urlencoded;charset=UTF-8"},data:{name:"hxl",age:19}}).then((e=>{console.log(e)})); +"use strict";var e=require("node:fs"),t=require("node:fs/promises"),r=require("node:path"),n=require("puppeteer"),o=require("chalk"),a=require("node:http"),i=require("node:https"),s=require("node:url"),l=require("node:querystring"),c=require("https-proxy-agent");function u(e,t=0){let r=Math.floor(Math.random()*e);for(;r1){const e=t?r:u(r.max,r.min);m(`Target id: ${y(n)} - Sleep time: ${y(e+"ms")}`),await function(e){return new Promise((t=>setTimeout(t,e)))}(e)}else m(`Target id: ${y(n)} - Sleep time: ${y("0ms")}`)}async function b(e,t,r){const{intervalTime:n}=t,o=!w(n),a=x(n),i=[];for(const s of e){const{id:e}=s;await S(o,a,n,e),i.push(r(s,t))}await Promise.all(i)}async function I(e,t,r){const{intervalTime:n}=t,o=!w(n),a=x(n);for(const i of e){const{id:e}=i;await S(o,a,n,e),await r(i,t)}}function j(e,t,r){const n=e[t];e[t]=e[r],e[r]=n}function $(e){if(1===e.length)return e;const t=Math.floor(e.length/2),r=$(e.slice(0,t)),n=$(e.slice(t)),o=[];let a=0,i=0;for(;a=n[i]?(o.push(r[a]),a++):(o.push(n[i]),i++);return ae.priority===t[0].priority))?$(t.map((e=>({...e,valueOf:()=>e.priority})))):t).map(((e,t)=>{const r=++t,{maxRetry:n,proxyDetails:o}=e,a=[];return{id:r,isHandle:!1,isSuccess:!1,isStatusNormal:!1,detailTargetConfig:e,detailTargetResult:null,maxRetry:n,retryCount:0,proxyDetails:o,crawlErrorQueue:a,result:{id:r,isSuccess:!1,maxRetry:n,retryCount:0,proxyDetails:o,crawlErrorQueue:a,data:null}}}));m(p(`Start crawling - type: ${o}, mode: ${e}, total: ${a.length}`));const i="async"===e?b:I;let s=0,l=a;for(;l.length;)if(await i(l,r,n),l=l.filter((e=>{const{isHandle:t,retryCount:r,maxRetry:n,detailTargetConfig:o,proxyDetails:a,crawlErrorQueue:i,isStatusNormal:s}=e;let l=!1;if(!t&&r=2)){const e=o.proxy?.switchByErrorCount;if(!s||!w(e)&&e>=i.length){a.find((e=>e.url===o.proxyUrl)).state=!1;const e=a.find((e=>e.state))?.url;w(e)||(o.proxyUrl=e)}}return l})),l.length){const e=l.map((e=>(e.retryCount++,e.id)));m(h(`Start retrying - count: ${++s}, targets id: [ ${e.join(", ")} ]`))}const c=[],u=[];return a.forEach((e=>{e.isSuccess?c.push(e.id):u.push(e.id)})),m(d(`Crawl ${o}s finish:`)),m(f(` Success - total: ${c.length}, targets id: [ ${c.join(", ")} ]`)),m(g(` Error - total: ${u.length}, targets id: [ ${u.join(", ")} ]`)),a.map((e=>e.result))}function O(e){const{data:t,url:r,params:n,proxyUrl:o,timeout:u,method:m}=e,{protocol:p,hostname:d,port:f,pathname:g,search:h}=new s.URL(r);let y=g;(h||n)&&(y+=h?`${h}${n?"&"+l.stringify(n):""}`:`?${l.stringify(n)}`);const x={requestConfig:{agent:o?new c.HttpsProxyAgent(o):"http:"===p?new a.Agent:new i.Agent,protocol:p,hostname:d,port:f,path:y,method:m?.toLocaleUpperCase()??"GET",headers:{},timeout:u},protocol:p,data:T(t)?JSON.stringify(t):t};return function(e,t){const r=e.headers??{},{requestConfig:n,data:o}=t,a={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",...r};w(o)||[{key:"Content-Type",value:"application/json"},{key:"Content-Length",value:Buffer.byteLength(o)}].forEach((e=>{const{key:t,value:n}=e;w(r[t])&&(a[t]=n)}));n.headers=a}(e,x),x}const V=[{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",versions:[{name:"Chrome",maxMajorVersion:112,minMajorVersion:100,maxMinorVersion:10,maxPatchVersion:5615},{name:"Safari",maxMinorVersion:36,maxPatchVersion:2333}]}},{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",versions:[{name:"Chrome",maxMajorVersion:91,minMajorVersion:88,maxMinorVersion:10,maxPatchVersion:5615},{name:"Safari",maxMinorVersion:36,maxPatchVersion:2333},{name:"Edg",maxMinorVersion:10,maxPatchVersion:864}]}},{platform:"Windows",mobile:"random",userAgent:{value:"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",versions:[{name:"Firefox",maxMajorVersion:47,minMajorVersion:43,maxMinorVersion:10,maxPatchVersion:5e3}]}}];function F(e){return C(e)?e.map((e=>T(e)?e:{url:e})):[T(e)?e:{url:e}]}function R(e,t){const{ua:r,platform:n,platformVersion:o,mobile:a,acceptLanguage:i,userAgent:s}=t;let l=e.headers;if(l||(e.headers=l={}),r&&(l["sec-ch-ua"]=r),a&&(l["sec-ch-ua-mobile"]="random"===a?u(2)?"?1":"?0":a),n&&(l["sec-ch-platform"]=n),o&&(l["sec-ch-ua-platform-version"]=o),i&&(l["accept-language"]=i),s){let e=s.value;s.versions?.forEach((t=>{const{name:r,maxMajorVersion:n,minMajorVersion:o,maxMinorVersion:a,minMinorVersion:i,maxPatchVersion:s,minPatchVersion:l}=t,c=e.split(`${r}/`)[1].split(" ")[0].split("."),m=c.join(".");w(n)||(c[0]=n===o?n:u(n,o)),w(a)||(c[1]=a===i?a:u(a,i)),w(s)||(c[2]=s===l?s:u(s,l));const p=`${r}/${m}`,d=`${r}/${c.join(".")}`;e=e.replace(p,d)})),l["user-agent"]=e}}function P(e,t){const{maxWidth:r,minWidth:n,maxHeight:o,minHidth:a}=t,i=e.viewport??{};r&&(i.width=r===n?r:u(r,n)),o&&(i.height=o===a?o:u(o,a)),Object.hasOwn(i,"width")&&Object.hasOwn(i,"height")&&(e.viewport=i)}function k(e,t,r){r.detailTargets=t.detailTargets.map((n=>{const o=n,{url:a,timeout:i,proxy:s,maxRetry:l,priority:c,headers:m,fingerprint:p}=o;if(e.baseUrl&&(o.url=e.baseUrl+a),w(i)&&(w(t.timeout)?o.timeout=e.timeout:o.timeout=t.timeout??void 0),w(l)&&(w(t.maxRetry)?o.maxRetry=e.maxRetry:o.maxRetry=t.maxRetry??0),w(s)&&(w(t.proxy)?w(e.proxy)||(o.proxy=e.proxy):o.proxy=t.proxy),w(o.proxy?.urls))o.proxyDetails=[];else{const e=o.proxy.urls;o.proxyUrl=e[0],o.proxyDetails=e.map((e=>({url:e,state:!0})))}if(w(c)&&(o.priority=0),w(m)&&t.headers&&(o.headers={...t.headers}),p)R(o,p);else if(w(p)&&C(t.fingerprints)&&t.fingerprints.length){const e=t.fingerprints,n=u(e.length),a=e[n];r.selectFingerprintIndexs.push(n),R(o,a)}else if(w(p)&&!C(t.fingerprints)&&e.enableRandomFingerprint){R(o,V[u(V.length)])}return o})),r.intervalTime=t.intervalTime,w(t.intervalTime)&&!w(e.intervalTime)&&(r.intervalTime=e.intervalTime),r.onCrawlItemComplete=t.onCrawlItemComplete}async function A(e,t){const{detailTargetConfig:r,detailTargetResult:n,retryCount:o,maxRetry:a,crawlErrorQueue:i}=e,{browser:s}=t,l=o===a,c=n?.page??await s.newPage();r.viewport&&await c.setViewport(r.viewport);let u=null,m=!0;try{if(r.proxyUrl?await s.createIncognitoBrowserContext({proxyServer:r.proxyUrl}):await s.createIncognitoBrowserContext({proxyServer:void 0}),r.cookies){const e=function(e,t){const r=[];return"string"==typeof t?t.split("; ").forEach((t=>{const n=t.split("=");r.push({name:n[0],value:n[1],url:e})})):Array.isArray(t)?t.forEach((t=>{t.url||(t.url=e),r.push(t)})):"object"==typeof t&&t&&(t.url||(t.url=e),r.push(t)),r}(r.url,r.cookies);await c.setCookie(...e)}else{const e=await c.cookies(r.url);await c.deleteCookie(...e)}r.headers&&await c.setExtraHTTPHeaders(r.headers),u=await c.goto(r.url,{timeout:r.timeout})}catch(e){m=!1,i.push(e)}e.detailTargetResult={response:u,page:c};const p=!M(e),d=m&&p;e.isStatusNormal=p,e.isSuccess=d,(d||l)&&(e.isHandle=!0,function(e,t){const{detailTargetResult:r,result:n}=e,{browser:o,onCrawlItemComplete:a}=t;q(e),n.data={browser:o,...r},a&&a(e.result)}(e,t))}async function D(n,o){const{detailTargetConfig:s,crawlErrorQueue:l,maxRetry:c,retryCount:u}=n,m=c===u;let p=null,d=!0;try{p=await(f=s,new Promise(((e,t)=>{const{requestConfig:r,protocol:n,data:o}=O(f);function s(t){const{statusCode:r,headers:n}=t,o=[];t.on("data",(e=>o.push(e))),t.on("end",(()=>{const t=Buffer.concat(o);e({statusCode:r,headers:n,data:t})}))}const l="http:"===n?a.request(r,s):i.request(r,s);l.on("timeout",(()=>{t(new Error(`Timeout ${r.timeout}ms`))})),l.on("error",(e=>{t(e)})),w(o)||l.write(o),l.end()})))}catch(e){d=!1,l.push(e)}var f;n.detailTargetResult=p;const g=!M(n),h=d&&g;n.isStatusNormal=g,n.isSuccess=h,(h||m)&&(n.isHandle=!0,"html"===o.type?function(e,t){const{isSuccess:r,detailTargetResult:n,result:o}=e,{onCrawlItemComplete:a}=t;if(q(e),r&&n){const{data:e,headers:t,statusCode:r}=n,a=e.toString();o.data={statusCode:r,headers:t,html:a}}a&&a(o)}(n,o):"data"===o.type?function(e,t){const{isSuccess:r,detailTargetResult:n,result:o}=e,{onCrawlItemComplete:a}=t;if(q(e),r&&n){const e=n.headers["content-type"]??"",t=e.includes("application/json")?JSON.parse(n.data.toString()):e.includes("text")?n.data.toString():n.data;o.data={...n,data:t}}a&&a(o)}(n,o):"file"===o.type&&function(n,o){const{id:a,isSuccess:i,detailTargetConfig:s,detailTargetResult:l,result:c}=n,{saveFileErrorArr:u,saveFilePendingQueue:m,onCrawlItemComplete:p,onBeforeSaveItemFile:d}=o;if(q(n),i&&l){const o=l.headers["content-type"]??"",i=s.fileName??`${a}-${(new Date).getTime()}`,f=s.extension??`.${o.split("/").pop()}`;s.storeDir&&!e.existsSync(s.storeDir)&&e.mkdirSync(s.storeDir,{recursive:!0});const g=s.storeDir??__dirname,h=r.resolve(g,i+f),y=l.data;let w=Promise.resolve(y);d&&(w=d({id:a,fileName:i,filePath:h,data:y}));const x=w.then((async e=>{let r=!0;try{await t.writeFile(h,e)}catch(e){r=!1;const t=`File save error at id ${a}: ${e.message}`,n=()=>a;u.push({message:t,valueOf:n})}const s=e.length;c.data={...l,data:{isSuccess:r,fileName:i,fileExtension:f,mimeType:o,size:s,filePath:h}},p&&p(n.result)}));m.push(x)}else p&&p(n.result)}(n,o))}const N=["isSuccess","retryCount"];function q(e){Object.keys(e).forEach((t=>{N.includes(t)&&(e.result[t]=e[t])}))}function W(e){let t=null,r=null,o=!1;return async function(a,i){o||(o=!0,r=n.launch(e.crawlPage?.puppeteerLaunch).then((e=>{t=e}))),r&&(await r,r&&(r=null));const{detailTargets:s,intervalTime:l,onCrawlItemComplete:c}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=F(e)}else n.detailTargets=F(t);return k(e,n,r),r.detailTargets.forEach(((e,t)=>{const{cookies:o,viewport:a,fingerprint:i}=e;if(w(o)&&n.cookies&&(e.cookies=n.cookies),w(a)&&n.viewport&&(e.viewport=n.viewport),i)P(e,i);else if(w(i)&&n.fingerprints?.length){const o=r.selectFingerprintIndexs[t];P(e,n.fingerprints[o])}})),r}(e,a),u={type:"page",browser:t,intervalTime:l,onCrawlItemComplete:c},m=await E(e.mode,s,u,A),p=C(a)||T(a)&&Object.hasOwn(a,"targets")?m:m[0];return i&&i(p),p}}function H(e){return async function(t,r){const{detailTargets:n,intervalTime:o,onCrawlItemComplete:a}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n={...n,...t},n.detailTargets=F(e)}else n.detailTargets=F(t);return k(e,n,r),r}(e,t),i={type:"html",intervalTime:o,onCrawlItemComplete:a},s=await E(e.mode,n,i,D),l=C(t)||T(t)&&Object.hasOwn(t,"targets")?s:s[0];return r&&r(l),l}}function B(e){return async function(t,r){const{detailTargets:n,intervalTime:o,onCrawlItemComplete:a}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=F(e)}else n.detailTargets=F(t);return k(e,n,r),r}(e,t),i={type:"data",intervalTime:o,onCrawlItemComplete:a},s=await E(e.mode,n,i,D),l=C(t)||T(t)&&Object.hasOwn(t,"targets")?s:s[0];return r&&r(l),l}}function L(e){return async function(t,r){const{detailTargets:n,intervalTime:o,onBeforeSaveItemFile:a,onCrawlItemComplete:i}=function(e,t){const r={detailTargets:[],intervalTime:void 0,selectFingerprintIndexs:[],onBeforeSaveItemFile:void 0,onCrawlItemComplete:void 0};let n={targets:[],detailTargets:[]};if(T(t)&&Object.hasOwn(t,"targets")){const{targets:e}=t;n=t,n.detailTargets=F(e)}else n.detailTargets=C(t)?t:[t];k(e,n,r);const o=!w(n?.storeDirs),a=v(n?.storeDirs)?0:1,i=!w(n?.extensions),s=(v(n?.extensions),!w(n?.fileNames));return r.detailTargets.forEach(((e,t)=>{w(e.storeDir)&&o&&(e.storeDir=0===a?n.storeDirs:n.storeDirs[t]),w(e.extension)&&i&&(e.extension=0===a?n.extensions:n.extensions[t]),w(e.fileName)&&s&&(e.fileName=n.fileNames[t])})),r.onBeforeSaveItemFile=n.onBeforeSaveItemFile,r}(e,t),s={type:"file",saveFileErrorArr:[],saveFilePendingQueue:[],intervalTime:o,onCrawlItemComplete:i,onBeforeSaveItemFile:a},l=await E(e.mode,n,s,D),{saveFilePendingQueue:c,saveFileErrorArr:u}=s;var p;await Promise.all(c),(p=u,function e(t,r){if(t>=r)return;const n=p[r];let o=t,a=r-1;for(;o<=a;){for(;p[o]n;)a--;o<=a&&(j(p,o,a),o++,a--)}j(p,o,r),e(t,o-1),e(o+1,r)}(0,p.length-1),p).forEach((e=>m(g(e.message))));const h=[],y=[];l.forEach((e=>{e.data?.data.isSuccess?h.push(e.id):y.push(e.id)})),m(d("Save files finish:")),m(f(` Success - total: ${h.length}, targets id: [ ${h.join(", ")} ]`)),m(g(` Error - total: ${y.length}, targets id: [ ${y.join(", ")} ]`));const x=C(t)||T(t)&&Object.hasOwn(t,"targets")?l:l[0];return r&&r(x),x}}function U(e,t){const{d:r,h:n,m:o}=e,a=(w(r)?0:1e3*r*60*60*24)+(w(n)?0:1e3*n*60*60)+(w(o)?0:1e3*o*60);let i=0;l();const s=setInterval(l,a);function l(){console.log(p("Start polling - count: "+ ++i)),t(i,c)}function c(){clearInterval(s),console.log(h("Stop the polling"))}}const Q={mode:"async",enableRandomFingerprint:!0,timeout:1e4,maxRetry:0};const K=function(e){const t=function(e){const t=e||{};return Object.keys(Q).forEach((e=>{w(t[e])&&(t[e]=Q[e])})),t}(e);return function(e){return{crawlPage:W(e),crawlHTML:H(e),crawlData:B(e),crawlFile:L(e),startPolling:U}}(t)}();K.crawlHTML("http://localhost:8888/html").then((e=>{console.log(e.data?.html)})); diff --git a/test/start/index.ts b/test/start/index.ts index 2d0bf33..eb898fa 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -2,15 +2,6 @@ import xCrawl from 'x-crawl' const testXCrawl = xCrawl() -testXCrawl - .crawlData({ - url: 'http://localhost:8888', - method: 'post', - headers: { - 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' - }, - data: { name: 'hxl', age: 19 } - }) - .then((res) => { - console.log(res) - }) +testXCrawl.crawlHTML('http://localhost:8888/html').then((res) => { + console.log(res.data?.html) +})