From 1a31ddded6432252af1b0eb2cf0528c9a928993c Mon Sep 17 00:00:00 2001 From: coderhxl Date: Thu, 16 Feb 2023 19:45:08 +0800 Subject: [PATCH] other --- README.md | 3 ++- document/cn.md | 3 ++- package.json | 2 +- publish/README.md | 5 +++-- publish/package.json | 2 +- test/start/index.js | 2 +- test/start/index.ts | 5 ++++- 7 files changed, 14 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 0704f62..0ed27da 100644 --- a/README.md +++ b/README.md @@ -310,7 +310,8 @@ interface IFetchDataConfig extends IFetchBaseConifg { ```ts interface IFetchFileConfig extends IFetchBaseConifg { fileConfig: { - storeDir: string + storeDir: string // store folder + extension?: string // filename extension } } ``` diff --git a/document/cn.md b/document/cn.md index 5cbc27d..f9dbdc5 100644 --- a/document/cn.md +++ b/document/cn.md @@ -322,7 +322,8 @@ interface IFetchDataConfig extends IFetchBaseConifg { ```ts interface IFetchFileConfig extends IFetchBaseConifg { fileConfig: { - storeDir: string + storeDir: string // 存放文件夹 + extension?: string // 文件扩展名 } } ``` diff --git a/package.json b/package.json index 6580827..5967a80 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "0.3.2", + "version": "0.4.0", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/publish/README.md b/publish/README.md index 8f75e51..0ed27da 100644 --- a/publish/README.md +++ b/publish/README.md @@ -17,7 +17,7 @@ XCrawl is a Nodejs multifunctional crawler library. - [Install](#Install) - [Example](#Example) -- [Core concepts](#Core concepts) +- [Core concepts](#Core-concepts) * [XCrawl](#XCrawl) + [Type](#Type-1) + [Example](#Example-1) @@ -310,7 +310,8 @@ interface IFetchDataConfig extends IFetchBaseConifg { ```ts interface IFetchFileConfig extends IFetchBaseConifg { fileConfig: { - storeDir: string + storeDir: string // store folder + extension?: string // filename extension } } ``` diff --git a/publish/package.json b/publish/package.json index ebde1b5..c661115 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "0.3.1", + "version": "0.4.0", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library.", "license": "MIT", diff --git a/test/start/index.js b/test/start/index.js index 539e422..4b8bb0a 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("node:https"),s=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,h=i.red,f=i.yellow;function d(e){return void 0===e}function g(e){return"number"==typeof e}function m(e){return Array.isArray(e)}function p(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function y(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new s.URL(e.url),l="http:"===t,h={agent:e.proxy?a(e.proxy):l?new n.Agent:new r.Agent,protocol:t,hostname:o,port:i,path:c,search:p(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return h.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,h),h}function w(e){return new Promise(((t,o)=>{const s=d(e.data);e.data=s?e.data:JSON.stringify(e.data);const a=y(e);function i(e){const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):r.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||s||c.write(e.data),c.end()}))}async function q(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const $=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=m(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:r}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(r)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t){const o=m(e)?e:[e];let n=[];return n="sync"!==this.baseConfig.mode?await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);const r=[];let s=0;for(const a of e){const e=++s;await q(o,n,t,e);const i=w(a).catch((t=>`Request ${e} is an error: ${t.message}`)).then((t=>"string"==typeof t?t:{id:e,...t}));r.push(i)}c(l("All requests have been sent!"));const a=await Promise.all(r),i=[],f=[];return a.forEach((e=>{if("string"==typeof e)return f.push(e);i.push(e)})),f.forEach((e=>c(h(e)))),c(`requestsTotal: ${u(e.length)}, success: ${l(i.length)}, error: ${h(f.length)}`),i}(o,t):await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,s=0,a=0;const i=[];for(const f of e){r++,await q(o,n,t,r);try{const e=await w(f);i.push({id:r,...e}),c(l(`Request ${u(r)} is an success`)),s++}catch(e){c(h(`Request ${r} is an error: ${e.message}`)),a++}}return c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(s)}, error: ${h(a)}`),i}(o,t),n}async fetchHTML(e){const{requestConifg:t}=this.mergeConfig({requestConifg:(n=e,"string"==typeof n?{url:e}:e)});var n;const r=await w(t),s=r.data.toString();return{...r,data:{raw:s,jsdom:new o.JSDOM(s)}}}async fetchData(e){const{requestConifg:t,intervalTime:o}=this.mergeConfig(e),n=await this.useBatchRequestByMode(t,o),r=[];return n.forEach((e=>{const t=e.headers["content-type"]??"",o=e.data,n=t.includes("text")?o.toString():JSON.parse(o.toString());r.push({...e,data:n})})),r}async fetchFile(o){const{requestConifg:n,intervalTime:r,fileConfig:s}=this.mergeConfig(o),a=await this.useBatchRequestByMode(n,r),i=[];a.forEach((o=>{const{id:n,headers:r,data:a}=o,u=r["content-type"]??"",l=u.split("/").pop(),f=(new Date).getTime().toString(),d=e.resolve(s.storeDir,`${f}.${l}`);try{t.writeFileSync(d,a),i.push({...o,data:{fileName:f,mimeType:u,size:a.length,filePath:d}})}catch(e){c(h(`File save error at id ${n}: ${e.message}`))}}));const f=a.length,d=i.length,g=f-d;return c(`saveTotal: ${u(f)}, success: ${l(d)}, error: ${h(g)}`),i}fetchPolling(e,t){const{Y:o,M:n,d:r,h:s,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(r)?0:1e3*r*60*60*24)+(d(s)?0:1e3*s*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(f(`Start the ${f.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:2e3,min:1e3},mode:"async"});$.fetchHTML({url:"https://www.google.com.hk/",proxy:"http://127.0.0.1:14892"}).then((t=>{console.log(t.statusCode);const{jsdom:o}=t.data,n=o.window.document.querySelector(".lnXdpd");$.fetchFile({requestConifg:{url:"https://www.google.com.hk/"+n.src,proxy:"http://127.0.0.1:14892"},fileConfig:{storeDir:e.resolve(__dirname,"./upload")}})})); +"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("node:https"),s=require("node:url"),a=require("https-proxy-agent"),i=require("chalk");const c=console.log,u=i.hex("#a57fff"),l=i.green,h=i.red,f=i.yellow;function d(e){return void 0===e}function g(e){return"number"==typeof e}function m(e){return Array.isArray(e)}function p(e,t){let o=e?`${e}`:"?";if(t)for(const e in t){o+=`&${e}=${t[e]}`}else o=e;return o}function y(e){const{protocol:t,hostname:o,port:i,pathname:c,search:u}=new s.URL(e.url),l="http:"===t,h={agent:e.proxy?a(e.proxy):l?new n.Agent:new r.Agent,protocol:t,hostname:o,port:i,path:c,search:p(u,e.params),method:e.method?.toLocaleUpperCase()??"GET",headers:{},timeout:e.timeout};return h.headers=function(e,t){const o={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",...e.headers??{}};return"POST"===t.method&&e.data&&(o["Content-Type"]="application/json",o["Content-Length"]=Buffer.byteLength(e.data)),o}(e,h),h}function q(e){return new Promise(((t,o)=>{const s=d(e.data);e.data=s?e.data:JSON.stringify(e.data);const a=y(e);function i(e){const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}let c;c="http:"===a.protocol?n.request(a,i):r.request(a,i),c.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),c.on("error",(e=>{o(e)})),"POST"!==a.method||s||c.write(e.data),c.end()}))}async function w(e,t,o,n){if(e&&n>1){const e=t?o:function(e,t=0){let o=Math.floor(Math.random()*e);for(;osetTimeout(t,e)))}(e)}else c(`Request ${u(n)} does not need to sleep, send immediately`)}const $=new class{baseConfig;constructor(e={}){this.baseConfig=e}mergeConfig(e){const t=this.baseConfig,o=structuredClone(e),n=m(o.requestConifg)?o.requestConifg:[o.requestConifg];for(const e of n){const{url:o,timeout:n,proxy:r}=e;d(t.baseUrl)||(e.url=t.baseUrl+o),d(n)&&(e.timeout=t.timeout),d(r)&&(e.proxy=t.proxy)}return d(o.intervalTime)&&(o.intervalTime=t.intervalTime),o}async useBatchRequestByMode(e,t){const o=m(e)?e:[e];let n=[];return n="sync"!==this.baseConfig.mode?await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);const r=[];let s=0;for(const a of e){const e=++s;await w(o,n,t,e);const i=q(a).catch((t=>`Request ${e} is an error: ${t.message}`)).then((t=>"string"==typeof t?t:{id:e,...t}));r.push(i)}c(l("All requests have been sent!"));const a=await Promise.all(r),i=[],f=[];return a.forEach((e=>{if("string"==typeof e)return f.push(e);i.push(e)})),f.forEach((e=>c(h(e)))),c(`requestsTotal: ${u(e.length)}, success: ${l(i.length)}, error: ${h(f.length)}`),i}(o,t):await async function(e,t){const o=!d(t),n=g(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let r=0,s=0,a=0;const i=[];for(const f of e){r++,await w(o,n,t,r);try{const e=await q(f);i.push({id:r,...e}),c(l(`Request ${u(r)} is an success`)),s++}catch(e){c(h(`Request ${r} is an error: ${e.message}`)),a++}}return c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(s)}, error: ${h(a)}`),i}(o,t),n}async fetchHTML(e){const{requestConifg:t}=this.mergeConfig({requestConifg:(n=e,"string"==typeof n?{url:e}:e)});var n;const r=await q(t),s=r.data.toString();return{...r,data:{html:s,jsdom:new o.JSDOM(s)}}}async fetchData(e){const{requestConifg:t,intervalTime:o}=this.mergeConfig(e),n=await this.useBatchRequestByMode(t,o),r=[];return n.forEach((e=>{const t=e.headers["content-type"]??"",o=e.data,n=t.includes("text")?o.toString():JSON.parse(o.toString());r.push({...e,data:n})})),r}async fetchFile(o){const{requestConifg:n,intervalTime:r,fileConfig:s}=this.mergeConfig(o),a=await this.useBatchRequestByMode(n,r),i=[];a.forEach((o=>{const{id:n,headers:r,data:a}=o,u=r["content-type"]??"",l=s.extension??u.split("/").pop(),f=(new Date).getTime().toString(),d=e.resolve(s.storeDir,`${f}.${l}`);try{t.writeFileSync(d,a),i.push({...o,data:{fileName:f,mimeType:u,size:a.length,filePath:d}})}catch(e){c(h(`File save error at id ${n}: ${e.message}`))}}));const f=a.length,d=i.length,g=f-d;return c(`saveTotal: ${u(f)}, success: ${l(d)}, error: ${h(g)}`),i}fetchPolling(e,t){const{Y:o,M:n,d:r,h:s,m:a}=e,i=(d(o)?0:1e3*o*60*60*24*365)+(d(n)?0:1e3*n*60*60*24*30)+(d(r)?0:1e3*r*60*60*24)+(d(s)?0:1e3*s*60*60)+(d(a)?0:1e3*a*60);let c=0;function u(){console.log(f(`Start the ${f.bold(++c)} polling`)),t(c)}u(),setInterval(u,i)}}({timeout:1e4,intervalTime:{max:2e3,min:1e3},mode:"async"});$.fetchHTML({url:"https://www.google.com.hk/",proxy:"http://127.0.0.1:14892"}).then((t=>{console.log(t.statusCode);const{jsdom:o}=t.data,n=o.window.document.querySelector(".lnXdpd");$.fetchFile({requestConifg:{url:"https://www.google.com.hk/"+n.src,proxy:"http://127.0.0.1:14892"},fileConfig:{storeDir:e.resolve(__dirname,"./upload"),extension:"jpg"}})})); diff --git a/test/start/index.ts b/test/start/index.ts index 9b0ad1c..acb072c 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -54,6 +54,9 @@ testXCrawl url: 'https://www.google.com.hk/' + imgEl!.src, proxy: 'http://127.0.0.1:14892' }, - fileConfig: { storeDir: path.resolve(__dirname, './upload') } + fileConfig: { + storeDir: path.resolve(__dirname, './upload'), + extension: 'jpg' + } }) })