From 0edc0532690d1187297a7274b75e8fbb8444e3e3 Mon Sep 17 00:00:00 2001 From: coderhxl Date: Sun, 5 Feb 2023 19:05:37 +0800 Subject: [PATCH] Fix fetchFile API file save is lost --- package.json | 2 +- publish/package.json | 2 +- src/index.ts | 75 ++++++++++++++++++--------------- src/request.ts | 4 +- test/start/index.js | 2 +- test/start/index.ts | 99 +++++++++++++++++++++++++++----------------- 6 files changed, 108 insertions(+), 76 deletions(-) diff --git a/package.json b/package.json index 32e38e8..5842b51 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "x-crawl", - "version": "0.1.4", + "version": "0.1.5", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.", "license": "MIT", diff --git a/publish/package.json b/publish/package.json index 261bc3a..f2cb608 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,6 +1,6 @@ { "name": "x-crawl", - "version": "0.1.4", + "version": "0.1.5", "author": "CoderHxl", "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.", "license": "MIT", diff --git a/src/index.ts b/src/index.ts index 67db678..c86c793 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,15 @@ import path from 'node:path' import { JSDOM } from 'jsdom' import { batchRequest, syncBatchRequest, request } from './request' -import { isArray, isString, isUndefined, log, logError } from './utils' +import { + isArray, + isString, + isUndefined, + log, + logError, + logNumber, + logSuccess +} from './utils' import { IXCrawlBaseConifg, @@ -135,39 +143,40 @@ export default class XCrawl { intervalTime ) - return new Promise((resolve) => { - const container: IFetchCommon = [] - - requestRes.forEach((requestResItem, index) => { - const { id, statusCode, headers, data } = requestResItem - - const mimeType = headers['content-type'] ?? '' - const suffix = mimeType.split('/').pop() - const fileName = new Date().getTime().toString() - const filePath = path.resolve( - fileConfig.storeDir, - `${fileName}.${suffix}` - ) - - fs.createWriteStream(filePath, 'binary').write(data, (err) => { - if (err) { - log(logError(`File save error at id ${id}: ${err.message}`)) - } else { - const fileInfo: IFileInfo = { - fileName, - mimeType, - size: data.length, - filePath - } - - container.push({ id, statusCode, headers, data: fileInfo }) - } - - if (index === requestRes.length - 1) { - resolve(container) - } + const container: IFetchCommon = [] + + requestRes.forEach((requestResItem) => { + const { id, headers, data } = requestResItem + + const mimeType = headers['content-type'] ?? '' + const suffix = mimeType.split('/').pop() + const fileName = new Date().getTime().toString() + const filePath = path.resolve( + fileConfig.storeDir, + `${fileName}.${suffix}` + ) + + try { + fs.writeFileSync(filePath, data) + + container.push({ + ...requestResItem, + data: { fileName, mimeType, size: data.length, filePath } }) - }) + } catch (error: any) { + log(logError(`File save error at id ${id}: ${error.message}`)) + } }) + + const saveTotal = requestRes.length + const success = container.length + const error = requestRes.length - container.length + log( + `saveTotal: ${logNumber(saveTotal)}, success: ${logSuccess( + success + )}, error: ${logError(error)}` + ) + + return container } } diff --git a/src/request.ts b/src/request.ts index 5303de5..1cda9cc 100644 --- a/src/request.ts +++ b/src/request.ts @@ -208,7 +208,7 @@ export async function batchRequest( error.forEach((message) => log(logError(message))) log( - `total: ${logNumber(requestConifgs.length)}, success: ${logSuccess( + `requestsTotal: ${logNumber(requestConifgs.length)}, success: ${logSuccess( success.length )}, error: ${logError(error.length)}` ) @@ -255,7 +255,7 @@ export async function syncBatchRequest( log(logSuccess('All requests are over!')) log( - `total: ${logNumber(requestConifgs.length)}, success: ${logSuccess( + `requestsTotal: ${logNumber(requestConifgs.length)}, success: ${logSuccess( successTotal )}, error: ${logError(errorTotal)}` ) diff --git a/test/start/index.js b/test/start/index.js index b00606e..9822203 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:fs"),t=require("node:path"),n=require("jsdom"),o=require("node:http"),s=require("https"),r=require("node:url"),a=require("chalk");function i(e,t=0){let n=Math.floor(Math.random()*e);return n1){const e=t?n:i(n.max,n.min);c(`Request ${u(o)} needs to sleep for ${u(e+"ms")} milliseconds before sending`),await function(e){return new Promise((t=>setTimeout(t,e)))}(e)}else c(`Request ${u(o)} does not need to sleep, send immediately`)}function w(e){return new Promise(((t,n)=>{const s=d(e.data);e.data=s?e.data:JSON.stringify(e.data);const r=g(e),a=o.request(r,(e=>{const{statusCode:n,headers:o}=e,s=[];e.on("data",(e=>s.push(e))),e.on("end",(()=>{const e=Buffer.concat(s);t({statusCode:n,headers:o,data:e})}))}));a.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),a.on("error",(e=>{n(e)})),"POST"!==r.method||s||a.write(e.data),a.end()}))}function q(e,t){const{baseUrl:n,timeout:o,intervalTime:s}=e,{requestConifg:r,intervalTime:a}=t,i=m(r)?r:[r];for(const e of i){const{url:t,timeout:s}=e;d(n)||(e.url=n+t),d(s)&&!d(o)&&(e.timeout=o)}return d(a)&&!d(s)&&(t.intervalTime=s),t}new class{baseConfig;constructor(e={}){this.baseConfig=e}async useBatchRequestByMode(e,t){const n=m(e)?e:[e];let o=[];return o="sync"!==this.baseConfig.mode?await async function(e,t){const n=!d(t),o=f(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);const s=[];let r=0;for(const a of e){const e=++r;await y(n,o,t,e);const i=w(a).catch((t=>`Request ${e} is an error: ${t.message}`)).then((t=>"string"==typeof t?t:{id:e,...t}));s.push(i)}c(h("All requests have been sent!"));const a=await Promise.all(s),i=[],m=[];return a.forEach((e=>{if("string"==typeof e)return m.push(e);i.push(e)})),m.forEach((e=>c(l(e)))),c(`total: ${u(e.length)}, success: ${h(i.length)}, error: ${l(m.length)}`),i}(n,t):await async function(e,t){const n=!d(t),o=f(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let s=0,r=0,a=0;const i=[];for(const d of e){s++,await y(n,o,t,s);try{const e=await w(d);i.push({id:s,...e}),c(h(`Request ${u(s)} is an success`)),r++}catch(e){c(l(`Request ${s} is an error: ${e.message}`)),a++}}return c(h("All requests are over!")),c(`total: ${u(e.length)}, success: ${h(r)}, error: ${l(a)}`),i}(n,t),o}async fetchHTML(e){const t="string"==typeof e?{url:e}:e;const{requestConifg:o}=q(this.baseConfig,{requestConifg:t}),s=await w(o),r=s.data.toString();return{...s,data:{raw:r,jsdom:new n.JSDOM(r)}}}async fetchData(e){const{requestConifg:t,intervalTime:n}=q(this.baseConfig,e),o=await this.useBatchRequestByMode(t,n),s=[];return o.forEach((e=>{const t=e.headers["content-type"]??"",n=e.data,o=t.includes("text")?n.toString():JSON.parse(n.toString());s.push({...e,data:o})})),s}async fetchFile(n){const{requestConifg:o,intervalTime:s,fileConfig:r}=q(this.baseConfig,n),a=await this.useBatchRequestByMode(o,s);return new Promise((n=>{const o=[];a.forEach(((s,i)=>{const{id:u,statusCode:h,headers:d,data:f}=s,m=d["content-type"]??"",p=m.split("/").pop(),g=(new Date).getTime().toString(),y=t.resolve(r.storeDir,`${g}.${p}`);e.createWriteStream(y,"binary").write(f,(e=>{if(e)c(l(`File save error at id ${u}: ${e.message}`));else{const e={fileName:g,mimeType:m,size:f.length,filePath:y};o.push({id:u,statusCode:h,headers:d,data:e})}i===a.length-1&&n(o)}))}))}))}}({timeout:1e4,intervalTime:{max:3e3,min:2e3},mode:"async"}).fetchData({requestConifg:[{url:"http://localhost:3001/home"},{url:"http://localhost:9001/api/home/wonderfulplace"},{url:"http://localhost:9001/api/home/goodprice"},{url:"http://localhost:3001/home"},{url:"http://localhost:9001/ai/home/goodprice"}]}).then((e=>{})); +"use strict";var e=require("node:path"),t=require("node:fs"),n=require("jsdom"),o=require("node:http"),s=require("https"),r=require("node:url"),a=require("chalk");function i(e,t=0){let n=Math.floor(Math.random()*e);return n1){const e=t?n:i(n.max,n.min);c(`Request ${u(o)} needs to sleep for ${u(e+"ms")} milliseconds before sending`),await function(e){return new Promise((t=>setTimeout(t,e)))}(e)}else c(`Request ${u(o)} does not need to sleep, send immediately`)}function w(e){return new Promise(((t,n)=>{const s=d(e.data);e.data=s?e.data:JSON.stringify(e.data);const r=p(e),a=o.request(r,(e=>{const{statusCode:n,headers:o}=e,s=[];e.on("data",(e=>s.push(e))),e.on("end",(()=>{const e=Buffer.concat(s);t({statusCode:n,headers:o,data:e})}))}));a.on("timeout",(()=>{n(new Error(`Timeout ${e.timeout}ms`))})),a.on("error",(e=>{n(e)})),"POST"!==r.method||s||a.write(e.data),a.end()}))}function q(e,t){const{baseUrl:n,timeout:o,intervalTime:s}=e,{requestConifg:r,intervalTime:a}=t,i=m(r)?r:[r];for(const e of i){const{url:t,timeout:s}=e;d(n)||(e.url=n+t),d(s)&&!d(o)&&(e.timeout=o)}return d(a)&&!d(s)&&(t.intervalTime=s),t}const $=new class{baseConfig;constructor(e={}){this.baseConfig=e}async useBatchRequestByMode(e,t){const n=m(e)?e:[e];let o=[];return o="sync"!==this.baseConfig.mode?await async function(e,t){const n=!d(t),o=f(t);c(`Begin execution, mode: async, total: ${u(e.length)} `);const s=[];let r=0;for(const a of e){const e=++r;await y(n,o,t,e);const i=w(a).catch((t=>`Request ${e} is an error: ${t.message}`)).then((t=>"string"==typeof t?t:{id:e,...t}));s.push(i)}c(l("All requests have been sent!"));const a=await Promise.all(s),i=[],m=[];return a.forEach((e=>{if("string"==typeof e)return m.push(e);i.push(e)})),m.forEach((e=>c(h(e)))),c(`requestsTotal: ${u(e.length)}, success: ${l(i.length)}, error: ${h(m.length)}`),i}(n,t):await async function(e,t){const n=!d(t),o=f(t);c(`Begin execution, mode: sync, total: ${u(e.length)} `);let s=0,r=0,a=0;const i=[];for(const d of e){s++,await y(n,o,t,s);try{const e=await w(d);i.push({id:s,...e}),c(l(`Request ${u(s)} is an success`)),r++}catch(e){c(h(`Request ${s} is an error: ${e.message}`)),a++}}return c(l("All requests are over!")),c(`requestsTotal: ${u(e.length)}, success: ${l(r)}, error: ${h(a)}`),i}(n,t),o}async fetchHTML(e){const t="string"==typeof e?{url:e}:e;const{requestConifg:o}=q(this.baseConfig,{requestConifg:t}),s=await w(o),r=s.data.toString();return{...s,data:{raw:r,jsdom:new n.JSDOM(r)}}}async fetchData(e){const{requestConifg:t,intervalTime:n}=q(this.baseConfig,e),o=await this.useBatchRequestByMode(t,n),s=[];return o.forEach((e=>{const t=e.headers["content-type"]??"",n=e.data,o=t.includes("text")?n.toString():JSON.parse(n.toString());s.push({...e,data:o})})),s}async fetchFile(n){const{requestConifg:o,intervalTime:s,fileConfig:r}=q(this.baseConfig,n),a=await this.useBatchRequestByMode(o,s),i=[];a.forEach((n=>{const{id:o,headers:s,data:a}=n,u=s["content-type"]??"",l=u.split("/").pop(),d=(new Date).getTime().toString(),f=e.resolve(r.storeDir,`${d}.${l}`);try{t.writeFileSync(f,a),i.push({...n,data:{fileName:d,mimeType:u,size:a.length,filePath:f}})}catch(e){c(h(`File save error at id ${o}: ${e.message}`))}}));const d=a.length,f=i.length,m=a.length-i.length;return c(`saveTotal: ${u(d)}, success: ${l(f)}, error: ${h(m)}`),i}}({timeout:1e4,intervalTime:{max:1e3,min:500},mode:"async"});$.fetchHTML({url:"https://www.bilibili.com/"}).then((t=>{const{jsdom:n}=t.data,o=n.window.document.querySelectorAll(".bili-video-card__cover"),s=[];o.forEach(((e,t)=>{const n=e.lastChild;t%2?s.push("https:"+n.src):s.push(n.src)})),console.log(s);const r=s.map((e=>({url:e})));$.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}}).then((e=>{}))})); diff --git a/test/start/index.ts b/test/start/index.ts index 140a0fe..eb68eb6 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -3,54 +3,77 @@ import XCrawl from '../../src' const testXCrawl = new XCrawl({ timeout: 10000, - intervalTime: { - max: 3000, - min: 2000 - }, + intervalTime: { max: 2000, min: 1000 }, mode: 'async' }) -testXCrawl - .fetchData({ - requestConifg: [ - { url: 'http://localhost:3001/home' }, - { url: 'http://localhost:9001/api/home/wonderfulplace' }, - { url: 'http://localhost:9001/api/home/goodprice' }, - { url: 'http://localhost:3001/home' }, - { url: 'http://localhost:9001/ai/home/goodprice' } - ] - }) - .then((res) => { - // console.log(res) +// testXCrawl +// .fetchData({ +// requestConifg: [ +// { url: 'http://localhost:3001/home' }, +// { url: 'http://localhost:9001/api/home/wonderfulplace' }, +// { url: 'http://localhost:9001/api/home/goodprice' }, +// { url: 'http://localhost:3001/home' }, +// { url: 'http://localhost:9001/ai/home/goodprice' } +// ] +// }) +// .then((res) => { +// // console.log(res) +// }) + +testXCrawl.fetchHTML({ url: 'https://www.bilibili.com/' }).then((res) => { + const { jsdom } = res.data + + const document = jsdom.window.document + const imgBoxEl = document.querySelectorAll('.bili-video-card__cover') + + const imgUrls: string[] = [] + imgBoxEl.forEach((item, index) => { + const img = item.lastChild as HTMLImageElement + + if (index % 2) { + imgUrls.push('https:' + img.src) + } else { + imgUrls.push(img.src) + } }) -// testXCrawl.fetchHTML({ url: 'https://www.bilibili.com/' }).then((res) => { -// const { jsdom } = res.data + console.log(imgUrls) -// const document = jsdom.window.document -// const imgBoxEl = document.querySelectorAll('.bili-video-card__cover') + const requestConifg = imgUrls.map((url) => ({ url })) -// const imgUrls: string[] = [] -// imgBoxEl.forEach((item, index) => { -// const img = item.lastChild as HTMLImageElement + testXCrawl + .fetchFile({ + requestConifg, + fileConfig: { storeDir: path.resolve(__dirname, './upload') } + }) + .then((res) => { + // console.log(res) + }) +}) -// if (index % 2) { -// imgUrls.push('https:' + img.src) -// } else { -// imgUrls.push(img.src) +// testXCrawl +// .fetchData({ +// requestConifg: { +// url: 'http://localhost:9001/api/area/阳江市', +// method: 'POST', +// data: { +// type: 'plus', +// offset: 0, +// size: 20 +// } // } // }) +// .then((res) => { +// const room = res[0].data.data.list[0] +// const requestConifg = room.pictureUrls.map((item: any) => ({ +// url: item +// })) -// console.log(imgUrls) - -// const requestConifg = imgUrls.map((url) => ({ url })) - -// testXCrawl -// .fetchFile({ +// testXCrawl.fetchFile({ // requestConifg, -// fileConfig: { storeDir: path.resolve(__dirname, './upload') } -// }) -// .then((res) => { -// console.log(res) +// fileConfig: { +// storeDir: path.resolve(__dirname, './upload') +// } // }) -// }) +// })