From d93f0f117040dc3b9dde08d0896653c5e30ec465 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Fri, 6 Dec 2024 09:20:13 -0500 Subject: [PATCH] Add a CLI parameter for passing selenium hub --- cli/crawl-cli.js | 46 ++++++++++++++++++++++++++++++++++++++++++--- cli/crawlConfig.js | 6 +++++- crawler.js | 4 ++-- crawlerConductor.js | 28 +++++++++++++++++++++++---- 4 files changed, 74 insertions(+), 10 deletions(-) diff --git a/cli/crawl-cli.js b/cli/crawl-cli.js index 3cbe7e9f..ae0e44aa 100644 --- a/cli/crawl-cli.js +++ b/cli/crawl-cli.js @@ -29,6 +29,7 @@ program .option('--config ', 'crawl configuration file') .option('--autoconsent-action ', 'dismiss cookie popups. Possible values: optout, optin') .option('--chromium-version ', 'use custom version of chromium') + .option('--selenium-hub ', 'selenium hub endpoint to request browsers from') .parse(process.argv); /** @@ -100,8 +101,28 @@ function filterUrls(inputUrls, logFunction, outputPath) { * @param {number} maxLoadTimeMs * @param {number} extraExecutionTimeMs * @param {Object.} collectorFlags + * @param {string} seleniumHub */ -async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) { +async function run( + inputUrls, + outputPath, + verbose, + logPath, + numberOfCrawlers, + dataCollectors, + reporters, + forceOverwrite, + filterOutFirstParty, + emulateMobile, + proxyHost, + regionCode, + antiBotDetection, + chromiumVersion, + maxLoadTimeMs, + extraExecutionTimeMs, + collectorFlags, + seleniumHub +) { const startTime = new Date(); reporters.forEach(reporter => { @@ -189,6 +210,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, + seleniumHub }); log(chalk.green('\n✅ Finished successfully.')); } catch(e) { @@ -217,7 +239,6 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da }); } -// @ts-ignore const config = crawlConfig.figureOut(program.opts()); const collectorFlags = { autoconsentAction: program.opts().autoconsentAction, @@ -272,7 +293,26 @@ if (!config.urls || !config.output) { return item; }); - run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags); + run( + urls, + config.output, + config.verbose, + config.logPath, + config.crawlers || null, + dataCollectors, + reporters, + config.forceOverwrite, + config.filterOutFirstParty, + config.emulateMobile, + config.proxyConfig, + config.regionCode, + !config.disableAntiBot, + config.chromiumVersion, + config.maxLoadTimeMs, + config.extraExecutionTimeMs, + collectorFlags, + config.seleniumHub + ); } /** diff --git a/cli/crawlConfig.js b/cli/crawlConfig.js index 8749ba10..f3d19235 100644 --- a/cli/crawlConfig.js +++ b/cli/crawlConfig.js @@ -14,7 +14,7 @@ function addProtocolIfNeeded(url) { /** * Looks at CLI flags, JSON config etc. to figure out the final crawl config * - * @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags + * @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, seleniumHub?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags * @returns {CrawlConfig} */ function figureOut(flags) { @@ -65,6 +65,9 @@ function figureOut(flags) { if (flags.chromiumVersion) { crawlConfig.chromiumVersion = flags.chromiumVersion; } + if (flags.seleniumHub) { + crawlConfig.seleniumHub = flags.seleniumHub; + } // array settings if (flags.dataCollectors) { @@ -137,4 +140,5 @@ module.exports = { * @property {boolean} disableAntiBot * @property {number} maxLoadTimeMs * @property {number} extraExecutionTimeMs + * @property {string} seleniumHub */ \ No newline at end of file diff --git a/crawler.js b/crawler.js index 480f880c..476d9158 100644 --- a/crawler.js +++ b/crawler.js @@ -347,8 +347,7 @@ async function crawl(url, options) { log, options.proxyHost, options.executablePath, - // FIXME: this is a hardcoded value - 'http://10.100.9.21:4444' + options.seleniumHub, ); const browserConnection = options.browserConnection || await browser.getConnection(); @@ -409,6 +408,7 @@ async function crawl(url, options) { * @property {number=} maxLoadTimeMs * @property {number=} extraExecutionTimeMs * @property {Object.=} collectorFlags + * @property {string=} seleniumHub */ /** diff --git a/crawlerConductor.js b/crawlerConductor.js index 85349094..0b626825 100644 --- a/crawlerConductor.js +++ b/crawlerConductor.js @@ -22,8 +22,23 @@ const MAX_NUMBER_OF_RETRIES = 2; * @param {number} maxLoadTimeMs * @param {number} extraExecutionTimeMs * @param {Object.} collectorFlags + * @param {string} seleniumHub */ -async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) { +async function crawlAndSaveData( + urlString, + dataCollectors, + log, + filterOutFirstParty, + dataCallback, + emulateMobile, + proxyHost, + antiBotDetection, + executablePath, + maxLoadTimeMs, + extraExecutionTimeMs, + collectorFlags, + seleniumHub +) { const url = new URL(urlString); /** * @type {function(...any):void} @@ -52,13 +67,14 @@ async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstPa maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, + seleniumHub, }); dataCallback(url, data); } /** - * @param {{urls: Array, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.}} options + * @param {{urls: Array, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object., seleniumHub?: string}} options */ module.exports = async options => { const log = options.logFunction || (() => {}); @@ -75,7 +91,10 @@ module.exports = async options => { log(chalk.cyan(`Number of crawlers: ${numberOfCrawlers}\n`)); // make sure the browser is downloaded before we start parallel tasks - const executablePath = await downloadChrome(log, options.chromiumVersion); + let executablePath = null; + if (!options.seleniumHub) { + executablePath = await downloadChrome(log, options.chromiumVersion); + } /** @type {Set} */ const inProgress = new Set(); @@ -109,7 +128,8 @@ module.exports = async options => { executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, - options.collectorFlags + options.collectorFlags, + options.seleniumHub, ); asyncLib.retry(MAX_NUMBER_OF_RETRIES, task, err => {