Skip to content

Commit

Permalink
Add a CLI parameter for passing selenium hub
Browse files Browse the repository at this point in the history
  • Loading branch information
muodov committed Dec 6, 2024
1 parent da05532 commit d93f0f1
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 10 deletions.
46 changes: 43 additions & 3 deletions cli/crawl-cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ program
.option('--config <path>', 'crawl configuration file')
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optout, optin')
.option('--chromium-version <version_number>', 'use custom version of chromium')
.option('--selenium-hub <url>', 'selenium hub endpoint to request browsers from')
.parse(process.argv);

/**
Expand Down Expand Up @@ -100,8 +101,28 @@ function filterUrls(inputUrls, logFunction, outputPath) {
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {Object.<string, boolean>} collectorFlags
* @param {string} seleniumHub
*/
async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
async function run(
inputUrls,
outputPath,
verbose,
logPath,
numberOfCrawlers,
dataCollectors,
reporters,
forceOverwrite,
filterOutFirstParty,
emulateMobile,
proxyHost,
regionCode,
antiBotDetection,
chromiumVersion,
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
seleniumHub
) {
const startTime = new Date();

reporters.forEach(reporter => {
Expand Down Expand Up @@ -189,6 +210,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
seleniumHub
});
log(chalk.green('\n✅ Finished successfully.'));
} catch(e) {
Expand Down Expand Up @@ -217,7 +239,6 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
});
}

// @ts-ignore
const config = crawlConfig.figureOut(program.opts());
const collectorFlags = {
autoconsentAction: program.opts().autoconsentAction,
Expand Down Expand Up @@ -272,7 +293,26 @@ if (!config.urls || !config.output) {
return item;
});

run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags);
run(
urls,
config.output,
config.verbose,
config.logPath,
config.crawlers || null,
dataCollectors,
reporters,
config.forceOverwrite,
config.filterOutFirstParty,
config.emulateMobile,
config.proxyConfig,
config.regionCode,
!config.disableAntiBot,
config.chromiumVersion,
config.maxLoadTimeMs,
config.extraExecutionTimeMs,
collectorFlags,
config.seleniumHub
);
}

/**
Expand Down
6 changes: 5 additions & 1 deletion cli/crawlConfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function addProtocolIfNeeded(url) {
/**
* Looks at CLI flags, JSON config etc. to figure out the final crawl config
*
* @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags
* @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, seleniumHub?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags
* @returns {CrawlConfig}
*/
function figureOut(flags) {
Expand Down Expand Up @@ -65,6 +65,9 @@ function figureOut(flags) {
if (flags.chromiumVersion) {
crawlConfig.chromiumVersion = flags.chromiumVersion;
}
if (flags.seleniumHub) {
crawlConfig.seleniumHub = flags.seleniumHub;
}

// array settings
if (flags.dataCollectors) {
Expand Down Expand Up @@ -137,4 +140,5 @@ module.exports = {
* @property {boolean} disableAntiBot
* @property {number} maxLoadTimeMs
* @property {number} extraExecutionTimeMs
* @property {string} seleniumHub
*/
4 changes: 2 additions & 2 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,7 @@ async function crawl(url, options) {
log,
options.proxyHost,
options.executablePath,
// FIXME: this is a hardcoded value
'http://10.100.9.21:4444'
options.seleniumHub,
);
const browserConnection = options.browserConnection || await browser.getConnection();

Expand Down Expand Up @@ -409,6 +408,7 @@ async function crawl(url, options) {
* @property {number=} maxLoadTimeMs
* @property {number=} extraExecutionTimeMs
* @property {Object.<string, string>=} collectorFlags
* @property {string=} seleniumHub
*/

/**
Expand Down
28 changes: 24 additions & 4 deletions crawlerConductor.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,23 @@ const MAX_NUMBER_OF_RETRIES = 2;
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {Object.<string, string>} collectorFlags
* @param {string} seleniumHub
*/
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
async function crawlAndSaveData(
urlString,
dataCollectors,
log,
filterOutFirstParty,
dataCallback,
emulateMobile,
proxyHost,
antiBotDetection,
executablePath,
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
seleniumHub
) {
const url = new URL(urlString);
/**
* @type {function(...any):void}
Expand Down Expand Up @@ -52,13 +67,14 @@ async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstPa
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
seleniumHub,
});

dataCallback(url, data);
}

/**
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>, seleniumHub?: string}} options
*/
module.exports = async options => {
const log = options.logFunction || (() => {});
Expand All @@ -75,7 +91,10 @@ module.exports = async options => {
log(chalk.cyan(`Number of crawlers: ${numberOfCrawlers}\n`));

// make sure the browser is downloaded before we start parallel tasks
const executablePath = await downloadChrome(log, options.chromiumVersion);
let executablePath = null;
if (!options.seleniumHub) {
executablePath = await downloadChrome(log, options.chromiumVersion);
}

/** @type {Set<string>} */
const inProgress = new Set();
Expand Down Expand Up @@ -109,7 +128,8 @@ module.exports = async options => {
executablePath,
options.maxLoadTimeMs,
options.extraExecutionTimeMs,
options.collectorFlags
options.collectorFlags,
options.seleniumHub,
);

asyncLib.retry(MAX_NUMBER_OF_RETRIES, task, err => {
Expand Down

0 comments on commit d93f0f1

Please sign in to comment.