From c9cd7018dac72794249bb50d01b7a7a3edeb9786 Mon Sep 17 00:00:00 2001 From: Karthik UJ Date: Sun, 16 Jun 2024 15:57:45 +0530 Subject: [PATCH 1/5] Initial changes for multi browser Signed-off-by: Karthik UJ --- config/config.json | 7 +- src/crawler/crawlStateManager.js | 6 +- src/crawler/crawler.js | 193 ++++++++++++++++++------------- src/models/configModel.js | 1 + 4 files changed, 122 insertions(+), 85 deletions(-) diff --git a/config/config.json b/config/config.json index 561fcba..6ea57e0 100644 --- a/config/config.json +++ b/config/config.json @@ -6,13 +6,14 @@ "enabled": false, "host": "127.0.0.1", "port": 8080 - } + }, + "instances": 1 }, "crawler": { "entryPoint": "https://security-crawl-maze.app/", "eventTimeout": 10000, "navigationTimeout": 30000, - "eventWait": 1000, + "eventWait": 0, "maxDuration": 0, "elements": [ "a", @@ -29,7 +30,7 @@ }, "recorderAuth": { "enabled": false, - "pptrRecording": "/path/to/pptrRecording.json" + "pptrRecording": "C:\\Users\\5up3r541y4n\\Dev\\login-recordings\\hypejab.pptr.json" } }, "includeRegexes": [ diff --git a/src/crawler/crawlStateManager.js b/src/crawler/crawlStateManager.js index c9b9794..ae277cf 100644 --- a/src/crawler/crawlStateManager.js +++ b/src/crawler/crawlStateManager.js @@ -8,6 +8,7 @@ class CrawlStateManager { */ constructor(rootState) { this.rootState = rootState ? rootState : null; + this.visitedActions = new Set(); } /** @@ -73,7 +74,10 @@ class CrawlStateManager { if (childState) { stack.push(...childState.getCrawlActions()); } else { - return currentAction; + if (!this.visitedActions.has(currentAction.actionId)) { + this.visitedActions.add(currentAction.actionId); + return currentAction; + } } } } diff --git a/src/crawler/crawler.js b/src/crawler/crawler.js index b71d673..7342607 100644 --- a/src/crawler/crawler.js +++ b/src/crawler/crawler.js @@ -31,6 +31,7 @@ class Crawler { this.authInProgress = false; this.allUrls = new Set(); this.allInteractables = [...this.crawlerConfig.elements].concat(CrawlInput.INPUT_FIELDS.map((element) => element.CSS_PATH)); + this.endTime = null; } /** @@ -345,127 +346,157 @@ class Crawler { appendFileSync(fullPath, url + '\n'); } + async handleCrawl (crawlManager, browser, page, currentState) { + let nextCrawlAction; + while ((nextCrawlAction = crawlManager.getNextCrawlAction()) && (this.crawlerConfig.maxDuration === 0 || Date.now() < this.endTime)) { + const currentAction = nextCrawlAction; + // console.log('\nCurrent Action:'); + // console.log(currentAction.cssPath); + await this.performAction(crawlManager, currentAction, page); + // console.log('Action performed'); + const currentStateHash = await this.getPageHash(page); + // console.log('Current state hash:'); + // console.log(currentStateHash); + const existingState = crawlManager.getStateByHash(currentStateHash); + if (existingState) { + // console.log('State already exists'); + currentState = existingState; + this.removeCrawlActionFromState(currentAction); + } else { + if (this.inContext(page.url())) { + // console.log('State does not exist, creating...'); + currentState = await this.getNewCrawlState(page, currentAction.getParentState().crawlDepth + 1, currentStateHash, crawlManager); + currentAction.childState = currentState; + } else { + this.removeCrawlActionFromState(currentAction); + } + } + } + + await browser.close(); + } + /** * Starts the crawling process. */ async startCrawling() { console.log(chalk.greenBright(`[INFO] Initializing browser...`)); - const browser = await Browser.getBrowserInstance(this.config.browser); - console.log(chalk.greenBright(`[INFO] Browser initialized successfully!`)); + // const browser = await Browser.getBrowserInstance(this.config.browser); + const browsers = await Promise.all( + Array.from({length: this.config.browser.instances}).map( + () => Browser.getBrowserInstance(this.config.browser), + ), + ); + console.log(chalk.greenBright(`[INFO] Browser(s) initialized successfully!`)); console.log(chalk.greenBright(`[INFO] Sasori will now start crawling from ${this.crawlerConfig.entryPoint}`)); - browser.on('targetcreated', async (target)=>{ - const targetBrowser = target.browser(); - const allTargetBrowserPages = await targetBrowser.pages(); - const targetPage = await target.page(); - if (targetPage && target.type() === 'page' && allTargetBrowserPages.length > 1) { - await targetPage.close(); - } - }); + for (const browser of browsers) { + browser.on('targetcreated', async (target)=>{ + const targetBrowser = target.browser(); + const allTargetBrowserPages = await targetBrowser.pages(); + const targetPage = await target.page(); + if (targetPage && target.type() === 'page' && allTargetBrowserPages.length > 1) { + await targetPage.close(); + } + }); + } const startTime = Date.now(); - const endTime = startTime + this.crawlerConfig.maxDuration; + this.endTime = startTime + this.crawlerConfig.maxDuration; if (this.crawlerConfig.maxDuration === 0) { console.log(chalk.greenBright(`[INFO] Max duration is set to 0, sasori will run indefinitely.`)); } else { console.log(chalk.greenBright(`[INFO] Sasori will stop crawling at ${new Date(endTime).toTimeString()}`)); } - const allPages = await browser.pages(); - const page = allPages[0]; - page.setDefaultTimeout(this.crawlerConfig.eventTimeout); - page.setDefaultNavigationTimeout(this.crawlerConfig.navigationTimeout); - await this.maximizeViewport(page); + + const allPages = []; + for (const browser of browsers) { + const pages = await browser.pages(); + const page = pages[0]; + page.setDefaultTimeout(this.crawlerConfig.eventTimeout); + page.setDefaultNavigationTimeout(this.crawlerConfig.navigationTimeout); + await this.maximizeViewport(page); + allPages.push(page); + } // Authenticate if basic auth is enabled if (this.crawlerConfig.authentication.basicAuth && this.crawlerConfig.authentication.basicAuth.enabled) { - await page.authenticate({username: this.crawlerConfig.authentication.basicAuth.username, password: this.crawlerConfig.authentication.basicAuth.password}); + for (const page of allPages) { + await page.authenticate({username: this.crawlerConfig.authentication.basicAuth.username, password: this.crawlerConfig.authentication.basicAuth.password}); + } } - // Statically response to out-of-scope requests. + // Statically respond to out-of-scope requests. console.log(chalk.greenBright(`[INFO] Setting up scope manager...`)); - await page.setRequestInterception(true); - page.on('request', async (interceptedRequest) => { - if (interceptedRequest.isInterceptResolutionHandled()) return; - - if (this.inContext(interceptedRequest.url())) { - if (!this.allUrls.has(interceptedRequest.url())) { - console.log(chalk.magentaBright(`[URL] `) + chalk.green(interceptedRequest.url())); - this.allUrls.add(interceptedRequest.url()); - if (this.crawlerConfig.outputFile) { - this.appendUrlToOutputFile(interceptedRequest.url()); + for (const page of allPages) { + await page.setRequestInterception(true); + page.on('request', async (interceptedRequest) => { + if (interceptedRequest.isInterceptResolutionHandled()) return; + + if (this.inContext(interceptedRequest.url())) { + if (!this.allUrls.has(interceptedRequest.url())) { + console.log(chalk.magentaBright(`[URL] `) + chalk.green(interceptedRequest.url())); + this.allUrls.add(interceptedRequest.url()); + if (this.crawlerConfig.outputFile) { + this.appendUrlToOutputFile(interceptedRequest.url()); + } } } - } - - if (this.authInProgress) { - const parsedUrl = new URL(interceptedRequest.url()); - const authority = parsedUrl.host; - const includeRegex = `https?://${authority}(?:/.*|)`; - if (!this.crawlerConfig.includeRegexes.includes(includeRegex)) { - this.crawlerConfig.includeRegexes.push(includeRegex); + + if (this.authInProgress) { + const parsedUrl = new URL(interceptedRequest.url()); + const authority = parsedUrl.host; + const includeRegex = `https?://${authority}(?:/.*|)`; + if (!this.crawlerConfig.includeRegexes.includes(includeRegex)) { + this.crawlerConfig.includeRegexes.push(includeRegex); + } } - } - - if ((this.authInProgress == false && !this.inContext(interceptedRequest.url()))) { - interceptedRequest.respond({ - status: 403, - contentType: 'text/plain', - body: 'Out of Sasori\'s scope', - }); - } else interceptedRequest.continue(); - }); + + if ((this.authInProgress == false && !this.inContext(interceptedRequest.url()))) { + interceptedRequest.respond({ + status: 403, + contentType: 'text/plain', + body: 'Out of Sasori\'s scope', + }); + } else interceptedRequest.continue(); + }); + }; // Dismiss all alerts/popups - page.on('dialog', async (dialog) => { - await dialog.dismiss(); - }); + for (const page of allPages) { + page.on('dialog', async (dialog) => { + await dialog.dismiss(); + }); + }; console.log(chalk.greenBright(`[INFO] Scope manager started successfully!`)); // Start authentication if enabled. if (this.crawlerConfig.authentication.recorderAuth && this.crawlerConfig.authentication.recorderAuth.enabled) { console.log(chalk.greenBright(`[INFO] Running initial authentication...`)); - await this.startAuthentication(browser, page); + await Promise.all( + Array.from({length: browsers.length}).map( + (v, i) => this.startAuthentication(browsers[i], allPages[i]), + ), + ); } console.log(chalk.greenBright(`[INFO] Creating crawl state manager...`)); const crawlManager = new CrawlStateManager(); - await page.goto(this.crawlerConfig.entryPoint, {waitUntil: ['domcontentloaded', 'networkidle0']}); - const rootStateHash = await this.getPageHash(page); - const rootState = await this.getNewCrawlState(page, 0, rootStateHash, crawlManager); + await allPages[0].goto(this.crawlerConfig.entryPoint, {waitUntil: ['domcontentloaded', 'networkidle0']}); + const rootStateHash = await this.getPageHash(allPages[0]); + const rootState = await this.getNewCrawlState(allPages[0], 0, rootStateHash, crawlManager); crawlManager.rootState = rootState; - let currentState = rootState; - let nextCrawlAction = crawlManager.getNextCrawlAction(); - - while ((nextCrawlAction = crawlManager.getNextCrawlAction()) && (this.crawlerConfig.maxDuration === 0 || Date.now() < endTime)) { - const currentAction = nextCrawlAction; - // console.log('\nCurrent Action:'); - // console.log(currentAction.cssPath); - await this.performAction(crawlManager, currentAction, page); - // console.log('Action performed'); - const currentStateHash = await this.getPageHash(page); - // console.log('Current state hash:'); - // console.log(currentStateHash); - const existingState = crawlManager.getStateByHash(currentStateHash); - if (existingState) { - // console.log('State already exists'); - currentState = existingState; - this.removeCrawlActionFromState(currentAction); - } else { - if (this.inContext(page.url())) { - // console.log('State does not exist, creating...'); - currentState = await this.getNewCrawlState(page, currentAction.getParentState().crawlDepth + 1, currentStateHash, crawlManager); - currentAction.childState = currentState; - } else { - this.removeCrawlActionFromState(currentAction); - } - } - } + // let nextCrawlAction = crawlManager.getNextCrawlAction(); + await Promise.all( + Array.from({length: browsers.length}).map( + (v, i) => this.handleCrawl(crawlManager, browsers[i], allPages[i], rootState), + ), + ); console.log(chalk.greenBright.bold('Scan completed')); - await browser.close(); } /** diff --git a/src/models/configModel.js b/src/models/configModel.js index 4dca2f4..6c12943 100644 --- a/src/models/configModel.js +++ b/src/models/configModel.js @@ -61,6 +61,7 @@ const browserModel = Joi.object({ headless: Joi.boolean(), maximize: Joi.boolean(), proxy: proxyModel, + instances: Joi.number().integer().required() }); /** From d7932322331f1638e5c1e931792ea40e4f50cdae Mon Sep 17 00:00:00 2001 From: Karthik UJ Date: Sun, 16 Jun 2024 22:37:52 +0530 Subject: [PATCH 2/5] Minor changes Signed-off-by: Karthik UJ --- config/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.json b/config/config.json index 6ea57e0..8f39569 100644 --- a/config/config.json +++ b/config/config.json @@ -30,7 +30,7 @@ }, "recorderAuth": { "enabled": false, - "pptrRecording": "C:\\Users\\5up3r541y4n\\Dev\\login-recordings\\hypejab.pptr.json" + "pptrRecording": "/path/to/login/recording" } }, "includeRegexes": [ From 90a779d9d7e275f05e29bd27b0c21509c677bf8a Mon Sep 17 00:00:00 2001 From: Karthik UJ Date: Thu, 20 Jun 2024 17:10:03 +0530 Subject: [PATCH 3/5] Lints the code Signed-off-by: Karthik UJ --- config/config.json | 84 ++++++++++++++++----------------- package.json | 99 ++++++++++++++++++++------------------- src/crawler/crawler.js | 27 +++++++---- src/models/configModel.js | 2 +- 4 files changed, 110 insertions(+), 102 deletions(-) diff --git a/config/config.json b/config/config.json index 8f39569..d07d309 100644 --- a/config/config.json +++ b/config/config.json @@ -1,43 +1,43 @@ -{ - "browser": { - "headless": false, - "maximize": true, - "proxy": { - "enabled": false, - "host": "127.0.0.1", - "port": 8080 - }, - "instances": 1 - }, - "crawler": { - "entryPoint": "https://security-crawl-maze.app/", - "eventTimeout": 10000, - "navigationTimeout": 30000, - "eventWait": 0, - "maxDuration": 0, - "elements": [ - "a", - "button", - "input[type=\"submit\"]" - ], - "maxChildren": 0, - "maxDepth": 10, - "authentication": { - "basicAuth": { - "enabled": false, - "username": "username", - "password": "password" - }, - "recorderAuth": { - "enabled": false, - "pptrRecording": "/path/to/login/recording" - } - }, - "includeRegexes": [ - "https?://security-crawl-maze.app(?:/.*|)" - ], - "excludeRegexes": [ - ".*logout.*" - ] - } +{ + "browser": { + "headless": false, + "maximize": true, + "proxy": { + "enabled": false, + "host": "127.0.0.1", + "port": 8080 + }, + "instances": 1 + }, + "crawler": { + "entryPoint": "https://security-crawl-maze.app/", + "eventTimeout": 10000, + "navigationTimeout": 30000, + "eventWait": 0, + "maxDuration": 0, + "elements": [ + "a", + "button", + "input[type=\"submit\"]" + ], + "maxChildren": 0, + "maxDepth": 10, + "authentication": { + "basicAuth": { + "enabled": false, + "username": "username", + "password": "password" + }, + "recorderAuth": { + "enabled": false, + "pptrRecording": "/path/to/login/recording" + } + }, + "includeRegexes": [ + "https?://security-crawl-maze.app(?:/.*|)" + ], + "excludeRegexes": [ + ".*logout.*" + ] + } } \ No newline at end of file diff --git a/package.json b/package.json index 85cf0ac..7c8be6b 100644 --- a/package.json +++ b/package.json @@ -1,49 +1,50 @@ -{ - "name": "sasori-crawl", - "version": "0.2.0", - "description": "Sasori is a dynamic web crawler powered by Puppeteer, designed for lightning-fast endpoint discovery.", - "main": "bin/index.js", - "author": "Karthik UJ", - "contributors": [ - { - "name": "Karthik UJ", - "email": "karthikuj2001@gmail.com", - "url": "https://www.5up3r541y4n.tech/" - } - ], - "license": "MIT", - "repository": { - "type": "git", - "url": "git+https://github.com/karthikuj/sasori.git" - }, - "bin": { - "sasori": "bin/index.js" - }, - "scripts": { - "start": "node ." - }, - "keywords": [ - "crawler", - "crawling", - "scraping", - "endpoint-discovery", - "puppeteer", - "dynamic", - "automation", - "security", - "dast", - "infosec" - ], - "dependencies": { - "@puppeteer/replay": "^2.13.4", - "chalk": "^4", - "cheerio": "^1.0.0-rc.12", - "joi": "^17.12.2", - "puppeteer": "^21.5.1", - "yargs": "^17.7.2" - }, - "devDependencies": { - "eslint": "^8.57.0", - "eslint-config-google": "^0.14.0" - } -} +{ + "name": "sasori-crawl", + "version": "0.2.0", + "description": "Sasori is a dynamic web crawler powered by Puppeteer, designed for lightning-fast endpoint discovery.", + "main": "bin/index.js", + "author": "Karthik UJ", + "contributors": [ + { + "name": "Karthik UJ", + "email": "karthikuj2001@gmail.com", + "url": "https://www.5up3r541y4n.tech/" + } + ], + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/karthikuj/sasori.git" + }, + "bin": { + "sasori": "bin/index.js" + }, + "scripts": { + "start": "node .", + "lint": "npx eslint . --fix" + }, + "keywords": [ + "crawler", + "crawling", + "scraping", + "endpoint-discovery", + "puppeteer", + "dynamic", + "automation", + "security", + "dast", + "infosec" + ], + "dependencies": { + "@puppeteer/replay": "^2.13.4", + "chalk": "^4", + "cheerio": "^1.0.0-rc.12", + "joi": "^17.12.2", + "puppeteer": "^21.5.1", + "yargs": "^17.7.2" + }, + "devDependencies": { + "eslint": "^8.57.0", + "eslint-config-google": "^0.14.0" + } +} diff --git a/src/crawler/crawler.js b/src/crawler/crawler.js index 7342607..7cbc27b 100644 --- a/src/crawler/crawler.js +++ b/src/crawler/crawler.js @@ -346,7 +346,14 @@ class Crawler { appendFileSync(fullPath, url + '\n'); } - async handleCrawl (crawlManager, browser, page, currentState) { + /** + * This function handles all the asynchronous crawling. + * @param {CrawlStateManager} crawlManager + * @param {Browser} browser + * @param {Page} page + * @param {CrawlState} currentState + */ + async handleCrawl(crawlManager, browser, page, currentState) { let nextCrawlAction; while ((nextCrawlAction = crawlManager.getNextCrawlAction()) && (this.crawlerConfig.maxDuration === 0 || Date.now() < this.endTime)) { const currentAction = nextCrawlAction; @@ -432,7 +439,7 @@ class Crawler { await page.setRequestInterception(true); page.on('request', async (interceptedRequest) => { if (interceptedRequest.isInterceptResolutionHandled()) return; - + if (this.inContext(interceptedRequest.url())) { if (!this.allUrls.has(interceptedRequest.url())) { console.log(chalk.magentaBright(`[URL] `) + chalk.green(interceptedRequest.url())); @@ -442,7 +449,7 @@ class Crawler { } } } - + if (this.authInProgress) { const parsedUrl = new URL(interceptedRequest.url()); const authority = parsedUrl.host; @@ -451,7 +458,7 @@ class Crawler { this.crawlerConfig.includeRegexes.push(includeRegex); } } - + if ((this.authInProgress == false && !this.inContext(interceptedRequest.url()))) { interceptedRequest.respond({ status: 403, @@ -475,9 +482,9 @@ class Crawler { if (this.crawlerConfig.authentication.recorderAuth && this.crawlerConfig.authentication.recorderAuth.enabled) { console.log(chalk.greenBright(`[INFO] Running initial authentication...`)); await Promise.all( - Array.from({length: browsers.length}).map( - (v, i) => this.startAuthentication(browsers[i], allPages[i]), - ), + Array.from({length: browsers.length}).map( + (v, i) => this.startAuthentication(browsers[i], allPages[i]), + ), ); } @@ -492,9 +499,9 @@ class Crawler { // let nextCrawlAction = crawlManager.getNextCrawlAction(); await Promise.all( - Array.from({length: browsers.length}).map( - (v, i) => this.handleCrawl(crawlManager, browsers[i], allPages[i], rootState), - ), + Array.from({length: browsers.length}).map( + (v, i) => this.handleCrawl(crawlManager, browsers[i], allPages[i], rootState), + ), ); console.log(chalk.greenBright.bold('Scan completed')); } diff --git a/src/models/configModel.js b/src/models/configModel.js index 6c12943..6056661 100644 --- a/src/models/configModel.js +++ b/src/models/configModel.js @@ -61,7 +61,7 @@ const browserModel = Joi.object({ headless: Joi.boolean(), maximize: Joi.boolean(), proxy: proxyModel, - instances: Joi.number().integer().required() + instances: Joi.number().integer().required(), }); /** From e0dcf3e9fb33f06e836a78e852182a06a321afa0 Mon Sep 17 00:00:00 2001 From: Karthik UJ Date: Sat, 13 Jul 2024 14:01:43 +0530 Subject: [PATCH 4/5] Fixes instances limited to actions bug Signed-off-by: Karthik UJ --- config/config.json | 2 +- src/crawler/crawlAction.js | 2 +- src/crawler/crawler.js | 36 +++++++++++++++++++++++------------- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/config/config.json b/config/config.json index d07d309..29a33ce 100644 --- a/config/config.json +++ b/config/config.json @@ -7,7 +7,7 @@ "host": "127.0.0.1", "port": 8080 }, - "instances": 1 + "instances": 4 }, "crawler": { "entryPoint": "https://security-crawl-maze.app/", diff --git a/src/crawler/crawlAction.js b/src/crawler/crawlAction.js index 9dabfdc..e466767 100644 --- a/src/crawler/crawlAction.js +++ b/src/crawler/crawlAction.js @@ -6,7 +6,7 @@ const {randomUUID} = require('crypto'); */ class CrawlAction { static { - this.ANCHOR = 'a'; + this.ANCHOR = 'A'; } /** diff --git a/src/crawler/crawler.js b/src/crawler/crawler.js index 7cbc27b..b7abfb7 100644 --- a/src/crawler/crawler.js +++ b/src/crawler/crawler.js @@ -32,6 +32,7 @@ class Crawler { this.allUrls = new Set(); this.allInteractables = [...this.crawlerConfig.elements].concat(CrawlInput.INPUT_FIELDS.map((element) => element.CSS_PATH)); this.endTime = null; + this.browserStartDelays = 1000; // 1 millisecond(s) } /** @@ -119,11 +120,19 @@ class Crawler { for (const element of this.crawlerConfig.elements) { const cssPaths = await domPath.getCssPaths(element); for (const cssPath of cssPaths) { - const node = await page.$eval(cssPath, (el) => el.outerHTML); - const actionHash = createHash('sha256').update(node).digest('hex'); - if (crawlManager.isCrawlActionUnique(cssPath, actionHash)) { - crawlActions.push(new CrawlAction(element, 'click', cssPath, actionHash, currentState)); + const node = await page.$eval(cssPath, (el) => { + return { + outerHTML: el.outerHTML, + tagName: el.tagName, + }; + }); + const actionHash = createHash('sha256').update(node.outerHTML).digest('hex'); + if (node.tagName === CrawlAction.ANCHOR) { + if (crawlManager.isCrawlActionUnique(cssPath, actionHash)) { + crawlActions.push(new CrawlAction(node.tagName, 'click', cssPath, actionHash, currentState)); + } } else { + crawlActions.push(new CrawlAction(node.tagName, 'click', cssPath, actionHash, currentState)); } } } @@ -161,6 +170,7 @@ class Crawler { // console.log('Shortest path:'); // console.log(shortestPath.map((action) => action.cssPath)); await page.goto(this.crawlerConfig.entryPoint, {waitUntil: 'domcontentloaded'}); + await page.waitForFunction(()=>document.readyState === 'complete', {timeout: this.crawlerConfig.navigationTimeout}); for (const crawlAction of shortestPath) { if (crawlAction.element != CrawlAction.ANCHOR) { // console.log('All crawlinputs:'); @@ -413,7 +423,7 @@ class Crawler { if (this.crawlerConfig.maxDuration === 0) { console.log(chalk.greenBright(`[INFO] Max duration is set to 0, sasori will run indefinitely.`)); } else { - console.log(chalk.greenBright(`[INFO] Sasori will stop crawling at ${new Date(endTime).toTimeString()}`)); + console.log(chalk.greenBright(`[INFO] Sasori will stop crawling at ${new Date(this.endTime).toTimeString()}`)); } const allPages = []; @@ -491,18 +501,18 @@ class Crawler { console.log(chalk.greenBright(`[INFO] Creating crawl state manager...`)); const crawlManager = new CrawlStateManager(); - await allPages[0].goto(this.crawlerConfig.entryPoint, {waitUntil: ['domcontentloaded', 'networkidle0']}); + await allPages[0].goto(this.crawlerConfig.entryPoint, {waitUntil: ['domcontentloaded']}); + await allPages[0].waitForFunction(()=>document.readyState === 'complete', {timeout: this.crawlerConfig.navigationTimeout}); const rootStateHash = await this.getPageHash(allPages[0]); const rootState = await this.getNewCrawlState(allPages[0], 0, rootStateHash, crawlManager); crawlManager.rootState = rootState; - // let nextCrawlAction = crawlManager.getNextCrawlAction(); - - await Promise.all( - Array.from({length: browsers.length}).map( - (v, i) => this.handleCrawl(crawlManager, browsers[i], allPages[i], rootState), - ), - ); + const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + const handleCrawls = browsers.map((browser, index) => async () => { + await delay(index * this.browserStartDelays); + return this.handleCrawl(crawlManager, browser, allPages[index], rootState); + }); + await Promise.all(handleCrawls.map((handleCrawl) => handleCrawl())); console.log(chalk.greenBright.bold('Scan completed')); } From 300dafa80162bc86d0fce934cdcb576469acbb7c Mon Sep 17 00:00:00 2001 From: Karthik UJ Date: Sat, 13 Jul 2024 14:14:15 +0530 Subject: [PATCH 5/5] Updates CHANGELOG Signed-off-by: Karthik UJ --- CHANGELOG.md | 23 ++++++++++++++++------- package.json | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39e2483..ef8227a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,17 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -## [0.1.0] - 2024-03-30 - -### Added +# Changelog + +All notable changes to this project will be documented in this file. + +## Unreleased + +### Added +- Multi-browser support +- Wait till `document.readyState` is equal to `complete`. + +### Fixed +- `CrawlAction` element name bug. +- Check if CrawlAction is unique only if it is not a link. + +## [0.1.0] - 2024-03-30 +### Added - Initial release of the project. \ No newline at end of file diff --git a/package.json b/package.json index 7c8be6b..7d767b8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "sasori-crawl", - "version": "0.2.0", + "version": "1.0.0", "description": "Sasori is a dynamic web crawler powered by Puppeteer, designed for lightning-fast endpoint discovery.", "main": "bin/index.js", "author": "Karthik UJ",