From 990df3195d52010170f8df727ed1ccb48aa96f98 Mon Sep 17 00:00:00 2001 From: Luke Chavers Date: Mon, 24 Feb 2025 14:45:46 -0500 Subject: [PATCH] Update sitemap-xml scan to use axios for lastmod check --- libs/core-scanner/src/core-scanner.service.ts | 1 + .../src/pages/sitemap-xml.spec.ts | 24 ++++++++++--- libs/core-scanner/src/pages/sitemap-xml.ts | 35 ++++++++++++++++--- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/libs/core-scanner/src/core-scanner.service.ts b/libs/core-scanner/src/core-scanner.service.ts index 7ddc52aa..59dccd88 100644 --- a/libs/core-scanner/src/core-scanner.service.ts +++ b/libs/core-scanner/src/core-scanner.service.ts @@ -196,6 +196,7 @@ export class CoreScannerService pages.createSitemapXmlScanner( pageLogger, input, + this.httpService, ), ); diff --git a/libs/core-scanner/src/pages/sitemap-xml.spec.ts b/libs/core-scanner/src/pages/sitemap-xml.spec.ts index 06f82d44..85410a6b 100644 --- a/libs/core-scanner/src/pages/sitemap-xml.spec.ts +++ b/libs/core-scanner/src/pages/sitemap-xml.spec.ts @@ -1,10 +1,13 @@ import { mock, MockProxy } from 'jest-mock-extended'; import { Logger } from 'pino'; import { Page, HTTPRequest, HTTPResponse } from 'puppeteer'; +import { HttpService } from '@nestjs/axios'; +import { AxiosResponse } from 'axios'; import { CoreInputDto } from '../core.input.dto'; -import { createSitemapXmlScanner } from './sitemap-xml'; +import { createSitemapXmlScanner, getSitemapUsingAxios } from './sitemap-xml'; import { source } from './test-page-source'; +import { of } from 'rxjs'; describe('sitemap-xml scanner', () => { let mockPage: MockProxy; @@ -44,14 +47,27 @@ describe('sitemap-xml scanner', () => { mockResponse.url.mockReturnValue('https://18f.gsa.gov/sitemap.xml'); mockPage.goto.mockResolvedValue(mockResponse); redirectRequest.redirectChain.mockReturnValue([]); + const mockHttpService = mock(); + const axiosResponse: AxiosResponse = { + data: {}, + status: 404, + statusText: 'Not Found', + headers: {}, + config: { + headers: null, + }, + }; + jest + .spyOn(mockHttpService, 'get') + .mockImplementationOnce(() => of(axiosResponse)); - const scanner = createSitemapXmlScanner(mockLogger, input); + const scanner = createSitemapXmlScanner(mockLogger, input, mockHttpService); const result = await scanner(mockPage); expect(result).toEqual({ sitemapXmlScan: { sitemapXmlCount: undefined, - sitemapXmlFinalUrlFilesize: 39170, + sitemapXmlFinalUrlFilesize: 15, sitemapXmlPdfCount: 0, sitemapXmlFinalUrl: 'https://18f.gsa.gov/sitemap.xml', sitemapXmlFinalUrlLive: true, @@ -64,4 +80,4 @@ describe('sitemap-xml scanner', () => { }, }); }); -}); +}); \ No newline at end of file diff --git a/libs/core-scanner/src/pages/sitemap-xml.ts b/libs/core-scanner/src/pages/sitemap-xml.ts index c987fb51..6b727aa4 100644 --- a/libs/core-scanner/src/pages/sitemap-xml.ts +++ b/libs/core-scanner/src/pages/sitemap-xml.ts @@ -1,5 +1,7 @@ import { Logger } from 'pino'; import { Page, HTTPResponse } from 'puppeteer'; +import { HttpService } from '@nestjs/axios'; +import { lastValueFrom } from 'rxjs'; import { CoreInputDto } from '@app/core-scanner/core.input.dto'; import { SitemapXmlScan } from 'entities/scan-data.entity'; @@ -10,6 +12,7 @@ import { getHttpsUrl, getMIMEType, isLive, createRequestHandlers, getPageMd5Hash export const createSitemapXmlScanner = ( logger: Logger, input: CoreInputDto, + httpService: HttpService, ) => { const url = getHttpsUrl(input.url); return async (sitemapPage: Page): Promise => { @@ -17,19 +20,22 @@ export const createSitemapXmlScanner = ( // go to the sitemap page from the target url const sitemapUrl = new URL(url); sitemapUrl.pathname = 'sitemap.xml'; - logger.info('Going to sitemap.xml...'); + logger.info(`Going to sitemap.xml: ${sitemapUrl.toString()}`); + const sitemapResponse = await sitemapPage.goto(sitemapUrl.toString(), { waitUntil: 'networkidle2', }); logger.info('Got sitemap.xml!'); // extract the html page source - const sitemapText = await sitemapResponse.text(); - logger.info('Got sitemap.xml text!'); + //const sitemapText = await sitemapResponse.text(); + logger.info(`Got sitemap.xml text from: ${sitemapResponse.url()}`); + + const sitemapContents = await getSitemapUsingAxios(sitemapResponse.url(), httpService, logger); return { sitemapXmlScan: await buildSitemapResult( sitemapResponse, - sitemapText, + sitemapContents.data.toString(), sitemapPage, logger, ), @@ -72,6 +78,25 @@ const buildSitemapResult = async ( }; }; +/** + * Fetches the sitemap XML using Axios. + * + * @param url The URL to fetch the sitemap from + * @param httpService The HTTP service to use for making requests + * @param logger A logger instance for logging + * @returns The sitemap XML response as a string or null if an error occurs + */ +export async function getSitemapUsingAxios(url: string, httpService: HttpService, logger: Logger) { + try { + const response = await httpService.get(url); + logger.info(`Got sitemap.xml response using Axios`); + return await lastValueFrom(response) + } catch (error) { + logger.error({error}, 'Error fetching sitemap.xml using Axios'); + return null; + } +} + const getUrlCount = async (page: Page) => { const urlCount = await page.evaluate(() => { const urls = [...document.getElementsByTagName('url')]; @@ -120,9 +145,11 @@ async function getLastModDate(sitemapText: string, sitemapPage: Page, logger: Lo } let dates = getModDatesByLastmodTag(sitemapText); if (!dates) { + logger.info('No tags found, checking tags...'); dates = await getModDatesByTDTag(sitemapPage, logger); } if (!dates || dates.length === 0) { + logger.info('No valid dates found in or tags.'); return null; } const parsedDates = convertStringsToDates(dates, logger);