Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1365 - Iterate <lastmod> to capture ~html-esque sitemaps #439

Merged
merged 1 commit into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libs/core-scanner/src/core-scanner.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ export class CoreScannerService
pages.createSitemapXmlScanner(
pageLogger,
input,
this.httpService,
),
);

Expand Down
24 changes: 20 additions & 4 deletions libs/core-scanner/src/pages/sitemap-xml.spec.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { mock, MockProxy } from 'jest-mock-extended';
import { Logger } from 'pino';
import { Page, HTTPRequest, HTTPResponse } from 'puppeteer';
import { HttpService } from '@nestjs/axios';
import { AxiosResponse } from 'axios';

import { CoreInputDto } from '../core.input.dto';
import { createSitemapXmlScanner } from './sitemap-xml';
import { createSitemapXmlScanner, getSitemapUsingAxios } from './sitemap-xml';
import { source } from './test-page-source';
import { of } from 'rxjs';

describe('sitemap-xml scanner', () => {
let mockPage: MockProxy<Page>;
Expand Down Expand Up @@ -44,14 +47,27 @@ describe('sitemap-xml scanner', () => {
mockResponse.url.mockReturnValue('https://18f.gsa.gov/sitemap.xml');
mockPage.goto.mockResolvedValue(mockResponse);
redirectRequest.redirectChain.mockReturnValue([]);
const mockHttpService = mock<HttpService>();
const axiosResponse: AxiosResponse<any> = {
data: {},
status: 404,
statusText: 'Not Found',
headers: {},
config: {
headers: null,
},
};
jest
.spyOn(mockHttpService, 'get')
.mockImplementationOnce(() => of(axiosResponse));

const scanner = createSitemapXmlScanner(mockLogger, input);
const scanner = createSitemapXmlScanner(mockLogger, input, mockHttpService);
const result = await scanner(mockPage);

expect(result).toEqual({
sitemapXmlScan: {
sitemapXmlCount: undefined,
sitemapXmlFinalUrlFilesize: 39170,
sitemapXmlFinalUrlFilesize: 15,
sitemapXmlPdfCount: 0,
sitemapXmlFinalUrl: 'https://18f.gsa.gov/sitemap.xml',
sitemapXmlFinalUrlLive: true,
Expand All @@ -64,4 +80,4 @@ describe('sitemap-xml scanner', () => {
},
});
});
});
});
35 changes: 31 additions & 4 deletions libs/core-scanner/src/pages/sitemap-xml.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { Logger } from 'pino';
import { Page, HTTPResponse } from 'puppeteer';
import { HttpService } from '@nestjs/axios';
import { lastValueFrom } from 'rxjs';

import { CoreInputDto } from '@app/core-scanner/core.input.dto';
import { SitemapXmlScan } from 'entities/scan-data.entity';
Expand All @@ -10,26 +12,30 @@ import { getHttpsUrl, getMIMEType, isLive, createRequestHandlers, getPageMd5Hash
export const createSitemapXmlScanner = (
logger: Logger,
input: CoreInputDto,
httpService: HttpService,
) => {
const url = getHttpsUrl(input.url);
return async (sitemapPage: Page): Promise<SitemapXmlPageScans> => {
createRequestHandlers(sitemapPage, logger);
// go to the sitemap page from the target url
const sitemapUrl = new URL(url);
sitemapUrl.pathname = 'sitemap.xml';
logger.info('Going to sitemap.xml...');
logger.info(`Going to sitemap.xml: ${sitemapUrl.toString()}`);

const sitemapResponse = await sitemapPage.goto(sitemapUrl.toString(), {
waitUntil: 'networkidle2',
});
logger.info('Got sitemap.xml!');
// extract the html page source
const sitemapText = await sitemapResponse.text();
logger.info('Got sitemap.xml text!');
//const sitemapText = await sitemapResponse.text();
logger.info(`Got sitemap.xml text from: ${sitemapResponse.url()}`);

const sitemapContents = await getSitemapUsingAxios(sitemapResponse.url(), httpService, logger);

return {
sitemapXmlScan: await buildSitemapResult(
sitemapResponse,
sitemapText,
sitemapContents.data.toString(),
sitemapPage,
logger,
),
Expand Down Expand Up @@ -72,6 +78,25 @@ const buildSitemapResult = async (
};
};

/**
* Fetches the sitemap XML using Axios.
*
* @param url The URL to fetch the sitemap from
* @param httpService The HTTP service to use for making requests
* @param logger A logger instance for logging
* @returns The sitemap XML response as a string or null if an error occurs
*/
export async function getSitemapUsingAxios(url: string, httpService: HttpService, logger: Logger) {
try {
const response = await httpService.get(url);
logger.info(`Got sitemap.xml response using Axios`);
return await lastValueFrom(response)
} catch (error) {
logger.error({error}, 'Error fetching sitemap.xml using Axios');
return null;
}
}

const getUrlCount = async (page: Page) => {
const urlCount = await page.evaluate(() => {
const urls = [...document.getElementsByTagName('url')];
Expand Down Expand Up @@ -120,9 +145,11 @@ async function getLastModDate(sitemapText: string, sitemapPage: Page, logger: Lo
}
let dates = getModDatesByLastmodTag(sitemapText);
if (!dates) {
logger.info('No <lastmod> tags found, checking <td> tags...');
dates = await getModDatesByTDTag(sitemapPage, logger);
}
if (!dates || dates.length === 0) {
logger.info('No valid dates found in <lastmod> or <td> tags.');
return null;
}
const parsedDates = convertStringsToDates(dates, logger);
Expand Down
Loading