Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1365 - iterate <lastmod> to capture ~html-esque sitemaps #437

Merged
merged 2 commits into from
Feb 21, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 160 additions & 7 deletions libs/core-scanner/src/pages/sitemap-xml.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ export const createSitemapXmlScanner = (
sitemapResponse,
sitemapText,
sitemapPage,
logger,
),
};
};
Expand All @@ -40,6 +41,7 @@ const buildSitemapResult = async (
sitemapResponse: HTTPResponse,
sitemapText: string,
sitemapPage: Page,
logger: Logger,
): Promise<SitemapXmlScan> => {
const sitemapUrl = new URL(sitemapResponse.url());
const sitemapLive = isLive(sitemapResponse);
Expand All @@ -63,19 +65,13 @@ const buildSitemapResult = async (
sitemapXmlFinalUrlFilesize: Buffer.byteLength(sitemapText, 'utf-8'),
sitemapXmlCount: await getUrlCount(sitemapPage),
sitemapXmlPdfCount: getPdfCount(sitemapText),
sitemapXmlLastMod: getLastModDate(sitemapText),
sitemapXmlLastMod: await getLastModDate(sitemapText, sitemapPage, logger),
sitemapXmlPageHash: await getPageMd5Hash(sitemapPage),
}
: {}),
};
};

function getLastModDate(sitemapText: string) {
const re = /<lastmod>(.*?)<\/lastmod>/g;
const matches = [...sitemapText.matchAll(re)];
return matches.length > 0 ? matches[matches.length - 1][1] : null;
}

const getUrlCount = async (page: Page) => {
const urlCount = await page.evaluate(() => {
const urls = [...document.getElementsByTagName('url')];
Expand All @@ -90,3 +86,160 @@ const getPdfCount = (sitemapText: string) => {
const occurrenceCount = [...sitemapText.matchAll(re)].length;
return occurrenceCount;
};

/**
* Date formats to try when parsing dates.
* Each format is represented by a regex and a parser function.
*/
const dateFormats = [
// Try MM/DD/YYYY (e.g., 02/20/2025)
{ regex: /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/, parser: (m: RegExpMatchArray) => new Date(`${m[3]}-${m[1].padStart(2, '0')}-${m[2].padStart(2, '0')}`) },
// Try DD/MM/YYYY (e.g., 20/02/2025)
{ regex: /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/, parser: (m: RegExpMatchArray) => new Date(`${m[3]}-${m[2].padStart(2, '0')}-${m[1].padStart(2, '0')}`) },
// Try YYYY-MM-DD (e.g., 2025-02-20)
{ regex: /^(\d{4})-(\d{1,2})-(\d{1,2})$/, parser: (m: RegExpMatchArray) => new Date(`${m[1]}-${m[2].padStart(2, '0')}-${m[3].padStart(2, '0')}`) },
// Try (2025-02-20T01:00:02-05:00)
{ regex: /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})([-+]\d{2}:\d{2})$/, parser: (m: RegExpMatchArray) => new Date(`${m[1]}-${m[2].padStart(2, '0')}-${m[3].padStart(2, '0')}T${m[4].padStart(2, '0')}:${m[5].padStart(2, '0')}:${m[6].padStart(2, '0')}${m[7]}`) },
];

/**
* Gets the last modification date from the sitemap XML.
*
* @param sitemapText The text content of the sitemap XML
* @param sitemapPage The Puppeteer page object for the sitemap
* @param logger A logger instance for logging
* @returns The last modification date as a string or null if not found
*/
async function getLastModDate(sitemapText: string, sitemapPage: Page, logger: Logger) {
if (!sitemapText || !sitemapPage) {
return null;
}
let dates = getModDatesByLastmodTag(sitemapText);
if (!dates) {
dates = await getModDatesByTDTag(sitemapPage, logger);
}
if (!dates || dates.length === 0) {
return null;
}
const parsedDates = convertStringsToDates(dates, logger);

const mostRecentDate = getMostRecentDate(parsedDates, logger);

logger.info({sitemapXmlLastMod: mostRecentDate.toISOString()}, `Most recent date found: ${mostRecentDate.toISOString()}`);
return mostRecentDate.toISOString();
}

/**
* Converts an array of date strings to Date objects.
*
* @param dates An array of strings representing dates
* @param logger A logger instance for logging
* @returns An array of Date objects
*/
function convertStringsToDates(dates: string[], logger: Logger): Date[] {
return dates.map(date => parseDate(date, logger));
}

/**
* Finds the most recent date from an array of Date objects.
*
* @param dates An array of Date objects
* @param logger A logger instance for logging
* @returns The most recent Date object or null if the array is empty
*/
function getMostRecentDate(dates: Date[], logger: Logger): Date | null {
return dates.sort((a, b) => b.getTime() - a.getTime())[0];
}

/**
* Checks if a string is a valid date format.
*
* @param dateStr The string to check
* @param logger A logger instance for logging
* @returns True if the string is a valid date format, false otherwise
*/
function isDate(dateStr: string, logger: Logger): boolean {
for (const { regex, parser } of dateFormats) {
const match = dateStr.match(regex);
if (match) {
const date = parser(match);
return !isNaN(date.getTime());
}
}
return false;
}

/**
* Extracts modification dates from the sitemap XML using <lastmod> tags.
*
* @param sitemapText The text content of the sitemap XML
* @returns An array of modification dates as strings or null if not found
*/
function getModDatesByLastmodTag(sitemapText: string): string[] | null {
const re = /<lastmod>\s*(.*?)\s*<\/lastmod>/g;
const matches = [...sitemapText.matchAll(re)];
if (matches.length > 0) {
return matches.map(match => match[1]);
}

return null;
}

/**
* Extracts modification dates from the sitemap XML using <td> tags.
*
* @param sitemapPage The Puppeteer page object for the sitemap
* @param logger A logger instance for logging
* @returns An array of modification dates as strings or null if not found
*/
async function getModDatesByTDTag(sitemapPage: Page, logger: Logger): Promise<string[] | null> {
const tdTexts = await sitemapPage.$$eval('td', (tds) =>
tds.map(td => td.textContent?.trim()).filter(text => text !== undefined)
);
if (!tdTexts || tdTexts.length === 0) {
return null;
}
const modDates = tdTexts.map(text => text.trim()).filter(text => isDate(text, logger));
const parsedDates = modDates.map(date => parseDate(date, logger));
const stringDates = parsedDates.map(date => date.toISOString());

return stringDates.length > 0 ? stringDates : null;
}

/**
* Parses a date string into a Date object.
*
* @param dateStr The date string to parse
* @param logger A logger instance for logging
* @returns The parsed Date object or a fallback date if parsing fails
*/
function parseDate(dateStr: string, logger: Logger): Date {
const isoDate = new Date(dateStr);
if (!isNaN(isoDate.getTime())) {
return isoDate;
}

const fallbackDate = parseFallbackDate(dateStr);
if (fallbackDate) {
return fallbackDate;
}

return new Date(0);
}

/**
* Parses a date string using fallback formats.
*
* @param dateStr The date string to parse
* @returns The parsed Date object or null if parsing fails
*/
function parseFallbackDate(dateStr: string): Date | null {
for (const { regex, parser } of dateFormats) {
const match = dateStr.match(regex);
if (match) {
return parser(match);
}
}

return null;
}
Loading