From 347a249ec645146398530b519fc57c1dd976eb4a Mon Sep 17 00:00:00 2001 From: Andy Kuny Date: Mon, 15 Jan 2024 15:05:59 -0500 Subject: [PATCH] Add new seo fields --- entities/core-result.entity.ts | 22 ++++ entities/scan-data.entity.ts | 3 + libs/core-scanner/src/pages/primary.spec.ts | 4 + libs/core-scanner/src/pages/primary.ts | 3 +- libs/core-scanner/src/scans/seo.spec.ts | 13 ++- libs/core-scanner/src/scans/seo.ts | 105 +++++++++++++++--- .../core-results/core-result.service.spec.ts | 3 + .../src/core-results/core-result.service.ts | 3 + 8 files changed, 135 insertions(+), 21 deletions(-) diff --git a/entities/core-result.entity.ts b/entities/core-result.entity.ts index 4d10d94e..1b4c049a 100644 --- a/entities/core-result.entity.ts +++ b/entities/core-result.entity.ts @@ -391,6 +391,28 @@ export class CoreResult { @Expose({ name: 'viewport_meta_tag' }) viewportMetaTag: boolean; + @Column({ nullable: true }) + @Expose({ name: 'page_title' }) + @Exclude() + pageTitle?: string; + + @Column({ nullable: true }) + @Expose({ name: 'meta_description_content' }) + @Exclude() + metaDescriptionContent?: string; + + @Column({ nullable: true }) + @Expose({ name: 'hreflang_codes' }) + @Exclude() + @Transform((value: string) => { + if (value) { + return value.split(','); + } else { + return null; + } + }) + hreflangCodes?: string; + static getColumnNames(): string[] { // return class-transformer version of column names return Object.keys(classToPlain(new CoreResult())); diff --git a/entities/scan-data.entity.ts b/entities/scan-data.entity.ts index 455370b9..0d025844 100644 --- a/entities/scan-data.entity.ts +++ b/entities/scan-data.entity.ts @@ -27,6 +27,9 @@ export type SeoScan = { ogArticleModifiedFinalUrl: Date; mainElementFinalUrl: boolean; canonicalLink: string; + pageTitle: string; + metaDescriptionContent: string; + hreflangCodes: string; }; export type ThirdPartyScan = { diff --git a/libs/core-scanner/src/pages/primary.spec.ts b/libs/core-scanner/src/pages/primary.spec.ts index 1c608c7e..182bdfed 100644 --- a/libs/core-scanner/src/pages/primary.spec.ts +++ b/libs/core-scanner/src/pages/primary.spec.ts @@ -46,6 +46,10 @@ describe('primary scanner', () => { '18F builds effective, user-centric digital services focused on the interaction between government and the people and businesses it serves.', mainElementFinalUrl: true, canonicalLink: 'https://18f.gsa.gov/', + pageTitle: '18F: Digital service delivery | Home', + metaDescriptionContent: + '18F builds effective, user-centric digital services focused on the interaction between government and the people and businesses it serves.', + hreflangCodes: '', }, thirdPartyScan: { thirdPartyServiceDomains: diff --git a/libs/core-scanner/src/pages/primary.ts b/libs/core-scanner/src/pages/primary.ts index 7ac23bb6..a02c77da 100644 --- a/libs/core-scanner/src/pages/primary.ts +++ b/libs/core-scanner/src/pages/primary.ts @@ -40,7 +40,6 @@ const primaryScan = async ( const getCSSRequests = await createCSSRequestsExtractor(page, logger); const getOutboundRequests = createOutboundRequestsExtractor(page); - // goto url and wait until there are only 2 idle requests const response = await page.goto(url, { waitUntil: 'networkidle0', }); @@ -61,7 +60,7 @@ const primaryScan = async ( buildDapResult(logger, getOutboundRequests()), buildThirdPartyResult(response, getOutboundRequests()), buildCookieResult(page), - buildSeoResult(logger, page), + buildSeoResult(logger, page, response), createUswdsScanner({ logger, getCSSRequests }, page)(response), buildLoginResult(response), buildCloudDotGovPagesResult(response), diff --git a/libs/core-scanner/src/scans/seo.spec.ts b/libs/core-scanner/src/scans/seo.spec.ts index 061959eb..c7608a7c 100644 --- a/libs/core-scanner/src/scans/seo.spec.ts +++ b/libs/core-scanner/src/scans/seo.spec.ts @@ -1,13 +1,17 @@ import { mock } from 'jest-mock-extended'; import { Logger } from 'pino'; - +import { HTTPResponse } from 'puppeteer'; import { browserInstance, newTestPage } from '../test-helper'; import { buildSeoResult } from './seo'; describe('seo scan', () => { it('works', async () => { await newTestPage(async ({ page }) => { - const result = await buildSeoResult(mock(), page); + const result = await buildSeoResult( + mock(), + page, + mock(), + ); expect(result).toEqual({ mainElementFinalUrl: true, ogArticlePublishedFinalUrl: undefined, @@ -17,6 +21,11 @@ describe('seo scan', () => { ogTitleFinalUrl: "Investing in Appalachia's economic future. - Appalachian Regional Commission", canonicalLink: 'https://www.arc.gov/', + pageTitle: + "Investing in Appalachia's economic future. - Appalachian Regional Commission", + metaDescriptionContent: '', + + hreflangCodes: '', }); }, 'arc_gov_dump.mht'); }); diff --git a/libs/core-scanner/src/scans/seo.ts b/libs/core-scanner/src/scans/seo.ts index b9ab95b8..21d7c5a0 100644 --- a/libs/core-scanner/src/scans/seo.ts +++ b/libs/core-scanner/src/scans/seo.ts @@ -1,28 +1,44 @@ import { Logger } from 'pino'; import { Page } from 'puppeteer'; - +import { HTTPResponse } from 'puppeteer'; import { SeoScan } from 'entities/scan-data.entity'; export const buildSeoResult = async ( logger: Logger, page: Page, + response: HTTPResponse, ): Promise => { - // seo + const ogTitleFinalUrl = await findOpenGraphTag(page, 'og:title'); + const ogDescriptionFinalUrl = await findOpenGraphTag(page, 'og:description'); + const ogArticlePublishedFinalUrl = await findOpenGraphDates( + logger, + page, + 'article:published_time', + ); + const ogArticleModifiedFinalUrl = await findOpenGraphDates( + logger, + page, + 'article:modified_time', + ); + const mainElementFinalUrl = await findMainElement(page); + const canonicalLink = + (await findCanonicalLinkInHtml(page)) ?? + (await findCanonicalLInkInResponseHeaders(response)) ?? + null; + const pageTitle = await findPageTitleText(page); + const metaDescriptionContent = await findMetaDescriptionContent(page); + const hreflangCodes = await findHrefLangCodes(page); + return { - ogTitleFinalUrl: await findOpenGraphTag(page, 'og:title'), - ogDescriptionFinalUrl: await findOpenGraphTag(page, 'og:description'), - ogArticlePublishedFinalUrl: await findOpenGraphDates( - logger, - page, - 'article:published_time', - ), - ogArticleModifiedFinalUrl: await findOpenGraphDates( - logger, - page, - 'article:modified_time', - ), - mainElementFinalUrl: await findMainElement(page), - canonicalLink: await findCanonicalLink(page), + ogTitleFinalUrl, + ogDescriptionFinalUrl, + ogArticlePublishedFinalUrl, + ogArticleModifiedFinalUrl, + mainElementFinalUrl, + canonicalLink, + pageTitle, + metaDescriptionContent, + hreflangCodes, }; }; @@ -75,7 +91,7 @@ const findMainElement = async (page: Page) => { return main; }; -const findCanonicalLink = async (page: Page) => { +const findCanonicalLinkInHtml = async (page: Page): Promise => { const canonicalLinkResult = await page.evaluate(() => { const canonicalLink = document.querySelector( 'link[rel="canonical"]', @@ -85,3 +101,58 @@ const findCanonicalLink = async (page: Page) => { return canonicalLinkResult; }; + +const findCanonicalLInkInResponseHeaders = async ( + response: HTTPResponse, +): Promise => { + const headers = await response.headers(); + + for (const key in headers) { + if (key.toLowerCase() === 'link') { + const value = headers[key]; + if (value.toLowerCase().includes('rel=canonical')) { + const regex = /https?:\/\/[^;]+(?=; rel="canonical")/i; + const matches = value.match(regex); + return matches ? matches[0] : null; + } + } + } + + return null; +}; + +const findPageTitleText = async (page: Page): Promise => { + return await page.evaluate(() => document.title.trim()); +}; + +const findMetaDescriptionContent = async ( + page: Page, +): Promise => { + const content = await page.evaluate(() => { + const metaDescription = document.querySelector('meta[name="description"]'); + + if (metaDescription && metaDescription.hasAttribute('content')) { + return metaDescription.getAttribute('content').trim(); + } + + return null; + }); + + return content; +}; + +const findHrefLangCodes = async (page: Page): Promise => { + const languageCodes = await page.evaluate(() => { + const hreflangElements = document.querySelectorAll( + 'link[rel="alternate"][hreflang]', + ); + + const hreflangValues = Array.from(hreflangElements).map((el) => { + return el.getAttribute('hreflang').trim().toLowerCase(); + }); + + return hreflangValues; + }); + + return languageCodes.join(','); +}; diff --git a/libs/database/src/core-results/core-result.service.spec.ts b/libs/database/src/core-results/core-result.service.spec.ts index 973084c4..e453b25c 100644 --- a/libs/database/src/core-results/core-result.service.spec.ts +++ b/libs/database/src/core-results/core-result.service.spec.ts @@ -111,6 +111,9 @@ describe('CoreResultService', () => { ogArticleModifiedFinalUrl: null, mainElementFinalUrl: null, canonicalLink: null, + pageTitle: null, + metaDescriptionContent: null, + hreflangCodes: null, }, thirdPartyScan: { thirdPartyServiceDomains: null, diff --git a/libs/database/src/core-results/core-result.service.ts b/libs/database/src/core-results/core-result.service.ts index 02f05b4a..bd302e62 100644 --- a/libs/database/src/core-results/core-result.service.ts +++ b/libs/database/src/core-results/core-result.service.ts @@ -82,6 +82,9 @@ export class CoreResultService { coreResult.ogDescriptionFinalUrl = result.seoScan.ogDescriptionFinalUrl; coreResult.ogTitleFinalUrl = result.seoScan.ogTitleFinalUrl; coreResult.canonicalLink = result.seoScan.canonicalLink; + coreResult.pageTitle = result.seoScan.pageTitle; + coreResult.metaDescriptionContent = result.seoScan.metaDescriptionContent; + coreResult.hreflangCodes = result.seoScan.hreflangCodes; // Third-party scan coreResult.thirdPartyServiceCount =