Skip to content

Commit

Permalink
Merge pull request #297 from GSA/seo-updates
Browse files Browse the repository at this point in the history
Add new seo fields
akuny authored Jan 15, 2024
2 parents a8ff301 + 347a249 commit b2ad917
Showing 8 changed files with 135 additions and 21 deletions.
22 changes: 22 additions & 0 deletions entities/core-result.entity.ts
Original file line number Diff line number Diff line change
@@ -391,6 +391,28 @@ export class CoreResult {
@Expose({ name: 'viewport_meta_tag' })
viewportMetaTag: boolean;

@Column({ nullable: true })
@Expose({ name: 'page_title' })
@Exclude()
pageTitle?: string;

@Column({ nullable: true })
@Expose({ name: 'meta_description_content' })
@Exclude()
metaDescriptionContent?: string;

@Column({ nullable: true })
@Expose({ name: 'hreflang_codes' })
@Exclude()
@Transform((value: string) => {
if (value) {
return value.split(',');
} else {
return null;
}
})
hreflangCodes?: string;

static getColumnNames(): string[] {
// return class-transformer version of column names
return Object.keys(classToPlain(new CoreResult()));
3 changes: 3 additions & 0 deletions entities/scan-data.entity.ts
Original file line number Diff line number Diff line change
@@ -27,6 +27,9 @@ export type SeoScan = {
ogArticleModifiedFinalUrl: Date;
mainElementFinalUrl: boolean;
canonicalLink: string;
pageTitle: string;
metaDescriptionContent: string;
hreflangCodes: string;
};

export type ThirdPartyScan = {
4 changes: 4 additions & 0 deletions libs/core-scanner/src/pages/primary.spec.ts
Original file line number Diff line number Diff line change
@@ -46,6 +46,10 @@ describe('primary scanner', () => {
'18F builds effective, user-centric digital services focused on the interaction between government and the people and businesses it serves.',
mainElementFinalUrl: true,
canonicalLink: 'https://18f.gsa.gov/',
pageTitle: '18F: Digital service delivery | Home',
metaDescriptionContent:
'18F builds effective, user-centric digital services focused on the interaction between government and the people and businesses it serves.',
hreflangCodes: '',
},
thirdPartyScan: {
thirdPartyServiceDomains:
3 changes: 1 addition & 2 deletions libs/core-scanner/src/pages/primary.ts
Original file line number Diff line number Diff line change
@@ -40,7 +40,6 @@ const primaryScan = async (
const getCSSRequests = await createCSSRequestsExtractor(page, logger);
const getOutboundRequests = createOutboundRequestsExtractor(page);

// goto url and wait until there are only 2 idle requests
const response = await page.goto(url, {
waitUntil: 'networkidle0',
});
@@ -61,7 +60,7 @@ const primaryScan = async (
buildDapResult(logger, getOutboundRequests()),
buildThirdPartyResult(response, getOutboundRequests()),
buildCookieResult(page),
buildSeoResult(logger, page),
buildSeoResult(logger, page, response),
createUswdsScanner({ logger, getCSSRequests }, page)(response),
buildLoginResult(response),
buildCloudDotGovPagesResult(response),
13 changes: 11 additions & 2 deletions libs/core-scanner/src/scans/seo.spec.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import { mock } from 'jest-mock-extended';
import { Logger } from 'pino';

import { HTTPResponse } from 'puppeteer';
import { browserInstance, newTestPage } from '../test-helper';
import { buildSeoResult } from './seo';

describe('seo scan', () => {
it('works', async () => {
await newTestPage(async ({ page }) => {
const result = await buildSeoResult(mock<Logger>(), page);
const result = await buildSeoResult(
mock<Logger>(),
page,
mock<HTTPResponse>(),
);
expect(result).toEqual({
mainElementFinalUrl: true,
ogArticlePublishedFinalUrl: undefined,
@@ -17,6 +21,11 @@ describe('seo scan', () => {
ogTitleFinalUrl:
"Investing in Appalachia's economic future. - Appalachian Regional Commission",
canonicalLink: 'https://www.arc.gov/',
pageTitle:
"Investing in Appalachia's economic future. - Appalachian Regional Commission",
metaDescriptionContent: '',

hreflangCodes: '',
});
}, 'arc_gov_dump.mht');
});
105 changes: 88 additions & 17 deletions libs/core-scanner/src/scans/seo.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,44 @@
import { Logger } from 'pino';
import { Page } from 'puppeteer';

import { HTTPResponse } from 'puppeteer';
import { SeoScan } from 'entities/scan-data.entity';

export const buildSeoResult = async (
logger: Logger,
page: Page,
response: HTTPResponse,
): Promise<SeoScan> => {
// seo
const ogTitleFinalUrl = await findOpenGraphTag(page, 'og:title');
const ogDescriptionFinalUrl = await findOpenGraphTag(page, 'og:description');
const ogArticlePublishedFinalUrl = await findOpenGraphDates(
logger,
page,
'article:published_time',
);
const ogArticleModifiedFinalUrl = await findOpenGraphDates(
logger,
page,
'article:modified_time',
);
const mainElementFinalUrl = await findMainElement(page);
const canonicalLink =
(await findCanonicalLinkInHtml(page)) ??
(await findCanonicalLInkInResponseHeaders(response)) ??
null;
const pageTitle = await findPageTitleText(page);
const metaDescriptionContent = await findMetaDescriptionContent(page);
const hreflangCodes = await findHrefLangCodes(page);

return {
ogTitleFinalUrl: await findOpenGraphTag(page, 'og:title'),
ogDescriptionFinalUrl: await findOpenGraphTag(page, 'og:description'),
ogArticlePublishedFinalUrl: await findOpenGraphDates(
logger,
page,
'article:published_time',
),
ogArticleModifiedFinalUrl: await findOpenGraphDates(
logger,
page,
'article:modified_time',
),
mainElementFinalUrl: await findMainElement(page),
canonicalLink: await findCanonicalLink(page),
ogTitleFinalUrl,
ogDescriptionFinalUrl,
ogArticlePublishedFinalUrl,
ogArticleModifiedFinalUrl,
mainElementFinalUrl,
canonicalLink,
pageTitle,
metaDescriptionContent,
hreflangCodes,
};
};

@@ -75,7 +91,7 @@ const findMainElement = async (page: Page) => {
return main;
};

const findCanonicalLink = async (page: Page) => {
const findCanonicalLinkInHtml = async (page: Page): Promise<string | null> => {
const canonicalLinkResult = await page.evaluate(() => {
const canonicalLink = document.querySelector<Element>(
'link[rel="canonical"]',
@@ -85,3 +101,58 @@ const findCanonicalLink = async (page: Page) => {

return canonicalLinkResult;
};

const findCanonicalLInkInResponseHeaders = async (
response: HTTPResponse,
): Promise<string | null> => {
const headers = await response.headers();

for (const key in headers) {
if (key.toLowerCase() === 'link') {
const value = headers[key];
if (value.toLowerCase().includes('rel=canonical')) {
const regex = /https?:\/\/[^;]+(?=; rel="canonical")/i;
const matches = value.match(regex);
return matches ? matches[0] : null;
}
}
}

return null;
};

const findPageTitleText = async (page: Page): Promise<string> => {
return await page.evaluate(() => document.title.trim());
};

const findMetaDescriptionContent = async (
page: Page,
): Promise<string | null> => {
const content = await page.evaluate(() => {
const metaDescription = document.querySelector('meta[name="description"]');

if (metaDescription && metaDescription.hasAttribute('content')) {
return metaDescription.getAttribute('content').trim();
}

return null;
});

return content;
};

const findHrefLangCodes = async (page: Page): Promise<string> => {
const languageCodes = await page.evaluate(() => {
const hreflangElements = document.querySelectorAll(
'link[rel="alternate"][hreflang]',
);

const hreflangValues = Array.from(hreflangElements).map((el) => {
return el.getAttribute('hreflang').trim().toLowerCase();
});

return hreflangValues;
});

return languageCodes.join(',');
};
3 changes: 3 additions & 0 deletions libs/database/src/core-results/core-result.service.spec.ts
Original file line number Diff line number Diff line change
@@ -111,6 +111,9 @@ describe('CoreResultService', () => {
ogArticleModifiedFinalUrl: null,
mainElementFinalUrl: null,
canonicalLink: null,
pageTitle: null,
metaDescriptionContent: null,
hreflangCodes: null,
},
thirdPartyScan: {
thirdPartyServiceDomains: null,
3 changes: 3 additions & 0 deletions libs/database/src/core-results/core-result.service.ts
Original file line number Diff line number Diff line change
@@ -82,6 +82,9 @@ export class CoreResultService {
coreResult.ogDescriptionFinalUrl = result.seoScan.ogDescriptionFinalUrl;
coreResult.ogTitleFinalUrl = result.seoScan.ogTitleFinalUrl;
coreResult.canonicalLink = result.seoScan.canonicalLink;
coreResult.pageTitle = result.seoScan.pageTitle;
coreResult.metaDescriptionContent = result.seoScan.metaDescriptionContent;
coreResult.hreflangCodes = result.seoScan.hreflangCodes;

// Third-party scan
coreResult.thirdPartyServiceCount =

0 comments on commit b2ad917

Please sign in to comment.