diff --git a/src/backlinks/handler.js b/src/backlinks/handler.js index 3cd165ae..515d0f57 100644 --- a/src/backlinks/handler.js +++ b/src/backlinks/handler.js @@ -15,49 +15,12 @@ import { } from '@adobe/spacecat-shared-http-utils'; import { composeAuditURL } from '@adobe/spacecat-shared-utils'; import AhrefsAPIClient from '@adobe/spacecat-shared-ahrefs-client'; -import { AbortController, AbortError } from '@adobe/fetch'; import { retrieveSiteBySiteId } from '../utils/data-access.js'; -import { enhanceBacklinksWithFixes, fetch } from '../support/utils.js'; - -const TIMEOUT = 3000; +import { enhanceBacklinksWithFixes, isStillBrokenURL } from '../support/utils.js'; export async function filterOutValidBacklinks(backlinks, log) { - const fetchWithTimeout = async (url, timeout) => { - const controller = new AbortController(); - const { signal } = controller; - const id = setTimeout(() => controller.abort(), timeout); - - try { - const response = await fetch(url, { signal }); - clearTimeout(id); - return response; - } catch (error) { - if (error instanceof AbortError) { - log.warn(`Request to ${url} timed out after ${timeout}ms`); - return { ok: false, status: 408 }; - } - } finally { - clearTimeout(id); - } - return null; - }; - - const isStillBrokenBacklink = async (backlink) => { - try { - const response = await fetchWithTimeout(backlink.url_to, TIMEOUT); - if (!response.ok && response.status !== 404 - && response.status >= 400 && response.status < 500) { - log.warn(`Backlink ${backlink.url_to} returned status ${response.status}`); - } - return !response.ok; - } catch (error) { - log.error(`Failed to check backlink ${backlink.url_to}: ${error.message}`); - return true; - } - }; - - const backlinkStatuses = await Promise.all(backlinks.map(isStillBrokenBacklink)); - return backlinks.filter((_, index) => backlinkStatuses[index]); + const backlinkStatuses = await Promise.allSettled(backlinks.map(async (backlink) => isStillBrokenURL(backlink.url_to, 'Backlink', log))); + return backlinks.filter((_, index) => backlinkStatuses[index].status === 'fulfilled' && backlinkStatuses[index].value); } export default async function auditBrokenBacklinks(message, context) { diff --git a/src/support/utils.js b/src/support/utils.js index 97c68c9e..531c988d 100644 --- a/src/support/utils.js +++ b/src/support/utils.js @@ -10,13 +10,16 @@ * governing permissions and limitations under the License. */ -import { context as h2, h1 } from '@adobe/fetch'; +import { + AbortController, AbortError, context as h2, h1, +} from '@adobe/fetch'; import { hasText, prependSchema, resolveCustomerSecretsName } from '@adobe/spacecat-shared-utils'; import URI from 'urijs'; import { JSDOM } from 'jsdom'; import { GetSecretValueCommand, SecretsManagerClient } from '@aws-sdk/client-secrets-manager'; URI.preventInvalidHostname = true; +const TIMEOUT = 3000; /* c8 ignore next 3 */ export const { fetch } = process.env.HELIX_FETCH_FORCE_HTTP1 @@ -32,6 +35,49 @@ export async function getRUMUrl(url) { return finalUrl.endsWith('/') ? finalUrl.slice(0, -1) : /* c8 ignore next */ finalUrl; } +/** + * Fetches a URL with a specified timeout. + * + * @async + * @param {string} url - The URL to fetch. + * @param {number} timeout - The timeout duration in milliseconds. + * @param {Object} log - The logging object to record information and errors. + * @returns {Promise<{ok: boolean, status: number}>} - A promise that resolves the response object + */ +export const fetchWithTimeout = async (url, timeout, log) => { + const controller = new AbortController(); + const { signal } = controller; + const id = setTimeout(() => controller.abort(), timeout); + + try { + const response = await fetch(url, { signal }); + clearTimeout(id); + return response; + } catch (error) { + if (error instanceof AbortError) { + log.warn(`Request to ${url} timed out after ${timeout}ms`); + return { ok: false, status: 408 }; + } + } finally { + clearTimeout(id); + } + return null; +}; + +export const isStillBrokenURL = async (url, label, log) => { + try { + const response = await fetchWithTimeout(url, TIMEOUT, log); + if (!response.ok && response.status !== 404 + && response.status >= 400 && response.status < 500) { + log.warn(`${label} ${url} returned status ${response.status}`); + } + return !response.ok; + } catch (error) { + log.error(`Failed to check ${label} ${url}: ${error.message}`); + return true; + } +}; + /** * Checks if a given URL contains a domain with a non-www subdomain. *