From ece1784cbe0de62ca0e108df71eb388475050f70 Mon Sep 17 00:00:00 2001 From: rschneider <97682836+rainer-exxcellent@users.noreply.github.com> Date: Fri, 7 Feb 2025 11:53:08 +0100 Subject: [PATCH] fix(performance): #201 segment text before call to hunspell and cache results --- lib/informativeTests/informativeTest_6_3_8.js | 52 +++++++++++++++---- scripts/test.js | 1 + tests/informativeTest_6_3_8.js | 38 +++++++++++++- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/lib/informativeTests/informativeTest_6_3_8.js b/lib/informativeTests/informativeTest_6_3_8.js index b304de0..32080bc 100644 --- a/lib/informativeTests/informativeTest_6_3_8.js +++ b/lib/informativeTests/informativeTest_6_3_8.js @@ -3,6 +3,8 @@ import { execFile } from 'node:child_process' import bcp47 from 'bcp47' const ajv = new Ajv() +// cache results of spell to improve perfromance +const spellCheckedWords2Result = new Map() const inputSchema = /** @type {const} */ ({ additionalProperties: true, @@ -241,10 +243,15 @@ export default async function informativeTest_6_3_8( } const lang = bcp47.parse(doc.document.lang) - if (!lang?.langtag.language.language) return ctx + if (!lang?.langtag.language.language) { + return ctx + } const dictionary = `${lang.langtag.language.language}${ typeof lang.langtag.region === 'string' ? `_${lang.langtag.region}` : '' }` + // @ts-ignore + const segmenter = new Intl.Segmenter([dictionary], { granularity: 'word' }) + const urlPattern = /(https?|ftp):\/\/[^\s/$.?#].[^\s]*/i for (const path of [ '/document/acknowledgments[]/names[]', @@ -357,17 +364,42 @@ export default async function informativeTest_6_3_8( */ async function checkField(instancePath, text) { if (typeof text !== 'string') return - const result = await spellCheckString({ - text, - dictionary, - hunspell: params.hunspell, - }) - if (!result.ok) { + + // URL's are not properly segmented. Remove it before segmentation + const textWithOutUrl = text.replace(urlPattern, '') + + const segmentedText = segmenter.segment(textWithOutUrl) + const segments = [...segmentedText] + .filter((s) => s.isWordLike) + .map((s) => s.segment) + + const checkResults = [] + for (const segment of segments) { + let spellCheckResult = spellCheckedWords2Result.get(segment) + + if (!spellCheckResult) { + spellCheckResult = await spellCheckString({ + // @ts-ignore + text: segment, + dictionary: dictionary, + hunspell: params.hunspell, + }) + spellCheckedWords2Result.set(segment, spellCheckResult) + } + if (!spellCheckResult.ok) { + checkResults.push(spellCheckResult) + } + } + + if (checkResults.length > 0) { + const words = checkResults.flatMap((result) => + result.mistakes.map((/** @type {{ word: any; }} */ m) => m.word) + ) ctx.infos.push({ instancePath, - message: `there are spelling mistakes in: ${result.mistakes - .map((m) => m.word) - .join(', ')}`, + message: `there are spelling mistakes in: ${[...new Set(words)].join( + ', ' + )}`, }) } } diff --git a/scripts/test.js b/scripts/test.js index be52050..54896a7 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -5,6 +5,7 @@ import { fileURLToPath } from 'url' spawn('mocha', ['tests', ...process.argv.slice(2)], { stdio: 'inherit', + shell: true, env: { ...process.env, DICPATH: fileURLToPath(new URL('../tests/dicts', import.meta.url)), diff --git a/tests/informativeTest_6_3_8.js b/tests/informativeTest_6_3_8.js index d1e84b2..262d404 100644 --- a/tests/informativeTest_6_3_8.js +++ b/tests/informativeTest_6_3_8.js @@ -6,13 +6,47 @@ const failingExamples = await readExampleFiles( new URL('informativeTest_6_3_8/failing', import.meta.url) ) +// Word muss match to test csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-11.json +// and csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-01.json + +const hunspellMap = new Map([ + ['Security', '*'], + ['researchers', '*'], + ['found', '*'], + ['multiple', '*'], + ['vulnerabilities', '*'], + ['in', '*'], + ['XYZ', '*'], + ['Secruity', '# error'], + ['OASIS', '*'], + ['CSAF', '*'], + ['TC', '*'], + ['Informative', '*'], + ['test', '*'], + ['Spell', '*'], + ['check', '*'], + ['valid', '*'], + ['example', '*'], + ['failing', '*'], + ['1', '*'], + ['Initial', '*'], + ['version', '*'], + ['1', '*'], + ['1', '*'], +]) + describe('Informative test 6.3.8', function () { describe('failing examples', function () { for (const [title, failingExample] of failingExamples) { it(title, async function () { const result = await informativeTest_6_3_8(failingExample, { - async hunspell() { - return 'Hunspell vMOCK\n\n# wrongword 1' + async hunspell({ dictionary, input }) { + const answer = hunspellMap.get(input) + if (answer) { + return 'Hunspell vMOCK\n\n' + answer + } else { + throw new Error('Hunspell vMOCK. Unknoww word ' + input) + } }, })