From ece1784cbe0de62ca0e108df71eb388475050f70 Mon Sep 17 00:00:00 2001
From: rschneider <97682836+rainer-exxcellent@users.noreply.github.com>
Date: Fri, 7 Feb 2025 11:53:08 +0100
Subject: [PATCH] fix(performance): #201 segment text before call to hunspell
 and cache results

---
 lib/informativeTests/informativeTest_6_3_8.js | 52 +++++++++++++++----
 scripts/test.js                               |  1 +
 tests/informativeTest_6_3_8.js                | 38 +++++++++++++-
 3 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/lib/informativeTests/informativeTest_6_3_8.js b/lib/informativeTests/informativeTest_6_3_8.js
index b304de0..32080bc 100644
--- a/lib/informativeTests/informativeTest_6_3_8.js
+++ b/lib/informativeTests/informativeTest_6_3_8.js
@@ -3,6 +3,8 @@ import { execFile } from 'node:child_process'
 import bcp47 from 'bcp47'
 
 const ajv = new Ajv()
+// cache results of spell to improve perfromance
+const spellCheckedWords2Result = new Map()
 
 const inputSchema = /** @type {const} */ ({
   additionalProperties: true,
@@ -241,10 +243,15 @@ export default async function informativeTest_6_3_8(
   }
 
   const lang = bcp47.parse(doc.document.lang)
-  if (!lang?.langtag.language.language) return ctx
+  if (!lang?.langtag.language.language) {
+    return ctx
+  }
   const dictionary = `${lang.langtag.language.language}${
     typeof lang.langtag.region === 'string' ? `_${lang.langtag.region}` : ''
   }`
+  // @ts-ignore
+  const segmenter = new Intl.Segmenter([dictionary], { granularity: 'word' })
+  const urlPattern = /(https?|ftp):\/\/[^\s/$.?#].[^\s]*/i
 
   for (const path of [
     '/document/acknowledgments[]/names[]',
@@ -357,17 +364,42 @@ export default async function informativeTest_6_3_8(
    */
   async function checkField(instancePath, text) {
     if (typeof text !== 'string') return
-    const result = await spellCheckString({
-      text,
-      dictionary,
-      hunspell: params.hunspell,
-    })
-    if (!result.ok) {
+
+    // URL's are not properly segmented. Remove it before segmentation
+    const textWithOutUrl = text.replace(urlPattern, '')
+
+    const segmentedText = segmenter.segment(textWithOutUrl)
+    const segments = [...segmentedText]
+      .filter((s) => s.isWordLike)
+      .map((s) => s.segment)
+
+    const checkResults = []
+    for (const segment of segments) {
+      let spellCheckResult = spellCheckedWords2Result.get(segment)
+
+      if (!spellCheckResult) {
+        spellCheckResult = await spellCheckString({
+          // @ts-ignore
+          text: segment,
+          dictionary: dictionary,
+          hunspell: params.hunspell,
+        })
+        spellCheckedWords2Result.set(segment, spellCheckResult)
+      }
+      if (!spellCheckResult.ok) {
+        checkResults.push(spellCheckResult)
+      }
+    }
+
+    if (checkResults.length > 0) {
+      const words = checkResults.flatMap((result) =>
+        result.mistakes.map((/** @type {{ word: any; }} */ m) => m.word)
+      )
       ctx.infos.push({
         instancePath,
-        message: `there are spelling mistakes in: ${result.mistakes
-          .map((m) => m.word)
-          .join(', ')}`,
+        message: `there are spelling mistakes in: ${[...new Set(words)].join(
+          ', '
+        )}`,
       })
     }
   }
diff --git a/scripts/test.js b/scripts/test.js
index be52050..54896a7 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -5,6 +5,7 @@ import { fileURLToPath } from 'url'
 
 spawn('mocha', ['tests', ...process.argv.slice(2)], {
   stdio: 'inherit',
+  shell: true,
   env: {
     ...process.env,
     DICPATH: fileURLToPath(new URL('../tests/dicts', import.meta.url)),
diff --git a/tests/informativeTest_6_3_8.js b/tests/informativeTest_6_3_8.js
index d1e84b2..262d404 100644
--- a/tests/informativeTest_6_3_8.js
+++ b/tests/informativeTest_6_3_8.js
@@ -6,13 +6,47 @@ const failingExamples = await readExampleFiles(
   new URL('informativeTest_6_3_8/failing', import.meta.url)
 )
 
+// Word muss match to test csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-11.json
+// and csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-01.json
+
+const hunspellMap = new Map([
+  ['Security', '*'],
+  ['researchers', '*'],
+  ['found', '*'],
+  ['multiple', '*'],
+  ['vulnerabilities', '*'],
+  ['in', '*'],
+  ['XYZ', '*'],
+  ['Secruity', '# error'],
+  ['OASIS', '*'],
+  ['CSAF', '*'],
+  ['TC', '*'],
+  ['Informative', '*'],
+  ['test', '*'],
+  ['Spell', '*'],
+  ['check', '*'],
+  ['valid', '*'],
+  ['example', '*'],
+  ['failing', '*'],
+  ['1', '*'],
+  ['Initial', '*'],
+  ['version', '*'],
+  ['1', '*'],
+  ['1', '*'],
+])
+
 describe('Informative test 6.3.8', function () {
   describe('failing examples', function () {
     for (const [title, failingExample] of failingExamples) {
       it(title, async function () {
         const result = await informativeTest_6_3_8(failingExample, {
-          async hunspell() {
-            return 'Hunspell vMOCK\n\n# wrongword 1'
+          async hunspell({ dictionary, input }) {
+            const answer = hunspellMap.get(input)
+            if (answer) {
+              return 'Hunspell vMOCK\n\n' + answer
+            } else {
+              throw new Error('Hunspell vMOCK. Unknoww word ' + input)
+            }
           },
         })