Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(performance): #201 improve perfomance of informativeTest_6_3_8 #202

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
1 change: 0 additions & 1 deletion hunspell.js
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Die Datei wird im csaf-validator-service verwendet.

This file was deleted.

18 changes: 0 additions & 18 deletions hunspell/getHunspellAvailableLangs.js

This file was deleted.

154 changes: 134 additions & 20 deletions lib/informativeTests/informativeTest_6_3_8.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,26 @@
import Ajv from 'ajv/dist/jtd.js'
import { execFile } from 'node:child_process'
import bcp47 from 'bcp47'
import { spawn } from 'child_process'
import { createInterface } from 'node:readline/promises'
import { EOL } from 'node:os'

/**
* This is the timeout in milliseconds after which the connection to hunspell
* declared as dead.
*/
const HUNSPELL_TIMEOUT = 1000

const ajv = new Ajv()
// cache results of spell to improve perfromance
const spellCheckedWords2Result = new Map()
/**
* @type {import("child_process").ChildProcessWithoutNullStreams | null }
*/
let hunspellSpawn
/**
* @type {import("readline/promises").Interface}
*/
let hunspellInterface

const inputSchema = /** @type {const} */ ({
additionalProperties: true,
Expand Down Expand Up @@ -241,10 +259,17 @@ export default async function informativeTest_6_3_8(
}

const lang = bcp47.parse(doc.document.lang)
if (!lang?.langtag.language.language) return ctx
if (!lang?.langtag.language.language) {
return ctx
}
const dictionary = `${lang.langtag.language.language}${
typeof lang.langtag.region === 'string' ? `_${lang.langtag.region}` : ''
}`
// @ts-ignore
const segmenter = new Intl.Segmenter(lang.langtag.language, {
granularity: 'word',
})
const urlPattern = /(https?|ftp):\/\/[^\s/$.?#].[^\s]*/gi

for (const path of [
'/document/acknowledgments[]/names[]',
Expand Down Expand Up @@ -304,7 +329,7 @@ export default async function informativeTest_6_3_8(
`${prefix}${branchIndex}/product/name`,
branch.product?.name
)
checkBranches(
await checkBranches(
`${prefix}${branchIndex}/branches/`,
Array.isArray(branch.branches) ? branch.branches : []
)
Expand Down Expand Up @@ -351,27 +376,74 @@ export default async function informativeTest_6_3_8(
}
}

/**
* @param {string} [text]
*/
function segmentString(text) {
// URL's are not properly segmented. Remove it before segmentation
if (text) {
const urlStringMatches = text.matchAll(urlPattern)
const textWithOutUrl = text.replace(urlPattern, '')

const segmentedText = segmenter.segment(textWithOutUrl)
const segments = [...segmentedText]
.filter((s) => s.isWordLike)
.map((s) => s.segment)
for (const match of urlStringMatches) {
segments.push(match[0])
}
return segments
} else {
return []
}
}

/**
* @param {string} instancePath
* @param {string} [text]
*/
async function checkField(instancePath, text) {
if (typeof text !== 'string') return
const result = await spellCheckString({
text,
dictionary,
hunspell: params.hunspell,
})
if (!result.ok) {
if (typeof text !== 'string') {
return
}

const segments = segmentString(text)

const checkResults = []
for (const segment of segments) {
let spellCheckResult = spellCheckedWords2Result.get(segment)

if (!spellCheckResult) {
spellCheckResult = await spellCheckString({
// @ts-ignore
text: segment,
dictionary: dictionary,
hunspell: params.hunspell,
})
spellCheckedWords2Result.set(segment, spellCheckResult)
}
if (!spellCheckResult.ok) {
checkResults.push(spellCheckResult)
}
}

if (checkResults.length > 0) {
const words = checkResults.flatMap((result) =>
result.mistakes.map((/** @type {{ word: any; }} */ m) => m.word)
)
ctx.infos.push({
instancePath,
message: `there are spelling mistakes in: ${result.mistakes
.map((m) => m.word)
.join(', ')}`,
message: `there are spelling mistakes in: ${[...new Set(words)].join(
', '
)}`,
})
}
}

if (hunspellSpawn) {
hunspellInterface.close()
hunspellSpawn.kill()
}
return ctx
}

Expand All @@ -384,7 +456,8 @@ export default async function informativeTest_6_3_8(
async function spellCheckString({ text, dictionary, hunspell }) {
/** @type {string} */
const result = await hunspell({ dictionary, input: text })
const lines = result.split('\n').slice(1)

const lines = result.split('\n')
const errors = lines
.filter((l) => l.startsWith('# ') || l.startsWith('& '))
.map((l) => {
Expand All @@ -410,13 +483,54 @@ async function spellCheckString({ text, dictionary, hunspell }) {
* @returns
*/
async function runHunspell({ dictionary, input }) {
/** @type {string} */
debugger
const result = await new Promise((resolve, reject) => {
const child = execFile('hunspell', ['-d', dictionary], (err, stdout) => {
if (err) return reject(err)
resolve(stdout)
})
child.stdin?.end(input)
const abortController = new AbortController()
const { signal } = abortController
let resolved = false

const abortOnHunspellError = () => {
if (!signal.aborted && !resolved) {
abortController.abort()
return reject(new Error('Lost hunspell connection'))
}
}

if (!hunspellSpawn) {
hunspellSpawn = spawn('hunspell', ['-d', dictionary], {
stdio: 'pipe',
shell: true,
})
.on('error', abortOnHunspellError)
.on('exit', abortOnHunspellError)
hunspellSpawn.stdin.on('error', abortOnHunspellError)
hunspellSpawn.stdout.on('error', abortOnHunspellError)
hunspellInterface = createInterface(hunspellSpawn.stdout)
}

/*
Here we install a timeout which is used as watchdog to detect a broken hunspell
input connection.
*/
const timeout = setTimeout(abortOnHunspellError, HUNSPELL_TIMEOUT)

/** @type {string[]} */
const buffer = []
/** @param {string} line */
const handler = (line) => {
clearTimeout(timeout)
if (signal.aborted || resolved) return
if (line !== '') buffer.push(line)
else {
hunspellInterface.off('line', handler)
resolved = true
resolve(buffer.join(EOL))
}
}
hunspellInterface.on('line', handler)
hunspellSpawn.stdin.write(input + EOL)
})

/** @type {string} */
return result
}
1 change: 1 addition & 0 deletions scripts/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { fileURLToPath } from 'url'

spawn('mocha', ['tests', ...process.argv.slice(2)], {
stdio: 'inherit',
shell: true,
env: {
...process.env,
DICPATH: fileURLToPath(new URL('../tests/dicts', import.meta.url)),
Expand Down
38 changes: 36 additions & 2 deletions tests/informativeTest_6_3_8.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,47 @@ const failingExamples = await readExampleFiles(
new URL('informativeTest_6_3_8/failing', import.meta.url)
)

// Word muss match to test csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-11.json
// and csaf/csaf_2.0/test/validator/data/informative/oasis_csaf_tc-csaf_2_0-2021-6-3-08-01.json

const hunspellMap = new Map([
['Security', '*'],
['researchers', '*'],
['found', '*'],
['multiple', '*'],
['vulnerabilities', '*'],
['in', '*'],
['XYZ', '*'],
['Secruity', '# error'],
['OASIS', '*'],
['CSAF', '*'],
['TC', '*'],
['Informative', '*'],
['test', '*'],
['Spell', '*'],
['check', '*'],
['valid', '*'],
['example', '*'],
['failing', '*'],
['1', '*'],
['Initial', '*'],
['version', '*'],
['1', '*'],
['1', '*'],
])

describe('Informative test 6.3.8', function () {
describe('failing examples', function () {
for (const [title, failingExample] of failingExamples) {
it(title, async function () {
const result = await informativeTest_6_3_8(failingExample, {
async hunspell() {
return 'Hunspell vMOCK\n\n# wrongword 1'
async hunspell({ dictionary, input }) {
const answer = hunspellMap.get(input)
if (answer) {
return 'Hunspell vMOCK\n\n' + answer
} else {
throw new Error('Hunspell vMOCK. Unknoww word ' + input)
}
},
})

Expand Down