From 6f716039d3c2b295e8b35e2f4a6a0a5afecb4288 Mon Sep 17 00:00:00 2001 From: Moonbase59 Date: Tue, 21 Nov 2023 16:41:29 +0100 Subject: [PATCH] Remove ­ ‍ ‌ from page before comparison so words with soft hyphens or ligature control characters can be compared and found. Since we're dealing with Markdown/HTML text, we must check for and remove all usual variants of coding these: &name; - &#decimal; - &#xhex; - \x{unicode} --- classes/GravTNTSearch.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/classes/GravTNTSearch.php b/classes/GravTNTSearch.php index 864856d..73203ff 100644 --- a/classes/GravTNTSearch.php +++ b/classes/GravTNTSearch.php @@ -214,7 +214,13 @@ public static function getCleanContent($page) $content = strip_tags($content); $content = preg_replace(['/[ \t]+/', '/\s*$^\s*/m'], [' ', "\n"], $content) ?? $content; - + // 2023-11-21 MCH - Remove some in-word Unicode that regularly breaks searches + $problematic = [ + '/­/i', '/­/', '/­/i', '/\x{00AD}/u', // soft hyphen + '/‍/i', '/‍/', '/‍/i', '/\x{200D}/u', // zero-width joiner + '/‌/i', '/‌/', '/‌/i', '/\x{200C}/u', // zero-width non-joiner + ]; + $content = preg_replace($problematic, '', $content) ?? $content; // Restore active page in Grav. unset($grav['page']); $grav['page'] = $activePage;