From 6573a3a02aaaf0e3a097d1f49f7fd82037f8c8c1 Mon Sep 17 00:00:00 2001
From: Alejandro Celaya <alejandro.celaya@hypothes.is>
Date: Thu, 6 Feb 2025 15:17:31 +0100
Subject: [PATCH] Ignore punctuation characters when looking for mentions

---
 src/sidebar/components/MarkdownEditor.tsx     |   8 +-
 src/sidebar/helpers/mentions.ts               |  64 +++++++-
 src/sidebar/helpers/test/mentions-test.js     | 148 +++++++++++++++++-
 src/sidebar/util/term-before-position.ts      |  43 -----
 .../util/test/term-before-position-test.js    |  90 -----------
 5 files changed, 207 insertions(+), 146 deletions(-)
 delete mode 100644 src/sidebar/util/term-before-position.ts
 delete mode 100644 src/sidebar/util/test/term-before-position-test.js

diff --git a/src/sidebar/components/MarkdownEditor.tsx b/src/sidebar/components/MarkdownEditor.tsx
index 809c7e9a60c..52a7d7fd2a0 100644
--- a/src/sidebar/components/MarkdownEditor.tsx
+++ b/src/sidebar/components/MarkdownEditor.tsx
@@ -29,6 +29,10 @@ import {
 } from 'preact/hooks';
 
 import { isMacOS } from '../../shared/user-agent';
+import {
+  getContainingWordOffsets,
+  termBeforePosition,
+} from '../helpers/mentions';
 import {
   LinkType,
   convertSelectionToLink,
@@ -36,10 +40,6 @@ import {
   toggleSpanStyle,
 } from '../markdown-commands';
 import type { EditorState } from '../markdown-commands';
-import {
-  getContainingWordOffsets,
-  termBeforePosition,
-} from '../util/term-before-position';
 import MarkdownView from './MarkdownView';
 import MentionPopover from './MentionPopover';
 
diff --git a/src/sidebar/helpers/mentions.ts b/src/sidebar/helpers/mentions.ts
index a09cdca1846..e2e39b1a493 100644
--- a/src/sidebar/helpers/mentions.ts
+++ b/src/sidebar/helpers/mentions.ts
@@ -11,19 +11,24 @@ import { buildAccountID } from './account-id';
  */
 export function wrapMentions(text: string, authority: string): string {
   return text.replace(
-    // Capture both the potential empty character before the mention (space, tab
-    // or new line), and the term following the `@` character.
-    // When we build the mention tag, we need to prepend that empty character to
+    // Capture both the potential empty char (space, tab or new line) or
+    // punctuation char before the mention, and the term following the `@`
+    // character.
+    // When we build the mention tag, we need to prepend that prev character to
     // avoid altering the spacing and structure of the text.
-    /(^|\s)@(\w+)(?=\s|$)/g,
-    (match, precedingWhitespace, username) => {
+    //
+    // To match the username, we only look for `A-Za-z0-9._` characters, which
+    // is what the server allows.
+    // See: https://github.com/hypothesis/h/blob/b8d0d4c/h/schemas/api/user.py#L21
+    /(^|[\s,.;:|?!'"\-()[\]{}])@([A-Za-z0-9._]+)(?=[\s,.;:|?!'"\-()[\]{}]|$)/g,
+    (match, precedingChar, username) => {
       const tag = document.createElement('a');
 
       tag.setAttribute('data-hyp-mention', '');
       tag.setAttribute('data-userid', buildAccountID(username, authority));
       tag.textContent = `@${username}`;
 
-      return `${precedingWhitespace}${tag.outerHTML}`;
+      return `${precedingChar}${tag.outerHTML}`;
     },
   );
 }
@@ -115,3 +120,50 @@ export function renderMentionTags(
 
   return foundMentions;
 }
+
+/**
+ * Returns the word at a specific position in a string, surrounded by empty
+ * characters or punctuation characters.
+ */
+export function termBeforePosition(text: string, position: number): string {
+  const { start } = getContainingWordOffsets(text, position);
+  return text.slice(start, position);
+}
+
+export type WordOffsets = {
+  start: number;
+  end: number;
+};
+
+/**
+ * Returns the `start` and `end` positions for the word that overlaps with
+ * provided reference position.
+ *
+ * For example, given the text "hello hypothesis", and the reference position 9
+ * (which corresponds to the `p` character) it will return the start and end of
+ * the word `hypothesis`, hence { start: 6, end: 16 }.
+ *
+ * Useful to get the offsets of the word matching the caret position in text
+ * inputs and textareas.
+ */
+export function getContainingWordOffsets(
+  text: string,
+  referencePosition: number,
+): WordOffsets {
+  const precedingText = text.slice(0, referencePosition);
+  const matches = [...precedingText.matchAll(/[\s,.;:|?!'"\-()[\]{}]/g)];
+  const precedingCharPos =
+    matches.length > 0 ? Math.max(...matches.map(match => match.index)) : -1;
+
+  const subsequentCharPos = text
+    .slice(referencePosition)
+    .search(/[\s,.;:|?!'"\-()[\]{}]/);
+
+  return {
+    start: precedingCharPos + 1,
+    end:
+      subsequentCharPos === -1
+        ? text.length
+        : referencePosition + subsequentCharPos,
+  };
+}
diff --git a/src/sidebar/helpers/test/mentions-test.js b/src/sidebar/helpers/test/mentions-test.js
index 802a3aebc1c..5ca676a8bb3 100644
--- a/src/sidebar/helpers/test/mentions-test.js
+++ b/src/sidebar/helpers/test/mentions-test.js
@@ -1,4 +1,10 @@
-import { renderMentionTags, unwrapMentions, wrapMentions } from '../mentions';
+import {
+  renderMentionTags,
+  unwrapMentions,
+  wrapMentions,
+  getContainingWordOffsets,
+  termBeforePosition,
+} from '../mentions';
 
 const mentionTag = (username, authority) =>
   `<a data-hyp-mention="" data-userid="acct:${username}@${authority}">@${username}</a>`;
@@ -40,9 +46,21 @@ look at ${mentionTag('foo', 'example.com')} comment`,
   },
   // Multiple mentions
   {
-    text: 'Hey @jane look at this quote from @rob',
+    text: 'Hey @jane, look at this quote from @rob',
     authority: 'example.com',
-    textWithTags: `Hey ${mentionTag('jane', 'example.com')} look at this quote from ${mentionTag('rob', 'example.com')}`,
+    textWithTags: `Hey ${mentionTag('jane', 'example.com')}, look at this quote from ${mentionTag('rob', 'example.com')}`,
+  },
+  // Mentions wrapped in punctuation chars
+  {
+    text: '(@jane) {@rob} and @john?',
+    authority: 'example.com',
+    textWithTags: `(${mentionTag('jane', 'example.com')}) {${mentionTag('rob', 'example.com')}} and ${mentionTag('john', 'example.com')}?`,
+  },
+  // username-like patterns with invalid chars should be ignored
+  {
+    text: 'Hello @not+a/user=name',
+    authority: 'example.com',
+    textWithTags: `Hello @not+a/user=name`,
   },
   // Email addresses should be ignored
   {
@@ -119,3 +137,127 @@ describe('renderMentionTags', () => {
     assert.equal(fourthMention, '@user_id_missing');
   });
 });
+
+// To make these tests more predictable, we place the `$` sign in the position
+// to be checked. That way it's easier to see what is the "word" preceding it.
+// The test will then get the `$` sign index and remove it from the text
+// before passing it to `termBeforePosition`.
+[
+  // First and last positions
+  {
+    text: '$Hello world',
+    expectedTerm: '',
+    expectedOffsets: { start: 0, end: 5 },
+  },
+  {
+    text: 'Hello world$',
+    expectedTerm: 'world',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+
+  // Position in the middle of words
+  {
+    text: 'Hell$o world',
+    expectedTerm: 'Hell',
+    expectedOffsets: { start: 0, end: 5 },
+  },
+  {
+    text: 'Hello wor$ld',
+    expectedTerm: 'wor',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+
+  // Position preceded by "empty space"
+  {
+    text: 'Hello $world',
+    expectedTerm: '',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+  {
+    text: `Text with
+      multiple
+      $
+      lines
+      `,
+    expectedTerm: '',
+    expectedOffsets: { start: 31, end: 31 },
+  },
+
+  // Position preceded by/in the middle of a word for multi-line text
+  {
+    text: `Text with$
+      multiple
+
+      lines
+      `,
+    expectedTerm: 'with',
+    expectedOffsets: { start: 5, end: 9 },
+  },
+  {
+    text: `Text with
+      multiple
+
+      li$nes
+      `,
+    expectedTerm: 'li',
+    expectedOffsets: { start: 32, end: 37 },
+  },
+
+  // Including punctuation characters
+  ...[
+    ',',
+    '.',
+    ';',
+    ':',
+    '|',
+    '?',
+    '!',
+    "'",
+    '"',
+    '-',
+    '(',
+    ')',
+    '[',
+    ']',
+    '{',
+    '}',
+  ].flatMap(char => [
+    {
+      text: `Foo${char}$ bar`,
+      expectedTerm: '',
+      expectedOffsets: { start: 4, end: 4 },
+    },
+    {
+      text: `${char}Foo$ bar`,
+      expectedTerm: 'Foo',
+      expectedOffsets: { start: 1, end: 4 },
+    },
+    {
+      text: `hello ${char}fo$o${char} bar`,
+      expectedTerm: 'fo',
+      expectedOffsets: { start: 7, end: 10 },
+    },
+  ]),
+].forEach(({ text, expectedTerm, expectedOffsets }) => {
+  // Get the position of the `$` sign in the text, then remove it
+  const position = text.indexOf('$');
+  const textWithoutDollarSign = text.replace('$', '');
+
+  describe('termBeforePosition', () => {
+    it('returns the term right before provided position', () => {
+      assert.equal(
+        termBeforePosition(textWithoutDollarSign, position),
+        expectedTerm,
+      );
+    });
+  });
+
+  describe('getContainingWordOffsets', () => {
+    it('returns expected offsets', () => {
+      assert.deepEqual(
+        getContainingWordOffsets(textWithoutDollarSign, position),
+        expectedOffsets,
+      );
+    });
+  });
+});
diff --git a/src/sidebar/util/term-before-position.ts b/src/sidebar/util/term-before-position.ts
deleted file mode 100644
index 4540dec916f..00000000000
--- a/src/sidebar/util/term-before-position.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Returns the "word" right before a specific position in an input string.
- *
- * In this context, a word is anything between a space, newline or tab, and
- * provided position.
- */
-export function termBeforePosition(text: string, position: number): string {
-  return text.slice(0, position).match(/\S+$/)?.[0] ?? '';
-}
-
-export type WordOffsets = {
-  start: number;
-  end: number;
-};
-
-/**
- * Returns the `start` and `end` positions for the word that overlaps with
- * provided reference position.
- *
- * For example, given the text "hello hypothesis", and the reference position 9
- * (which corresponds to the `p` character) it will return the start and end of
- * the word `hypothesis`, hence { start: 6, end: 16 }.
- *
- * Useful to get the offsets of the word matching the caret position in text
- * inputs and textareas.
- */
-export function getContainingWordOffsets(
-  text: string,
-  referencePosition: number,
-): WordOffsets {
-  const precedingEmptyCharPos = text
-    .slice(0, referencePosition)
-    .search(/\s\S*$/);
-  const subsequentEmptyCharPos = text.slice(referencePosition).search(/\s/);
-
-  return {
-    start: (precedingEmptyCharPos === -1 ? -1 : precedingEmptyCharPos) + 1,
-    end:
-      subsequentEmptyCharPos === -1
-        ? text.length
-        : referencePosition + subsequentEmptyCharPos,
-  };
-}
diff --git a/src/sidebar/util/test/term-before-position-test.js b/src/sidebar/util/test/term-before-position-test.js
deleted file mode 100644
index f1f6a8c9a01..00000000000
--- a/src/sidebar/util/test/term-before-position-test.js
+++ /dev/null
@@ -1,90 +0,0 @@
-import {
-  getContainingWordOffsets,
-  termBeforePosition,
-} from '../term-before-position';
-
-describe('term-before-position', () => {
-  // To make these tests more predictable, we place the `$` sign in the position
-  // to be checked. That way it's easier to see what is the "word" preceding it.
-  // The test will then get the `$` sign index and remove it from the text
-  // before passing it to `termBeforePosition`.
-  [
-    // First and last positions
-    {
-      text: '$Hello world',
-      expectedTerm: '',
-      expectedOffsets: { start: 0, end: 5 },
-    },
-    {
-      text: 'Hello world$',
-      expectedTerm: 'world',
-      expectedOffsets: { start: 6, end: 11 },
-    },
-
-    // Position in the middle of words
-    {
-      text: 'Hell$o world',
-      expectedTerm: 'Hell',
-      expectedOffsets: { start: 0, end: 5 },
-    },
-    {
-      text: 'Hello wor$ld',
-      expectedTerm: 'wor',
-      expectedOffsets: { start: 6, end: 11 },
-    },
-
-    // Position preceded by "empty space"
-    {
-      text: 'Hello $world',
-      expectedTerm: '',
-      expectedOffsets: { start: 6, end: 11 },
-    },
-    {
-      text: `Text with
-      multiple
-      $
-      lines
-      `,
-      expectedTerm: '',
-      expectedOffsets: { start: 31, end: 31 },
-    },
-
-    // Position preceded by/in the middle of a word for multi-line text
-    {
-      text: `Text with$
-      multiple
-
-      lines
-      `,
-      expectedTerm: 'with',
-      expectedOffsets: { start: 5, end: 9 },
-    },
-    {
-      text: `Text with
-      multiple
-
-      li$nes
-      `,
-      expectedTerm: 'li',
-      expectedOffsets: { start: 32, end: 37 },
-    },
-  ].forEach(({ text, expectedTerm, expectedOffsets }) => {
-    // Get the position of the `$` sign in the text, then remove it
-    const position = text.indexOf('$');
-    const textWithoutDollarSign = text.replace('$', '');
-
-    it('`termBeforePosition` returns the term right before provided position', () => {
-      assert.equal(
-        termBeforePosition(textWithoutDollarSign, position),
-        expectedTerm,
-      );
-    });
-
-    it('`getContainingWordOffsets` returns expected offsets', () => {
-      assert.deepEqual(
-        getContainingWordOffsets(textWithoutDollarSign, position),
-        expectedOffsets,
-      );
-    });
-  });
-});