Ignore punctuation characters when trying to match mentions

hypothesis · Feb 10, 2025 · aa75693 · aa75693
1 parent 0a589c3
commit aa75693
Show file tree

Hide file tree

Showing 5 changed files with 226 additions and 157 deletions.
diff --git a/src/sidebar/components/MarkdownEditor.tsx b/src/sidebar/components/MarkdownEditor.tsx
@@ -29,17 +29,17 @@ import {
 } from 'preact/hooks';
 
 import { isMacOS } from '../../shared/user-agent';
+import {
+  getContainingMentionOffsets,
+  termBeforePosition,
+} from '../helpers/mentions';
 import {
   LinkType,
   convertSelectionToLink,
   toggleBlockStyle,
   toggleSpanStyle,
 } from '../markdown-commands';
 import type { EditorState } from '../markdown-commands';
-import {
-  getContainingWordOffsets,
-  termBeforePosition,
-} from '../util/term-before-position';
 import { getCaretCoordinates } from '../util/textarea-caret-position';
 import MarkdownView from './MarkdownView';
 import MentionPopover from './MentionPopover';
@@ -264,7 +264,7 @@ function TextArea({
     (suggestion: UserItem) => {
       const textarea = textareaRef.current!;
       const { value } = textarea;
-      const { start, end } = getContainingWordOffsets(
+      const { start, end } = getContainingMentionOffsets(
         value,
         textarea.selectionStart,
       );

diff --git a/src/sidebar/helpers/mentions.ts b/src/sidebar/helpers/mentions.ts
@@ -1,6 +1,19 @@
 import type { Mention } from '../../types/api';
 import { buildAccountID } from './account-id';
 
+// Pattern that matches characters treated as the boundary of a mention.
+const BOUNDARY_CHARS = String.raw`[\s,.;:|?!'"\-()[\]{}]`;
+
+// Pattern that matches Hypothesis usernames.
+// See https://github.com/hypothesis/h/blob/797d9a4/h/models/user.py#L25
+const USERNAME_PAT = '[A-Za-z0-9_][A-Za-z0-9._]+[A-Za-z0-9_]';
+
+// Pattern that finds user mentions in text.
+const MENTIONS_PAT = new RegExp(
+  `(^|${BOUNDARY_CHARS})@(${USERNAME_PAT})(?=${BOUNDARY_CHARS}|$)`,
+  'g',
+);
+
 /**
  * Wrap all occurrences of @mentions in provided text into the corresponding
  * special tag, as long as they are surrounded by "empty" space (space, tab, new
@@ -10,22 +23,15 @@ import { buildAccountID } from './account-id';
  *  `<a data-hyp-mention data-userid="acct:[email protected]">@someuser</a>`
  */
 export function wrapMentions(text: string, authority: string): string {
-  return text.replace(
-    // Capture both the potential empty character before the mention (space, tab
-    // or new line), and the term following the `@` character.
-    // When we build the mention tag, we need to prepend that empty character to
-    // avoid altering the spacing and structure of the text.
-    /(^|\s)@(\w+)(?=\s|$)/g,
-    (match, precedingWhitespace, username) => {
-      const tag = document.createElement('a');
-
-      tag.setAttribute('data-hyp-mention', '');
-      tag.setAttribute('data-userid', buildAccountID(username, authority));
-      tag.textContent = `@${username}`;
-
-      return `${precedingWhitespace}${tag.outerHTML}`;
-    },
-  );
+  return text.replace(MENTIONS_PAT, (match, precedingChar, username) => {
+    const tag = document.createElement('a');
+
+    tag.setAttribute('data-hyp-mention', '');
+    tag.setAttribute('data-userid', buildAccountID(username, authority));
+    tag.textContent = `@${username}`;
+
+    return `${precedingChar}${tag.outerHTML}`;
+  });
 }
 
 /**
@@ -115,3 +121,50 @@ export function renderMentionTags(
 
   return foundMentions;
 }
+
+/**
+ * Returns the word at a specific position in a string, surrounded by empty
+ * characters or punctuation characters.
+ */
+export function termBeforePosition(text: string, position: number): string {
+  const { start } = getContainingMentionOffsets(text, position);
+  return text.slice(start, position);
+}
+
+export type WordOffsets = {
+  start: number;
+  end: number;
+};
+
+/**
+ * Returns the `start` and `end` positions for the word or mention that overlaps
+ * with provided reference position.
+ *
+ * For example, given the text "hello @hypothesis", and the reference position 9
+ * (which corresponds to the `y` character) it will return the start and end of
+ * the `@hypothesis` mention, hence { start: 6, end: 17 }.
+ *
+ * Useful to get the offsets of the mention matching the caret position in text
+ * inputs and textareas.
+ */
+export function getContainingMentionOffsets(
+  text: string,
+  referencePosition: number,
+): WordOffsets {
+  const precedingText = text.slice(0, referencePosition);
+  const matches = [...precedingText.matchAll(new RegExp(BOUNDARY_CHARS, 'g'))];
+  const precedingCharPos =
+    matches.length > 0 ? Math.max(...matches.map(match => match.index)) : -1;
+
+  const subsequentCharPos = text
+    .slice(referencePosition)
+    .search(new RegExp(BOUNDARY_CHARS));
+
+  return {
+    start: precedingCharPos + 1,
+    end:
+      subsequentCharPos === -1
+        ? text.length
+        : referencePosition + subsequentCharPos,
+  };
+}
diff --git a/src/sidebar/helpers/test/mentions-test.js b/src/sidebar/helpers/test/mentions-test.js
@@ -1,4 +1,10 @@
-import { renderMentionTags, unwrapMentions, wrapMentions } from '../mentions';
+import {
+  renderMentionTags,
+  unwrapMentions,
+  wrapMentions,
+  getContainingMentionOffsets,
+  termBeforePosition,
+} from '../mentions';
 
 const mentionTag = (username, authority) =>
   `<a data-hyp-mention="" data-userid="acct:${username}@${authority}">@${username}</a>`;
@@ -40,16 +46,35 @@ look at ${mentionTag('foo', 'example.com')} comment`,
   },
   // Multiple mentions
   {
-    text: 'Hey @jane look at this quote from @rob',
+    text: 'Hey @jane, look at this quote from @rob',
     authority: 'example.com',
-    textWithTags: `Hey ${mentionTag('jane', 'example.com')} look at this quote from ${mentionTag('rob', 'example.com')}`,
+    textWithTags: `Hey ${mentionTag('jane', 'example.com')}, look at this quote from ${mentionTag('rob', 'example.com')}`,
+  },
+  // Mentions wrapped in punctuation chars
+  {
+    text: '(@jane) {@rob} and @john?',
+    authority: 'example.com',
+    textWithTags: `(${mentionTag('jane', 'example.com')}) {${mentionTag('rob', 'example.com')}} and ${mentionTag('john', 'example.com')}?`,
+  },
+  // username-like patterns with invalid chars should be ignored
+  {
+    text: 'Hello @not+a/user=name',
+    authority: 'example.com',
+    textWithTags: `Hello @not+a/user=name`,
   },
   // Email addresses should be ignored
   {
     text: 'Ignore email: [email protected]',
     authority: 'example.com',
     textWithTags: 'Ignore email: [email protected]',
   },
+  // Trailing dots should not be considered part of the mention, but dots
+  // in-between should
+  {
+    text: 'Hello @jane.doe.',
+    authority: 'example.com',
+    textWithTags: `Hello ${mentionTag('jane.doe', 'example.com')}.`,
+  },
 ].forEach(({ text, authority, textWithTags }) => {
   describe('wrapMentions', () => {
     it('wraps every mention in a mention tag', () => {
@@ -119,3 +144,127 @@ describe('renderMentionTags', () => {
     assert.equal(fourthMention, '@user_id_missing');
   });
 });
+
+// To make these tests more predictable, we place the `$` sign in the position
+// to be checked. That way it's easier to see what is the "word" preceding it.
+// The test will then get the `$` sign index and remove it from the text
+// before passing it to `termBeforePosition`.
+[
+  // First and last positions
+  {
+    text: '$Hello world',
+    expectedTerm: '',
+    expectedOffsets: { start: 0, end: 5 },
+  },
+  {
+    text: 'Hello world$',
+    expectedTerm: 'world',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+
+  // Position in the middle of words
+  {
+    text: 'Hell$o world',
+    expectedTerm: 'Hell',
+    expectedOffsets: { start: 0, end: 5 },
+  },
+  {
+    text: 'Hello wor$ld',
+    expectedTerm: 'wor',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+
+  // Position preceded by "empty space"
+  {
+    text: 'Hello $world',
+    expectedTerm: '',
+    expectedOffsets: { start: 6, end: 11 },
+  },
+  {
+    text: `Text with
+      multiple
+      $
+      lines
+      `,
+    expectedTerm: '',
+    expectedOffsets: { start: 31, end: 31 },
+  },
+
+  // Position preceded by/in the middle of a word for multi-line text
+  {
+    text: `Text with$
+      multiple
+
+      lines
+      `,
+    expectedTerm: 'with',
+    expectedOffsets: { start: 5, end: 9 },
+  },
+  {
+    text: `Text with
+      multiple
+
+      li$nes
+      `,
+    expectedTerm: 'li',
+    expectedOffsets: { start: 32, end: 37 },
+  },
+
+  // Including punctuation characters
+  ...[
+    ',',
+    '.',
+    ';',
+    ':',
+    '|',
+    '?',
+    '!',
+    "'",
+    '"',
+    '-',
+    '(',
+    ')',
+    '[',
+    ']',
+    '{',
+    '}',
+  ].flatMap(char => [
+    {
+      text: `Foo${char}$ bar`,
+      expectedTerm: '',
+      expectedOffsets: { start: 4, end: 4 },
+    },
+    {
+      text: `${char}Foo$ bar`,
+      expectedTerm: 'Foo',
+      expectedOffsets: { start: 1, end: 4 },
+    },
+    {
+      text: `hello ${char}fo$o${char} bar`,
+      expectedTerm: 'fo',
+      expectedOffsets: { start: 7, end: 10 },
+    },
+  ]),
+].forEach(({ text, expectedTerm, expectedOffsets }) => {
+  // Get the position of the `$` sign in the text, then remove it
+  const position = text.indexOf('$');
+  const textWithoutDollarSign = text.replace('$', '');
+
+  describe('termBeforePosition', () => {
+    it('returns the term right before provided position', () => {
+      assert.equal(
+        termBeforePosition(textWithoutDollarSign, position),
+        expectedTerm,
+      );
+    });
+  });
+
+  describe('getContainingMentionOffsets', () => {
+    it('returns expected offsets', () => {
+      assert.deepEqual(
+        getContainingMentionOffsets(textWithoutDollarSign, position),
+        expectedOffsets,
+      );
+    });
+  });
+});
diff --git a/src/sidebar/util/term-before-position.ts b/src/sidebar/util/term-before-position.ts