avoid allocating hats to the first letter of a token

We could get much fancier than this, but after running this with a day it appears to help some, and it is nice and simple. I propose that we declare that it fixes cursorless-dev#1658, at least for now.
josharian · Aug 2, 2023 · 0385b48 · 0385b48
1 parent 342a771
commit 0385b48
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 8 deletions.
diff --git a/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts b/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts
@@ -119,12 +119,26 @@ export class TokenGraphemeSplitter {
    * @param token The token to split
    * @returns A list of normalised graphemes in {@link token}
    */
-  getTokenGraphemes = (token: string): Grapheme[] =>
-    matchAll<Grapheme>(token, GRAPHEME_SPLIT_REGEX, (match) => ({
-      text: this.normalizeGrapheme(match[0]),
-      tokenStartOffset: match.index!,
-      tokenEndOffset: match.index! + match[0].length,
-    }));
+  getTokenGraphemes = (token: string): Grapheme[] => {
+    const graphemes = matchAll<Grapheme>(
+      token,
+      GRAPHEME_SPLIT_REGEX,
+      (match) => ({
+        text: this.normalizeGrapheme(match[0]),
+        tokenStartOffset: match.index!,
+        tokenEndOffset: match.index! + match[0].length,
+        isFirstLetterGrapheme: false,
+      }),
+    );
+    // iterate through the graphemes, marking the first letter
+    for (const grapheme of graphemes) {
+      if (grapheme.text.match(/[a-z]/)) {
+        grapheme.isFirstLetterGrapheme = true;
+        break;
+      }
+    }
+    return graphemes;
+  };
 
   /**
    * Normalizes the grapheme {@link rawGraphemeText} based on user
@@ -201,4 +215,12 @@ export interface Grapheme {
 
   /** The end offset of the grapheme within its containing token */
   tokenEndOffset: number;
+
+  /**
+   * Whether this grapheme is the first letter grapheme of the text
+   * See https://github.com/cursorless-dev/cursorless/issues/1658
+   * TODO: Consider instead whether this grapheme is the beginning of a word inside the text.
+   * This is more complicated, because the definition of a word can vary by language.
+   */
+  isFirstLetterGrapheme: boolean;
 }
diff --git a/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts b/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts
@@ -14,6 +14,12 @@ export type HatMetric = (hat: HatCandidate) => number;
  */
 export const negativePenalty: HatMetric = ({ penalty }) => -penalty;
 
+/**
+ * @returns A metric that penalizes graphemes that are the first letter of a token
+ */
+export const avoidFirstLetter: HatMetric = ({ grapheme }) =>
+  grapheme.isFirstLetterGrapheme ? -1 : 0;
+
 /**
  * @param hatOldTokenRanks A map from a hat candidate (grapheme+style combination) to the score of the
  * token that used the given hat in the previous hat allocation.

diff --git a/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts b/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts
@@ -151,7 +151,7 @@ function getTokenRemainingHatCandidates(
 }
 
 /**
- * @param token The token that recevied the hat
+ * @param token The token that received the hat
  * @param chosenHat The hat we chose for the token
  * @returns An object indicating the hat assigned to the token, along with the
  * range of the grapheme upon which it sits

diff --git a/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts b/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts
@@ -2,6 +2,7 @@ import { HatStability, TokenHat } from "@cursorless/common";
 import { HatCandidate } from "./allocateHats";
 import { RankingContext } from "./getHatRankingContext";
 import {
+  avoidFirstLetter,
   hatOldTokenRank,
   isOldTokenHat,
   minimumTokenRankContainingGrapheme,
@@ -71,7 +72,10 @@ export function chooseTokenHat(
     // 4. Narrow to the hats with the lowest penalty
     negativePenalty,
 
-    // 5. Prefer hats that sit on a grapheme that doesn't appear in any highly
+    // 5. Avoid the first grapheme of the token if possible
+    avoidFirstLetter,
+
+    // 6. Prefer hats that sit on a grapheme that doesn't appear in any highly
     //    ranked token
     minimumTokenRankContainingGrapheme(tokenRank, graphemeTokenRanks),
   ])!;