From 0385b48037d9fe1cc96d6c4238e93f9f9d975ab3 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Tue, 1 Aug 2023 17:29:14 -0700 Subject: [PATCH] avoid allocating hats to the first letter of a token We could get much fancier than this, but after running this with a day it appears to help some, and it is nice and simple. I propose that we declare that it fixes #1658, at least for now. --- .../tokenGraphemeSplitter.ts | 34 +++++++++++++++---- .../src/util/allocateHats/HatMetrics.ts | 6 ++++ .../src/util/allocateHats/allocateHats.ts | 2 +- .../src/util/allocateHats/chooseTokenHat.ts | 6 +++- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts b/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts index 116bcd75e5f..128b2ad7fb5 100644 --- a/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts +++ b/packages/cursorless-engine/src/tokenGraphemeSplitter/tokenGraphemeSplitter.ts @@ -119,12 +119,26 @@ export class TokenGraphemeSplitter { * @param token The token to split * @returns A list of normalised graphemes in {@link token} */ - getTokenGraphemes = (token: string): Grapheme[] => - matchAll(token, GRAPHEME_SPLIT_REGEX, (match) => ({ - text: this.normalizeGrapheme(match[0]), - tokenStartOffset: match.index!, - tokenEndOffset: match.index! + match[0].length, - })); + getTokenGraphemes = (token: string): Grapheme[] => { + const graphemes = matchAll( + token, + GRAPHEME_SPLIT_REGEX, + (match) => ({ + text: this.normalizeGrapheme(match[0]), + tokenStartOffset: match.index!, + tokenEndOffset: match.index! + match[0].length, + isFirstLetterGrapheme: false, + }), + ); + // iterate through the graphemes, marking the first letter + for (const grapheme of graphemes) { + if (grapheme.text.match(/[a-z]/)) { + grapheme.isFirstLetterGrapheme = true; + break; + } + } + return graphemes; + }; /** * Normalizes the grapheme {@link rawGraphemeText} based on user @@ -201,4 +215,12 @@ export interface Grapheme { /** The end offset of the grapheme within its containing token */ tokenEndOffset: number; + + /** + * Whether this grapheme is the first letter grapheme of the text + * See https://github.com/cursorless-dev/cursorless/issues/1658 + * TODO: Consider instead whether this grapheme is the beginning of a word inside the text. + * This is more complicated, because the definition of a word can vary by language. + */ + isFirstLetterGrapheme: boolean; } diff --git a/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts b/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts index 3aae820a475..730bbb3ff7a 100644 --- a/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts +++ b/packages/cursorless-engine/src/util/allocateHats/HatMetrics.ts @@ -14,6 +14,12 @@ export type HatMetric = (hat: HatCandidate) => number; */ export const negativePenalty: HatMetric = ({ penalty }) => -penalty; +/** + * @returns A metric that penalizes graphemes that are the first letter of a token + */ +export const avoidFirstLetter: HatMetric = ({ grapheme }) => + grapheme.isFirstLetterGrapheme ? -1 : 0; + /** * @param hatOldTokenRanks A map from a hat candidate (grapheme+style combination) to the score of the * token that used the given hat in the previous hat allocation. diff --git a/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts b/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts index 5c22ffab461..64c7e1347f5 100644 --- a/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts +++ b/packages/cursorless-engine/src/util/allocateHats/allocateHats.ts @@ -151,7 +151,7 @@ function getTokenRemainingHatCandidates( } /** - * @param token The token that recevied the hat + * @param token The token that received the hat * @param chosenHat The hat we chose for the token * @returns An object indicating the hat assigned to the token, along with the * range of the grapheme upon which it sits diff --git a/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts b/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts index c1e583677cd..5fc99e005c6 100644 --- a/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts +++ b/packages/cursorless-engine/src/util/allocateHats/chooseTokenHat.ts @@ -2,6 +2,7 @@ import { HatStability, TokenHat } from "@cursorless/common"; import { HatCandidate } from "./allocateHats"; import { RankingContext } from "./getHatRankingContext"; import { + avoidFirstLetter, hatOldTokenRank, isOldTokenHat, minimumTokenRankContainingGrapheme, @@ -71,7 +72,10 @@ export function chooseTokenHat( // 4. Narrow to the hats with the lowest penalty negativePenalty, - // 5. Prefer hats that sit on a grapheme that doesn't appear in any highly + // 5. Avoid the first grapheme of the token if possible + avoidFirstLetter, + + // 6. Prefer hats that sit on a grapheme that doesn't appear in any highly // ranked token minimumTokenRankContainingGrapheme(tokenRank, graphemeTokenRanks), ])!;