Automattic · elijah-potter · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 27, 2025
diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict
@@ -49774,3 +49774,6 @@ a8c/SM
 a11n/1
 a12s/9
 intergenerational
+etc.
+vs.
+et al.
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -5,7 +5,9 @@ use std::fmt::Display;
 use paste::paste;
 
 use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
-use crate::patterns::{PatternExt, RepeatingPattern, SequencePattern};
+use crate::patterns::{
+    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
+};
 use crate::punctuation::Punctuation;
 use crate::vec_ext::VecExt;
 use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
@@ -105,12 +107,13 @@ impl Document {
     /// Should be run after every change to the underlying [`Self::source`].
     fn parse(&mut self, dictionary: &impl Dictionary) {
         self.condense_spaces();
-        self.condense_ellipsis();
         self.condense_newlines();
         self.newlines_to_breaks();
         self.condense_contractions();
         self.condense_dotted_initialisms();
         self.condense_number_suffixes();
+        self.condense_ellipsis();
+        self.condense_latin();
         self.match_quotes();
 
         for token in self.tokens.iter_mut() {
@@ -326,6 +329,49 @@ impl Document {
         self.tokens.remove_indices(remove_these);
     }
 
+    thread_local! {
+        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
+    }
+
+    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
+        Lrc::new(EitherPattern::new(vec![
+            Box::new(
+                SequencePattern::default()
+                    .then_word_set(WordSet::all(&["etc", "vs"]))
+                    .then_period(),
+            ),
+            Box::new(
+                SequencePattern::aco("et")
+                    .then_whitespace()
+                    .t_aco("al")
+                    .then_period(),
+            ),
+        ]))
+    }
+
+    /// Assumes that the first matched token is the canonical one to be condensed into.
+    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
+    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
+    where
+        F: Fn(&mut Token),
+    {
+        let matches = pattern.find_all_matches_in_doc(self);
+
+        let mut remove_indices = VecDeque::with_capacity(matches.len());
+
+        for m in matches {
+            remove_indices.extend(m.start + 1..m.end);
+            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
+            edit(&mut self.tokens[m.start]);
+        }
+
+        self.tokens.remove_indices(remove_indices);
+    }
+
+    fn condense_latin(&mut self) {
+        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
+    }
+
     /// Searches for multiple sequential newline tokens and condenses them down
     /// into one.
     fn condense_newlines(&mut self) {
@@ -414,76 +460,39 @@ impl Document {
 
     fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
         let period = SequencePattern::default().then_period();
-        Lrc::new(RepeatingPattern::new(Box::new(period)))
+        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
     }
 
     thread_local! {
         static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
     }
 
     fn condense_ellipsis(&mut self) {
-        let found = Self::ELLIPSIS_PATTERN
-            .with(|v| v.clone())
-            .find_all_matches(&self.tokens, &self.source);
-        let mut to_remove = VecDeque::new();
-
-        for found_slice in found {
-            if found_slice.len() <= 1 {
-                continue;
-            }
-
-            let found_toks = &mut self.tokens[found_slice.start..found_slice.end];
-
-            let end_char = found_toks.last().unwrap().span.end;
-            let first = found_toks.first_mut().unwrap();
-            first.kind = TokenKind::Punctuation(Punctuation::Ellipsis);
-            first.span.end = end_char;
-            for i in found_slice.start + 1..found_slice.end {
-                to_remove.push_back(i)
-            }
-        }
+        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
+        self.condense_pattern(&pattern, |tok| {
+            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
+        });
+    }
+
+    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
+        Lrc::new(
+            SequencePattern::default()
+                .then_any_word()
+                .then_apostrophe()
+                .then_any_word(),
+        )
+    }
 
-        self.tokens.remove_indices(to_remove);
+    thread_local! {
+        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
     }
 
     /// Searches for contractions and condenses them down into single
     /// tokens.
     fn condense_contractions(&mut self) {
-        if self.tokens.len() < 3 {
-            return;
-        }
-
-        // Indices of the three token stretches we are going to condense.
-        let mut replace_starts = Vec::new();
-
-        for idx in 0..self.tokens.len() - 2 {
-            let a = self.tokens[idx];
-            let b = self.tokens[idx + 1];
-            let c = self.tokens[idx + 2];
-
-            if matches!(
-                (a.kind, b.kind, c.kind),
-                (
-                    TokenKind::Word(..),
-                    TokenKind::Punctuation(Punctuation::Apostrophe),
-                    TokenKind::Word(..)
-                )
-            ) {
-                // Ensure there is no overlapping between replacements
-                let should_replace = if let Some(last_idx) = replace_starts.last() {
-                    *last_idx < idx - 2
-                } else {
-                    true
-                };
-
-                if should_replace {
-                    replace_starts.push(idx);
-                    self.tokens[idx].span.end = c.span.end;
-                }
-            }
-        }
+        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
 
-        self.condense_indices(&replace_starts, 3);
+        self.condense_pattern(&pattern, |_| {});
     }
 }
 

diff --git a/harper-core/src/patterns/mod.rs b/harper-core/src/patterns/mod.rs
@@ -67,8 +67,6 @@ where
             return found;
         }
 
-        found.sort_by_key(|s| s.start);
-
         let mut remove_indices = VecDeque::new();
 
         for i in 0..found.len() - 1 {
@@ -127,7 +125,7 @@ where
     }
 }
 
-trait DocPattern {
+pub trait DocPattern {
     fn find_all_matches_in_doc(&self, document: &Document) -> Vec<Span>;
 }
 

diff --git a/harper-core/src/patterns/repeating_pattern.rs b/harper-core/src/patterns/repeating_pattern.rs
@@ -6,25 +6,35 @@ use crate::Token;
 /// Somewhat reminiscent of the `.*` operator in Regex.
 pub struct RepeatingPattern {
     inner: Box<dyn Pattern>,
+    required_repetitions: usize,
 }
 
 impl RepeatingPattern {
-    pub fn new(pattern: Box<dyn Pattern>) -> Self {
-        Self { inner: pattern }
+    pub fn new(pattern: Box<dyn Pattern>, required_repetitions: usize) -> Self {
+        Self {
+            inner: pattern,
+            required_repetitions,
+        }
     }
 }
 
 impl Pattern for RepeatingPattern {
     fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
         let mut tok_cursor = 0;
+        let mut repetition = 0;
 
         loop {
             let match_len = self.inner.matches(&tokens[tok_cursor..], source);
 
             if match_len == 0 {
-                return tok_cursor;
+                if repetition >= self.required_repetitions {
+                    return tok_cursor;
+                } else {
+                    return 0;
+                }
             } else {
                 tok_cursor += match_len;
+                repetition += 1;
             }
         }
     }
@@ -41,11 +51,19 @@ mod tests {
         let doc = Document::new_plain_english_curated(
             "This matcher will match the entirety of any document!",
         );
-        let pat = RepeatingPattern::new(Box::new(AnyPattern));
+        let pat = RepeatingPattern::new(Box::new(AnyPattern), 0);
 
         assert_eq!(
             pat.matches(doc.get_tokens(), doc.get_source()),
             doc.get_tokens().len()
         )
     }
+
+    #[test]
+    fn does_not_match_short() {
+        let doc = Document::new_plain_english_curated("No match");
+        let pat = RepeatingPattern::new(Box::new(AnyPattern), 4);
+
+        assert_eq!(pat.matches(doc.get_tokens(), doc.get_source()), 0)
+    }
 }
diff --git a/harper-core/src/patterns/sequence_pattern.rs b/harper-core/src/patterns/sequence_pattern.rs
@@ -199,7 +199,7 @@ impl SequencePattern {
 
     pub fn then_one_or_more(mut self, pat: Box<dyn Pattern>) -> Self {
         self.token_patterns
-            .push(Box::new(RepeatingPattern::new(pat)));
+            .push(Box::new(RepeatingPattern::new(pat, 0)));
         self
     }
 

diff --git a/harper-core/tests/run_tests.rs b/harper-core/tests/run_tests.rs
@@ -48,6 +48,8 @@ create_test!(amazon_hostname.md, 0);
 create_test!(issue_159.md, 1);
 create_test!(issue_358.md, 0);
 create_test!(issue_195.md, 0);
+create_test!(issue_118.md, 0);
+create_test!(lots_of_latin.md, 0);
 create_test!(pr_452.md, 2);
 
 // Make sure it doesn't panic

diff --git a/harper-core/tests/test_sources/issue_118.md b/harper-core/tests/test_sources/issue_118.md
@@ -0,0 +1 @@
+This is another sentence that says item 1, item 2, etc. in the middle of the sentence.
diff --git a/harper-core/tests/test_sources/lots_of_latin.md b/harper-core/tests/test_sources/lots_of_latin.md
@@ -0,0 +1,3 @@
+We had some issues with correctly parsing certain Latin terms.
+
+It caused issues with phrases like, "it was Mike Tyson vs. Weird Al!" and "Mike Tyson et al. wrote this paper," etc., especially for scientific papers.
-Original file line number
+Diff line change
@@ Expand Up / @@ -49774,3 +49774,6 @@ a8c/SM @@
     a11n/1
     a12s/9
     intergenerational
+    etc.
+    vs.
+    et al.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This is another sentence that says item 1, item 2, etc. in the middle of the sentence.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		We had some issues with correctly parsing certain Latin terms.

		It caused issues with phrases like, "it was Mike Tyson vs. Weird Al!" and "Mike Tyson et al. wrote this paper," etc., especially for scientific papers.