Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): condense multi-token Latin words and phrases #473

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -49774,3 +49774,6 @@ a8c/SM
a11n/1
a12s/9
intergenerational
etc.
vs.
et al.
125 changes: 67 additions & 58 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ use std::fmt::Display;
use paste::paste;

use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
use crate::patterns::{PatternExt, RepeatingPattern, SequencePattern};
use crate::patterns::{
DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
};
use crate::punctuation::Punctuation;
use crate::vec_ext::VecExt;
use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
Expand Down Expand Up @@ -105,12 +107,13 @@ impl Document {
/// Should be run after every change to the underlying [`Self::source`].
fn parse(&mut self, dictionary: &impl Dictionary) {
self.condense_spaces();
self.condense_ellipsis();
self.condense_newlines();
self.newlines_to_breaks();
self.condense_contractions();
self.condense_dotted_initialisms();
self.condense_number_suffixes();
self.condense_ellipsis();
self.condense_latin();
self.match_quotes();

for token in self.tokens.iter_mut() {
Expand Down Expand Up @@ -326,6 +329,49 @@ impl Document {
self.tokens.remove_indices(remove_these);
}

thread_local! {
static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
}

fn uncached_latin_pattern() -> Lrc<EitherPattern> {
Lrc::new(EitherPattern::new(vec![
Box::new(
SequencePattern::default()
.then_word_set(WordSet::all(&["etc", "vs"]))
.then_period(),
),
Box::new(
SequencePattern::aco("et")
.then_whitespace()
.t_aco("al")
.then_period(),
),
]))
}

/// Assumes that the first matched token is the canonical one to be condensed into.
/// Takes a callback that can be used to retroactively edit the canonical token afterwards.
fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
where
F: Fn(&mut Token),
{
let matches = pattern.find_all_matches_in_doc(self);

let mut remove_indices = VecDeque::with_capacity(matches.len());

for m in matches {
remove_indices.extend(m.start + 1..m.end);
self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
edit(&mut self.tokens[m.start]);
}

self.tokens.remove_indices(remove_indices);
}

fn condense_latin(&mut self) {
self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
}

/// Searches for multiple sequential newline tokens and condenses them down
/// into one.
fn condense_newlines(&mut self) {
Expand Down Expand Up @@ -414,76 +460,39 @@ impl Document {

fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
let period = SequencePattern::default().then_period();
Lrc::new(RepeatingPattern::new(Box::new(period)))
Lrc::new(RepeatingPattern::new(Box::new(period), 2))
}

thread_local! {
static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
}

fn condense_ellipsis(&mut self) {
let found = Self::ELLIPSIS_PATTERN
.with(|v| v.clone())
.find_all_matches(&self.tokens, &self.source);
let mut to_remove = VecDeque::new();

for found_slice in found {
if found_slice.len() <= 1 {
continue;
}

let found_toks = &mut self.tokens[found_slice.start..found_slice.end];

let end_char = found_toks.last().unwrap().span.end;
let first = found_toks.first_mut().unwrap();
first.kind = TokenKind::Punctuation(Punctuation::Ellipsis);
first.span.end = end_char;
for i in found_slice.start + 1..found_slice.end {
to_remove.push_back(i)
}
}
let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
self.condense_pattern(&pattern, |tok| {
tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
});
}

fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
Lrc::new(
SequencePattern::default()
.then_any_word()
.then_apostrophe()
.then_any_word(),
)
}

self.tokens.remove_indices(to_remove);
thread_local! {
static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
}

/// Searches for contractions and condenses them down into single
/// tokens.
fn condense_contractions(&mut self) {
if self.tokens.len() < 3 {
return;
}

// Indices of the three token stretches we are going to condense.
let mut replace_starts = Vec::new();

for idx in 0..self.tokens.len() - 2 {
let a = self.tokens[idx];
let b = self.tokens[idx + 1];
let c = self.tokens[idx + 2];

if matches!(
(a.kind, b.kind, c.kind),
(
TokenKind::Word(..),
TokenKind::Punctuation(Punctuation::Apostrophe),
TokenKind::Word(..)
)
) {
// Ensure there is no overlapping between replacements
let should_replace = if let Some(last_idx) = replace_starts.last() {
*last_idx < idx - 2
} else {
true
};

if should_replace {
replace_starts.push(idx);
self.tokens[idx].span.end = c.span.end;
}
}
}
let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());

self.condense_indices(&replace_starts, 3);
self.condense_pattern(&pattern, |_| {});
}
}

Expand Down
4 changes: 1 addition & 3 deletions harper-core/src/patterns/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ where
return found;
}

found.sort_by_key(|s| s.start);

let mut remove_indices = VecDeque::new();

for i in 0..found.len() - 1 {
Expand Down Expand Up @@ -127,7 +125,7 @@ where
}
}

trait DocPattern {
pub trait DocPattern {
fn find_all_matches_in_doc(&self, document: &Document) -> Vec<Span>;
}

Expand Down
26 changes: 22 additions & 4 deletions harper-core/src/patterns/repeating_pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,35 @@ use crate::Token;
/// Somewhat reminiscent of the `.*` operator in Regex.
pub struct RepeatingPattern {
inner: Box<dyn Pattern>,
required_repetitions: usize,
}

impl RepeatingPattern {
pub fn new(pattern: Box<dyn Pattern>) -> Self {
Self { inner: pattern }
pub fn new(pattern: Box<dyn Pattern>, required_repetitions: usize) -> Self {
Self {
inner: pattern,
required_repetitions,
}
}
}

impl Pattern for RepeatingPattern {
fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
let mut tok_cursor = 0;
let mut repetition = 0;

loop {
let match_len = self.inner.matches(&tokens[tok_cursor..], source);

if match_len == 0 {
return tok_cursor;
if repetition >= self.required_repetitions {
return tok_cursor;
} else {
return 0;
}
} else {
tok_cursor += match_len;
repetition += 1;
}
}
}
Expand All @@ -41,11 +51,19 @@ mod tests {
let doc = Document::new_plain_english_curated(
"This matcher will match the entirety of any document!",
);
let pat = RepeatingPattern::new(Box::new(AnyPattern));
let pat = RepeatingPattern::new(Box::new(AnyPattern), 0);

assert_eq!(
pat.matches(doc.get_tokens(), doc.get_source()),
doc.get_tokens().len()
)
}

#[test]
fn does_not_match_short() {
let doc = Document::new_plain_english_curated("No match");
let pat = RepeatingPattern::new(Box::new(AnyPattern), 4);

assert_eq!(pat.matches(doc.get_tokens(), doc.get_source()), 0)
}
}
2 changes: 1 addition & 1 deletion harper-core/src/patterns/sequence_pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ impl SequencePattern {

pub fn then_one_or_more(mut self, pat: Box<dyn Pattern>) -> Self {
self.token_patterns
.push(Box::new(RepeatingPattern::new(pat)));
.push(Box::new(RepeatingPattern::new(pat, 0)));
self
}

Expand Down
2 changes: 2 additions & 0 deletions harper-core/tests/run_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ create_test!(amazon_hostname.md, 0);
create_test!(issue_159.md, 1);
create_test!(issue_358.md, 0);
create_test!(issue_195.md, 0);
create_test!(issue_118.md, 0);
create_test!(lots_of_latin.md, 0);
create_test!(pr_452.md, 2);

// Make sure it doesn't panic
Expand Down
1 change: 1 addition & 0 deletions harper-core/tests/test_sources/issue_118.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is another sentence that says item 1, item 2, etc. in the middle of the sentence.
3 changes: 3 additions & 0 deletions harper-core/tests/test_sources/lots_of_latin.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
We had some issues with correctly parsing certain Latin terms.

It caused issues with phrases like, "it was Mike Tyson vs. Weird Al!" and "Mike Tyson et al. wrote this paper," etc., especially for scientific papers.
Loading