From b9d23ff93b37b8c2bc454ec259216124f20702c2 Mon Sep 17 00:00:00 2001 From: Sebastian Walter Date: Thu, 7 Mar 2024 17:58:35 +0100 Subject: [PATCH] split up lr1 constraint into exact and regular constraint --- src/grammar.rs | 14 +- text-utils-grammar/benches/benchmark.rs | 35 +- .../grammars/calc/examples/calc.txt | 2 +- text-utils-grammar/src/lib.rs | 5 +- text-utils-grammar/src/lr1.rs | 798 +++++++++++++----- text-utils-grammar/src/utils.rs | 16 +- 6 files changed, 646 insertions(+), 224 deletions(-) diff --git a/src/grammar.rs b/src/grammar.rs index abe0fdc..998acc9 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -5,7 +5,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; use text_utils_grammar::{ - Constraint, LR1GrammarConstraint, LR1GrammarParser, LR1NextState, LR1Parse, LR1State, + Constraint, ExactLR1GrammarConstraint, LR1GrammarParser, LR1NextState, LR1Parse, LR1State, RegularExpressionConstraint, RegularExpressionState, }; @@ -114,8 +114,8 @@ impl RegexConstraint { } #[pyclass] -struct LR1Constraint { - inner: Arc, +struct ExactLR1Constraint { + inner: Arc, state: LR1State, indices: Vec, is_match: bool, @@ -123,10 +123,10 @@ struct LR1Constraint { } #[pymethods] -impl LR1Constraint { +impl ExactLR1Constraint { #[new] fn new(grammar: &str, lexer: &str, continuations: Vec>) -> anyhow::Result { - let inner = LR1GrammarConstraint::new(grammar, lexer, continuations) + let inner = ExactLR1GrammarConstraint::new(grammar, lexer, continuations) .map_err(|e| anyhow!("failed to create LR(1) grammar constraint: {}", e))?; let state = inner.get_start_state(); let (indices, next_states) = inner.get_valid_continuations_with_state(&state); @@ -146,7 +146,7 @@ impl LR1Constraint { lexer_path: &str, continuations: Vec>, ) -> anyhow::Result { - let inner = LR1GrammarConstraint::from_files(grammar_path, lexer_path, continuations) + let inner = ExactLR1GrammarConstraint::from_files(grammar_path, lexer_path, continuations) .map_err(|e| { anyhow!( "failed to create LR(1) grammar constraint from files {} and {}: {}", @@ -350,7 +350,7 @@ fn parse_into_py( pub(super) fn add_submodule(py: Python, parent_module: &PyModule) -> PyResult<()> { let m = PyModule::new(py, "grammar")?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; parent_module.add_submodule(m)?; diff --git a/text-utils-grammar/benches/benchmark.rs b/text-utils-grammar/benches/benchmark.rs index dcfecb5..da531f0 100644 --- a/text-utils-grammar/benches/benchmark.rs +++ b/text-utils-grammar/benches/benchmark.rs @@ -2,8 +2,9 @@ use std::fs::{self, read_to_string}; use std::path::PathBuf; use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; +use text_utils_grammar::lr1::LR1GrammarConstraint; use text_utils_grammar::{ - utils::optimized_prefix_order, Constraint, LR1GrammarConstraint, LR1GrammarParser, + utils::optimized_prefix_order, Constraint, ExactLR1GrammarConstraint, LR1GrammarParser, RegularExpressionConstraint, }; @@ -117,30 +118,48 @@ fn bench_lr1_constraint(c: &mut Criterion) { .to_str() .unwrap() .to_string(); - let lr1_constraint = LR1GrammarConstraint::from_files(grammar, tokens, conts.clone()).unwrap(); - let state = lr1_constraint.get_start_state(); + let exact_lr_constraint = + ExactLR1GrammarConstraint::from_files(&grammar, &tokens, conts.clone()).unwrap(); + let lr_constraint = LR1GrammarConstraint::from_files(grammar, tokens, conts.clone()).unwrap(); + let state = exact_lr_constraint.get_start_state(); + c.bench_function("exact_lr1_json_empty_get_valid_continuations", |b| { + b.iter(|| exact_lr_constraint.get_valid_continuations_with_state(&state)) + }); + let state = lr_constraint.get_start_state(); c.bench_function("lr1_json_empty_get_valid_continuations", |b| { - b.iter(|| lr1_constraint.get_valid_continuations_with_state(&state)) + b.iter(|| lr_constraint.get_valid_continuations_with_state(&state)) }); let input = read_to_string( PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("grammars/json/examples/numbers.json"), ) .unwrap(); - let state = lr1_constraint + let state = exact_lr_constraint + .get_state(&input.as_bytes()[..input.len() / 2]) + .unwrap(); + c.bench_function("exact_lr1_json_numbers_get_valid_continuations", |b| { + b.iter(|| exact_lr_constraint.get_valid_continuations_with_state(&state)) + }); + let state = lr_constraint .get_state(&input.as_bytes()[..input.len() / 2]) .unwrap(); c.bench_function("lr1_json_numbers_get_valid_continuations", |b| { - b.iter(|| lr1_constraint.get_valid_continuations_with_state(&state)) + b.iter(|| lr_constraint.get_valid_continuations_with_state(&state)) }); let input = read_to_string( PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("grammars/json/examples/example1.json"), ) .unwrap(); - let state = lr1_constraint + let state = exact_lr_constraint + .get_state(&input.as_bytes()[..input.len() / 2]) + .unwrap(); + c.bench_function("exact_lr1_json_example1_get_valid_continuations", |b| { + b.iter(|| exact_lr_constraint.get_valid_continuations_with_state(&state)) + }); + let state = lr_constraint .get_state(&input.as_bytes()[..input.len() / 2]) .unwrap(); c.bench_function("lr1_json_example1_get_valid_continuations", |b| { - b.iter(|| lr1_constraint.get_valid_continuations_with_state(&state)) + b.iter(|| lr_constraint.get_valid_continuations_with_state(&state)) }); } diff --git a/text-utils-grammar/grammars/calc/examples/calc.txt b/text-utils-grammar/grammars/calc/examples/calc.txt index 9f745a4..6a452c1 100644 --- a/text-utils-grammar/grammars/calc/examples/calc.txt +++ b/text-utils-grammar/grammars/calc/examples/calc.txt @@ -1 +1 @@ -(1+2)*3 +() diff --git a/text-utils-grammar/src/lib.rs b/text-utils-grammar/src/lib.rs index 97b574d..e82a806 100644 --- a/text-utils-grammar/src/lib.rs +++ b/text-utils-grammar/src/lib.rs @@ -8,7 +8,10 @@ pub mod utils; pub use re::RegularExpressionConstraint; pub use regex_automata::util::primitives::StateID as RegularExpressionState; -pub use lr1::{LR1GrammarConstraint, LR1GrammarParser, LR1NextState, LR1Parse, LR1State}; +pub use lr1::{ + ExactLR1GrammarConstraint, LR1GrammarConstraint, LR1GrammarParser, LR1NextState, LR1Parse, + LR1State, +}; pub trait Constraint { type State; diff --git a/text-utils-grammar/src/lr1.rs b/text-utils-grammar/src/lr1.rs index dd0c777..d5c53bc 100644 --- a/text-utils-grammar/src/lr1.rs +++ b/text-utils-grammar/src/lr1.rs @@ -218,61 +218,113 @@ fn load_grammar_and_pdfas( Ok((grammar, pdfas)) } -type Tokens = Vec>; +type Tokens = Vec>>; type Span = (usize, usize); type Spans = Vec; type Matching = Vec<(usize, StateID)>; -fn prefix_lexer( +enum TokenOrMatching { + Token(usize, Option>), + Matching(Matching), +} + +fn find_token_or_matching( prefix: &[u8], + matching: &Matching, + pdfas: &[(PrefixDFA, Option>)], +) -> Option { + let mut len = 0; + let mut token = None; + let mut found_token = false; + let mut prefix_matches = vec![]; + for &(pidx, state) in matching { + let (pdfa, tidx) = &pdfas[pidx]; + match pdfa.find_prefix_match(state, prefix) { + PrefixMatch::None => continue, + PrefixMatch::Maybe(state) => prefix_matches.push((pidx, state)), + PrefixMatch::UpTo(end, _) => { + if !found_token || end > len { + len = end; + token = tidx.as_ref().copied(); + found_token = true; + } + } + }; + } + if !prefix_matches.is_empty() { + if prefix_matches.iter().all(|(pidx, state)| { + let (pdfa, _) = &pdfas[*pidx]; + pdfa.is_final_match_state(*state) + }) { + let (pidx, _) = prefix_matches[0]; + let (_, token) = &pdfas[pidx]; + Some(TokenOrMatching::Token( + prefix.len(), + token.as_ref().copied(), + )) + } else { + Some(TokenOrMatching::Matching(prefix_matches)) + } + } else if found_token { + Some(TokenOrMatching::Token(len, token)) + } else { + None + } +} + +type PrefixLexerOutput = (Tokens, Spans, Matching, Span); + +#[inline] +fn prefix_lexer_with( + continuation: &[u8], pdfas: &[(PrefixDFA, Option>)], -) -> Result<(Tokens, Spans, Matching, Span), Box> { + mut prefix_matches: Matching, +) -> Result> { // returns a list of tokens and a list of indices of pdfas matching // the rest of the prefix, or None if no matching pdfa is found let mut tokens = vec![]; let mut spans = vec![]; - // initially all pdfas are in the matching state, the start state - let mut prefix_matches: Vec<_> = pdfas - .iter() - .enumerate() - .map(|(pidx, (pdfa, _))| (pidx, pdfa.get_start_state())) - .collect(); let mut i = 0; // logic is that longest match wins - while i < prefix.len() { - let mut longest = 0; - let mut matching = None; - prefix_matches.clear(); - for (pidx, (pdfa, tidx)) in pdfas.iter().enumerate() { - match pdfa.find_prefix_match(pdfa.get_start_state(), &prefix[i..]) { - PrefixMatch::None => continue, - PrefixMatch::Maybe(state) => prefix_matches.push((pidx, state)), - PrefixMatch::UpTo(end, _) => { - if end > longest { - longest = end; - matching = tidx.as_ref(); - } - } - }; - } - if !prefix_matches.is_empty() { - // there is at least one pdfa that matches the whole rest of prefix - break; - } else if longest > 0 { - if let Some(&tidx) = matching { - tokens.push(tidx); - spans.push((i, longest)); + while i < continuation.len() { + match find_token_or_matching(&continuation[i..], &prefix_matches, pdfas) { + Some(TokenOrMatching::Token(len, token)) => { + tokens.push(token); + spans.push((i, len)); + i += len; + prefix_matches = initial_prefix_matches(pdfas); + } + Some(TokenOrMatching::Matching(matching)) => { + prefix_matches = matching; + break; + } + None => { + return Err(format!( + "no matching token found from position {i}: '{}'", + String::from_utf8_lossy(&continuation[i..]) + ) + .into()); } - i += longest; - } else { - return Err(format!( - "no matching token found from position {i}: '{}'", - String::from_utf8_lossy(&prefix[i..]) - ) - .into()); } } - Ok((tokens, spans, prefix_matches, (i, prefix.len() - i))) + Ok((tokens, spans, prefix_matches, (i, continuation.len() - i))) +} + +fn initial_prefix_matches(pdfas: &[(PrefixDFA, Option>)]) -> Matching { + pdfas + .iter() + .enumerate() + .map(|(pidx, (pdfa, _))| (pidx, pdfa.get_start_state())) + .collect() +} + +fn prefix_lexer( + prefix: &[u8], + pdfas: &[(PrefixDFA, Option>)], +) -> Result> { + // initially all pdfas are in the potential prefix matches, the start state + let prefix_matches = initial_prefix_matches(pdfas); + prefix_lexer_with(prefix, pdfas, prefix_matches) } fn lexer( @@ -280,17 +332,19 @@ fn lexer( pdfas: &[(PrefixDFA, Option>)], ) -> Result<(Tokens, Spans), Box> { let (mut tokens, mut spans, last_matches, last_span) = prefix_lexer(text.as_bytes(), pdfas)?; - if let Some(&tidx) = last_matches.iter().find_map(|&(pidx, state)| { - let (pdfa, Some(tidx)) = &pdfas[pidx] else { - return None; - }; + if let Some(&token) = last_matches.iter().find_map(|&(pidx, state)| { + let (pdfa, token) = &pdfas[pidx]; if pdfa.is_match_state(state) { - Some(tidx) + Some(token) } else { None } }) { - tokens.push(tidx); + assert!( + last_span.1 > 0, + "last span should not be empty in this case" + ); + tokens.push(token); spans.push(last_span); } Ok((tokens, spans)) @@ -377,7 +431,7 @@ impl LR1GrammarParser { let (tokens, _) = lexer(text, &self.pdfas)?; Ok(tokens .into_iter() - .map(|tidx| self.grammar.token_name(tidx).unwrap()) + .filter_map(|tidx| tidx.and_then(|tidx| self.grammar.token_name(tidx))) .collect()) } @@ -390,7 +444,9 @@ impl LR1GrammarParser { tokens .into_iter() .zip(spans) - .map(|(tidx, (start, len))| Ok(DefaultLexeme::new(tidx.as_storaget(), start, len))) + .filter_map(|(tidx, (start, len))| { + tidx.map(|tidx| Ok(DefaultLexeme::new(tidx.as_storaget(), start, len))) + }) .collect(), nlc, ); @@ -452,7 +508,7 @@ impl LR1GrammarParser { } } -pub struct LR1GrammarConstraint { +pub struct ExactLR1GrammarConstraint { pub(crate) grammar: YaccGrammar, table: StateTable, pdfas: Vec<(PrefixDFA, Option>)>, @@ -467,7 +523,143 @@ enum LR1Action { None, } -impl LR1GrammarConstraint { +fn shift_reduce( + grammar: &YaccGrammar, + table: &StateTable, + stack: &[StIdx], + token: TIdx, +) -> LR1Action { + let Some(mut stidx) = stack.last().copied() else { + return LR1Action::None; + }; + // perform actions until the next shift, + // can be implemented without actually + // modifying the stack, because it will only ever + // get smaller by reduces + // stidx will always be the last element of the stack + // (at position stack_end) + let mut stack_end = stack.len() - 1; + loop { + match table.action(stidx, token) { + Action::Shift(next_stidx) => { + stidx = next_stidx; + break; + } + Action::Reduce(pidx) => { + let ridx = grammar.prod_to_rule(pidx); + let rlen = grammar.prod(pidx).len(); + stack_end -= rlen - 1; + let Some(new_stidx) = table.goto(stack[stack_end - 1], ridx) else { + return LR1Action::None; + }; + stidx = new_stidx; + } + Action::Accept => return LR1Action::Accept, + Action::Error => return LR1Action::None, + }; + } + LR1Action::ShiftReduce(stack_end + 1, stidx) +} + +fn matchable_pdfas<'pdfa>( + grammar: &YaccGrammar, + table: &StateTable, + pdfas: &'pdfa [(PrefixDFA, Option>)], + stack: &[StIdx], +) -> Vec<(usize, &'pdfa PrefixDFA)> { + let Some(&last) = stack.last() else { + return vec![]; + }; + let state_actions: Vec<_> = table.state_actions(last).collect(); + pdfas + .iter() + .enumerate() + .filter_map(|(i, (pdfa, tidx))| { + if let Some(tidx) = tidx { + if !state_actions.contains(tidx) + || !matches!( + shift_reduce(grammar, table, stack, *tidx), + LR1Action::ShiftReduce(..) + ) + { + return None; + } + } + Some((i, pdfa)) + }) + .collect() +} + +fn filter_matching( + matching: &mut Matching, + grammar: &YaccGrammar, + table: &StateTable, + pdfas: &[(PrefixDFA, Option>)], + stack: &[StIdx], +) { + matching.retain(|&(pidx, _)| { + let (_, token) = &pdfas[pidx]; + if let Some(token) = token { + if !matches!( + shift_reduce(grammar, table, stack, *token), + LR1Action::ShiftReduce(..) + ) { + return false; + } + } + true + }) +} + +fn drive( + grammar: &YaccGrammar, + table: &StateTable, + mut stack: Vec>, + tokens: &[Option>], +) -> Option>> { + let mut idx = 0; + while idx < tokens.len() { + let stidx = stack.last()?; + let Some(tidx) = tokens[idx] else { + idx += 1; + continue; + }; + match table.action(*stidx, tidx) { + Action::Shift(stidx) => { + stack.push(stidx); + idx += 1; + } + Action::Reduce(pidx) => { + let ridx = grammar.prod_to_rule(pidx); + let keep = stack.len() - grammar.prod(pidx).len(); + stack.truncate(keep); + let stidx = table.goto(*stack.last()?, ridx)?; + stack.push(stidx); + } + Action::Accept => unreachable!("dont drive with eof token"), + Action::Error => return None, + } + } + Some(stack) +} + +fn only_skippable_matching(matching: &Matching, pdfas: &[(PrefixDFA, Option>)]) -> bool { + matching.iter().all(|&(pidx, pdfa_state)| { + let (pdfa, None) = &pdfas[pidx] else { + return false; + }; + pdfa.is_match_state(pdfa_state) + }) +} + +fn is_accept_state(grammar: &YaccGrammar, table: &StateTable, stack: &[StIdx]) -> bool { + matches!( + shift_reduce(grammar, table, stack, grammar.eof_token_idx()), + LR1Action::Accept + ) +} + +impl ExactLR1GrammarConstraint { pub fn new( grammar: &str, tokens: &str, @@ -502,67 +694,8 @@ impl LR1GrammarConstraint { Self::new(&grammar, &tokens, continuations) } - fn shift_reduce(&self, stack: &[StIdx], token: TIdx) -> LR1Action { - let Some(mut stidx) = stack.last().copied() else { - return LR1Action::None; - }; - // perform actions until the next shift, - // can be implemented without actually - // modifying the stack, because it will only ever - // get smaller by reduces - // stidx will always be the last element of the stack - // (at position stack_end) - let mut stack_end = stack.len() - 1; - loop { - match self.table.action(stidx, token) { - Action::Shift(next_stidx) => { - stidx = next_stidx; - break; - } - Action::Reduce(pidx) => { - let ridx = self.grammar.prod_to_rule(pidx); - let rlen = self.grammar.prod(pidx).len(); - stack_end -= rlen - 1; - let Some(new_stidx) = self.table.goto(stack[stack_end - 1], ridx) else { - return LR1Action::None; - }; - stidx = new_stidx; - } - Action::Accept => return LR1Action::Accept, - Action::Error => return LR1Action::None, - }; - } - LR1Action::ShiftReduce(stack_end + 1, stidx) - } - - fn matchable_pdfas(&self, stack: &[StIdx]) -> Vec<(usize, &PrefixDFA)> { - let Some(&last) = stack.last() else { - return vec![]; - }; - let state_actions: Vec<_> = self.table.state_actions(last).collect(); - self.pdfas - .iter() - .enumerate() - .filter_map(|(i, (pdfa, tidx))| { - if let Some(tidx) = tidx { - if !state_actions.contains(tidx) - || !matches!(self.shift_reduce(stack, *tidx), LR1Action::ShiftReduce(..)) - { - return None; - } - } - Some((i, pdfa)) - }) - .collect() - } - pub fn only_skippable_matching(&self, state: &LR1State) -> bool { - state.matching.iter().all(|&(pidx, pdfa_state)| { - let (pdfa, None) = &self.pdfas[pidx] else { - return false; - }; - pdfa.is_match_state(pdfa_state) - }) + only_skippable_matching(&state.matching, &self.pdfas) } } @@ -589,32 +722,61 @@ pub struct LR1NextState { matching: Matching, } -impl Constraint for LR1GrammarConstraint { +impl Constraint for ExactLR1GrammarConstraint { type State = LR1State; type NextState = LR1NextState; + fn get_state(&self, prefix: &[u8]) -> Option { + let (tokens, _, mut matching, _) = prefix_lexer(prefix, &self.pdfas).ok()?; + let stack = drive( + &self.grammar, + &self.table, + vec![self.table.start_state()], + &tokens, + )?; + // the matching returned by prefix lexer is not a matching + // that adheres to the grammar, so we need to filter it + // further to only contain pdfas that are allowed to match + // according to the grammar + filter_matching( + &mut matching, + &self.grammar, + &self.table, + &self.pdfas, + &stack, + ); + if matching.is_empty() { + return None; + } + Some(Self::State { stack, matching }) + } + + fn get_start_state(&self) -> Self::State { + self.get_state(b"").expect("should not happen") + } + fn is_match_state(&self, state: &Self::State) -> bool { - matches!( - self.shift_reduce(&state.stack, self.grammar.eof_token_idx()), - LR1Action::Accept - ) || state.matching.iter().any(|&(pidx, pdfa_state)| { - let (pdfa, Some(token)) = &self.pdfas[pidx] else { - return false; - }; - if !pdfa.is_match_state(pdfa_state) { - return false; - } - let LR1Action::ShiftReduce(keep, stidx) = self.shift_reduce(&state.stack, *token) - else { - return false; - }; - let mut stack = state.stack[..keep].to_vec(); - stack.push(stidx); - matches!( - self.shift_reduce(&stack, self.grammar.eof_token_idx()), - LR1Action::Accept - ) - }) + is_accept_state(&self.grammar, &self.table, &state.stack) + || state.matching.iter().any(|&(pidx, pdfa_state)| { + let (pdfa, token) = &self.pdfas[pidx]; + if !pdfa.is_match_state(pdfa_state) { + return false; + } + let Some(token) = token else { + // a skippable token would not change anything here, + // as the check for accept state would already have + // returned true + return false; + }; + let LR1Action::ShiftReduce(keep, stidx) = + shift_reduce(&self.grammar, &self.table, &state.stack, *token) + else { + return false; + }; + let mut stack = state.stack[..keep].to_vec(); + stack.push(stidx); + is_accept_state(&self.grammar, &self.table, &stack) + }) } fn get_valid_continuations_with_state( @@ -646,11 +808,16 @@ impl Constraint for LR1GrammarConstraint { None } }) - .map(|&tidx| (self.shift_reduce(&state.stack, tidx), tidx)) - { + .map(|&tidx| { + ( + shift_reduce(&self.grammar, &self.table, &state.stack, tidx), + tidx, + ) + }) { let mut next_stack = state.stack[..keep].to_vec(); next_stack.push(next_stidx); - let next_matchable_pdfas = self.matchable_pdfas(&next_stack); + let next_matchable_pdfas = + matchable_pdfas(&self.grammar, &self.table, &self.pdfas, &next_stack); let token_name = self.grammar.token_name(tidx).unwrap(); Some(( (keep, next_stidx, token_name.to_string()), @@ -660,8 +827,9 @@ impl Constraint for LR1GrammarConstraint { None }; - let only_skippable_matching = self.only_skippable_matching(state); - let matchable_pdfas = self.matchable_pdfas(&state.stack); + let only_skippable_matching = only_skippable_matching(&state.matching, &self.pdfas); + let matchable_pdfas = + matchable_pdfas(&self.grammar, &self.table, &self.pdfas, &state.stack); // now check all continuations let mut i = 0; @@ -741,36 +909,152 @@ impl Constraint for LR1GrammarConstraint { } conts.into_iter().unzip() } +} + +pub struct LR1GrammarConstraint { + grammar: YaccGrammar, + table: StateTable, + pdfas: Vec<(PrefixDFA, Option>)>, + continuations: Vec>, + permutation: Vec, + skips: Vec, +} + +impl LR1GrammarConstraint { + pub fn new( + grammar: &str, + tokens: &str, + continuations: Vec>, + ) -> Result> { + let (grammar, pdfas) = load_grammar_and_pdfas( + grammar, + YaccKind::Original(YaccOriginalActionKind::NoAction), + tokens, + )?; + let (_, table) = lrtable::from_yacc(&grammar, Minimiser::Pager)?; + let (permutation, skips) = optimized_prefix_order(&continuations); + Ok(Self { + continuations, + grammar, + pdfas, + table, + permutation, + skips, + }) + } + + pub fn from_files( + grammar_path: impl AsRef, + tokens_path: impl AsRef, + continuations: Vec>, + ) -> Result> { + let file = File::open(grammar_path.as_ref())?; + let grammar = read_to_string(file)?; + let file = File::open(tokens_path.as_ref())?; + let tokens = read_to_string(file)?; + Self::new(&grammar, &tokens, continuations) + } +} + +impl Constraint for LR1GrammarConstraint { + type State = LR1State; + type NextState = LR1State; + + fn get_state(&self, prefix: &[u8]) -> Option { + let (tokens, _, mut matching, _) = prefix_lexer(prefix, &self.pdfas).ok()?; + let stack = drive( + &self.grammar, + &self.table, + vec![self.table.start_state()], + &tokens, + )?; + filter_matching( + &mut matching, + &self.grammar, + &self.table, + &self.pdfas, + &stack, + ); + if matching.is_empty() { + return None; + } + Some(Self::State { stack, matching }) + } fn get_start_state(&self) -> Self::State { self.get_state(b"").expect("should not happen") } - fn get_state(&self, prefix: &[u8]) -> Option { - // fix this by parsing prefix into tokens with the lexer - // and then driving the pda with these tokens - let (tokens, _, matching, _) = prefix_lexer(prefix, &self.pdfas).ok()?; - let mut stack = vec![self.table.start_state()]; - let mut idx = 0; - while idx < tokens.len() { - let stidx = stack.last()?; - let tidx = tokens[idx]; - match self.table.action(*stidx, tidx) { - Action::Shift(stidx) => { - stack.push(stidx); - idx += 1; - } - Action::Reduce(pidx) => { - let ridx = self.grammar.prod_to_rule(pidx); - let keep = stack.len() - self.grammar.prod(pidx).len(); - stack.truncate(keep); - let stidx = self.table.goto(*stack.last()?, ridx)?; - stack.push(stidx); + fn is_match_state(&self, state: &Self::State) -> bool { + is_accept_state(&self.grammar, &self.table, &state.stack) + || state.matching.iter().any(|&(pidx, pdfa_state)| { + let (pdfa, token) = &self.pdfas[pidx]; + if !pdfa.is_match_state(pdfa_state) { + return false; } - Action::Accept | Action::Error => return None, + let Some(token) = token else { + return false; + }; + let LR1Action::ShiftReduce(keep, stidx) = + shift_reduce(&self.grammar, &self.table, &state.stack, *token) + else { + return false; + }; + let mut stack = state.stack[..keep].to_vec(); + stack.push(stidx); + is_accept_state(&self.grammar, &self.table, &stack) + }) + } + + fn get_valid_continuations_with_state( + &self, + state: &Self::State, + ) -> (Vec, Vec) { + let mut conts = BTreeMap::new(); + + // now check all continuations + let mut i = 0; + while i < self.permutation.len() { + let skip = self.skips[i]; + let j = self.permutation[i]; + let cont = &self.continuations[j]; + i += 1; + + let Ok((tokens, _, mut next_matching, _)) = + prefix_lexer_with(cont, &self.pdfas, state.matching.clone()) + else { + i += skip; + continue; + }; + + let Some(next_stack) = drive(&self.grammar, &self.table, state.stack.clone(), &tokens) + else { + i += skip; + continue; + }; + + filter_matching( + &mut next_matching, + &self.grammar, + &self.table, + &self.pdfas, + &next_stack, + ); + if next_matching.is_empty() { + i += skip; + continue; } + + conts.insert( + j, + LR1State { + stack: next_stack, + matching: next_matching, + }, + ); } - Some(Self::State { stack, matching }) + + conts.into_iter().unzip() } } @@ -852,38 +1136,128 @@ mod test { assert!(lexer("2 - 1", &pdfas).is_err()); let (tokens, spans) = lexer("(1 + 28)*\n3", &pdfas).unwrap(); assert_eq!( - tokens.into_iter().map(|tidx| map[&tidx]).collect_vec(), + tokens + .into_iter() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), vec!["LP", "INT", "PLUS", "INT", "RP", "TIMES", "INT"] ); assert_eq!( spans, - vec![(0, 1), (1, 1), (3, 1), (5, 2), (7, 1), (8, 1), (10, 1)] + vec![ + (0, 1), + (1, 1), + (2, 1), + (3, 1), + (4, 1), + (5, 2), + (7, 1), + (8, 1), + (9, 1), + (10, 1) + ] ); let (pdfas, map) = get_ab_pdfas(); let (tokens, spans) = lexer("aabb", &pdfas).unwrap(); assert_eq!( - tokens.into_iter().map(|tidx| map[&tidx]).collect_vec(), + tokens + .into_iter() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), vec!["AA", "BB"] ); assert_eq!(spans, vec![(0, 2), (2, 2)]); let (tokens, spans) = lexer("abb", &pdfas).unwrap(); assert_eq!( - tokens.into_iter().map(|tidx| map[&tidx]).collect_vec(), + tokens + .into_iter() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), vec!["AB1", "B"] ); assert_eq!(spans, vec![(0, 2), (2, 1)]); assert!(lexer("abac", &pdfas).is_err()); } + fn combine_prefix_lexer_outputs( + output1: PrefixLexerOutput, + output2: PrefixLexerOutput, + ) -> PrefixLexerOutput { + let (mut combined_lexemes, mut combined_spans, _, mut last_span) = output1; + let (lexemes2, spans2, matching, last_span2) = output2; + combined_lexemes.extend(lexemes2); + if let Some(first2) = spans2.first() { + combined_spans.push((last_span.0, last_span.1 + first2.1)); + combined_spans.extend( + spans2 + .into_iter() + .skip(1) + .map(|(start, len)| (last_span.0 + last_span.1 + start, len)), + ); + last_span = (last_span.0 + last_span.1 + last_span2.0, last_span2.1); + } else { + assert!(last_span2.0 == 0); + last_span = (last_span.0, last_span.1 + last_span2.1); + } + (combined_lexemes, combined_spans, matching, last_span) + } + + #[test] + fn test_prefix_lexer_with() { + let (pdfas, _) = get_calc_pfdas(); + + let texts = [ + "(1 + 28)*\n3".as_bytes(), + b" 10 + 5", + b" ", + b"(((3 + 4)) * 6)", + ]; + + for text in texts { + let (lexemes, spans, matching, last_span) = prefix_lexer(text, &pdfas).unwrap(); + + for i in 0..=text.len() { + let output1 = prefix_lexer(&text[..i], &pdfas).unwrap(); + let output2 = prefix_lexer_with(&text[i..], &pdfas, output1.2.clone()).unwrap(); + let (combined_lexemes, combined_spans, combined_matching, combined_last_span) = + combine_prefix_lexer_outputs(output1, output2); + println!("text: {:?}", String::from_utf8_lossy(text)); + println!("text1: {:?}", String::from_utf8_lossy(&text[..i])); + println!("text2: {:?}", String::from_utf8_lossy(&text[i..])); + assert_eq!(lexemes, combined_lexemes); + assert_eq!(matching, combined_matching); + assert_eq!(spans, combined_spans); + assert_eq!(last_span, combined_last_span); + } + } + } + #[test] fn test_prefix_lexer() { let (pdfas, map) = get_calc_pfdas(); let (lexemes, spans, matching, last_span) = prefix_lexer(b"(1 + 28)*\n3", &pdfas).unwrap(); assert_eq!( - lexemes.into_iter().map(|tidx| map[&tidx]).collect_vec(), + lexemes + .iter() + .cloned() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), vec!["LP", "INT", "PLUS", "INT", "RP", "TIMES"] ); - assert_eq!(spans, vec![(0, 1), (1, 1), (3, 1), (5, 2), (7, 1), (8, 1)]); + assert_eq!( + spans, + vec![ + (0, 1), + (1, 1), + (2, 1), + (3, 1), + (4, 1), + (5, 2), + (7, 1), + (8, 1), + (9, 1) + ] + ); assert_eq!(matching.len(), 1); assert_eq!(last_span, (10, 1)); let (idx, state) = matching[0]; @@ -906,34 +1280,30 @@ mod test { assert_eq!(last_span, (0, 0)); let (lexemes, spans, matching, last_span) = prefix_lexer(b" (", &pdfas).unwrap(); - assert!(lexemes.is_empty()); - assert!(spans.is_empty()); - assert_eq!(matching.len(), 1); - let (idx, state) = matching[0]; - assert_eq!(idx, 0); - let (pdfa, tidx) = &pdfas[idx]; - assert_eq!(map[tidx.as_ref().unwrap()], "LP"); - assert!(pdfa.is_match_state(state)); - assert_eq!(last_span, (4, 1)); + assert_eq!(lexemes.into_iter().filter(|tidx| tidx.is_some()).count(), 1); + assert_eq!(spans.len(), 2); + assert_eq!(matching.len(), 7); + assert_eq!(last_span, (5, 0)); let (pdfas, map) = get_ab_pdfas(); let (lexemes, spans, matching, last_span) = prefix_lexer(b"aabb", &pdfas).unwrap(); assert_eq!( - lexemes.into_iter().map(|tidx| map[&tidx]).collect_vec(), - vec!["AA"] + lexemes + .into_iter() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), + vec!["AA", "BB"] ); - assert_eq!(spans, vec![(0, 2)]); - assert_eq!(matching.len(), 1); - let (idx, state) = matching[0]; - assert_eq!(idx, 4); - let (pdfa, tidx) = &pdfas[idx]; - assert_eq!(map[tidx.as_ref().unwrap()], "BB"); - assert!(pdfa.is_match_state(state)); - assert_eq!(last_span, (2, 2)); + assert_eq!(spans, vec![(0, 2), (2, 2)]); + assert_eq!(matching.len(), 8); + assert_eq!(last_span, (4, 0)); let (lexemes, spans, matching, last_span) = prefix_lexer(b"aab", &pdfas).unwrap(); assert_eq!( - lexemes.into_iter().map(|tidx| map[&tidx]).collect_vec(), + lexemes + .into_iter() + .filter_map(|tidx| tidx.map(|tidx| map[&tidx])) + .collect_vec(), vec!["AA"] ); assert_eq!(spans, vec![(0, 2)]); @@ -975,22 +1345,20 @@ mod test { println!("{}", parse.pretty(text, true)); } - #[test] - fn test_lrk_constraint() { - let conts = load_continuations(); + fn drive_with_tokens( + grammar: &YaccGrammar, + table: &StateTable, + tokens: &[Option>], + ) -> bool { + drive(grammar, table, vec![table.start_state()], tokens).is_some() + } - let (grammar, lexer, _) = load_lrk_grammar("json"); - let lrk = LR1GrammarConstraint::from_files(grammar, lexer, conts.clone()).unwrap(); - assert!(lrk.get_state(b"\"id\": \"1\"").is_none()); - assert!(lrk.get_state(b"\"id\"").is_some()); - let state = lrk.get_state(b"{\"id\": \"1\"").unwrap(); - assert!(!lrk.is_match_state(&state)); - let state = lrk.get_state(b"{\"id\": \"1\"}}").unwrap(); - let (cont_indices, _) = lrk.get_valid_continuations_with_state(&state); - assert!(!lrk.is_match_state(&state)); - assert!(cont_indices.is_empty()); - let state = lrk.get_state(b"{\"id\": \"1\"}").unwrap(); - assert!(lrk.is_match_state(&state)); + fn check_continuations( + lrk: &LR1GrammarConstraint, + prefix: &[u8], + continuations: &[Vec], + ) -> LR1State { + let state = lrk.get_state(prefix).unwrap(); let (cont_indices, _) = lrk.get_valid_continuations_with_state(&state); println!( "matching {}, {} conts: {:#?}", @@ -998,13 +1366,37 @@ mod test { cont_indices.len(), cont_indices .iter() - .map(|i| String::from_utf8_lossy(&conts[*i])) + .map(|i| String::from_utf8_lossy(&continuations[*i])) .collect_vec() ); - return; + for i in cont_indices { + let full: Vec<_> = prefix + .iter() + .copied() + .chain(continuations[i].clone()) + .collect(); + let (tokens, ..) = prefix_lexer(&full, &lrk.pdfas).unwrap(); + assert!(drive_with_tokens(&lrk.grammar, &lrk.table, &tokens)); + } + state + } - let (grammar, lexer, _) = load_lrk_grammar("calc"); + #[test] + fn test_lrk_constraint() { + let conts = load_continuations(); + + let (grammar, lexer, _) = load_lrk_grammar("json"); let lrk = LR1GrammarConstraint::from_files(grammar, lexer, conts.clone()).unwrap(); + assert!(lrk.get_state(b"\"id\": \"1\"").is_none()); + assert!(lrk.get_state(b"{\"id\": \"1\"}}").is_none()); + assert!(lrk.get_state(b"\"id\"").is_some()); + let state = check_continuations(&lrk, b"{\"id\": \"1\"", &conts); + assert!(!lrk.is_match_state(&state)); + let state = check_continuations(&lrk, b"{\"id\": \"1\"}", &conts); + assert!(lrk.is_match_state(&state)); + + let (grammar, lexer, _) = load_lrk_grammar("calc"); + let lrk = ExactLR1GrammarConstraint::from_files(grammar, lexer, conts.clone()).unwrap(); let state = lrk.get_start_state(); let (cont_indices, _) = lrk.get_valid_continuations_with_state(&state); println!( diff --git a/text-utils-grammar/src/utils.rs b/text-utils-grammar/src/utils.rs index dbc2981..739c332 100644 --- a/text-utils-grammar/src/utils.rs +++ b/text-utils-grammar/src/utils.rs @@ -67,6 +67,10 @@ impl PrefixDFA { self.dfa.is_match_state(self.dfa.next_eoi_state(state)) } + pub(crate) fn is_final_match_state(&self, state: StateID) -> bool { + self.is_match_state(state) && (0..=255).all(|b| self.drive(state, &[b]).is_none()) + } + #[inline] pub(crate) fn drive(&self, mut state: StateID, continuation: &[u8]) -> Option { for &b in continuation { @@ -80,7 +84,11 @@ impl PrefixDFA { #[inline] pub(crate) fn find_prefix_match(&self, mut state: StateID, prefix: &[u8]) -> PrefixMatch { - let mut last_match = None; + let mut last_match = if self.is_match_state(state) { + Some((0, state)) + } else { + None + }; for (i, &b) in prefix.iter().enumerate() { state = self.dfa.next_state(state, b); if self.is_match_state(state) { @@ -134,9 +142,9 @@ mod test { #[test] fn test_make_anchored() { - assert_eq!(make_anchored("a"), "^(a)$"); - assert_eq!(make_anchored("^a"), "^(a)$"); - assert_eq!(make_anchored("a$"), "^(a)$"); + assert_eq!(make_anchored("a"), "^(?:a)$"); + assert_eq!(make_anchored("^a"), "^(?:a)$"); + assert_eq!(make_anchored("a$"), "^(?:a)$"); assert_eq!(make_anchored("^a$"), "^a$"); } }