diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b1c7c..4229a06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,10 @@ } ``` +- A new syntax added for right contexts. A right context is basically + lookahead, but can only be used in rules and cannot be nested inside regexes. + See README for details. (#29) + # 2021/11/30: 0.8.1 New version published to fix broken README pages for lexgen and lexgen_util in diff --git a/README.md b/README.md index d569bbc..d0b13a9 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,20 @@ You can use parenthesis for grouping, e.g. `('a' | 'b')*`. Example: `'a' 'b' | 'c'+` is the same as `(('a' 'b') | ('c'+))`. +## Right context (lookahead) + +A rule in a rule set can be followed by another regex using `> ` syntax, +for right context. Right context is basically a limited form of lookahead: they +can only appear after a top-level regex for a rule. They cannot be used nested +in a regex. + +For example, the rule left-hand side `'a' > (_ # 'b')` matches `'a'` as long as +it's not followed by `'b'`. + +See also [right context tests] for more examples. + +[right context tests]: https://github.com/osa1/lexgen/blob/main/tests/right_ctx.rs + ## Built-in regular expressions lexgen comes with a set of built-in regular expressions. Regular diff --git a/crates/lexgen/src/ast.rs b/crates/lexgen/src/ast.rs index 864f991..6a17f1c 100644 --- a/crates/lexgen/src/ast.rs +++ b/crates/lexgen/src/ast.rs @@ -22,7 +22,7 @@ pub struct Lexer { pub enum Rule { /// `let = ;` - Binding { var: Var, re: Regex }, + Binding { var: Var, re: RegexCtx }, /// `type Error = UserError;` ErrorType { @@ -41,10 +41,17 @@ pub enum Rule { } pub struct SingleRule { - pub lhs: Regex, + pub lhs: RegexCtx, pub rhs: SemanticActionIdx, } +/// Regular expression with optional right context (lookahead) +#[derive(Debug, Clone)] +pub struct RegexCtx { + pub re: Regex, + pub right_ctx: Option, +} + #[derive(Debug, Clone)] pub enum RuleRhs { None, @@ -135,13 +142,30 @@ pub enum CharOrRange { Range(char, char), } -/// Parses a regex terminated with: `=>` (used in rules with RHSs), `,` (used in rules without -/// RHSs), or `;` (used in let bindings) +/// Parses a regex with optional right context: `re_ctx -> re [> re]` +fn parse_regex_ctx(input: ParseStream) -> syn::Result { + let re = parse_regex(input)?; + if input.peek(syn::token::Gt) { + input.parse::()?; + let right_ctx = parse_regex(input)?; + Ok(RegexCtx { + re, + right_ctx: Some(right_ctx), + }) + } else { + Ok(RegexCtx { + re, + right_ctx: None, + }) + } +} + +/// Parses a regex fn parse_regex(input: ParseStream) -> syn::Result { parse_regex_0(input) } -// re_0 -> re_1 | re_1 `|` re_1 (alternation) +// re_0 -> re_1 | re_0 `|` re_1 (alternation) fn parse_regex_0(input: ParseStream) -> syn::Result { let mut re = parse_regex_1(input)?; @@ -154,7 +178,7 @@ fn parse_regex_0(input: ParseStream) -> syn::Result { Ok(re) } -// re_1 -> re_2 | re_2 re_2 +// re_1 -> re_2 | re_1 re_2 (concatenation) fn parse_regex_1(input: ParseStream) -> syn::Result { let mut re = parse_regex_2(input)?; @@ -213,7 +237,7 @@ fn parse_regex_4(input: ParseStream) -> syn::Result { if input.peek(syn::token::Paren) { let parenthesized; syn::parenthesized!(parenthesized in input); - parse_regex(&parenthesized) + parse_regex(&parenthesized) // no right ctx } else if input.peek(syn::token::Dollar) { let _ = input.parse::()?; if input.parse::().is_ok() { @@ -269,7 +293,7 @@ fn parse_single_rule( input: ParseStream, semantic_action_table: &mut SemanticActionTable, ) -> syn::Result { - let lhs = parse_regex(input)?; + let lhs = parse_regex_ctx(input)?; let rhs = if input.parse::().is_ok() { RuleRhs::None @@ -308,7 +332,7 @@ fn parse_rule( input.parse::()?; let var = input.parse::()?; input.parse::()?; - let re = parse_regex(input)?; + let re = parse_regex_ctx(input)?; input.parse::()?; Ok(Rule::Binding { var: Var(var.to_string()), diff --git a/crates/lexgen/src/dfa.rs b/crates/lexgen/src/dfa.rs index 91001f9..48603e7 100644 --- a/crates/lexgen/src/dfa.rs +++ b/crates/lexgen/src/dfa.rs @@ -5,6 +5,7 @@ pub mod simplify; pub mod simulate; use crate::collections::{Map, Set}; +use crate::nfa::AcceptingState; use crate::range_map::{Range, RangeMap}; use std::convert::TryFrom; @@ -38,7 +39,7 @@ pub struct State { range_transitions: RangeMap, any_transition: Option, end_of_input_transition: Option, - accepting: Option, + accepting: Vec>, // Predecessors of the state, used to inline code for a state with one predecessor in the // predecessor's code predecessors: Set, @@ -52,7 +53,7 @@ impl State { range_transitions: Default::default(), any_transition: None, end_of_input_transition: None, - accepting: None, + accepting: vec![], predecessors: Default::default(), } } @@ -81,12 +82,8 @@ impl DFA { StateIdx(0) } - pub fn make_state_accepting(&mut self, state: StateIdx, value: A) { - // Give first rule priority - let accepting = &mut self.states[state.0].accepting; - if accepting.is_none() { - *accepting = Some(value); - } + pub fn make_state_accepting(&mut self, state: StateIdx, accept: AcceptingState) { + self.states[state.0].accepting.push(accept); } pub fn new_state(&mut self) -> StateIdx { @@ -95,6 +92,11 @@ impl DFA { new_state_idx } + #[cfg(test)] + pub fn is_accepting_state(&self, state: StateIdx) -> bool { + !self.states[state.0].accepting.is_empty() + } + pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) { let old = self.states[state.0].char_transitions.insert(char, next); assert!( @@ -238,7 +240,7 @@ impl Display for DFA { predecessors: _, } = state; - if accepting.is_some() { + if !accepting.is_empty() { if *initial { write!(f, "{:>5}:", format!("i*{}", state_idx))?; } else { diff --git a/crates/lexgen/src/dfa/codegen.rs b/crates/lexgen/src/dfa/codegen.rs index d34fd7e..ff7d4c1 100644 --- a/crates/lexgen/src/dfa/codegen.rs +++ b/crates/lexgen/src/dfa/codegen.rs @@ -7,8 +7,10 @@ use super::simplify::Trans; use super::{State, StateIdx, DFA}; use crate::ast::{RuleKind, RuleRhs}; -use crate::collections::Map; -use crate::range_map::RangeMap; +use crate::collections::{Map, Set}; +use crate::nfa::AcceptingState; +use crate::range_map::{Range, RangeMap}; +use crate::right_ctx::{RightCtxDFAs, RightCtxIdx}; use crate::semantic_action_table::{SemanticActionIdx, SemanticActionTable}; use std::convert::TryFrom; @@ -30,7 +32,8 @@ use quote::{quote, ToTokens}; const MAX_GUARD_SIZE: usize = 9; pub fn reify( - dfa: DFA, + dfa: DFA, SemanticActionIdx>, + right_ctx_dfas: &RightCtxDFAs, semantic_actions: SemanticActionTable, user_state_type: Option, user_error_type: Option, @@ -66,6 +69,26 @@ pub fn reify( let switch_method = generate_switch(&ctx, &rule_name_enum_name); + let token_type = ctx.token_type(); + + let error_type = match ctx.user_error_type() { + None => quote!(::std::convert::Infallible), + Some(error_type) => error_type.into_token_stream(), + }; + + let semantic_action_fn_ret_ty = match ctx.user_error_type() { + None => { + quote!(::lexgen_util::SemanticActionResult>) + } + Some(user_error_type) => { + quote!(::lexgen_util::SemanticActionResult>) + } + }; + + let semantic_action_fns = generate_semantic_action_fns(&ctx, &semantic_action_fn_ret_ty); + + let right_ctx_fns = generate_right_ctx_fns(&mut ctx, right_ctx_dfas); + let search_tables = ctx.take_search_tables(); let binary_search_fn = if search_tables.is_empty() { @@ -106,24 +129,8 @@ pub fn reify( }) .collect(); - let lexer_name = ctx.lexer_name(); let token_type = ctx.token_type(); - - let error_type = match ctx.user_error_type() { - None => quote!(::std::convert::Infallible), - Some(error_type) => error_type.into_token_stream(), - }; - - let semantic_action_fn_ret_ty = match ctx.user_error_type() { - None => { - quote!(::lexgen_util::SemanticActionResult>) - } - Some(user_error_type) => { - quote!(::lexgen_util::SemanticActionResult>) - } - }; - - let semantic_action_fns = generate_semantic_action_fns(&ctx, &semantic_action_fn_ret_ty); + let lexer_name = ctx.lexer_name(); quote!( // An enum for the rule sets in the DFA. `Init` is the initial, unnamed rule set. @@ -183,6 +190,7 @@ pub fn reify( #(#search_tables)* #binary_search_fn #semantic_action_fns + #(#right_ctx_fns)* impl<'input> Iterator for #lexer_name<'input> { type Item = Result<(::lexgen_util::Loc, #token_type, ::lexgen_util::Loc), ::lexgen_util::LexerError<#error_type>>; @@ -227,7 +235,10 @@ fn generate_switch(ctx: &CgCtx, enum_name: &syn::Ident) -> TokenStream { } /// Generate arms of `match self.__state { ... }` of a DFA. -fn generate_state_arms(ctx: &mut CgCtx, dfa: DFA) -> Vec { +fn generate_state_arms( + ctx: &mut CgCtx, + dfa: DFA, SemanticActionIdx>, +) -> Vec { let DFA { states } = dfa; let mut match_arms: Vec = vec![]; @@ -260,8 +271,8 @@ fn generate_state_arms(ctx: &mut CgCtx, dfa: DFA) -> V fn generate_state_arm( ctx: &mut CgCtx, state_idx: usize, - state: &State, - states: &[State], + state: &State, SemanticActionIdx>, + states: &[State, SemanticActionIdx>], ) -> TokenStream { let State { initial, @@ -285,7 +296,7 @@ fn generate_state_arm( // fail (backtrack or raise error) let default_action = any_transition .as_ref() - .map(|any_transition| generate_any_transition(ctx, states, any_transition)) + .map(|any_transition| generate_any_transition(ctx, states, any_transition, fail())) .unwrap_or_else(fail); let state_char_arms = generate_state_char_arms( @@ -306,7 +317,9 @@ fn generate_state_arm( let end_of_input_action = match end_of_input_transition { Some(end_of_input_transition) => match end_of_input_transition { - Trans::Accept(action) => generate_rhs_code(ctx, *action), + Trans::Accept(accepting_states) => { + test_right_ctxs(ctx, accepting_states, end_of_input_default_action) + } Trans::Trans(next_state) => { let StateIdx(next_state) = ctx.renumber_state(*next_state); quote!(self.0.__state = #next_state;) @@ -338,12 +351,40 @@ fn generate_state_arm( } } ) - } else if let Some(rhs) = accepting { + } else if !accepting.is_empty() { // Accepting state - let semantic_fn = ctx.semantic_action_fn_ident(*rhs); + let mut rhss: Vec<(TokenStream, TokenStream)> = Vec::with_capacity(accepting.len()); + + for AcceptingState { value, right_ctx } in accepting.iter() { + match right_ctx { + Some(right_ctx) => { + let right_ctx_fn = right_ctx_fn_name(ctx.lexer_name(), right_ctx); + let semantic_fn = ctx.semantic_action_fn_ident(*value); + rhss.push(( + quote!(#right_ctx_fn(self.0.__iter.clone())), + quote!(self.0.set_accepting_state(#semantic_fn)), + )); + } + None => { + let semantic_fn = ctx.semantic_action_fn_ident(*value); + rhss.push(( + quote!(true), + quote!(self.0.set_accepting_state(#semantic_fn)), + )); + break; + } + } + } + + let (last_cond, last_rhs) = rhss.pop().unwrap(); + let mut set_accepting_state = quote!(if #last_cond { #last_rhs }); + + for (cond, rhs) in rhss.into_iter().rev() { + set_accepting_state = quote!(if #cond { #rhs } else { #set_accepting_state }); + } quote!( - self.0.set_accepting_state(#semantic_fn); + #set_accepting_state match self.0.next() { None => { @@ -373,8 +414,9 @@ fn generate_state_arm( fn generate_any_transition( ctx: &mut CgCtx, - states: &[State], - trans: &Trans, + states: &[State, SemanticActionIdx>], + trans: &Trans, + fail: TokenStream, ) -> TokenStream { let action = match trans { Trans::Trans(StateIdx(next_state)) => { @@ -386,7 +428,7 @@ fn generate_any_transition( } } - Trans::Accept(action) => generate_rhs_code(ctx, *action), + Trans::Accept(accepting_states) => test_right_ctxs(ctx, accepting_states, fail), }; quote!( @@ -397,22 +439,22 @@ fn generate_any_transition( /// Generate arms for `match char { ... }` fn generate_state_char_arms( ctx: &mut CgCtx, - states: &[State], - char_transitions: &Map, - range_transitions: &RangeMap, + states: &[State, SemanticActionIdx>], + char_transitions: &Map>, + range_transitions: &RangeMap>, // RHS of the default alternative for this `match` (_ => ) default_rhs: &TokenStream, ) -> Vec { // Arms of the `match` for the current character let mut state_char_arms: Vec = vec![]; - // Add char transitions. Collect characters for next states, to be able to use or - // patterns in arms and reduce code size + // Collect characters for next states, to be able to use or patterns in arms and reduce code + // size let mut state_chars: Map> = Default::default(); for (char, next) in char_transitions { match next { - Trans::Accept(action) => { - let action_code = generate_rhs_code(ctx, *action); + Trans::Accept(accepting) => { + let action_code = test_right_ctxs(ctx, accepting, default_rhs.clone()); state_char_arms.push(quote!( #char => { #action_code @@ -423,6 +465,7 @@ fn generate_state_char_arms( } } + // Add char transitions for (StateIdx(next_state), chars) in state_chars.iter() { let pat = quote!(#(#chars)|*); @@ -442,18 +485,21 @@ fn generate_state_char_arms( )); } - // Add range transitions. Same as above, use chain of "or"s for ranges with same transition. + // Same as above for range transitions. Use chain of "or"s for ranges with same transition. let mut state_ranges: Map> = Default::default(); + for range in range_transitions.iter() { - match range.value { - Trans::Trans(state_idx) => state_ranges.entry(state_idx).or_default().push(( + match &range.value { + Trans::Trans(state_idx) => state_ranges.entry(*state_idx).or_default().push(( char::try_from(range.start).unwrap(), char::try_from(range.end).unwrap(), )), - Trans::Accept(action) => { - let action_code = generate_rhs_code(ctx, action); + Trans::Accept(accepting) => { + let action_code = test_right_ctxs(ctx, accepting, default_rhs.clone()); + let range_start = char::from_u32(range.start).unwrap(); let range_end = char::from_u32(range.end).unwrap(); + state_char_arms.push(quote!( x if x >= #range_start && x <= #range_end => { #action_code @@ -463,6 +509,7 @@ fn generate_state_char_arms( } } + // Add range transitions for (StateIdx(next_state), ranges) in state_ranges.into_iter() { let guard = if ranges.len() > MAX_GUARD_SIZE { let binary_search_table_id = ctx.add_search_table(ranges); @@ -578,3 +625,239 @@ fn generate_semantic_action_fns( quote!(#(#fns)*) } + +fn right_ctx_fn_name(lexer_name: &syn::Ident, idx: &RightCtxIdx) -> syn::Ident { + syn::Ident::new( + &format!("{}_RIGHT_CTX_{}", lexer_name, idx.as_usize()), + Span::call_site(), + ) +} + +fn generate_right_ctx_fns( + ctx: &mut CgCtx, + right_ctx_dfas: &RightCtxDFAs, +) -> Vec { + let mut fns = vec![]; + + let lexer_name = ctx.lexer_name().clone(); + + for (idx, dfa) in right_ctx_dfas.iter() { + let fn_name = right_ctx_fn_name(&lexer_name, &idx); + + let match_arms = generate_right_ctx_state_arms(ctx, dfa); + + fns.push( + quote!(fn #fn_name(mut input: std::iter::Peekable) -> bool { + let mut state: usize = 0; + + loop { + match state { + #(#match_arms)* + } + } + }), + ); + } + + fns +} + +fn generate_right_ctx_state_arms(ctx: &mut CgCtx, dfa: &DFA) -> Vec { + let DFA { states } = dfa; + + let mut match_arms: Vec = vec![]; + + let n_states = states.len(); + + for (state_idx, state) in states.iter().enumerate() { + let state_code: TokenStream = generate_right_ctx_state_arm(ctx, state, states); + + let state_idx_pat = if state_idx == n_states - 1 { + quote!(_) + } else { + quote!(#state_idx) + }; + + match_arms.push(quote!( + #state_idx_pat => { #state_code } + )); + } + + match_arms +} + +fn generate_right_ctx_state_arm( + ctx: &mut CgCtx, + state: &State, + states: &[State], +) -> TokenStream { + let State { + initial: _, + char_transitions, + range_transitions, + any_transition, + end_of_input_transition, + accepting, + predecessors: _, + } = state; + + let state_char_arms = + generate_right_ctx_state_char_arms(ctx, states, char_transitions, range_transitions); + + // Make sure right contexts don't have right contexts. We don't allow this in the syntax + // currently. + for accepting_state in accepting { + assert_eq!(accepting_state.right_ctx, None); + } + + if !accepting.is_empty() { + return quote!(return true); + } + + let eof = match end_of_input_transition { + Some(StateIdx(eof_next)) => quote!(state = #eof_next), + None => quote!(return false), + }; + + let def = match any_transition { + Some(StateIdx(any_next)) => quote!(state = #any_next), + None => quote!(return false), + }; + + quote!( + match input.next() { + None => #eof, + Some(char) => { + match char { + #(#state_char_arms,)* + _ => #def, + } + } + } + ) +} + +// NB. Does not add default case +fn generate_right_ctx_state_char_arms( + ctx: &mut CgCtx, + states: &[State], + char_transitions: &Map, + range_transitions: &RangeMap, +) -> Vec { + // Arms of the `match` for the current character + let mut state_char_arms: Vec = vec![]; + + // Collect characters for next states, to be able to use or patterns in arms and reduce code + // size + let mut state_chars: Map> = Default::default(); + + // Set of chars that transition to an accepting state + let mut accept_chars: Set = Default::default(); + + for (char, next) in char_transitions { + if states[next.0].accepting.is_empty() { + state_chars.entry(*next).or_default().push(*char); + } else { + accept_chars.insert(*char); + } + } + + // Add char transitions + for (StateIdx(next_state), chars) in state_chars.iter() { + let pat = quote!(#(#chars)|*); + state_char_arms.push(quote!(#pat => self.state = #next_state)); + } + + if !accept_chars.is_empty() { + let accept_chars: Vec = accept_chars.into_iter().collect(); + state_char_arms.push(quote!(#(#accept_chars)|* => return true)); + } + + // Same as above for range transitions. Use chain of "or"s for ranges with same transition. + let mut state_ranges: Map> = Default::default(); + let mut accept_ranges: Set<(char, char)> = Default::default(); + + for Range { + start, + end, + value: next, + } in range_transitions.iter() + { + let start = char::try_from(*start).unwrap(); + let end = char::try_from(*end).unwrap(); + + if states[next.0].accepting.is_empty() { + state_ranges.entry(*next).or_default().push((start, end)); + } else { + accept_ranges.insert((start, end)); + } + } + + // Add range transitions + for (StateIdx(next_state), ranges) in state_ranges.into_iter() { + let guard = if ranges.len() > MAX_GUARD_SIZE { + let binary_search_table_id = ctx.add_search_table(ranges); + + quote!(binary_search(x, &#binary_search_table_id)) + } else { + let range_checks: Vec = ranges + .into_iter() + .map(|(range_begin, range_end)| quote!((x >= #range_begin && x <= #range_end))) + .collect(); + + quote!(#(#range_checks)||*) + }; + + state_char_arms.push(quote!(x if #guard => state = #next_state)); + } + + if !accept_ranges.is_empty() { + let guard = if accept_ranges.len() > MAX_GUARD_SIZE { + let binary_search_table_id = ctx.add_search_table(accept_ranges.into_iter().collect()); + + quote!(binary_search(x, &#binary_search_table_id)) + } else { + let range_checks: Vec = accept_ranges + .into_iter() + .map(|(range_begin, range_end)| quote!((x >= #range_begin && x <= #range_end))) + .collect(); + + quote!(#(#range_checks)||*) + }; + + state_char_arms.push(quote!(x if #guard => return true)); + } + + state_char_arms +} + +fn test_right_ctxs( + ctx: &mut CgCtx, + accepting_states: &[AcceptingState], + default_rhs: TokenStream, +) -> TokenStream { + let mut alts: Vec<(TokenStream, TokenStream)> = Vec::with_capacity(accepting_states.len()); + let mut default = default_rhs; + + for AcceptingState { value, right_ctx } in accepting_states { + let action_code = generate_rhs_code(ctx, *value); + match right_ctx { + Some(right_ctx) => { + let right_ctx_fn = right_ctx_fn_name(ctx.lexer_name(), right_ctx); + alts.push((quote!(#right_ctx_fn(self.0.__iter.clone())), action_code)); + } + None => { + default = action_code; + break; + } + } + } + + let mut action_code = default; + + for (cond, rhs) in alts.into_iter().rev() { + action_code = quote!(if #cond { #rhs } else { #action_code }); + } + + action_code +} diff --git a/crates/lexgen/src/dfa/codegen/ctx.rs b/crates/lexgen/src/dfa/codegen/ctx.rs index 705720e..ce6f8f3 100644 --- a/crates/lexgen/src/dfa/codegen/ctx.rs +++ b/crates/lexgen/src/dfa/codegen/ctx.rs @@ -44,7 +44,7 @@ struct CgState { impl CgCtx { pub fn new( - dfa: &DFA, + dfa: &DFA, SemanticActionIdx>, semantic_action_table: SemanticActionTable, lexer_name: syn::Ident, token_type: syn::Type, diff --git a/crates/lexgen/src/dfa/simplify.rs b/crates/lexgen/src/dfa/simplify.rs index a845990..0847577 100644 --- a/crates/lexgen/src/dfa/simplify.rs +++ b/crates/lexgen/src/dfa/simplify.rs @@ -1,20 +1,21 @@ use super::{State, StateIdx, DFA}; use crate::collections::Map; -use crate::semantic_action_table::SemanticActionIdx; +use crate::nfa::AcceptingState; #[derive(Debug)] -pub enum Trans { - Accept(SemanticActionIdx), +pub enum Trans { + Accept(Vec>), Trans(StateIdx), } /// Removes accepting states with no transitions, makes the transitions to those states accepting. -pub fn simplify( - dfa: DFA, +pub fn simplify( + dfa: DFA, dfa_state_indices: &mut Map, -) -> DFA { - let mut empty_states: Vec<(StateIdx, Option)> = vec![]; - let mut non_empty_states: Vec<(StateIdx, State)> = vec![]; +) -> DFA, A> { + let mut empty_states: Vec<(StateIdx, Vec>)> = vec![]; + + let mut non_empty_states: Vec<(StateIdx, State)> = vec![]; for (state_idx, state) in dfa.into_state_indices() { if state.has_no_transitions() && !state.initial { @@ -31,14 +32,14 @@ pub fn simplify( *t = t.map(|i| i - idx); } - let map_transition = |t: StateIdx| -> Option { + let map_transition = |t: StateIdx| -> Trans { match empty_states.binary_search_by(|(state_idx, _action)| state_idx.cmp(&t)) { - Ok(idx) => empty_states[idx].1.map(Trans::Accept), - Err(idx) => Some(Trans::Trans(t.map(|i| i - idx))), + Ok(idx) => Trans::Accept(empty_states[idx].1.clone()), + Err(idx) => Trans::Trans(t.map(|i| i - idx)), } }; - let new_states: Vec> = non_empty_states + let new_states: Vec, A>> = non_empty_states .into_iter() .map(|(_state_idx, state)| { let State { @@ -53,19 +54,19 @@ pub fn simplify( let char_transitions = char_transitions .into_iter() - .filter_map(|(char, next)| map_transition(next).map(|next| (char, next))) + .map(|(char, next)| (char, map_transition(next))) .collect(); - let range_transitions = range_transitions.filter_map(map_transition); + let range_transitions = range_transitions.map(map_transition); - let any_transition = any_transition.and_then(map_transition); + let any_transition = any_transition.map(map_transition); - let end_of_input_transition = end_of_input_transition.and_then(map_transition); + let end_of_input_transition = end_of_input_transition.map(map_transition); let predecessors = predecessors .into_iter() .map(|pred| match map_transition(pred) { - Some(Trans::Trans(pred)) => pred, + Trans::Trans(pred) => pred, _ => { // This pass should only remove nodes without successors, so it's a bug if // we remove a predecessor diff --git a/crates/lexgen/src/dfa/simulate.rs b/crates/lexgen/src/dfa/simulate.rs index 9a9ac2c..5edd7da 100644 --- a/crates/lexgen/src/dfa/simulate.rs +++ b/crates/lexgen/src/dfa/simulate.rs @@ -1,10 +1,16 @@ use super::{StateIdx, DFA}; pub use crate::nfa::simulate::{ErrorLoc, Matches}; +use crate::nfa::AcceptingState; use crate::range_map::Range; +use crate::right_ctx::RightCtxDFAs; impl DFA { - pub fn simulate<'input>(&self, input: &'input str) -> (Matches<'input, A>, Option) { + pub fn simulate<'input>( + &self, + input: &'input str, + right_ctx_dfas: &RightCtxDFAs, + ) -> (Matches<'input, A>, Option) { let mut values: Matches<'input, A> = vec![]; // Current state @@ -52,8 +58,22 @@ impl DFA { state = next_state; // Check for accepting state - if let Some(value) = self.states[state.0].accepting { - last_match = Some((match_start, value, char_idx + char.len_utf8())); + for AcceptingState { value, right_ctx } in &self.states[state.0].accepting { + match right_ctx { + None => { + last_match = + Some((match_start, *value, char_idx + char.len_utf8())); + break; + } + Some(right_ctx_idx) => { + let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); + if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { + last_match = + Some((match_start, *value, char_idx + char.len_utf8())); + break; + } + } + } } } } @@ -63,9 +83,20 @@ impl DFA { if let Some(next) = next_end_of_input(self, state) { // Check for accepting state state = next; - if let Some(value) = self.states[state.0].accepting { - values.push((&input[match_start..], value)); - break; // 'outer + for AcceptingState { value, right_ctx } in &self.states[state.0].accepting { + match right_ctx { + None => { + values.push((&input[match_start..], *value)); + break 'outer; + } + Some(right_ctx_idx) => { + let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); + if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { + values.push((&input[match_start..], *value)); + break 'outer; + } + } + } } } @@ -120,3 +151,37 @@ fn next(dfa: &DFA, state: StateIdx, char: char) -> Option(dfa: &DFA, state: StateIdx) -> Option { dfa.states[state.0].end_of_input_transition } + +// Similar to `simulate`, but does not keep track of the last match as we don't need "longest +// match" semantics and backtracking +pub fn simulate_right_ctx( + dfa: &DFA, + mut char_indices: std::str::CharIndices, +) -> bool { + let mut state = dfa.initial_state(); + + if dfa.is_accepting_state(state) { + return true; + } + + while let Some((_, char)) = char_indices.next() { + match next(dfa, state, char) { + None => { + // Stuck + return false; + } + Some(next_state) => { + if dfa.is_accepting_state(next_state) { + return true; + } + + state = next_state; + } + } + } + + match next_end_of_input(dfa, state) { + None => false, + Some(next_state) => dfa.is_accepting_state(next_state), + } +} diff --git a/crates/lexgen/src/lib.rs b/crates/lexgen/src/lib.rs index fcde178..b7ad439 100644 --- a/crates/lexgen/src/lib.rs +++ b/crates/lexgen/src/lib.rs @@ -19,16 +19,18 @@ mod nfa; mod nfa_to_dfa; mod range_map; mod regex_to_nfa; +mod right_ctx; mod semantic_action_table; #[cfg(test)] mod tests; -use ast::{Lexer, Regex, Rule, SingleRule, Var}; +use ast::{Lexer, Regex, RegexCtx, Rule, SingleRule, Var}; use collections::Map; use dfa::{StateIdx as DfaStateIdx, DFA}; use nfa::NFA; use nfa_to_dfa::nfa_to_dfa; +use right_ctx::RightCtxDFAs; use semantic_action_table::{SemanticActionIdx, SemanticActionTable}; use std::collections::hash_map::Entry; @@ -54,6 +56,9 @@ pub fn lexer(input: TokenStream) -> TokenStream { // Maps DFA names to their initial states in the final DFA let mut dfas: Map = Default::default(); + // DFAs generated for right contexts + let mut right_ctx_dfas = RightCtxDFAs::new(); + let mut bindings: Map = Default::default(); let mut dfa: Option> = None; @@ -71,12 +76,13 @@ pub fn lexer(input: TokenStream) -> TokenStream { panic!("Variable {:?} is defined multiple times", entry.key().0); } Entry::Vacant(entry) => { - entry.insert(re); + // TODO: Check that regex doesn't have right context + entry.insert(re.re); } }, Rule::RuleSet { name, rules } => { if name == "Init" { - let dfa = dfa.insert(compile_rules(rules, &bindings)); + let dfa = dfa.insert(compile_rules(rules, &bindings, &mut right_ctx_dfas)); let initial_state = dfa.initial_state(); if dfas.insert(name.to_string(), initial_state).is_some() { @@ -87,7 +93,7 @@ pub fn lexer(input: TokenStream) -> TokenStream { .as_mut() .expect("First rule set should be named \"Init\""); - let dfa_ = compile_rules(rules, &bindings); + let dfa_ = compile_rules(rules, &bindings, &mut right_ctx_dfas); let dfa_idx = dfa.add_dfa(dfa_); @@ -106,7 +112,7 @@ pub fn lexer(input: TokenStream) -> TokenStream { ); } - let dfa = dfa.insert(compile_rules(rules, &bindings)); + let dfa = dfa.insert(compile_rules(rules, &bindings, &mut right_ctx_dfas)); let initial_state = dfa.initial_state(); dfas.insert("Init".to_owned(), initial_state); } @@ -131,6 +137,7 @@ pub fn lexer(input: TokenStream) -> TokenStream { dfa::codegen::reify( dfa, + &right_ctx_dfas, semantic_action_table, user_state_type, user_error_type, @@ -145,11 +152,18 @@ pub fn lexer(input: TokenStream) -> TokenStream { fn compile_rules( rules: Vec, bindings: &Map, + right_ctx_dfas: &mut RightCtxDFAs, ) -> DFA { let mut nfa: NFA = NFA::new(); for SingleRule { lhs, rhs } in rules { - nfa.add_regex(bindings, &lhs, rhs); + let RegexCtx { re, right_ctx } = lhs; + + let right_ctx = right_ctx + .as_ref() + .map(|right_ctx| right_ctx_dfas.new_right_ctx(bindings, right_ctx)); + + nfa.add_regex(bindings, &re, right_ctx, rhs); } nfa_to_dfa(&nfa) diff --git a/crates/lexgen/src/nfa.rs b/crates/lexgen/src/nfa.rs index 64d468b..c378678 100644 --- a/crates/lexgen/src/nfa.rs +++ b/crates/lexgen/src/nfa.rs @@ -6,6 +6,7 @@ use crate::collections::{Map, Set}; use crate::display::HashSetDisplay; use crate::range_map::{Range, RangeMap}; use crate::regex_to_nfa; +use crate::right_ctx::RightCtxIdx; /// Non-deterministic finite automate, parameterized on values of accepting states. #[derive(Debug)] @@ -24,7 +25,13 @@ struct State { empty_transitions: Set, any_transitions: Set, end_of_input_transitions: Set, - accepting: Option, + accepting: Option>, +} + +#[derive(Debug, Clone, Copy)] +pub struct AcceptingState { + pub value: A, + pub right_ctx: Option, } impl State { @@ -51,7 +58,7 @@ impl NFA { StateIdx(0) } - pub fn get_accepting_state(&self, state: StateIdx) -> Option<&A> { + pub fn get_accepting_state(&self, state: StateIdx) -> Option<&AcceptingState> { self.states[state.0].accepting.as_ref() } @@ -86,16 +93,23 @@ impl NFA { new_state_idx } - pub fn add_regex(&mut self, bindings: &Map, re: &Regex, value: A) { + pub fn add_regex( + &mut self, + bindings: &Map, + re: &Regex, + right_ctx: Option, + value: A, + ) { let re_accepting_state = self.new_state(); - self.make_state_accepting(re_accepting_state, value); + + self.make_state_accepting(re_accepting_state, value, right_ctx); let re_initial_state = self.new_state(); let nfa_initial_state = self.initial_state(); - regex_to_nfa::add_re(self, bindings, re, re_initial_state, re_accepting_state); - self.add_empty_transition(nfa_initial_state, re_initial_state); + + regex_to_nfa::add_re(self, bindings, re, re_initial_state, re_accepting_state); } pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) { @@ -156,10 +170,12 @@ impl NFA { assert!(not_exists, "add_end_of_input_transition"); } - fn make_state_accepting(&mut self, state: StateIdx, value: A) { - let old = self.states[state.0].accepting.replace(value); + fn make_state_accepting(&mut self, state: StateIdx, value: A, right_ctx: Option) { + let old = self.states[state.0] + .accepting + .replace(AcceptingState { value, right_ctx }); - debug_assert!(old.is_none(), "make_state_accepting"); + assert!(old.is_none(), "make_state_accepting"); } pub fn compute_state_closure(&self, states: &Set) -> Set { @@ -203,10 +219,22 @@ impl Display for NFA { accepting, } = state; - if accepting.is_some() { - write!(f, "{:>4}", format!("*{}", state_idx))?; - } else { - write!(f, "{:>4}:", state_idx)?; + match accepting { + Some(AcceptingState { + value: _, + right_ctx, + }) => match right_ctx { + Some(right_ctx_idx) => { + write!(f, "{:>4}", format!("*{}", state_idx),)?; + write!(f, " (ctx {})", right_ctx_idx.as_usize())?; + } + None => { + write!(f, "{:>4}", format!("*{}", state_idx))?; + } + }, + None => { + write!(f, "{:>4}:", state_idx)?; + } } let mut first = true; diff --git a/crates/lexgen/src/nfa/simulate.rs b/crates/lexgen/src/nfa/simulate.rs index 5d45229..caecdde 100644 --- a/crates/lexgen/src/nfa/simulate.rs +++ b/crates/lexgen/src/nfa/simulate.rs @@ -1,12 +1,19 @@ -use super::{StateIdx, NFA}; +use super::{AcceptingState, StateIdx, NFA}; use crate::collections::Set; +use crate::dfa::simulate::simulate_right_ctx; +use crate::dfa::StateIdx as DfaStateIdx; +use crate::right_ctx::RightCtxDFAs; pub type Matches<'input, A> = Vec<(&'input str, A)>; pub type ErrorLoc = usize; impl NFA { - pub fn simulate<'input>(&self, input: &'input str) -> (Matches<'input, A>, Option) { + pub fn simulate<'input>( + &self, + input: &'input str, + right_ctx_dfas: &RightCtxDFAs, + ) -> (Matches<'input, A>, Option) { let mut values: Matches<'input, A> = vec![]; // If we skipped an accepting state because we were able to make progress with the next @@ -65,9 +72,24 @@ impl NFA { let mut states_sorted: Vec = states.iter().copied().collect(); states_sorted.sort(); for state in states_sorted { - if let Some(value) = self.states[state.0].accepting { - last_match = Some((match_start, value, char_idx + char.len_utf8())); - break; + if let Some(AcceptingState { value, right_ctx }) = + &self.states[state.0].accepting + { + match right_ctx { + None => { + last_match = + Some((match_start, *value, char_idx + char.len_utf8())); + break; + } + Some(right_ctx_idx) => { + let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); + if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { + last_match = + Some((match_start, *value, char_idx + char.len_utf8())); + break; + } + } + } } } } @@ -81,9 +103,22 @@ impl NFA { states_sorted.sort(); for state in states_sorted { - if let Some(value) = self.states[state.0].accepting { - values.push((&input[match_start..], value)); - break 'outer; + if let Some(AcceptingState { value, right_ctx }) = + &self.states[state.0].accepting + { + match right_ctx { + None => { + values.push((&input[match_start..], *value)); + break 'outer; + } + Some(right_ctx_idx) => { + let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); + if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { + values.push((&input[match_start..], *value)); + break 'outer; + } + } + } } } } diff --git a/crates/lexgen/src/range_map.rs b/crates/lexgen/src/range_map.rs index 7682dbf..7fae7bf 100644 --- a/crates/lexgen/src/range_map.rs +++ b/crates/lexgen/src/range_map.rs @@ -61,21 +61,6 @@ impl RangeMap { self.ranges.is_empty() } - pub fn filter_map(self, mut f: F) -> RangeMap - where - F: FnMut(A) -> Option, - { - RangeMap { - ranges: self - .ranges - .into_iter() - .filter_map(|Range { start, end, value }| { - f(value).map(|value| Range { start, end, value }) - }) - .collect(), - } - } - pub fn map(self, mut f: F) -> RangeMap where F: FnMut(A) -> B, diff --git a/crates/lexgen/src/right_ctx.rs b/crates/lexgen/src/right_ctx.rs new file mode 100644 index 0000000..fa20111 --- /dev/null +++ b/crates/lexgen/src/right_ctx.rs @@ -0,0 +1,72 @@ +//! Stuff related to right contexts +//! +//! A right context is a limited version of lookahead. A rule can have at most one right context. +//! When a rule has right context, after the regex for the rule matches, we run the DFA for the +//! right context with cloned input stream. Only if it matches we consider the the rule as a match. +//! This provides a simple "lookahead" support, which should be good enough when lexing programming +//! languages. + +use crate::ast::{Regex, Var}; +use crate::collections::Map; +// use crate::dfa::simplify::{simplify, Trans}; +use crate::dfa::{StateIdx, DFA}; +use crate::nfa::NFA; +use crate::nfa_to_dfa::nfa_to_dfa; + +#[derive(Debug)] +pub struct RightCtxDFAs { + dfas: Vec>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct RightCtxIdx(usize); + +impl RightCtxIdx { + pub fn as_usize(&self) -> usize { + self.0 + } +} + +impl RightCtxDFAs { + pub fn new() -> Self { + RightCtxDFAs { dfas: vec![] } + } + + pub fn iter(&self) -> impl Iterator)> { + self.dfas + .iter() + .enumerate() + .map(|(i, dfa)| (RightCtxIdx(i), dfa)) + } +} + +impl RightCtxDFAs { + pub fn new_right_ctx(&mut self, bindings: &Map, right_ctx: &Regex) -> RightCtxIdx { + let idx = self.dfas.len(); + + let mut nfa: NFA<()> = NFA::new(); + nfa.add_regex(bindings, right_ctx, None, ()); + + let dfa = nfa_to_dfa(&nfa); + self.dfas.push(dfa); + + RightCtxIdx(idx) + } + + #[cfg(test)] + pub fn get(&self, right_ctx: &RightCtxIdx) -> &DFA { + &self.dfas[right_ctx.as_usize()] + } + + /* + pub fn simplify(self) -> RightCtxDFAs> { + RightCtxDFAs { + dfas: self + .dfas + .into_iter() + .map(|dfa| simplify::<(), ()>(dfa, &mut Default::default())) + .collect(), + } + } + */ +} diff --git a/crates/lexgen/src/tests.rs b/crates/lexgen/src/tests.rs index 6687b73..074bdf6 100644 --- a/crates/lexgen/src/tests.rs +++ b/crates/lexgen/src/tests.rs @@ -1,12 +1,22 @@ use crate::ast::{CharOrRange, CharSet, Regex, Var}; use crate::collections::Map; +use crate::dfa::StateIdx as DfaStateIdx; use crate::nfa::simulate::{ErrorLoc, Matches}; use crate::nfa::NFA; use crate::nfa_to_dfa::nfa_to_dfa; +use crate::right_ctx::RightCtxDFAs; fn test_simulate<'input, A: Copy + std::fmt::Debug + Eq>( nfa: &NFA, test_cases: Vec<(&'input str, Matches<'input, A>, Option)>, +) { + test_simulate_right_ctx(nfa, &RightCtxDFAs::new(), test_cases) +} + +fn test_simulate_right_ctx<'input, A: Copy + std::fmt::Debug + Eq>( + nfa: &NFA, + right_ctx_dfas: &RightCtxDFAs, + test_cases: Vec<(&'input str, Matches<'input, A>, Option)>, ) { println!("NFA=\n{}", nfa); @@ -18,14 +28,14 @@ fn test_simulate<'input, A: Copy + std::fmt::Debug + Eq>( let expected = (expected_matches, expected_error); assert_eq!( - &nfa.simulate(str), + &nfa.simulate(str, right_ctx_dfas), &expected, "NFA simulation failed for string: {:?}", str ); assert_eq!( - dfa.simulate(str), + dfa.simulate(str, right_ctx_dfas), expected, "DFA simulation failed for string: {:?}", str @@ -43,10 +53,11 @@ fn simulate_backtracking() { Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))), Box::new(Regex::Char('b')), ), + None, 1, ); - nfa.add_regex(&Default::default(), &Regex::Char('a'), 2); + nfa.add_regex(&Default::default(), &Regex::Char('a'), None, 2); test_simulate( &nfa, @@ -62,9 +73,24 @@ fn simulate_backtracking() { fn issue_16() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("xyzxyz".to_owned()), 1); - nfa.add_regex(&Default::default(), &Regex::String("xyz".to_owned()), 2); - nfa.add_regex(&Default::default(), &Regex::String("xya".to_owned()), 3); + nfa.add_regex( + &Default::default(), + &Regex::String("xyzxyz".to_owned()), + None, + 1, + ); + nfa.add_regex( + &Default::default(), + &Regex::String("xyz".to_owned()), + None, + 2, + ); + nfa.add_regex( + &Default::default(), + &Regex::String("xya".to_owned()), + None, + 3, + ); test_simulate( &nfa, @@ -85,7 +111,12 @@ fn stuck_1() { fn stuck_2() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("ab".to_owned()), 1); + nfa.add_regex( + &Default::default(), + &Regex::String("ab".to_owned()), + None, + 1, + ); test_simulate(&nfa, vec![("aba", vec![("ab", 1)], Some(2))]); } @@ -94,8 +125,13 @@ fn stuck_2() { fn stuck_3() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("aaab".to_owned()), 1); - nfa.add_regex(&Default::default(), &Regex::String("a".to_owned()), 2); + nfa.add_regex( + &Default::default(), + &Regex::String("aaab".to_owned()), + None, + 1, + ); + nfa.add_regex(&Default::default(), &Regex::String("a".to_owned()), None, 2); test_simulate(&nfa, vec![("aaabb", vec![("aaab", 1)], Some(4))]); } @@ -104,7 +140,7 @@ fn stuck_3() { fn simulate_char() { let re = Regex::Char('a'); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -119,7 +155,7 @@ fn simulate_char() { fn simulate_string() { let re = Regex::String("ab".to_owned()); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -138,7 +174,7 @@ fn simulate_char_set_char() { CharOrRange::Char('b'), ])); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -159,7 +195,7 @@ fn simulate_char_set_range() { CharOrRange::Range('0', '9'), ])); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -171,7 +207,7 @@ fn simulate_char_set_range() { fn simulate_zero_or_more() { let re = Regex::ZeroOrMore(Box::new(Regex::Char('a'))); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -189,7 +225,7 @@ fn simulate_zero_or_more() { fn simulate_one_or_more() { let re = Regex::OneOrMore(Box::new(Regex::Char('a'))); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -206,7 +242,7 @@ fn simulate_one_or_more() { fn simulate_zero_or_one() { let re = Regex::ZeroOrOne(Box::new(Regex::Char('a'))); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -223,7 +259,7 @@ fn simulate_zero_or_one() { fn simulate_concat() { let re = Regex::Concat(Box::new(Regex::Char('a')), Box::new(Regex::Char('b'))); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -239,7 +275,7 @@ fn simulate_concat() { fn simulate_or() { let re = Regex::Or(Box::new(Regex::Char('a')), Box::new(Regex::Char('b'))); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -258,7 +294,7 @@ fn simulate_or_one_or_more_char() { Box::new(Regex::Char('b')), ); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -275,8 +311,8 @@ fn simulate_multiple_accepting_states_1() { let re1 = Regex::String("aaaa".to_owned()); let re2 = Regex::String("aaab".to_owned()); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re1, 1); - nfa.add_regex(&Default::default(), &re2, 2); + nfa.add_regex(&Default::default(), &re1, None, 1); + nfa.add_regex(&Default::default(), &re2, None, 2); test_simulate( &nfa, @@ -296,8 +332,8 @@ fn multiple_accepting_states_2() { ); let re2 = Regex::CharSet(CharSet(vec![CharOrRange::Range('0', '9')])); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &re1, 1); - nfa.add_regex(&Default::default(), &re2, 2); + nfa.add_regex(&Default::default(), &re1, None, 1); + nfa.add_regex(&Default::default(), &re2, None, 2); test_simulate( &nfa, @@ -334,7 +370,7 @@ fn simulate_variables() { ))))), ); let mut nfa: NFA = NFA::new(); - nfa.add_regex(&bindings, &re, 1); + nfa.add_regex(&bindings, &re, None, 1); test_simulate( &nfa, @@ -355,7 +391,7 @@ fn zero_or_more_concat_confusion_1() { Box::new(Regex::Char('a')), ); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -372,7 +408,7 @@ fn zero_or_more_concat_confusion_2() { Box::new(Regex::String("ab".to_owned())), ); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -395,7 +431,7 @@ fn zero_or_more_concat_confusion_3() { Box::new(Regex::Char('a')), ); - nfa.add_regex(&Default::default(), &re, 1); + nfa.add_regex(&Default::default(), &re, None, 1); test_simulate( &nfa, @@ -411,8 +447,13 @@ fn zero_or_more_concat_confusion_3() { fn simulate_any_1() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("ab".to_owned()), 1); - nfa.add_regex(&Default::default(), &Regex::Any, 2); + nfa.add_regex( + &Default::default(), + &Regex::String("ab".to_owned()), + None, + 1, + ); + nfa.add_regex(&Default::default(), &Regex::Any, None, 2); test_simulate( &nfa, @@ -437,6 +478,7 @@ fn simulate_any_2() { Box::new(Regex::Char('\'')), )), ), + None, 1, ); @@ -460,6 +502,7 @@ fn simulate_end_of_input_1() { )), )), ), + None, 1, ); @@ -477,10 +520,11 @@ fn simulate_end_of_input_1() { fn simulate_end_of_input_2() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::EndOfInput, 1); + nfa.add_regex(&Default::default(), &Regex::EndOfInput, None, 1); nfa.add_regex( &Default::default(), &Regex::ZeroOrMore(Box::new(Regex::Any)), + None, 2, ); @@ -492,9 +536,24 @@ fn simulate_end_of_input_2() { fn simulate_multiple_accepting_states_3() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("aaa".to_owned()), 1); - nfa.add_regex(&Default::default(), &Regex::String("aaa".to_owned()), 2); - nfa.add_regex(&Default::default(), &Regex::String("aa".to_owned()), 3); + nfa.add_regex( + &Default::default(), + &Regex::String("aaa".to_owned()), + None, + 1, + ); + nfa.add_regex( + &Default::default(), + &Regex::String("aaa".to_owned()), + None, + 2, + ); + nfa.add_regex( + &Default::default(), + &Regex::String("aa".to_owned()), + None, + 3, + ); test_simulate( &nfa, @@ -509,12 +568,18 @@ fn simulate_multiple_accepting_states_3() { fn range_and_char_confusion() { let mut nfa: NFA = NFA::new(); - nfa.add_regex(&Default::default(), &Regex::String("ab".to_owned()), 1); + nfa.add_regex( + &Default::default(), + &Regex::String("ab".to_owned()), + None, + 1, + ); nfa.add_regex( &Default::default(), &Regex::OneOrMore(Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range( 'a', 'z', )])))), + None, 2, ); @@ -534,6 +599,7 @@ fn overlapping_ranges() { Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'b')]))), Box::new(Regex::Char('1')), ), + None, 1, ); nfa.add_regex( @@ -542,6 +608,7 @@ fn overlapping_ranges() { Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'c')]))), Box::new(Regex::Char('2')), ), + None, 2, ); @@ -550,3 +617,58 @@ fn overlapping_ranges() { vec![("a1", vec![("a1", 1)], None), ("a2", vec![("a2", 2)], None)], ); } + +#[test] +fn right_context_1() { + let mut nfa: NFA = NFA::new(); + let mut right_ctxs = RightCtxDFAs::new(); + + let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a')); + nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); + + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]); + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]); +} + +#[test] +fn right_context_2() { + let mut nfa: NFA = NFA::new(); + let mut right_ctxs = RightCtxDFAs::new(); + + let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Any); + nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); + + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]); + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![("a", 1)], Some(1))]); + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![], Some(0))]); +} + +#[test] +fn right_context_3() { + let mut nfa: NFA = NFA::new(); + let mut right_ctxs = RightCtxDFAs::new(); + + let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput); + nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); + + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![("a", 1)], None)]); + test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]); +} + +#[test] +fn right_context_4() { + let mut nfa: NFA = NFA::new(); + let mut right_ctxs = RightCtxDFAs::new(); + + let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a')); + nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); + + let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput); + nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 2); + + test_simulate_right_ctx( + &nfa, + &right_ctxs, + vec![("aa", vec![("a", 1), ("a", 2)], None)], + ); +} diff --git a/crates/lexgen/tests/right_ctx.rs b/crates/lexgen/tests/right_ctx.rs new file mode 100644 index 0000000..03d0457 --- /dev/null +++ b/crates/lexgen/tests/right_ctx.rs @@ -0,0 +1,226 @@ +mod test_utils; + +use lexgen::lexer; +use lexgen_util::{LexerError, LexerErrorKind}; +use test_utils::{loc, next}; + +#[test] +fn right_ctx_1() { + lexer! { + Lexer -> u32; + + 'a' > 'a' = 1, + } + + let mut lexer = Lexer::new("aa"); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 1, 1), + kind: LexerErrorKind::InvalidToken, + })) + ); + + let mut lexer = Lexer::new("ab"); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 0, 0), + kind: LexerErrorKind::InvalidToken, + })) + ); +} + +#[test] +fn right_ctx_2() { + lexer! { + Lexer -> u32; + + 'a' > _ = 1, + } + + let mut lexer = Lexer::new("aa"); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 1, 1), + kind: LexerErrorKind::InvalidToken, + })) + ); + + let mut lexer = Lexer::new("ab"); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 1, 1), + kind: LexerErrorKind::InvalidToken, + })) + ); + + let mut lexer = Lexer::new("a"); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 0, 0), + kind: LexerErrorKind::InvalidToken, + })) + ); +} + +#[test] +fn right_ctx_3() { + lexer! { + Lexer -> u32; + + 'a' > $ = 1, + } + + let mut lexer = Lexer::new("a"); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!(next(&mut lexer), None); + + let mut lexer = Lexer::new("ab"); + assert_eq!( + next(&mut lexer), + Some(Err(LexerError { + location: loc(0, 0, 0), + kind: LexerErrorKind::InvalidToken, + })) + ); +} + +#[test] +fn right_ctx_4() { + lexer! { + Lexer -> u32; + + 'a' > 'a' = 1, + 'a' > $ = 2, + } + + let mut lexer = Lexer::new("a"); + assert_eq!(next(&mut lexer), Some(Ok(2))); + assert_eq!(next(&mut lexer), None); + + let mut lexer = Lexer::new("aa"); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!(next(&mut lexer), Some(Ok(2))); + assert_eq!(next(&mut lexer), None); +} + +#[test] +fn rust_single_line_comment() { + lexer! { + Lexer -> &'input str; + + rule Init { + $$ascii_whitespace, + + "//" => |lexer| lexer.switch(LexerRule::SinglelineComment), + } + + rule SinglelineComment { + (_ # '\n')* > ('\n' | $) => |lexer| { + let comment = lexer.match_(); + lexer.switch_and_return(LexerRule::Init, comment) + }, + } + } + + // Terminated at the end of input (no newline) + let input = "// / "; + let mut lexer = Lexer::new(input); + assert_eq!(next(&mut lexer), Some(Ok(input))); + assert_eq!(next(&mut lexer), None); + + // Terminated with newlines + let input = "// / \n"; + let mut lexer = Lexer::new(input); + assert_eq!(next(&mut lexer), Some(Ok("// / "))); + assert_eq!(next(&mut lexer), None); + + // Empty comment, terminated with eof + let input = "//"; + let mut lexer = Lexer::new(input); + assert_eq!(next(&mut lexer), Some(Ok("//"))); + assert_eq!(next(&mut lexer), None); + + // Empty comment, terminated with eol + let input = "//\n"; + let mut lexer = Lexer::new(input); + assert_eq!(next(&mut lexer), Some(Ok("//"))); + assert_eq!(next(&mut lexer), None); +} + +#[test] +fn rust_float() { + #[derive(Debug, PartialEq, Eq)] + enum Token<'input> { + Float(&'input str), + Int(&'input str), + Range, + } + + lexer! { + Lexer -> Token<'input>; + + ['0'-'9']+ '.' > (_ # ('.' | '_' | $$XID_Start) | $) => |lexer| { + let match_ = lexer.match_(); + lexer.return_(Token::Float(match_)) + }, + + ['0'-'9']+ => |lexer| { + let match_ = lexer.match_(); + lexer.return_(Token::Int(match_)) + }, + + ".." = Token::Range, + } + + let mut lexer = Lexer::new("1."); + assert_eq!(next(&mut lexer), Some(Ok(Token::Float("1.")))); + assert_eq!(next(&mut lexer), None); + + let mut lexer = Lexer::new("1.."); + assert_eq!(next(&mut lexer), Some(Ok(Token::Int("1")))); + assert_eq!(next(&mut lexer), Some(Ok(Token::Range))); + assert_eq!(next(&mut lexer), None); +} + +#[test] +fn ligature_shaping() { + #[derive(Debug, PartialEq, Eq)] + enum Token<'input> { + Lig(&'input str), + NotLig(&'input str), + } + + lexer! { + Lexer -> Token<'input>; + + "---" > ((_ # '-') | $) => |lexer| { + let match_ = lexer.match_(); + lexer.return_(Token::Lig(match_)) + }, + + _+ => |lexer| { + let match_ = lexer.match_(); + lexer.return_(Token::NotLig(match_)) + }, + } + + let mut lexer = Lexer::new("--"); + assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("--")))); + assert_eq!(next(&mut lexer), None); + + let mut lexer = Lexer::new("---"); + assert_eq!(next(&mut lexer), Some(Ok(Token::Lig("---")))); + assert_eq!(next(&mut lexer), None); + + let mut lexer = Lexer::new("----"); + assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("----")))); + assert_eq!(next(&mut lexer), None); +} diff --git a/crates/lexgen_util/src/lib.rs b/crates/lexgen_util/src/lib.rs index 6e68036..bc3c7db 100644 --- a/crates/lexgen_util/src/lib.rs +++ b/crates/lexgen_util/src/lib.rs @@ -78,7 +78,7 @@ pub struct Lexer<'input, Token, State, Error, Wrapper> { // Character iterator. `Peekable` is used in the handler's `peek` method. Note that we can't // use byte index returned by this directly, as we re-initialize this field when backtracking. // Add `iter_byte_idx` to the byte index before using. When resetting, update `iter_byte_idx`. - iter: std::iter::Peekable>, + pub __iter: std::iter::Peekable>, // Start of the current match current_match_start: Loc, @@ -113,7 +113,7 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { user_state: state, input, iter_loc: Loc::ZERO, - iter: input.chars().peekable(), + __iter: input.chars().peekable(), current_match_start: Loc::ZERO, current_match_end: Loc::ZERO, last_match: None, @@ -122,7 +122,7 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { // Read the next chracter pub fn next(&mut self) -> Option { - match self.iter.next() { + match self.__iter.next() { None => None, Some(char) => { self.current_match_end.byte_idx += char.len_utf8(); @@ -140,7 +140,7 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { } pub fn peek(&mut self) -> Option { - self.iter.peek().copied() + self.__iter.peek().copied() } // On success returns semantic action function for the last match @@ -157,7 +157,7 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { self.__done = false; self.current_match_start = match_start; self.current_match_end = match_end; - self.iter = self.input[match_end.byte_idx..].chars().peekable(); + self.__iter = self.input[match_end.byte_idx..].chars().peekable(); self.iter_loc = match_end; Ok(semantic_action) }