osa1 · osa1 · Jan 31, 2022 · Dec 9, 2021 · Dec 10, 2021 · Dec 10, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,10 @@
   }
   ```
 
+- A new syntax added for right contexts. A right context is basically
+  lookahead, but can only be used in rules and cannot be nested inside regexes.
+  See README for details. (#29)
+
 # 2021/11/30: 0.8.1
 
 New version published to fix broken README pages for lexgen and lexgen_util in

diff --git a/README.md b/README.md
@@ -175,6 +175,20 @@ You can use parenthesis for grouping, e.g. `('a' | 'b')*`.
 
 Example: `'a' 'b' | 'c'+` is the same as `(('a' 'b') | ('c'+))`.
 
+## Right context (lookahead)
+
+A rule in a rule set can be followed by another regex using `> <regex>` syntax,
+for right context. Right context is basically a limited form of lookahead: they
+can only appear after a top-level regex for a rule. They cannot be used nested
+in a regex.
+
+For example, the rule left-hand side `'a' > (_ # 'b')` matches `'a'` as long as
+it's not followed by `'b'`.
+
+See also [right context tests] for more examples.
+
+[right context tests]: https://github.com/osa1/lexgen/blob/main/tests/right_ctx.rs
+
 ## Built-in regular expressions
 
 lexgen comes with a set of built-in regular expressions. Regular

diff --git a/crates/lexgen/src/ast.rs b/crates/lexgen/src/ast.rs
@@ -22,7 +22,7 @@ pub struct Lexer {
 
 pub enum Rule {
     /// `let <ident> = <regex>;`
-    Binding { var: Var, re: Regex },
+    Binding { var: Var, re: RegexCtx },
 
     /// `type Error = UserError;`
     ErrorType {
@@ -41,10 +41,17 @@ pub enum Rule {
 }
 
 pub struct SingleRule {
-    pub lhs: Regex,
+    pub lhs: RegexCtx,
     pub rhs: SemanticActionIdx,
 }
 
+/// Regular expression with optional right context (lookahead)
+#[derive(Debug, Clone)]
+pub struct RegexCtx {
+    pub re: Regex,
+    pub right_ctx: Option<Regex>,
+}
+
 #[derive(Debug, Clone)]
 pub enum RuleRhs {
     None,
@@ -135,13 +142,30 @@ pub enum CharOrRange {
     Range(char, char),
 }
 
-/// Parses a regex terminated with: `=>` (used in rules with RHSs), `,` (used in rules without
-/// RHSs), or `;` (used in let bindings)
+/// Parses a regex with optional right context: `re_ctx -> re [> re]`
+fn parse_regex_ctx(input: ParseStream) -> syn::Result<RegexCtx> {
+    let re = parse_regex(input)?;
+    if input.peek(syn::token::Gt) {
+        input.parse::<syn::token::Gt>()?;
+        let right_ctx = parse_regex(input)?;
+        Ok(RegexCtx {
+            re,
+            right_ctx: Some(right_ctx),
+        })
+    } else {
+        Ok(RegexCtx {
+            re,
+            right_ctx: None,
+        })
+    }
+}
+
+/// Parses a regex
 fn parse_regex(input: ParseStream) -> syn::Result<Regex> {
     parse_regex_0(input)
 }
 
-// re_0 -> re_1 | re_1 `|` re_1 (alternation)
+// re_0 -> re_1 | re_0 `|` re_1 (alternation)
 fn parse_regex_0(input: ParseStream) -> syn::Result<Regex> {
     let mut re = parse_regex_1(input)?;
 
@@ -154,7 +178,7 @@ fn parse_regex_0(input: ParseStream) -> syn::Result<Regex> {
     Ok(re)
 }
 
-// re_1 -> re_2 | re_2 re_2
+// re_1 -> re_2 | re_1 re_2 (concatenation)
 fn parse_regex_1(input: ParseStream) -> syn::Result<Regex> {
     let mut re = parse_regex_2(input)?;
 
@@ -213,7 +237,7 @@ fn parse_regex_4(input: ParseStream) -> syn::Result<Regex> {
     if input.peek(syn::token::Paren) {
         let parenthesized;
         syn::parenthesized!(parenthesized in input);
-        parse_regex(&parenthesized)
+        parse_regex(&parenthesized) // no right ctx
     } else if input.peek(syn::token::Dollar) {
         let _ = input.parse::<syn::token::Dollar>()?;
         if input.parse::<syn::token::Dollar>().is_ok() {
@@ -269,7 +293,7 @@ fn parse_single_rule(
     input: ParseStream,
     semantic_action_table: &mut SemanticActionTable,
 ) -> syn::Result<SingleRule> {
-    let lhs = parse_regex(input)?;
+    let lhs = parse_regex_ctx(input)?;
 
     let rhs = if input.parse::<syn::token::Comma>().is_ok() {
         RuleRhs::None
@@ -308,7 +332,7 @@ fn parse_rule(
         input.parse::<syn::token::Let>()?;
         let var = input.parse::<syn::Ident>()?;
         input.parse::<syn::token::Eq>()?;
-        let re = parse_regex(input)?;
+        let re = parse_regex_ctx(input)?;
         input.parse::<syn::token::Semi>()?;
         Ok(Rule::Binding {
             var: Var(var.to_string()),

diff --git a/crates/lexgen/src/dfa.rs b/crates/lexgen/src/dfa.rs
@@ -5,6 +5,7 @@ pub mod simplify;
 pub mod simulate;
 
 use crate::collections::{Map, Set};
+use crate::nfa::AcceptingState;
 use crate::range_map::{Range, RangeMap};
 
 use std::convert::TryFrom;
@@ -38,7 +39,7 @@ pub struct State<T, A> {
     range_transitions: RangeMap<T>,
     any_transition: Option<T>,
     end_of_input_transition: Option<T>,
-    accepting: Option<A>,
+    accepting: Vec<AcceptingState<A>>,
     // Predecessors of the state, used to inline code for a state with one predecessor in the
     // predecessor's code
     predecessors: Set<StateIdx>,
@@ -52,7 +53,7 @@ impl<T, A> State<T, A> {
             range_transitions: Default::default(),
             any_transition: None,
             end_of_input_transition: None,
-            accepting: None,
+            accepting: vec![],
             predecessors: Default::default(),
         }
     }
@@ -81,12 +82,8 @@ impl<A> DFA<StateIdx, A> {
         StateIdx(0)
     }
 
-    pub fn make_state_accepting(&mut self, state: StateIdx, value: A) {
-        // Give first rule priority
-        let accepting = &mut self.states[state.0].accepting;
-        if accepting.is_none() {
-            *accepting = Some(value);
-        }
+    pub fn make_state_accepting(&mut self, state: StateIdx, accept: AcceptingState<A>) {
+        self.states[state.0].accepting.push(accept);
     }
 
     pub fn new_state(&mut self) -> StateIdx {
@@ -95,6 +92,11 @@ impl<A> DFA<StateIdx, A> {
         new_state_idx
     }
 
+    #[cfg(test)]
+    pub fn is_accepting_state(&self, state: StateIdx) -> bool {
+        !self.states[state.0].accepting.is_empty()
+    }
+
     pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) {
         let old = self.states[state.0].char_transitions.insert(char, next);
         assert!(
@@ -238,7 +240,7 @@ impl<A> Display for DFA<StateIdx, A> {
                 predecessors: _,
             } = state;
 
-            if accepting.is_some() {
+            if !accepting.is_empty() {
                 if *initial {
                     write!(f, "{:>5}:", format!("i*{}", state_idx))?;
                 } else {