diff --git a/text-utils-grammar/src/lr1.rs b/text-utils-grammar/src/lr1.rs index 944c6ca..5800d06 100644 --- a/text-utils-grammar/src/lr1.rs +++ b/text-utils-grammar/src/lr1.rs @@ -19,76 +19,10 @@ use regex::{escape, Regex}; use regex_automata::util::primitives::StateID; use crate::{ - utils::{optimized_prefix_order, PrefixDFA, PrefixMatch}, + utils::{extract_parts, optimized_prefix_order, pattern_from_parts, PrefixDFA, PrefixMatch}, Constraint, }; -#[derive(Debug)] -enum Part { - Literal(String), - Regex(String), -} - -fn extract_parts(pattern: &str) -> Vec { - let mut parts = vec![]; - for part in pattern.split_whitespace() { - if (part.starts_with('\'') && part.ends_with('\'')) - || (part.starts_with('"') && part.ends_with('"')) - { - // treat part as literal - parts.push(Part::Literal(escape(&part[1..part.len() - 1]))); - } else { - // treat part as regular expression - parts.push(Part::Regex(part.to_string())); - } - } - parts -} - -// define function to recursively build pattern from parts -fn pattern_from_parts( - name: &str, - parts: &[Part], - name_regex: &Regex, - fragments: &HashMap<&str, Vec>, - tokens: &IndexMap<&str, Vec>, -) -> Result> { - let mut pattern = String::new(); - for part in parts { - match part { - Part::Literal(s) => pattern.push_str(s), - Part::Regex(s) => { - // find all tokens or framents in regex - // and replace them with their pattern - let mut replaced = String::new(); - let mut last_match = 0; - for caps in name_regex.captures_iter(s) { - let m = caps.get(0).unwrap(); - replaced.push_str(&s[last_match..m.start()]); - // surround token or fragment with parentheses to group it - replaced.push_str("(?:"); - let _name = caps.get(1).unwrap().as_str(); - if let Some(parts) = tokens.get(_name).or_else(|| fragments.get(_name)) { - let replacement = - pattern_from_parts(name, parts, name_regex, fragments, tokens)?; - replaced.push_str(&replacement); - } else { - return Err(format!( - "token or fragment {_name} within {name} not found in lexer" - ) - .into()); - } - replaced.push(')'); - last_match = m.end(); - } - replaced.push_str(&s[last_match..]); - pattern.push_str(&replaced); - } - } - } - Ok(pattern) -} - type PdfaList = Vec<(PrefixDFA, Option>)>; fn format_yacc_error(grammar: &str, e: &YaccGrammarError) -> String { diff --git a/text-utils-grammar/src/re.rs b/text-utils-grammar/src/re.rs index f33a2f6..25f6df5 100644 --- a/text-utils-grammar/src/re.rs +++ b/text-utils-grammar/src/re.rs @@ -1,6 +1,10 @@ -use std::{error::Error, fs::File, io::read_to_string, path::Path}; +use std::{collections::HashMap, error::Error, fs::File, io::read_to_string, path::Path}; -use crate::{utils::PrefixDFA, Constraint}; +use crate::{ + utils::{extract_parts, pattern_from_parts, Part, PrefixDFA}, + Constraint, +}; +use indexmap::IndexMap; use regex::Regex; use regex_automata::util::primitives::StateID; @@ -10,8 +14,38 @@ pub struct RegularExpressionConstraint { } impl RegularExpressionConstraint { - pub fn new(pattern: &str, continuations: Vec>) -> Result> { - let pdfa = PrefixDFA::new(pattern)?; + pub fn new(content: &str, continuations: Vec>) -> Result> { + let fragment_name = Regex::new(r"\{([A-Z][A-Z0-9_]*)\}")?; + let fragment_line = Regex::new(r"(?m)^([A-Z][A-Z0-9_]*)\s+(.+)$")?; + let sep = Regex::new("(?m)^%%$")?; + let pattern = if let Some(m) = sep.find(content) { + // parse fragements + let mut fragments = HashMap::new(); + for line in content[..m.start()].lines() { + if line.is_empty() || line.trim_start().starts_with("//") { + continue; + } + let cap = fragment_line + .captures(line) + .ok_or(format!("invalid fragment line: {line}"))?; + let name = cap.get(1).unwrap().as_str(); + let pattern = cap.get(2).unwrap().as_str(); + let parts = extract_parts(pattern); + if fragments.insert(name, parts).is_some() { + return Err(format!("duplicate fragment {name}").into()); + }; + } + pattern_from_parts( + "regular expression", + &[Part::Regex(content[m.end()..].to_string())], + &fragment_name, + &fragments, + &IndexMap::new(), + )? + } else { + content.to_string() + }; + let pdfa = PrefixDFA::new(&pattern)?; Ok(RegularExpressionConstraint { pdfa, continuations, @@ -24,10 +58,7 @@ impl RegularExpressionConstraint { ) -> Result> { let file = File::open(path.as_ref())?; let content = read_to_string(file)?; - let sep = Regex::new("(?m)^%%$")?; - let m = sep.find(&content).ok_or("line with %% not found")?; - let pattern = &content[m.end()..]; - Self::new(pattern, continuations) + Self::new(&content, continuations) } } diff --git a/text-utils-grammar/src/utils.rs b/text-utils-grammar/src/utils.rs index 739c332..c137e53 100644 --- a/text-utils-grammar/src/utils.rs +++ b/text-utils-grammar/src/utils.rs @@ -1,12 +1,80 @@ -use std::{error::Error, fmt::Debug}; +use std::{collections::HashMap, error::Error, fmt::Debug}; +use indexmap::IndexMap; use itertools::Itertools; +use regex::{escape, Regex}; use regex_automata::{ dfa::{dense::DFA, Automaton}, util::primitives::StateID, Input, }; +#[derive(Debug)] +pub(crate) enum Part { + Literal(String), + Regex(String), +} + +pub(crate) fn extract_parts(pattern: &str) -> Vec { + let mut parts = vec![]; + for part in pattern.split_whitespace() { + if (part.starts_with('\'') && part.ends_with('\'')) + || (part.starts_with('"') && part.ends_with('"')) + { + // treat part as literal + parts.push(Part::Literal(escape(&part[1..part.len() - 1]))); + } else { + // treat part as regular expression + parts.push(Part::Regex(part.to_string())); + } + } + parts +} + +// define function to recursively build pattern from parts +pub(crate) fn pattern_from_parts( + name: &str, + parts: &[Part], + name_regex: &Regex, + fragments: &HashMap<&str, Vec>, + tokens: &IndexMap<&str, Vec>, +) -> Result> { + let mut pattern = String::new(); + for part in parts { + match part { + Part::Literal(s) => pattern.push_str(s), + Part::Regex(s) => { + // find all tokens or framents in regex + // and replace them with their pattern + let mut replaced = String::new(); + let mut last_match = 0; + for caps in name_regex.captures_iter(s) { + let m = caps.get(0).unwrap(); + replaced.push_str(&s[last_match..m.start()]); + // surround token or fragment with parentheses to group it + replaced.push_str("(?:"); + let _name = caps.get(1).unwrap().as_str(); + if let Some(parts) = tokens.get(_name).or_else(|| fragments.get(_name)) { + let replacement = + pattern_from_parts(name, parts, name_regex, fragments, tokens)?; + replaced.push_str(&replacement); + } else { + return Err(format!( + "token or fragment {_name} within {name} not found in lexer" + ) + .into()); + } + replaced.push(')'); + last_match = m.end(); + } + replaced.push_str(&s[last_match..]); + pattern.push_str(&replaced); + } + } + } + Ok(pattern) +} + fn make_anchored(pat: &str) -> String { let pat: String = match (pat.starts_with('^'), pat.ends_with('$')) { (true, true) => return pat.to_string(),