Skip to content

Commit

Permalink
update regex initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
bastiscode committed Mar 11, 2024
1 parent 579253d commit 1b705ee
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 76 deletions.
68 changes: 1 addition & 67 deletions text-utils-grammar/src/lr1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,76 +19,10 @@ use regex::{escape, Regex};
use regex_automata::util::primitives::StateID;

use crate::{
utils::{optimized_prefix_order, PrefixDFA, PrefixMatch},
utils::{extract_parts, optimized_prefix_order, pattern_from_parts, PrefixDFA, PrefixMatch},
Constraint,
};

#[derive(Debug)]
enum Part {
Literal(String),
Regex(String),
}

fn extract_parts(pattern: &str) -> Vec<Part> {
let mut parts = vec![];
for part in pattern.split_whitespace() {
if (part.starts_with('\'') && part.ends_with('\''))
|| (part.starts_with('"') && part.ends_with('"'))
{
// treat part as literal
parts.push(Part::Literal(escape(&part[1..part.len() - 1])));
} else {
// treat part as regular expression
parts.push(Part::Regex(part.to_string()));
}
}
parts
}

// define function to recursively build pattern from parts
fn pattern_from_parts(
name: &str,
parts: &[Part],
name_regex: &Regex,
fragments: &HashMap<&str, Vec<Part>>,
tokens: &IndexMap<&str, Vec<Part>>,
) -> Result<String, Box<dyn Error>> {
let mut pattern = String::new();
for part in parts {
match part {
Part::Literal(s) => pattern.push_str(s),
Part::Regex(s) => {
// find all tokens or framents in regex
// and replace them with their pattern
let mut replaced = String::new();
let mut last_match = 0;
for caps in name_regex.captures_iter(s) {
let m = caps.get(0).unwrap();
replaced.push_str(&s[last_match..m.start()]);
// surround token or fragment with parentheses to group it
replaced.push_str("(?:");
let _name = caps.get(1).unwrap().as_str();
if let Some(parts) = tokens.get(_name).or_else(|| fragments.get(_name)) {
let replacement =
pattern_from_parts(name, parts, name_regex, fragments, tokens)?;
replaced.push_str(&replacement);
} else {
return Err(format!(
"token or fragment {_name} within {name} not found in lexer"
)
.into());
}
replaced.push(')');
last_match = m.end();
}
replaced.push_str(&s[last_match..]);
pattern.push_str(&replaced);
}
}
}
Ok(pattern)
}

type PdfaList = Vec<(PrefixDFA, Option<TIdx<u32>>)>;

fn format_yacc_error(grammar: &str, e: &YaccGrammarError) -> String {
Expand Down
47 changes: 39 additions & 8 deletions text-utils-grammar/src/re.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use std::{error::Error, fs::File, io::read_to_string, path::Path};
use std::{collections::HashMap, error::Error, fs::File, io::read_to_string, path::Path};

use crate::{utils::PrefixDFA, Constraint};
use crate::{
utils::{extract_parts, pattern_from_parts, Part, PrefixDFA},
Constraint,
};
use indexmap::IndexMap;
use regex::Regex;
use regex_automata::util::primitives::StateID;

Expand All @@ -10,8 +14,38 @@ pub struct RegularExpressionConstraint {
}

impl RegularExpressionConstraint {
pub fn new(pattern: &str, continuations: Vec<Vec<u8>>) -> Result<Self, Box<dyn Error>> {
let pdfa = PrefixDFA::new(pattern)?;
pub fn new(content: &str, continuations: Vec<Vec<u8>>) -> Result<Self, Box<dyn Error>> {
let fragment_name = Regex::new(r"\{([A-Z][A-Z0-9_]*)\}")?;
let fragment_line = Regex::new(r"(?m)^([A-Z][A-Z0-9_]*)\s+(.+)$")?;
let sep = Regex::new("(?m)^%%$")?;
let pattern = if let Some(m) = sep.find(content) {
// parse fragements
let mut fragments = HashMap::new();
for line in content[..m.start()].lines() {
if line.is_empty() || line.trim_start().starts_with("//") {
continue;
}
let cap = fragment_line
.captures(line)
.ok_or(format!("invalid fragment line: {line}"))?;
let name = cap.get(1).unwrap().as_str();
let pattern = cap.get(2).unwrap().as_str();
let parts = extract_parts(pattern);
if fragments.insert(name, parts).is_some() {
return Err(format!("duplicate fragment {name}").into());
};
}
pattern_from_parts(
"regular expression",
&[Part::Regex(content[m.end()..].to_string())],
&fragment_name,
&fragments,
&IndexMap::new(),
)?
} else {
content.to_string()
};
let pdfa = PrefixDFA::new(&pattern)?;
Ok(RegularExpressionConstraint {
pdfa,
continuations,
Expand All @@ -24,10 +58,7 @@ impl RegularExpressionConstraint {
) -> Result<Self, Box<dyn Error>> {
let file = File::open(path.as_ref())?;
let content = read_to_string(file)?;
let sep = Regex::new("(?m)^%%$")?;
let m = sep.find(&content).ok_or("line with %% not found")?;
let pattern = &content[m.end()..];
Self::new(pattern, continuations)
Self::new(&content, continuations)
}
}

Expand Down
70 changes: 69 additions & 1 deletion text-utils-grammar/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,80 @@
use std::{error::Error, fmt::Debug};
use std::{collections::HashMap, error::Error, fmt::Debug};

use indexmap::IndexMap;
use itertools::Itertools;
use regex::{escape, Regex};
use regex_automata::{
dfa::{dense::DFA, Automaton},
util::primitives::StateID,
Input,
};

#[derive(Debug)]
pub(crate) enum Part {
Literal(String),
Regex(String),
}

pub(crate) fn extract_parts(pattern: &str) -> Vec<Part> {
let mut parts = vec![];
for part in pattern.split_whitespace() {
if (part.starts_with('\'') && part.ends_with('\''))
|| (part.starts_with('"') && part.ends_with('"'))
{
// treat part as literal
parts.push(Part::Literal(escape(&part[1..part.len() - 1])));
} else {
// treat part as regular expression
parts.push(Part::Regex(part.to_string()));
}
}
parts
}

// define function to recursively build pattern from parts
pub(crate) fn pattern_from_parts(
name: &str,
parts: &[Part],
name_regex: &Regex,
fragments: &HashMap<&str, Vec<Part>>,
tokens: &IndexMap<&str, Vec<Part>>,
) -> Result<String, Box<dyn Error>> {
let mut pattern = String::new();
for part in parts {
match part {
Part::Literal(s) => pattern.push_str(s),
Part::Regex(s) => {
// find all tokens or framents in regex
// and replace them with their pattern
let mut replaced = String::new();
let mut last_match = 0;
for caps in name_regex.captures_iter(s) {
let m = caps.get(0).unwrap();
replaced.push_str(&s[last_match..m.start()]);
// surround token or fragment with parentheses to group it
replaced.push_str("(?:");
let _name = caps.get(1).unwrap().as_str();
if let Some(parts) = tokens.get(_name).or_else(|| fragments.get(_name)) {
let replacement =
pattern_from_parts(name, parts, name_regex, fragments, tokens)?;
replaced.push_str(&replacement);
} else {
return Err(format!(
"token or fragment {_name} within {name} not found in lexer"
)
.into());
}
replaced.push(')');
last_match = m.end();
}
replaced.push_str(&s[last_match..]);
pattern.push_str(&replaced);
}
}
}
Ok(pattern)
}

fn make_anchored(pat: &str) -> String {
let pat: String = match (pat.starts_with('^'), pat.ends_with('$')) {
(true, true) => return pat.to_string(),
Expand Down

0 comments on commit 1b705ee

Please sign in to comment.