use std::collections::{HashMap, HashSet};
use std::iter;
use std::sync::atomic::{AtomicU64, Ordering};
use crate::helpers::{TrailError, consume_lexeme, format_error, get_env, is_escaped, peek};
use crate::regex::rex_parse;
enum SplitMarker {
QUOTE { index: usize },
SLASH { index: usize },
GROUP { index: usize },
}
pub fn split_definition_into_lexemes(definition: &str) -> Result<Vec<String>, TrailError> {
let quantifiers: HashMap<char, (Vec<&str>, Vec<&str>)> = [
('?', (vec!["["], vec!["]"])),
('+', (vec!["{"], vec!["}"])),
('*', (vec!["{", "["], vec!["]", "}"])),
]
.into_iter()
.collect();
let delimiters: Vec<char> = vec!['(', ')', '[', ']', '{', '}', '|'];
let mut lexemes: Vec<String> = Vec::new();
let mut in_quote = false;
let mut in_slash = false;
let mut markers: Vec<SplitMarker> = Vec::new();
let mut accumulate: Vec<char> = Vec::new();
let mut i = 0;
let chars: Vec<char> = definition.chars().collect();
while i < chars.len() {
let curr = chars[i];
if curr == '/' && !in_quote {
if !in_slash {
consume_lexeme(&mut lexemes, &mut accumulate);
accumulate.push(curr);
in_slash = true;
markers.push(SplitMarker::SLASH { index: i });
} else if !is_escaped(&chars, i) {
accumulate.push(curr);
let input: String = accumulate[1..accumulate.len() - 1].iter().collect();
let regex = rex_parse(&input)?;
lexemes.extend(
iter::once(String::from("("))
.chain(regex)
.chain(iter::once(String::from(")"))),
);
accumulate.clear();
let marker = markers
.pop()
.expect("Expected a `SplitMarker`, found `None` instead.");
assert!(
matches!(marker, SplitMarker::SLASH { .. }),
"`MarkerKind.SLASH` was not found."
);
in_slash = false;
}
}
else if curr == '|' && !in_quote && !in_slash {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
}
else if curr == '"' && !in_slash {
if !in_quote {
consume_lexeme(&mut lexemes, &mut accumulate);
accumulate.push(curr);
markers.push(SplitMarker::QUOTE { index: i });
in_quote = true;
} else if is_escaped(&chars, i) {
accumulate.pop();
accumulate.push(curr);
} else {
accumulate.push(curr);
consume_lexeme(&mut lexemes, &mut accumulate);
let marker = markers
.pop()
.expect("Expected a `SplitMarker`, found `None` instead.");
assert!(
matches!(marker, SplitMarker::QUOTE { .. }),
"`MarkerKind.QUOTE` was not found."
);
in_quote = false;
}
}
else if quantifiers.contains_key(&curr) && !in_quote && !in_slash {
let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
if prev == ')' {
let (open_br, close_br) = quantifiers[&curr].clone();
let mut assembled: Vec<String> = Vec::new();
let mut depth = -1;
while let Some(last) = lexemes.pop() {
assembled.push(last.clone());
if last != "(" || depth != 0 {
depth += match last.as_str() {
")" => 1,
"(" => -1,
_ => 0,
};
} else {
break;
}
}
lexemes.extend(open_br.iter().map(|s| s.to_string()));
lexemes.extend(assembled.into_iter().rev());
lexemes.extend(close_br.iter().map(|s| s.to_string()));
} else if prev == '(' {
if curr == '?' && next == '<' {
accumulate.push(curr);
} else {
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}{}", prev, curr);
return Err(TrailError(format_error(
"Invalid quantifier precedence.",
&context,
&source,
)));
}
} else if prev == '\0' {
return Err(TrailError(format_error(
"Interval quantifiers must be precedented by either an expression or a group.",
"",
&curr.to_string(),
)));
} else {
let lexeme = if accumulate.is_empty() {
lexemes.pop().expect("Vector should not be empty.")
} else {
accumulate.iter().collect()
};
accumulate.clear();
let (open_br, close_br) = quantifiers[&curr].clone();
lexemes.extend(open_br.iter().map(|s| s.to_string()));
lexemes.push(lexeme);
lexemes.extend(close_br.iter().map(|s| s.to_string()));
}
}
else if delimiters.contains(&curr) && !in_quote && !in_slash {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
match curr {
'(' => markers.push(SplitMarker::GROUP { index: i }),
')' => {
markers.pop().ok_or({
let context: String = chars[..i].iter().collect();
TrailError(format_error(
"Unexpected `)` - no matching opening parenthesis.",
&context,
")",
))
})?;
}
'|' => {}
_ => {
let context: String = chars[..i].iter().collect();
Err(TrailError(format_error(
"Delimiter reserved for internal use.",
&context,
&curr.to_string(),
)))?;
}
}
}
else if curr == '<' && !in_quote && !in_slash {
let (prevf, prevs) = (peek(&chars, i, -1), peek(&chars, i, -2));
if prevf == '$' || (prevf == '?' && prevs == '(') {
let mut k = 0;
let mut next = peek(&chars, i, k);
while next != '>' {
accumulate.push(next);
next = peek(&chars, i, k + 1);
k += 1;
}
accumulate.push(next);
consume_lexeme(&mut lexemes, &mut accumulate);
i += k as usize;
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Reference has invalid precedence - `(?<alphanumeric>...)` to capture, and `$<alphanumeric>`.",
&context,
&curr.to_string(),
)));
}
}
else if curr == '\0' {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Reserved null character `\\0`.",
&context,
&curr.to_string(),
)));
}
else if curr == ' ' {
if in_quote || in_slash {
accumulate.push(curr);
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
}
}
else {
accumulate.push(curr);
}
i += 1;
}
consume_lexeme(&mut lexemes, &mut accumulate);
if let Some(marker) = markers.pop() {
match marker {
SplitMarker::QUOTE { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unclosed string literal - missing \" to terminate the string.",
&context,
"\"",
)));
}
SplitMarker::SLASH { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unterminated regex pattern starting with `/` - add closing delimiter or escape `/` as `\\/`.",
&context,
"/",
)));
}
SplitMarker::GROUP { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unmatched '(' - expected a closing ')'.",
&context,
"(",
)));
}
}
}
return Ok(iter::once(String::from("("))
.chain(lexemes.into_iter())
.chain(iter::once(String::from(")")))
.collect());
}
pub static UUID: AtomicU64 = AtomicU64::new(0);
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct UUId(u64);
impl UUId {
pub fn new() -> Self {
Self(UUID.fetch_add(1, Ordering::Relaxed))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolVar(pub String);
impl SymbolVar {
pub fn new(value: &str) -> Self {
Self(value.to_string())
}
pub fn rand() -> Self {
Self(format!("{}", UUID.fetch_add(1, Ordering::Relaxed)))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolTag(pub String);
impl SymbolTag {
pub fn new(value: &str) -> Self {
Self(value.to_string())
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Symbol {
TERMINAL {
id: UUId,
value: String,
tags: Vec<SymbolTag>,
},
VARIABLE {
id: UUId,
value: SymbolVar,
},
REFERENCE {
id: UUId,
value: SymbolTag,
},
END {
id: UUId,
},
}
#[derive(Debug, Clone)]
pub struct SymbolGraph {
pub nodes: HashMap<UUId, Symbol>,
pub heads: HashSet<UUId>,
pub edges: HashMap<UUId, HashSet<UUId>>,
pub tails: HashSet<UUId>,
}
impl Default for SymbolGraph {
fn default() -> Self {
Self {
nodes: HashMap::new(),
heads: HashSet::new(),
edges: HashMap::new(),
tails: HashSet::new(),
}
}
}
fn build_symbol_from_lexeme(lexeme: &str) -> (Symbol, UUId) {
if lexeme.starts_with("\"") && lexeme.ends_with("\"") {
let id = UUId::new();
return (
Symbol::TERMINAL {
id: id,
value: lexeme[1..lexeme.len() - 1].to_string(),
tags: Vec::new(),
},
id,
);
} else if lexeme.starts_with("$<") && lexeme.ends_with(">") {
let id = UUId::new();
return (
Symbol::REFERENCE {
id: id,
value: SymbolTag(lexeme[2..lexeme.len() - 1].to_string()),
},
id,
);
} else {
let id = UUId::new();
return (
Symbol::VARIABLE {
id: id,
value: SymbolVar(lexeme.to_string()),
},
id,
);
}
}
pub fn construct_symbol_graph(lexemes: &Vec<String>) -> SymbolGraph {
let mut graph = SymbolGraph::default();
if lexemes.is_empty() {
return graph;
}
let (pnode, mut pid) = build_symbol_from_lexeme(&lexemes[0]);
graph.nodes.insert(pid, pnode);
graph.heads.insert(pid);
graph.edges.insert(pid, HashSet::new());
for lexeme in &lexemes[1..] {
let (cnode, cid) = build_symbol_from_lexeme(lexeme);
graph.nodes.insert(cid, cnode);
graph.edges.insert(pid, HashSet::from([cid]));
pid = cid;
}
graph.tails.insert(pid);
return graph;
}
pub fn connect_symbol_graph(mut graph_lt: SymbolGraph, mut graph_rt: SymbolGraph) -> SymbolGraph {
if graph_lt.edges.is_empty() && graph_rt.edges.is_empty() {
return SymbolGraph::default();
} else if graph_lt.edges.is_empty() {
return graph_rt;
} else if graph_rt.edges.is_empty() {
return graph_lt;
} else {
graph_lt
.edges
.retain(|_, v: &mut HashSet<UUId>| !v.is_empty());
graph_rt
.edges
.retain(|_, v: &mut HashSet<UUId>| !v.is_empty());
let mut edges: HashMap<UUId, HashSet<UUId>> =
graph_lt.edges.into_iter().chain(graph_rt.edges).collect();
if get_env("SKIP_RULE", true) {
let (end_symbols_h, end_symbols_t): (Vec<UUId>, Vec<UUId>) = (
graph_lt
.heads
.iter()
.filter(|id| matches!(graph_lt.nodes[id], Symbol::END { .. }))
.map(|id| *id)
.collect(),
graph_lt
.tails
.iter()
.filter(|id| matches!(graph_lt.nodes[id], Symbol::END { .. }))
.map(|id| *id)
.collect(),
);
let (size_h, size_t) = (end_symbols_h.len(), end_symbols_t.len());
assert!(size_h <= 1, "Duplicate `END` head symbol.");
if size_h == 1 {
graph_lt.heads.remove(&end_symbols_h[0]);
graph_lt.heads.extend(graph_rt.heads.clone());
graph_lt.nodes.remove(&end_symbols_h[0]);
}
assert!(size_t <= 1, "Duplicate `END` tail symbol.");
if size_t == 1 {
let predecessors: Vec<UUId> = edges
.iter()
.filter(|(_, v)| v.contains(&end_symbols_t[0]))
.map(|(k, _)| *k)
.collect();
for predecessor in &predecessors {
edges
.get_mut(&predecessor)
.expect("Predecessor should exist.")
.remove(&end_symbols_t[0]);
}
graph_lt.tails.remove(&end_symbols_t[0]);
graph_lt.tails.extend(predecessors);
graph_lt.nodes.remove(&end_symbols_t[0]);
}
}
let (heads, nodes, mut tails) = (
graph_lt.heads,
graph_lt
.nodes
.into_iter()
.chain(graph_rt.nodes)
.collect::<HashMap<UUId, Symbol>>(),
graph_rt.tails,
);
for tail in &graph_lt.tails {
for head in &graph_rt.heads {
if matches!(nodes[&head], Symbol::END { .. }) {
tails.insert(*head);
}
edges
.entry(*tail)
.or_insert_with(HashSet::new)
.insert(*head);
}
}
return SymbolGraph {
nodes: nodes,
heads: heads,
edges: edges,
tails: tails,
};
}
}
fn union_symbol_graph(graph_lt: SymbolGraph, graph_rt: SymbolGraph) -> SymbolGraph {
if graph_lt.edges.is_empty() && graph_rt.edges.is_empty() {
return SymbolGraph::default();
} else if graph_lt.edges.is_empty() {
return graph_rt;
} else if graph_rt.edges.is_empty() {
return graph_lt;
} else {
let (heads_lt, mut heads_rt) = (graph_lt.heads, graph_rt.heads);
let (tails_lt, mut tails_rt) = (graph_lt.tails, graph_rt.tails);
let (nodes, mut edges): (HashMap<UUId, Symbol>, HashMap<UUId, HashSet<UUId>>) = (
graph_lt.nodes.into_iter().chain(graph_rt.nodes).collect(),
graph_lt.edges.into_iter().chain(graph_rt.edges).collect(),
);
let (end_symbols_hlt, end_symbols_hrt): (Vec<UUId>, Vec<UUId>) = (
heads_lt
.iter()
.filter(|x| matches!(nodes[x], Symbol::END { .. }))
.copied()
.collect(),
heads_rt
.iter()
.filter(|x| matches!(nodes[x], Symbol::END { .. }))
.copied()
.collect(),
);
let (size_hlt, size_hrt) = (end_symbols_hlt.len(), end_symbols_hrt.len());
assert!(
size_hlt <= 1 && size_hrt <= 1,
"Duplicate head `END` symbols."
);
if (size_hlt & size_hrt) == 1 {
heads_rt.remove(&end_symbols_hrt[0]);
}
let (end_symbols_tlt, end_symbols_trt): (Vec<UUId>, Vec<UUId>) = (
tails_lt
.iter()
.filter(|x| matches!(nodes[x], Symbol::END { .. }))
.copied()
.collect(),
tails_rt
.iter()
.filter(|x| matches!(nodes[x], Symbol::END { .. }))
.copied()
.collect(),
);
let (size_tlt, size_trt) = (end_symbols_tlt.len(), end_symbols_trt.len());
assert!(
size_tlt <= 1 && size_trt <= 1,
"Duplicate tail `END` symbols."
);
if (size_tlt & size_trt) == 1 {
let (end_symbol_tlt, end_symbol_trt) = (end_symbols_tlt[0], end_symbols_trt[0]);
let predecessors: Vec<UUId> = edges
.iter()
.filter(|(_, v)| v.contains(&end_symbol_trt))
.map(|(k, _)| *k)
.collect();
for predecessor in predecessors {
let childrens = edges.get_mut(&predecessor).unwrap();
childrens.remove(&end_symbol_trt);
childrens.insert(end_symbol_tlt);
}
tails_rt.remove(&end_symbol_trt);
}
let (heads, tails): (HashSet<UUId>, HashSet<UUId>) = (
heads_lt.into_iter().chain(heads_rt).collect(),
tails_lt.into_iter().chain(tails_rt).collect(),
);
return SymbolGraph {
nodes: nodes,
heads: heads,
edges: edges,
tails: tails,
};
}
}
mod bitflags {
#[derive(Debug, PartialEq, Clone, Copy)]
pub struct DelimiterProperty(pub u32);
pub const NULL: DelimiterProperty = DelimiterProperty(0 << 0);
pub const STOP: DelimiterProperty = DelimiterProperty(1 << 0);
pub const LOOP: DelimiterProperty = DelimiterProperty(1 << 1);
pub const PIPE: DelimiterProperty = DelimiterProperty(1 << 2);
}
fn cast_symbol_graph(graph: &mut SymbolGraph, properties: bitflags::DelimiterProperty) {
let (nodes, heads, edges, tails) = (
&mut graph.nodes,
&mut graph.heads,
&mut graph.edges,
&mut graph.tails,
);
let (end_symbol_h, end_symbol_t) = (
heads
.iter()
.find_map(|x| (matches!(nodes[x], Symbol::END { .. })).then_some(nodes[x].clone())),
tails
.iter()
.find_map(|x| (matches!(nodes[x], Symbol::END { .. })).then_some(nodes[x].clone())),
);
let end_symbol: Symbol = end_symbol_h
.or(end_symbol_t)
.unwrap_or(Symbol::END { id: UUId::new() });
let end_id = if let Symbol::END { id } = end_symbol {
id
} else {
unreachable!()
};
if (properties.0 & bitflags::LOOP.0) > 0 {
if tails.contains(&end_id) {
let parents = edges
.iter()
.filter(|(_, v)| v.contains(&end_id))
.map(|(k, _)| *k);
tails.extend(parents);
}
for tail in &*tails {
for head in &*heads {
if !matches!(nodes[tail], Symbol::END { .. }) {
edges
.entry(*tail)
.or_insert_with(HashSet::new)
.insert(*head);
}
}
if !matches!(nodes[tail], Symbol::END { .. }) {
edges
.entry(*tail)
.or_insert_with(HashSet::new)
.insert(end_id);
}
}
*tails = HashSet::from([end_id]);
nodes.insert(end_id, end_symbol);
} else if (properties.0 & bitflags::STOP.0) > 0 {
heads.insert(end_id);
nodes.insert(end_id, end_symbol);
edges.insert(end_id, HashSet::new());
}
}
#[derive(Clone)]
struct TrailAccumulator {
kind: bitflags::DelimiterProperty,
graph: SymbolGraph,
tag: String,
}
impl Default for TrailAccumulator {
fn default() -> Self {
Self {
kind: bitflags::NULL,
graph: SymbolGraph::default(),
tag: String::new(),
}
}
}
pub fn build_symbol_graph(definition: &str) -> Result<SymbolGraph, TrailError> {
let bitflags: HashMap<String, bitflags::DelimiterProperty> = HashMap::from([
("(".to_string(), bitflags::NULL),
("[".to_string(), bitflags::STOP),
("{".to_string(), bitflags::LOOP),
("|".to_string(), bitflags::PIPE),
]);
let lexemes = split_definition_into_lexemes(definition)?;
let (mut state, mut accumulated): (Vec<TrailAccumulator>, Vec<String>) =
(vec![TrailAccumulator::default()], Vec::new());
let (mut i, l_size) = (0, lexemes.len());
while i < l_size {
let (lexeme, s_size) = (&lexemes[i], state.len());
if matches!(lexeme.as_str(), "(" | "[" | "{" | "|") {
let next_graph = construct_symbol_graph(&accumulated);
state[s_size - 1].graph =
connect_symbol_graph(state[s_size - 1].graph.clone(), next_graph);
state.push(TrailAccumulator {
kind: bitflags[lexeme],
..TrailAccumulator::default()
});
accumulated.clear();
} else if lexeme.starts_with("?<") && lexeme.ends_with(">") {
state[s_size - 1].tag = lexeme[2..lexeme.len() - 1].to_string();
} else if matches!(lexeme.as_str(), ")" | "]" | "}") {
let next_graph = construct_symbol_graph(&accumulated);
state[s_size - 1].graph =
connect_symbol_graph(state[s_size - 1].graph.clone(), next_graph);
accumulated.clear();
let mut accumulator = state.pop().expect("Build state should not be empty.");
while accumulator.kind == bitflags::PIPE {
let graph = &mut state.last_mut().unwrap().graph;
*graph = union_symbol_graph(graph.clone(), accumulator.graph);
accumulator = state.pop().expect("Build state should not be empty.");
}
cast_symbol_graph(&mut accumulator.graph, accumulator.kind);
let tag = accumulator.tag;
for symbol in accumulator.graph.nodes.values_mut() {
if let Symbol::TERMINAL { tags, .. } = symbol
&& !tag.is_empty()
{
tags.push(SymbolTag(tag.clone()))
}
}
let s_size = state.len();
state[s_size - 1].graph =
connect_symbol_graph(state[s_size - 1].graph.clone(), accumulator.graph);
} else {
accumulated.push(lexeme.to_string());
}
i += 1;
}
assert_eq!(state.len(), 1, "Only one TrailAccumulator should remain.");
return Ok(state.pop().unwrap().graph);
}