pub(crate) mod tests;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::{Display, Formatter};
use std::marker::PhantomData;
use vectree::VecTree;
use lexigram_core::CollectJoin;
use lexigram_core::char_reader::escape_string;
use crate::{btreeset, indent_source, General, Normalized};
use crate::char_reader::escape_char;
use crate::lexer::{ModeId, ModeOption, StateId, Terminal};
use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
use crate::build::{BuildErrorSource, BuildFrom, HasBuildErrorSource};
use crate::segments::Segments;
use crate::segmap::Seg;
use crate::take_until::TakeUntilIterator;
#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
pub enum ReType { #[default] Empty,
End(Box<Terminal>),
Char(char),
CharRange(Box<Segments>),
String(Box<String>),
Concat,
Star,
Plus,
Or,
Lazy,
}
#[test]
fn retype_size() {
let size = size_of::<ReType>();
assert!(size <= 16, "size of ReType is too big: {size}");
}
impl ReType {
pub fn is_empty(&self) -> bool {
matches!(self, ReType::Empty)
}
pub fn is_leaf(&self) -> bool {
matches!(self, ReType::Empty | ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_))
}
pub fn is_nullable(&self) -> Option<bool> {
match self {
ReType::Empty | ReType::Star => Some(true),
ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_) | ReType::Plus => Some(false),
ReType::Concat | ReType::Or | ReType::Lazy => None,
}
}
pub fn is_end(&self) -> bool {
matches!(self, ReType::End(_))
}
}
impl Display for ReType {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
ReType::Empty => write!(f, "-"),
ReType::End(t) => write!(f, "{t}"),
ReType::Char(c) => write!(f, "'{}'", escape_char(*c)),
ReType::CharRange(segments) => write!(f, "[{segments}]"),
ReType::String(s) => write!(f, "'{}'", escape_string(s)),
ReType::Concat => write!(f, "&"),
ReType::Star => write!(f, "*"),
ReType::Plus => write!(f, "+"),
ReType::Or => write!(f, "|"),
ReType::Lazy => write!(f, "??"),
}
}
}
type Id = u32;
#[derive(Clone, Debug, PartialEq, Default)]
pub struct ReNode {
id: Option<Id>,
op: ReType,
firstpos: HashSet<Id>, lastpos: HashSet<Id>, nullable: Option<bool>
}
impl ReNode {
pub fn new(node: ReType) -> ReNode {
ReNode { id: None, op: node, firstpos: HashSet::new(), lastpos: HashSet::new(), nullable: None }
}
pub fn empty() -> ReNode { ReNode::new(ReType::Empty) }
pub fn end(t: Terminal) -> ReNode { ReNode::new(ReType::End(Box::new(t))) }
pub fn char(c: char) -> ReNode { ReNode::new(ReType::Char(c)) }
pub fn char_range(s: Segments) -> ReNode { ReNode::new(ReType::CharRange(Box::new(s))) }
pub fn string<T: Into<String>>(s: T) -> ReNode { ReNode::new(ReType::String(Box::new(s.into()))) }
pub fn concat() -> ReNode { ReNode::new(ReType::Concat) }
pub fn star() -> ReNode { ReNode::new(ReType::Star) }
pub fn plus() -> ReNode { ReNode::new(ReType::Plus) }
pub fn or() -> ReNode { ReNode::new(ReType::Or) }
pub fn lazy() -> ReNode { ReNode::new(ReType::Lazy) }
pub fn is_leaf(&self) -> bool {
self.op.is_leaf()
}
pub fn is_empty(&self) -> bool {
self.op.is_empty()
}
pub fn get_type(&self) -> &ReType {
&self.op
}
pub fn is_nullable(&self) -> Option<bool> {
self.nullable
}
pub fn get_mut_type(&mut self) -> &mut ReType {
&mut self.op
}
}
impl Display for ReNode {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
if let Some(id) = self.id {
write!(f, "{id}:")?;
}
self.op.fmt(f)
}
}
pub type ReTree = VecTree<ReNode>;
#[derive(Clone)]
pub struct DfaBuilder {
re: ReTree,
followpos: HashMap<Id, HashSet<Id>>,
lazypos: HashSet<Id>,
ids: HashMap<Id, usize>,
log: BufLog
}
impl DfaBuilder {
pub fn new() -> Self {
Self::with_log(BufLog::new())
}
pub fn with_log(log: BufLog) -> Self {
DfaBuilder {
re: VecTree::new(),
followpos: HashMap::new(),
lazypos: HashSet::new(),
ids: HashMap::new(),
log
}
}
#[inline(always)]
pub fn get_re(&self) -> &ReTree {
&self.re
}
fn preprocess_re(&mut self) {
let mut nodes = vec![];
for mut inode in self.re.iter_post_depth_simple_mut() {
if matches!(inode.op, ReType::String(_)) {
if let ReType::String(s) = std::mem::take(&mut inode.op) {
if s.is_empty() {
self.log.add_error(format!("node #{} is an empty string", inode.index));
}
nodes.push((inode.index, s));
}
}
}
for (index, s) in nodes {
let node = self.re.get_mut(index);
match s.len() {
0 => node.op = ReType::Char('?'), 1 => node.op = ReType::Char(s.chars().nth(0).unwrap()),
_ => {
node.op = ReType::Concat;
for c in s.chars() {
self.re.add(Some(index), ReNode::char(c));
}
}
}
}
}
fn calc_node_pos(&mut self) {
let mut id = 0;
for mut inode in self.re.iter_post_depth_mut() {
if inode.is_leaf() {
if inode.num_children() > 0 {
self.log.add_error(format!("node #{} {} had {} child(ren) but shouldn't have any", inode.index, inode.op, inode.num_children()));
}
id += 1;
inode.id = Some(id);
if !inode.is_empty() {
inode.firstpos.insert(id);
inode.lastpos.insert(id);
}
self.ids.insert(id, inode.index);
if inode.op.is_end() {
self.followpos.insert(id, HashSet::new());
}
} else {
match inode.op {
ReType::Concat => {
if inode.num_children() > 0 {
let mut firstpos = HashSet::<Id>::new();
for child in inode.iter_children_simple().take_until(|&n| !n.nullable.unwrap()) {
firstpos.extend(&child.firstpos);
}
inode.firstpos.extend(firstpos);
let mut lastpos = HashSet::<Id>::new();
for child in inode.iter_children_simple().rev().take_until(|&n| !n.nullable.unwrap()) {
lastpos.extend(&child.lastpos);
}
inode.lastpos.extend(lastpos);
let mut iter = inode.iter_children_simple();
let a = iter.next().unwrap(); let mut lastpos = a.lastpos.clone();
for b in iter { for j in &lastpos {
if !self.followpos.contains_key(j) {
self.followpos.insert(*j, HashSet::new());
}
self.followpos.get_mut(j).unwrap().extend(&b.firstpos);
}
if b.nullable.unwrap() {
lastpos.extend(&b.lastpos);
} else {
lastpos = b.lastpos.clone();
}
}
} else {
self.log.add_error(format!("node #{} is Concat but has no children", inode.index))
}
}
ReType::Star | ReType::Plus => {
if inode.num_children() == 1 {
let firstpos = inode.iter_children_simple().next().unwrap().firstpos.iter().copied().to_vec();
inode.firstpos.extend(firstpos);
let lastpos = inode.iter_children_simple().next().unwrap().lastpos.iter().copied().to_vec();
inode.lastpos.extend(lastpos);
for i in &inode.lastpos {
if !self.followpos.contains_key(i) {
self.followpos.insert(*i, HashSet::new());
}
self.followpos.get_mut(i).unwrap().extend(&inode.firstpos);
}
} else {
self.log.add_error(format!("node #{} is {:?} but has {} child(ren) instead of 1 child",
inode.index, inode.op, inode.num_children()))
}
}
ReType::Or => {
if inode.num_children() > 0 {
let mut firstpos = HashSet::<Id>::new();
for child in inode.iter_children_simple() {
firstpos.extend(&child.firstpos);
}
inode.firstpos.extend(firstpos);
let mut lastpos = HashSet::<Id>::new();
for child in inode.iter_children_simple() {
lastpos.extend(&child.lastpos);
}
inode.lastpos.extend(lastpos);
} else {
self.log.add_error(format!("node #{} is Or but has no children", inode.index));
}
}
ReType::Lazy => {
if inode.num_children() == 1 {
let child = inode.iter_children_simple().next().unwrap();
let firstpos = child.firstpos.clone();
let lastpos = child.lastpos.clone();
inode.firstpos = firstpos;
inode.lastpos = lastpos;
for ichild in inode.iter_post_depth_simple().filter(|node| node.is_leaf()) {
self.lazypos.insert(ichild.id.unwrap());
}
} else {
self.log.add_error(format!("node #{} is Lazy but has {} child(ren) instead of 1 child",
inode.index, inode.num_children()));
}
}
_ => panic!("{:?}: no way to compute firstpos/...", &*inode)
}
}
if let Some(nullable) = inode.op.is_nullable() {
inode.nullable = Some(nullable);
} else {
inode.nullable = match &inode.op {
ReType::Concat | ReType::Lazy => Some(inode.iter_children_simple().all(|child| child.nullable.unwrap())),
ReType::Or => Some(inode.iter_children_simple().any(|child| child.nullable.unwrap())),
op => panic!("{:?} should have a fixed nullable property", op)
}
}
}
}
fn calc_states(&mut self) -> Dfa<General> {
const VERBOSE: bool = false;
const RESOLVE_END_STATES: bool = true;
const RM_LAZY_BRANCHES: bool = true;
let mut dfa = Dfa::new();
if !self.log.has_no_errors() {
return dfa;
}
if VERBOSE { println!("new DFA"); }
let mut current_id = 0;
let key = BTreeSet::from_iter(self.re.get(0).firstpos.iter().copied());
let mut new_states = BTreeSet::<BTreeSet<Id>>::new();
new_states.insert(key.clone());
let mut states = BTreeMap::<BTreeSet<Id>, StateId>::new();
states.insert(key, current_id);
dfa.initial_state = Some(current_id);
let mut conflicts = vec![];
let mut lazy_followpos = self.lazypos.iter().copied().collect::<BTreeSet<Id>>();
lazy_followpos.extend(self.lazypos.iter().filter_map(|id| self.followpos.get(id)).flatten());
if VERBOSE { println!("lazy_followpos = {{{}}}", lazy_followpos.iter().join(", ")); }
let mut symbols_part = Segments::empty();
for id in self.ids.values() {
let node = self.re.get_mut(*id);
if let ReType::Char(c) = node.op {
node.op = ReType::CharRange(Box::new(Segments::from(c)));
}
if let ReType::CharRange(segments) = &node.op {
symbols_part.add_partition(segments);
}
}
if VERBOSE { println!("symbols = {symbols_part:X}"); }
while let Some(s) = new_states.pop_first() {
let new_state_id = *states.get(&s).unwrap();
let is_lazy_state = s.iter().all(|id| lazy_followpos.contains(id));
if VERBOSE {
println!("- state {} = {{{}}}{}", new_state_id, states_to_string(&s), if is_lazy_state { ", lazy state" } else { "" });
}
let mut trans = BTreeMap::<Seg, BTreeSet<Id>>::new(); let mut terminals = BTreeMap::<Id, &Terminal>::new(); let mut first_terminal_id: Option<Id> = None; let mut id_transitions = BTreeSet::<Id>::new(); let mut id_terminal: Option<Id> = None; for (symbol, id) in s.iter().map(|id| (&self.re.get(self.ids[id]).op, *id)) {
if symbol.is_end() {
dfa.state_graph.entry(new_state_id).or_insert_with(|| {
if VERBOSE { println!(" + {symbol} => create state {new_state_id}"); }
BTreeMap::new()
});
if let ReType::End(t) = symbol {
id_terminal = Some(id);
if first_terminal_id.is_none() {
first_terminal_id = Some(id);
if new_state_id == 0 {
if t.is_only_skip() {
self.log.add_warning(format!("<skip> on initial state is a risk of infinite loop on bad input ({t})"));
} else if t.is_token() {
self.log.add_warning(format!("<token> on initial state returns a token on bad input ({t})"));
}
}
}
if RESOLVE_END_STATES {
if let Some(t2) = terminals.insert(id, t) {
panic!("overriding {id} -> {t2} with {id} -> {t} in end_states {}",
terminals.iter().map(|(id, t)| format!("{id} {t}")).join(", "));
}
} else if let std::collections::btree_map::Entry::Vacant(e) = dfa.end_states.entry(new_state_id) {
e.insert(*t.clone());
if VERBOSE { println!(" # end state: id {id} {t}"); }
} else if VERBOSE {
println!(" # end state: id {id} {t} ## DISCARDED since another one already taken");
}
} else {
panic!("unexpected END symbol: {symbol:?}");
}
} else if let ReType::CharRange(segments) = symbol {
if let Some(set) = self.followpos.get(&id) {
id_transitions.extend(set);
let cmp = segments.intersect(&symbols_part);
assert!(cmp.internal.is_empty(), "{symbols_part} # {segments} = {cmp}");
if VERBOSE { println!(" + {} to {}", &cmp.common, id); }
for segment in cmp.common.into_iter() {
if let Some(ids) = trans.get_mut(&segment) {
ids.insert(id);
} else {
trans.insert(segment, btreeset![id]);
}
}
} else {
self.log.add_error(format!("node #{id} is not in followpos; is an accepting state missing? Orphan segment: {segments}"));
}
}
}
if RESOLVE_END_STATES {
if terminals.len() > 1 {
if VERBOSE {
println!(" # {id_transitions:?}");
println!(" # terminal conflict: {}", terminals.iter()
.map(|(id, t)| format!("{id} -> {t} (has{} trans)", if id_transitions.contains(id) { "" } else { " no" }))
.join(", "));
}
let ts = terminals.iter().map(|(_, &t)| t).collect::<BTreeSet<_>>();
let (chosen, t) = terminals.pop_first().unwrap();
conflicts.push((new_state_id, ts, t));
id_terminal = Some(chosen);
if VERBOSE { println!(" selecting the first one of the list: id {chosen} {t}"); }
dfa.end_states.insert(new_state_id, t.to_owned());
} else if let Some((id, terminal)) = terminals.pop_first() {
id_terminal = Some(id);
if VERBOSE { println!(" # end state: id {id} {terminal}"); }
dfa.end_states.insert(new_state_id, terminal.clone());
}
}
let has_non_lazy_terminal = if let Some(id) = id_terminal {
!lazy_followpos.contains(&id)
} else {
false
};
let mut map = BTreeMap::<StateId, Segments>::new();
for (segment, ids) in trans {
if VERBOSE { print!(" - {} in {}: ", segment, states_to_string(&ids)); }
if RM_LAZY_BRANCHES && !is_lazy_state && has_non_lazy_terminal && ids.iter().all(|id| lazy_followpos.contains(id)) {
if VERBOSE { println!(" => lazy, removed"); }
continue;
}
let mut state = BTreeSet::new();
for id in ids {
state.extend(&self.followpos[&id]);
}
if VERBOSE { print!("follow = {{{}}}", states_to_string(&state)); }
let state_id = if let Some(state_id) = states.get(&state) {
if VERBOSE { println!(" => state {state_id}"); }
*state_id
} else {
new_states.insert(state.clone());
current_id += 1;
if VERBOSE { println!(" => new state {} = {{{}}}", current_id, states_to_string(&state)); }
states.insert(state, current_id);
current_id
};
if let Some(segments) = map.get_mut(&state_id) {
segments.insert(segment);
} else {
map.insert(state_id, Segments::new(segment));
}
}
for segments in map.values_mut() {
segments.normalize();
}
if VERBOSE {
for (st, int) in &map {
println!(" {} -> {}", int, st);
}
}
dfa.state_graph.insert(new_state_id, map.into_iter().map(|(id, segments)| (segments, id)).collect());
}
let state_terminals = dfa.end_states.values().collect::<BTreeSet<_>>();
let missing = self.ids.values()
.filter_map(|x| if let ReType::End(t) = &self.re.get(*x).op { Some(t.as_ref()) } else { None })
.filter(|x| !state_terminals.contains(x))
.collect::<BTreeSet<_>>();
if VERBOSE && !missing.is_empty() { println!("# missing: {}", missing.iter().map(|t| t.to_string()).join(", ")); }
for missing_t in missing {
let related = conflicts.iter()
.filter_map(|(s_id, ts, chosen)|
if ts.contains(missing_t) {
Some(format!("\n - conflict in state {s_id}: {} => {chosen} chosen", ts.iter().map(|t| t.to_string()).join(" <> ")))
} else {
None
})
.to_vec();
if !related.is_empty() {
self.log.add_error(format!(
"{missing_t} is never selected, but it was in conflict with (an)other token(s); check if re-ordering definitions solves the problem{}",
related.join("")));
} else {
self.log.add_error(format!("{missing_t} is never selected"));
}
}
dfa
}
fn build(mut self) -> Dfa<General> {
self.log.add_note("building DFA...");
self.calc_node_pos();
let mut dfa = self.calc_states();
dfa.log.extend(std::mem::replace(&mut self.log, BufLog::new()));
dfa
}
#[cfg(test)]
pub(crate) fn build_from_graph<T: IntoIterator<Item=(StateId, Terminal)>>(
&mut self, graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>, init_state: StateId, end_states: T,
) -> Option<Dfa<General>>
{
let mut dfa = Dfa::<General> {
state_graph: graph,
initial_state: Some(init_state),
end_states: BTreeMap::from_iter(end_states),
first_end_state: None,
log: BufLog::new(),
_phantom: PhantomData
};
dfa.first_end_state = dfa.end_states.keys().min().cloned();
Some(dfa)
}
fn build_from_dfa_modes<T, U>(self, dfas: T) -> Dfa<General>
where
T: IntoIterator<Item=(ModeId, Dfa<U>)>,
{
assert!(self.re.is_empty(), "build_from_dfa_modes() used on non-empty builder");
let mut dfa = Dfa::<General>::with_log(self.log);
dfa.log.add_note("combining DFA modes...");
let mut init_states = BTreeMap::new();
for (idx, new_dfa) in dfas {
dfa.log.add_note(format!("- DFA mode #{idx}"));
dfa.log.extend(new_dfa.log);
if new_dfa.state_graph.is_empty() {
dfa.log.add_error(format!("DFA mode #{idx} is empty"));
}
let offset = if init_states.is_empty() { 0 } else { 1 + dfa.state_graph.keys().max().unwrap() };
dfa.initial_state = dfa.initial_state.or(new_dfa.initial_state);
if let Some(initial_state) = new_dfa.initial_state {
if init_states.insert(idx, offset + initial_state).is_some() {
dfa.log.add_error(format!("DFA mode #{idx} defined multiple times"))
};
} else {
dfa.log.add_error(format!("DFA mode #{idx} has no initial state"));
}
for (st_from, mut map) in new_dfa.state_graph {
for (_, st_to) in map.iter_mut() {
*st_to += offset;
}
assert!(!dfa.state_graph.contains_key(&(st_from + offset)));
dfa.state_graph.insert(st_from + offset, map);
}
dfa.end_states.extend(new_dfa.end_states.into_iter().map(|(st, term)| (st + offset, term)));
}
if init_states.len() > 1 {
for (_, term) in dfa.end_states.iter_mut() {
term.mode_state = match term.mode {
ModeOption::None => None,
ModeOption::Mode(m) | ModeOption::Push(m) => {
let state_opt = init_states.get(&m);
if state_opt.is_none() {
dfa.log.add_error(format!("unknown mode {m} in combined DFA"));
}
state_opt.cloned()
}
};
}
dfa.first_end_state = None;
}
if dfa.log.has_no_errors() {
Dfa::<General> {
state_graph: dfa.state_graph,
initial_state: dfa.initial_state,
end_states: dfa.end_states,
first_end_state: dfa.first_end_state,
log: dfa.log,
_phantom: PhantomData,
}
} else {
Dfa::with_log(dfa.log)
}
}
}
impl Default for DfaBuilder {
fn default() -> Self {
Self::new()
}
}
impl LogReader for DfaBuilder {
type Item = BufLog;
fn get_log(&self) -> &Self::Item {
&self.log
}
fn give_log(self) -> Self::Item {
self.log
}
}
impl HasBuildErrorSource for DfaBuilder {
const SOURCE: BuildErrorSource = BuildErrorSource::DfaBuilder;
}
impl BuildFrom<ReTree> for DfaBuilder {
fn build_from(regex_tree: ReTree) -> Self {
let mut builder = DfaBuilder {
re: regex_tree,
followpos: HashMap::new(),
lazypos: HashSet::new(),
ids: HashMap::new(),
log: BufLog::new()
};
builder.preprocess_re();
builder
}
}
pub struct Dfa<T> {
state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
initial_state: Option<StateId>,
end_states: BTreeMap<StateId, Terminal>,
first_end_state: Option<StateId>,
pub(crate) log: BufLog,
_phantom: PhantomData<T>
}
impl<T> Dfa<T> {
pub fn with_log(log: BufLog) -> Self {
Dfa {
state_graph: BTreeMap::new(),
initial_state: None,
end_states: BTreeMap::new(),
first_end_state: None,
log,
_phantom: PhantomData
}
}
pub fn get_state_graph(&self) -> &BTreeMap<StateId, BTreeMap<Segments, StateId>> {
&self.state_graph
}
pub fn get_initial_state(&self) -> &Option<StateId> {
&self.initial_state
}
pub fn get_end_states(&self) -> &BTreeMap<StateId, Terminal> {
&self.end_states
}
pub fn get_first_end_state(&self) -> &Option<StateId> {
&self.first_end_state
}
pub fn is_normalized(&self) -> bool {
if self.state_graph.is_empty() {
return true;
}
let mut states = self.state_graph.keys().to_vec();
states.sort();
if *states[0] != 0 {
false
} else {
let mut last_end = self.end_states.contains_key(states[0]);
let mut last: StateId = 0;
for &st in states.iter().skip(1) {
let end = self.end_states.contains_key(st);
if (*st != last + 1) || (!end && last_end) {
return false;
}
last_end = end;
last = *st;
}
true
}
}
pub fn normalize(mut self) -> Dfa<Normalized> {
if !self.log.has_no_errors() {
return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
}
self.log.add_note("normalizing DFA...");
let mut translate = BTreeMap::<StateId, StateId>::new();
let mut state_graph = BTreeMap::<StateId, BTreeMap<Segments, StateId>>::new();
let mut end_states = BTreeMap::<StateId, Terminal>::new();
let nbr_end = self.end_states.len();
let mut non_end_id = 0;
let mut end_id = self.state_graph.len() - nbr_end;
self.first_end_state = Some(end_id);
for &id in self.state_graph.keys() {
if let Some(terminal) = self.end_states.get(&id) {
translate.insert(id, end_id);
end_states.insert(end_id, terminal.clone());
end_id += 1;
} else {
translate.insert(id, non_end_id);
non_end_id += 1;
};
}
self.initial_state = self.initial_state.map(|st| *translate.get(&st).unwrap());
for s in end_states.iter_mut() {
if let Some(state) = s.1.mode_state {
s.1.mode_state = Some(translate[&state])
}
}
self.end_states = end_states;
for (id, mut trans) in std::mem::take(&mut self.state_graph) {
for (_, st) in trans.iter_mut() {
*st = translate[st];
}
state_graph.insert(translate[&id], trans);
}
self.state_graph = state_graph;
Dfa::<Normalized> {
state_graph: self.state_graph,
initial_state: self.initial_state,
end_states: self.end_states,
first_end_state: self.first_end_state,
log: self.log,
_phantom: PhantomData,
}
}
pub fn optimize(mut self) -> Dfa<Normalized> {
const VERBOSE: bool = false;
if !self.log.has_no_errors() {
return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
}
self.log.add_note("optimizing DFA...");
let separate_end_states = true;
if VERBOSE { println!("-----------------------------------------------------------"); }
let mut groups = Vec::<BTreeSet<StateId>>::new();
let mut st_to_group = BTreeMap::<StateId, usize>::new();
let nbr_non_end_states = self.state_graph.len() - self.end_states.len();
let mut last_non_end_id = 0;
let first_ending_id = nbr_non_end_states + 1;
let mut group = BTreeSet::<StateId>::new();
for st in self.state_graph.keys().filter(|&st| !self.end_states.contains_key(st)) {
group.insert(*st);
st_to_group.insert(*st, 0);
}
groups.push(group);
for _ in 1..first_ending_id {
groups.push(BTreeSet::new());
}
if separate_end_states {
for st in self.end_states.keys() {
st_to_group.insert(*st, groups.len());
groups.push(BTreeSet::<StateId>::from([*st]));
}
} else {
st_to_group.extend(self.end_states.keys().map(|id| (*id, groups.len())));
groups.push(BTreeSet::from_iter(self.end_states.keys().copied()));
}
let mut last_ending_id = groups.len() - 1;
let mut change = true;
while change {
let mut changes = Vec::<(StateId, usize, usize)>::new(); for (id, p) in groups.iter().enumerate().filter(|(_, g)| !g.is_empty()) {
if VERBOSE { println!("group #{id}: {p:?}:"); }
let mut combinations = BTreeMap::<BTreeMap<&Segments, usize>, usize>::new();
for &st_id in p {
let combination = self.state_graph.get(&st_id).unwrap().iter()
.filter(|(_, st)| st_to_group.contains_key(st)) .map(|(s, st)| { (s, st_to_group[st]) })
.collect::<BTreeMap<_, _>>();
if VERBOSE { print!("- state {st_id}{}: {combination:?}", if self.end_states.contains_key(&st_id) { " <END>" } else { "" }) };
if combinations.is_empty() {
combinations.insert(combination, id); if VERBOSE { println!(" (1st, no change)"); }
} else if let Some(&group_id) = combinations.get(&combination) {
if group_id != id {
changes.push((st_id, id, group_id));
if VERBOSE { println!(" -> group #{group_id}"); }
} else if VERBOSE {
println!(" (no change)");
}
} else {
let new_id = if id < first_ending_id {
assert!(last_non_end_id + 1 < first_ending_id, "no more IDs for non-accepting state");
last_non_end_id += 1;
last_non_end_id
} else {
last_ending_id += 1;
last_ending_id
};
combinations.insert(combination, new_id);
changes.push((st_id, id, new_id));
if VERBOSE { println!(" -> new group #{new_id}"); }
}
}
}
change = !changes.is_empty();
for (st_id, old_group_id, new_group_id) in changes {
if new_group_id >= groups.len() {
groups.push(BTreeSet::<StateId>::new());
}
assert!(groups[old_group_id].remove(&st_id));
groups[new_group_id].insert(st_id);
assert_eq!(st_to_group.insert(st_id, new_group_id), Some(old_group_id));
}
if VERBOSE && change { println!("---"); }
}
if VERBOSE { println!("-----------------------------------------------------------"); }
let delta = first_ending_id - last_non_end_id - 1;
self.first_end_state = Some(last_non_end_id + 1);
if delta > 0 {
if VERBOSE {
println!("removing the gaps in group numbering: (delta={delta})");
println!("st_to_group: {st_to_group:?}");
println!("groups: {groups:?}");
}
for (_, new_st) in st_to_group.iter_mut() {
if *new_st > last_non_end_id {
*new_st -= delta;
}
}
groups = groups.into_iter().enumerate()
.filter(|(id, _)| *id <= last_non_end_id || *id >= first_ending_id)
.map(|(_, g)| g)
.to_vec();
if VERBOSE {
println!("st_to_group: {st_to_group:?}");
println!("groups: {groups:?}");
}
}
self.end_states = BTreeMap::<StateId, Terminal>::from_iter(
groups.iter().enumerate()
.filter_map(|(group_id, states)| states.iter()
.find_map(|s| self.end_states.get(s).map(|terminal| {
let mut new_terminal = terminal.clone();
if let Some(state) = &mut new_terminal.mode_state {
*state = st_to_group[state];
}
(group_id as StateId, new_terminal)
})))
);
self.initial_state = Some(st_to_group[&self.initial_state.unwrap()] as StateId);
self.state_graph = self.state_graph.iter()
.map(|(st_id, map_sym_st)| (
st_to_group[st_id] as StateId,
map_sym_st.iter().map(|(segs, st)| (segs.clone(), st_to_group[st] as StateId)).collect::<BTreeMap<_, _>>()))
.collect::<BTreeMap::<StateId, BTreeMap<Segments, StateId>>>();
if VERBOSE {
println!("new_graph: {:?}", self.state_graph);
println!("new_initial: {:?}", self.initial_state);
println!("new_end: {:?}", self.end_states);
println!("-----------------------------------------------------------");
}
debug_assert!(self.is_normalized(), "optimized state machine isn't regular\nend_states={:?}\ngraph={:?}",
self.end_states, self.state_graph);
Dfa::<Normalized> {
state_graph: self.state_graph,
initial_state: self.initial_state,
end_states: self.end_states,
first_end_state: self.first_end_state,
log: self.log,
_phantom: PhantomData,
}
}
}
impl<T> LogReader for Dfa<T> {
type Item = BufLog;
fn get_log(&self) -> &Self::Item {
&self.log
}
fn give_log(self) -> Self::Item {
self.log
}
}
impl<T> HasBuildErrorSource for Dfa<T> {
const SOURCE: BuildErrorSource = BuildErrorSource::Dfa;
}
impl Dfa<General> {
pub fn new() -> Dfa<General> {
Dfa::with_log(BufLog::new())
}
}
impl Default for Dfa<General> {
fn default() -> Self {
Self::new()
}
}
impl Dfa<Normalized> {
pub fn gen_tables_source_code(&self, indent: usize) -> String {
let mut source = Vec::<String>::new();
source.push("let dfa_tables = DfaTables::new(".to_string());
source.push(" btreemap![".to_string());
source.extend(graph_to_code(&self.state_graph, None, 8));
source.push(" ],".to_string());
source.push(format!(" {:?},", self.initial_state));
source.push(" btreemap![".to_string());
let es = self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).to_vec();
source.extend(es.chunks(6).map(|v| format!(" {},", v.join(", "))));
source.push(" ],".to_string());
source.push(format!(" {:?},", self.first_end_state));
source.push(");".to_string());
indent_source(vec![source], indent)
}
}
pub struct DfaBundle {
dfas: Vec<(ModeId, Dfa<General>)>,
log: Option<BufLog>,
}
impl DfaBundle {
pub fn new<T>(dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
DfaBundle {
dfas: dfas.into_iter().collect(),
log: None
}
}
pub fn with_log<T>(log: BufLog, dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
DfaBundle {
dfas: dfas.into_iter().collect(),
log: Some(log)
}
}
}
impl BuildFrom<DfaBundle> for Dfa<General> {
fn build_from(bundle: DfaBundle) -> Self {
let dfa_builder = DfaBuilder::with_log(bundle.log.unwrap_or_default());
dfa_builder.build_from_dfa_modes(bundle.dfas)
}
}
impl BuildFrom<DfaBuilder> for Dfa<General> {
fn build_from(dfa_builder: DfaBuilder) -> Self {
dfa_builder.build()
}
}
pub struct DfaTables {
state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
initial_state: Option<StateId>,
end_states: BTreeMap<StateId, Terminal>,
first_end_state: Option<StateId>,
}
impl DfaTables {
pub fn new(
state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
initial_state: Option<StateId>,
end_states: BTreeMap<StateId, Terminal>,
first_end_state: Option<StateId>,
) -> Self {
DfaTables { state_graph, initial_state, end_states, first_end_state }
}
}
impl BuildFrom<DfaTables> for Dfa<Normalized> {
fn build_from(source: DfaTables) -> Self {
Dfa {
state_graph: source.state_graph,
initial_state: source.initial_state,
end_states: source.end_states,
first_end_state: source.first_end_state,
log: BufLog::new(),
_phantom: PhantomData
}
}
}
fn states_to_string<T: Display>(s: &BTreeSet<T>) -> String {
s.iter().map(|id| id.to_string()).join(", ")
}
#[allow(dead_code)]
pub(crate) fn followpos_to_string(dfa_builder: &DfaBuilder) -> String {
let mut fpos = dfa_builder.followpos.iter()
.map(|(id, ids)| format!("{id:3} -> {}", ids.iter().map(|x| x.to_string()).join(", ")))
.to_vec();
fpos.sort();
fpos.join("\n")
}
fn node_to_string(tree: &ReTree, index: usize, basic: bool) -> String {
let node = tree.get(index);
let mut result = String::new();
if !basic {
if let Some(b) = node.nullable {
if b {
result.push('!');
}
} else {
result.push('?');
}
}
result.push_str(&node.to_string());
let children = tree.children(index);
if !children.is_empty() {
result.push('(');
result.push_str(&children.iter().map(|&c| node_to_string(tree, c, basic)).to_vec().join(","));
result.push(')');
}
result
}
pub fn retree_to_str(tree: &ReTree, node: Option<usize>, emphasis: Option<usize>, show_index: bool) -> String {
fn pr_join(children: Vec<(u32, String)>, str: &str, pr: u32) -> (u32, String) {
(pr,
children.into_iter()
.map(|(p_ch, ch)| if p_ch < pr { format!("({ch})") } else { ch })
.join(str))
}
fn pr_append(child: (u32, String), str: &str, pr: u32, force_par: bool) -> (u32, String) {
(pr, if force_par || child.0 < pr { format!("({}){str}", child.1) } else { format!("{}{str}", child.1) })
}
const PR_MATCH: u32 = 1;
const PR_TERM: u32 = 2;
const PR_REPEAT: u32 = 3;
const PR_ITEM: u32 = 4;
let mut children = vec![];
if tree.is_empty() {
return "<empty>".to_string();
}
let top = node.unwrap_or_else(|| tree.get_root().unwrap());
for node in tree.iter_post_depth_simple_at(top) {
let (pr, mut str) = match node.num_children() {
0 => {
match &node.op {
ReType::Empty => (PR_ITEM, "ε".to_string()),
ReType::End(t) => (PR_ITEM, t.to_string()),
ReType::Char(ch) => (PR_ITEM, format!("{ch:?}")),
ReType::CharRange(segments) => (PR_ITEM, format!("[{segments}]")),
ReType::String(str) => (PR_ITEM, format!("{str:?}")),
s => panic!("{s:?} should have children"),
}
}
n => {
let mut node_children = children.split_off(children.len() - n);
match &node.op {
ReType::Concat => pr_join(node_children, " ", PR_TERM),
ReType::Star => pr_append(node_children.pop().unwrap(), "*", PR_REPEAT, show_index),
ReType::Plus => pr_append(node_children.pop().unwrap(), "+", PR_REPEAT, show_index),
ReType::Or => pr_join(node_children, " | ", PR_MATCH),
ReType::Lazy => pr_append(node_children.pop().unwrap(), "?", PR_REPEAT, show_index),
s => panic!("{s:?} shouldn't have {n} child(ren)"),
}
}
};
if show_index {
str = format!("{}:{str}", node.index);
}
if Some(node.index) == emphasis {
str = format!(" ►►► {str} ◄◄◄ ");
}
children.push((pr, str));
}
children.pop().unwrap().1
}
pub fn tree_to_string(tree: &ReTree, root: Option<usize>, basic: bool) -> String {
if !tree.is_empty() {
node_to_string(tree, root.unwrap_or_else(|| tree.get_root().unwrap()), basic)
} else {
"None".to_string()
}
}
impl<T> Dfa<T> {
pub fn print(&self, indent: usize) {
println!("Initial state: {}", if let Some(st) = self.initial_state { st.to_string() } else { "none".to_string() });
println!("Graph:");
print_graph(&self.state_graph, Some(&self.end_states), indent);
println!("End states:\n{: >indent$}{}", " ",
self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).join(", "), indent=indent);
}
}
fn graph_to_code(
state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>,
end_states: Option<&BTreeMap<StateId, Terminal>>,
indent: usize,
) -> Vec<String> {
let s = String::from_utf8(vec![32; indent]).unwrap();
state_graph.iter().map(|(state, trans)| {
let mut has_neg = false;
let mut tr = trans.iter().map(|(sym, st)| {
let s = (sym.to_string(), st.to_string());
has_neg |= s.0.starts_with('~');
s
}).to_vec();
if has_neg {
for (sym, _) in &mut tr {
if !sym.ends_with(']') {
sym.insert(0, '[');
sym.push(']');
}
}
}
format!("{s}{} => branch!({}),{}",
state,
tr.into_iter().map(|(sym, st)| format!("{sym} => {st}")).join(", "),
end_states.and_then(|map| map.get(state).map(|token| format!(" // {}", token))).unwrap_or_default(),
)
}).collect()
}
pub fn print_graph(state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>, end_states: Option<&BTreeMap<StateId, Terminal>>, indent: usize) {
let src = graph_to_code(state_graph, end_states, indent);
println!("{}", src.join("\n"));
}
pub mod macros {
#[macro_export]
macro_rules! node {
(chr $char:expr) => { $crate::dfa::ReNode::char($char) };
(chr $char1:expr, $char2:expr $(;$char3:expr, $char4:expr)*) => { ($char1..=$char2)$(.chain($char3..=$char4))*.map(|c| $crate::dfa::ReNode::char(c)) };
([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
(~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![~ $($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
(.) => { node!([DOT]) };
(str $str:expr) => { $crate::dfa::ReNode::string($str) };
(&) => { $crate::dfa::ReNode::concat() };
(|) => { $crate::dfa::ReNode::or() };
(*) => { $crate::dfa::ReNode::star() };
(+) => { $crate::dfa::ReNode::plus() };
(e) => { $crate::dfa::ReNode::empty() };
(??) => { $crate::dfa::ReNode::lazy() };
(= $id:expr) => { $crate::dfa::ReNode::end($crate::lexer::Terminal {
action: $crate::lexer::ActionOption::Token($id),
channel: 0,
mode: $crate::lexer::ModeOption::None,
mode_state: None,
pop: false
}) };
($id:expr) => { $crate::dfa::ReNode::end($id) };
([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!([$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
(~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!(~ [$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
}
#[macro_export]
macro_rules! term {
(= $id:expr ) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Token($id),channel: 0, mode: $crate::lexer::ModeOption::None, mode_state: None, pop: false } };
(more) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::More, channel: 0, mode: $crate::lexer::ModeOption::None, mode_state: None, pop: false } };
(skip) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: 0, mode: $crate::lexer::ModeOption::None, mode_state: None, pop: false } };
(mode $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: 0, mode: $crate::lexer::ModeOption::Mode($id), mode_state: None, pop: false } };
(push $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: 0, mode: $crate::lexer::ModeOption::Push($id), mode_state: None, pop: false } };
(pushst $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: 0, mode: $crate::lexer::ModeOption::None, mode_state: Some($id), pop: false } };
(pop) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: 0, mode: $crate::lexer::ModeOption::None, mode_state: None, pop: true } };
(# $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip, channel: $id, mode: $crate::lexer::ModeOption::None, mode_state: None, pop: false } };
}
#[cfg(test)]
mod tests {
use crate::{branch, btreemap};
use crate::dfa::{graph_to_code, ReNode};
use crate::char_reader::{UTF8_HIGH_MIN, UTF8_LOW_MAX, UTF8_MAX, UTF8_MIN};
use crate::lexer::{ActionOption, ModeOption, Terminal};
use crate::segments::Segments;
use crate::segmap::Seg;
#[test]
fn macro_node() {
assert_eq!(node!([0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::dot()));
assert_eq!(node!(~[0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::empty()));
assert_eq!(node!(chr 'a'), ReNode::char('a'));
assert_eq!(node!(['a'-'z', '0'-'9']), ReNode::char_range(Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)])));
assert_eq!(node!(.), ReNode::char_range(Segments::from([Seg(UTF8_MIN, UTF8_LOW_MAX), Seg(UTF8_HIGH_MIN, UTF8_MAX)])));
assert_eq!(node!(str "new"), ReNode::string("new"));
assert_eq!(node!(=5), ReNode::end(Terminal { action: ActionOption::Token(5), channel: 0, mode: ModeOption::None, mode_state: None, pop: false }));
assert_eq!(node!(&), ReNode::concat());
assert_eq!(node!(|), ReNode::or());
assert_eq!(node!(*), ReNode::star());
assert_eq!(node!(+), ReNode::plus());
assert_eq!(node!(e), ReNode::empty());
}
#[test]
fn state_graph() {
let test = btreemap![
1 => branch!(),
2 => branch!('A' => 0),
3 => branch!('B' => 1, 'C' => 2),
4 => branch!('D'-'F' => 3),
5 => branch!('0'-'9', '_' => 4),
6 => branch!(DOT => 5),
7 => branch!(~['*'] => 6),
8 => branch!(~['+', '-'] => 7),
9 => branch!(~['*'] => 8, ['*'] => 9),
];
let s = graph_to_code(&test, None, 4);
assert_eq!(s, vec![
" 1 => branch!(),".to_string(),
" 2 => branch!('A' => 0),".to_string(),
" 3 => branch!('B' => 1, 'C' => 2),".to_string(),
" 4 => branch!('D'-'F' => 3),".to_string(),
" 5 => branch!('0'-'9', '_' => 4),".to_string(),
" 6 => branch!(DOT => 5),".to_string(),
" 7 => branch!(~['*'] => 6),".to_string(),
" 8 => branch!(~['+', '-'] => 7),".to_string(),
" 9 => branch!(~['*'] => 8, ['*'] => 9),".to_string(),
]);
}
}
}