lexigram_lib/dfa/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
6use std::fmt::{Display, Formatter};
7use std::marker::PhantomData;
8use vectree::VecTree;
9use lexigram_core::CollectJoin;
10use lexigram_core::char_reader::escape_string;
11use crate::{btreeset, indent_source, General, Normalized};
12use crate::char_reader::escape_char;
13use crate::lexer::{ModeId, ModeOption, StateId, Terminal};
14use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
15use crate::build::{BuildErrorSource, BuildFrom, HasBuildErrorSource};
16use crate::segments::Segments;
17use crate::segmap::Seg;
18use crate::take_until::TakeUntilIterator;
19
20// ---------------------------------------------------------------------------------------------
21
22#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
23pub enum ReType { // todo: remove Boxes
24    #[default] Empty,
25    End(Box<Terminal>),
26    Char(char),
27    CharRange(Box<Segments>),
28    String(Box<String>),
29    Concat,
30    Star,
31    Plus,
32    Or,
33    Lazy,
34}
35
36#[test]
37fn retype_size() {
38    let size = size_of::<ReType>();
39    assert!(size <= 16, "size of ReType is too big: {size}");
40}
41
42impl ReType {
43    pub fn is_empty(&self) -> bool {
44        matches!(self, ReType::Empty)
45    }
46
47    pub fn is_leaf(&self) -> bool {
48        matches!(self, ReType::Empty | ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_))
49    }
50
51    pub fn is_nullable(&self) -> Option<bool> {
52        match self {
53            ReType::Empty | ReType::Star => Some(true),
54            ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_) | ReType::Plus => Some(false),
55            ReType::Concat | ReType::Or | ReType::Lazy => None,
56        }
57    }
58
59    pub fn is_end(&self) -> bool {
60        matches!(self, ReType::End(_))
61    }
62
63    // pub fn apply_chars<F: FnMut(char) -> ()>(&self, mut f: F) {
64    //     match self {
65    //         ReType::Char(c) => f(*c),
66    //         ReType::CharRange(i) => i.0.iter().flat_map(|(a, b)| (*a..=*b)).for_each(|code| f(char::from_u32(code).unwrap())),
67    //         _ => {}
68    //     }
69    // }
70}
71
72impl Display for ReType {
73    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
74        match self {
75            ReType::Empty => write!(f, "-"),
76            ReType::End(t) => write!(f, "{t}"),
77            ReType::Char(c) => write!(f, "'{}'", escape_char(*c)),
78            ReType::CharRange(segments) => write!(f, "[{segments}]"),
79            ReType::String(s) => write!(f, "'{}'", escape_string(&s)),
80            ReType::Concat => write!(f, "&"),
81            ReType::Star => write!(f, "*"),
82            ReType::Plus => write!(f, "+"),
83            ReType::Or => write!(f, "|"),
84            ReType::Lazy => write!(f, "??"),
85        }
86    }
87}
88
89type Id = u32;
90
91#[derive(Clone, Debug, PartialEq, Default)]
92pub struct ReNode {
93    id: Option<Id>,
94    op: ReType,
95    firstpos: HashSet<Id>,  // todo: use BTreeSet or Vec instead? Move to DfaBuilder?
96    lastpos: HashSet<Id>,   // todo: use BTreeSet or Vec instead? Move to DfaBuilder?
97    nullable: Option<bool>
98}
99
100impl ReNode {
101    pub fn new(node: ReType) -> ReNode {
102        ReNode { id: None, op: node, firstpos: HashSet::new(), lastpos: HashSet::new(), nullable: None }
103    }
104
105    pub fn empty() -> ReNode { ReNode::new(ReType::Empty) }
106
107    pub fn end(t: Terminal) -> ReNode { ReNode::new(ReType::End(Box::new(t))) }
108
109    pub fn char(c: char) -> ReNode { ReNode::new(ReType::Char(c)) }
110
111    pub fn char_range(s: Segments) -> ReNode { ReNode::new(ReType::CharRange(Box::new(s))) }
112
113    pub fn string<T: Into<String>>(s: T) -> ReNode { ReNode::new(ReType::String(Box::new(s.into()))) }
114
115    pub fn concat() -> ReNode { ReNode::new(ReType::Concat) }
116
117    pub fn star() -> ReNode { ReNode::new(ReType::Star) }
118
119    pub fn plus() -> ReNode { ReNode::new(ReType::Plus) }
120
121    pub fn or() -> ReNode { ReNode::new(ReType::Or) }
122
123    pub fn lazy() -> ReNode { ReNode::new(ReType::Lazy) }
124
125    pub fn is_leaf(&self) -> bool {
126        self.op.is_leaf()
127    }
128
129    pub fn is_empty(&self) -> bool {
130        self.op.is_empty()
131    }
132
133    pub fn get_type(&self) -> &ReType {
134        &self.op
135    }
136
137    pub fn is_nullable(&self) -> Option<bool> {
138        self.nullable
139    }
140
141    pub fn get_mut_type(&mut self) -> &mut ReType {
142        &mut self.op
143    }
144}
145
146impl Display for ReNode {
147    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
148        if let Some(id) = self.id {
149            write!(f, "{id}:")?;
150        }
151        self.op.fmt(f)
152    }
153}
154
155// ---------------------------------------------------------------------------------------------
156
157pub type ReTree = VecTree<ReNode>;
158
159#[derive(Clone)]
160pub struct DfaBuilder {
161    /// Regular Expression tree
162    re: ReTree,
163    /// `followpos` table, containing the `Id` -> `Id` graph of `re`
164    followpos: HashMap<Id, HashSet<Id>>,
165    /// `lazypos[id_child]` includes `id_lazy` when `id_child` is a child of a lazy operator `id_lazy`
166    lazypos: HashSet<Id>,
167    /// `Id` -> node index
168    ids: HashMap<Id, usize>,
169    log: BufLog
170}
171
172impl DfaBuilder {
173    pub fn new() -> Self {
174        Self::with_log(BufLog::new())
175    }
176
177    pub fn with_log(log: BufLog) -> Self {
178        DfaBuilder {
179            re: VecTree::new(),
180            followpos: HashMap::new(),
181            lazypos: HashSet::new(),
182            ids: HashMap::new(),
183            log
184        }
185    }
186
187    #[inline(always)]
188    pub fn get_re(&self) -> &ReTree {
189        &self.re
190    }
191
192    /// Replaces ReType::String(s) with a concatenation of ReType::Char(s[i])
193    fn preprocess_re(&mut self) {
194        let mut nodes = vec![];
195        for mut inode in self.re.iter_depth_simple_mut() {
196            if matches!(inode.op, ReType::String(_)) {
197                // we have to do it again to move the string
198                if let ReType::String(s) = std::mem::take(&mut inode.op) {
199                    if s.is_empty() {
200                        // will be replaced by an empty Concat, but shouldn't exist
201                        self.log.add_error(format!("node #{} is an empty string", inode.index));
202                    }
203                    nodes.push((inode.index, s));
204                }
205            }
206        }
207        for (index, s) in nodes {
208            let node = self.re.get_mut(index);
209            match s.len() {
210                0 => node.op = ReType::Char('?'), // empty string replaced by '?'
211                1 => node.op = ReType::Char(s.chars().nth(0).unwrap()),
212                _ => {
213                    node.op = ReType::Concat;
214                    for c in s.chars() {
215                        self.re.add(Some(index), ReNode::char(c));
216                    }
217                }
218            }
219        }
220    }
221
222    /// Calculates `firstpos`, `lastpost`, `nullable` for each node, and the `followpos` table.
223    fn calc_node_pos(&mut self) {
224        let mut id = 0;
225        for mut inode in self.re.iter_depth_mut() {
226            if inode.is_leaf() {
227                if inode.num_children() > 0 {
228                    self.log.add_error(format!("node #{} {} had {} child(ren) but shouldn't have any", inode.index, inode.op, inode.num_children()));
229                }
230                id += 1;
231                inode.id = Some(id);
232                if !inode.is_empty() {
233                    inode.firstpos.insert(id);
234                    inode.lastpos.insert(id);
235                }
236                self.ids.insert(id, inode.index);
237                if inode.op.is_end() {
238                    self.followpos.insert(id, HashSet::new());
239                }
240            } else {
241                match inode.op {
242                    ReType::Concat => {
243                        if inode.num_children() > 0 {
244                            // firstpos = union of all firstpos until the first non-nullable child (included)
245                            let mut firstpos = HashSet::<Id>::new();
246                            for child in inode.iter_children_simple().take_until(|&n| !n.nullable.unwrap()) {
247                                firstpos.extend(&child.firstpos);
248                            }
249                            inode.firstpos.extend(firstpos);
250                            // lastpos = union of all lastpos until the first non-nullable child (included), starting from the end
251                            let mut lastpos = HashSet::<Id>::new();
252                            for child in inode.iter_children_simple().rev().take_until(|&n| !n.nullable.unwrap()) {
253                                lastpos.extend(&child.lastpos);
254                            }
255                            inode.lastpos.extend(lastpos);
256                            // followpos:
257                            // for all pairs of consecutive children {c[i], c[i+1]},
258                            //     for all j in c[i].lastpos
259                            //         followpos[j].extend(c[i+1].firstpos)
260                            let mut iter = inode.iter_children_simple();
261                            let a = iter.next().unwrap();   // a is c[i]
262                            let mut lastpos = a.lastpos.clone();
263                            while let Some(b) = iter.next() {   // b is c[i+1]
264                                for j in &lastpos {
265                                    if !self.followpos.contains_key(j) {
266                                        self.followpos.insert(*j, HashSet::new());
267                                    }
268                                    self.followpos.get_mut(j).unwrap().extend(&b.firstpos);
269                                }
270                                // we must build the lastpos during the iteration
271                                // &(0,1,2,3) = &2(&1(&0(0,1),2),3) => &0.lpos=0.lpos, &1.lpos=lastpos("&",[&0,2])), ...
272                                if b.nullable.unwrap() {
273                                    lastpos.extend(&b.lastpos);
274                                } else {
275                                    lastpos = b.lastpos.clone();
276                                }
277                            }
278                        } else {
279                            self.log.add_error(format!("node #{} is Concat but has no children", inode.index))
280                        }
281                    }
282                    ReType::Star | ReType::Plus => {
283                        if inode.num_children() == 1 {
284                            // firstpos, lastpos identical to child's
285                            let firstpos = inode.iter_children_simple().next().unwrap().firstpos.iter().map(|&n| n).to_vec();
286                            inode.firstpos.extend(firstpos);
287                            let lastpos = inode.iter_children_simple().next().unwrap().lastpos.iter().map(|&n| n).to_vec();
288                            inode.lastpos.extend(lastpos);
289                            // followpos:
290                            // for all i in *.lastpos,
291                            //     followpos[i].extend(*.firstpos)
292
293                            for i in &inode.lastpos {
294                                if !self.followpos.contains_key(i) {
295                                    self.followpos.insert(*i, HashSet::new());
296                                }
297                                self.followpos.get_mut(i).unwrap().extend(&inode.firstpos);
298                            }
299                        } else {
300                            self.log.add_error(format!("node #{} is {:?} but has {} child(ren) instead of 1 child",
301                                                       inode.index, inode.op, inode.num_children()))
302                        }
303                    }
304                    ReType::Or => {
305                        if inode.num_children() > 0 {
306                            // firstpos, lastpost = union of children's
307                            let mut firstpos = HashSet::<Id>::new();
308                            for child in inode.iter_children_simple() {
309                                firstpos.extend(&child.firstpos);
310                            }
311                            inode.firstpos.extend(firstpos);
312                            let mut lastpos = HashSet::<Id>::new();
313                            for child in inode.iter_children_simple() {
314                                lastpos.extend(&child.lastpos);
315                            }
316                            inode.lastpos.extend(lastpos);
317                        } else {
318                            self.log.add_error(format!("node #{} is Or but has no children", inode.index));
319                        }
320                    }
321                    ReType::Lazy => {
322                        if inode.num_children() == 1 {
323                            let child = inode.iter_children_simple().next().unwrap();
324                            let firstpos = child.firstpos.clone();
325                            let lastpos = child.lastpos.clone();
326                            inode.firstpos = firstpos;
327                            inode.lastpos = lastpos;
328                            for ichild in inode.iter_depth_simple().filter(|node| node.is_leaf()) {
329                                self.lazypos.insert(ichild.id.unwrap());
330                            }
331                        } else {
332                            self.log.add_error(format!("node #{} is Lazy but has {} child(ren) instead of 1 child",
333                                                       inode.index, inode.num_children()));
334                        }
335                    }
336                    _ => panic!("{:?}: no way to compute firstpos/...", &*inode)
337                }
338            }
339            if let Some(nullable) = inode.op.is_nullable() {
340                inode.nullable = Some(nullable);
341            } else {
342                inode.nullable = match &inode.op {
343                    ReType::Concat | ReType::Lazy => Some(inode.iter_children_simple().all(|child| child.nullable.unwrap())),
344                    ReType::Or => Some(inode.iter_children_simple().any(|child| child.nullable.unwrap())),
345                    op => panic!("{:?} should have a fixed nullable property", op)
346                }
347            }
348        }
349    }
350
351    fn calc_states(&mut self) -> Dfa<General> {
352        const VERBOSE: bool = false;
353        const RESOLVE_END_STATES: bool = true;
354        const RM_LAZY_BRANCHES: bool = true;
355        // initial state from firstpos(top node)
356        let mut dfa = Dfa::new();
357        if !self.log.has_no_errors() {
358            return dfa;
359        }
360        if VERBOSE { println!("new DFA"); }
361        let mut current_id = 0;
362        let key = BTreeSet::from_iter(self.re.get(0).firstpos.iter().map(|&id| id));
363        let mut new_states = BTreeSet::<BTreeSet<Id>>::new();
364        new_states.insert(key.clone());
365        let mut states = BTreeMap::<BTreeSet<Id>, StateId>::new();
366        states.insert(key, current_id);
367        dfa.initial_state = Some(current_id);
368
369        // gathers lazy ids and their immediate followpos to remove phantom branches:
370        let mut lazy_followpos = self.lazypos.iter().map(|id| *id).collect::<BTreeSet<Id>>();
371        lazy_followpos.extend(self.lazypos.iter().filter_map(|id| self.followpos.get(id)).flatten());
372        if VERBOSE { println!("lazy_followpos = {{{}}}", lazy_followpos.iter().join(", ")); }
373
374        // gets a partition of the symbol segments and changes Char to CharRange
375        let mut symbols_part = Segments::empty();
376        for id in self.ids.values() {
377            let node = self.re.get_mut(*id);
378            if let ReType::Char(c) = node.op {
379                node.op = ReType::CharRange(Box::new(Segments::from(c)));
380            }
381            if let ReType::CharRange(segments) = &node.op {
382                symbols_part.add_partition(&segments);
383            }
384        }
385        if VERBOSE { println!("symbols = {symbols_part:X}"); }
386
387        // prepares the segments and their source ids
388        while let Some(s) = new_states.pop_first() {
389            let new_state_id = states.get(&s).unwrap().clone();
390            let is_lazy_state = s.iter().all(|id| lazy_followpos.contains(id));
391            if VERBOSE {
392                println!("- state {} = {{{}}}{}", new_state_id, states_to_string(&s), if is_lazy_state { ", lazy state" } else { "" });
393            }
394            let mut trans = BTreeMap::<Seg, BTreeSet<Id>>::new();   // transitions partitioned by `symbol_parts`
395            let mut terminals = BTreeMap::<Id, &Terminal>::new();   // all terminals (used if terminal conflicts)
396            let mut first_terminal_id: Option<Id> = None;   // first met terminal id, if any (used if terminal conflicts)
397            let mut id_transitions = BTreeSet::<Id>::new(); // ids that are destination from current state (used if terminal conflicts)
398            let mut id_terminal: Option<Id> = None;         // selected terminal id, if any (used to remove phantom branches)
399            for (symbol, id) in s.iter().map(|id| (&self.re.get(self.ids[id]).op, *id)) {
400                if symbol.is_end() {
401                    if !dfa.state_graph.contains_key(&new_state_id) {
402                        if VERBOSE { println!("  + {symbol} => create state {new_state_id}"); }
403                        dfa.state_graph.insert(new_state_id, BTreeMap::new());
404                    }
405                    if let ReType::End(t) = symbol {
406                        id_terminal = Some(id);
407                        if first_terminal_id.is_none() {
408                            first_terminal_id = Some(id);
409                            if new_state_id == 0 {
410                                if t.is_only_skip() {
411                                    self.log.add_warning(format!("<skip> on initial state is a risk of infinite loop on bad input ({t})"));
412                                } else if t.is_token() {
413                                    self.log.add_warning(format!("<token> on initial state returns a token on bad input ({t})"));
414                                }
415                            }
416                        }
417                        if RESOLVE_END_STATES {
418                            if let Some(t2) = terminals.insert(id, t) {
419                                panic!("overriding {id} -> {t2} with {id} -> {t} in end_states {}",
420                                       terminals.iter().map(|(id, t)| format!("{id} {t}")).join(", "));
421                            }
422                        } else {
423                            if !dfa.end_states.contains_key(&new_state_id) {
424                                dfa.end_states.insert(new_state_id, *t.clone());
425                                if VERBOSE { println!("  # end state: id {id} {t}"); }
426                            } else if VERBOSE {
427                                println!("  # end state: id {id} {t} ## DISCARDED since another one already taken");
428                            }
429                        }
430                    } else {
431                        panic!("unexpected END symbol: {symbol:?}");
432                    }
433                } else {
434                    if let ReType::CharRange(segments) = symbol {
435                        if let Some(set) = self.followpos.get(&id) {
436                            id_transitions.extend(set);
437                            let cmp = segments.intersect(&symbols_part);
438                            assert!(cmp.internal.is_empty(), "{symbols_part} # {segments} = {cmp}");
439                            if VERBOSE { println!("  + {} to {}", &cmp.common, id); }
440                            for segment in cmp.common.into_iter() {
441                                if let Some(ids) = trans.get_mut(&segment) {
442                                    ids.insert(id);
443                                } else {
444                                    trans.insert(segment, btreeset![id]);
445                                }
446                            }
447                        } else {
448                            self.log.add_error(format!("node #{id} is not in followpos; is an accepting state missing? Orphan segment: {segments}"));
449                        }
450                    }
451                }
452            }
453            if RESOLVE_END_STATES {
454                if terminals.len() > 1 {
455                    if VERBOSE {
456                        println!("  # {id_transitions:?}");
457                        println!("  # terminal conflict: {}", terminals.iter()
458                            .map(|(id, t)| format!("{id} -> {t} (has{} trans)", if id_transitions.contains(id) { "" } else { " no" }))
459                            .join(", "));
460                    }
461                    // The potential terminals are obtained by removing all terminals associated with an id that is already the destination
462                    // of at least one transition from this state. The idea is to favour terminals that don't have another chance to be
463                    // used, in case of terminal conflict.
464                    let mut potentials = terminals.keys().cloned().filter(|id| !id_transitions.contains(id)).collect::<BTreeSet<_>>();
465                    let chosen = match potentials.len() {
466                        0 => {
467                            if VERBOSE { println!("    all ids have transitions => AMBIGUOUS, selecting the first defined terminal"); }
468                            self.log.add_warning(format!("conflicting terminals for state {new_state_id}, none having other transitions: {}",
469                                                         terminals.iter().map(|(id, t)| format!("ID {id} -> terminal {t}")).join(", ")));
470                            first_terminal_id.unwrap()
471                        }
472                        1 => {
473                            if VERBOSE { println!("    only one id has no transitions => selecting it"); }
474                            potentials.pop_first().unwrap()
475                        }
476                        n => {
477                            self.log.add_warning(format!("conflicting terminals for state {new_state_id}, {n} having no other transition: {}",
478                                                         terminals.iter().map(|(id, t)| format!("ID {id} -> terminal {t}")).join(", ")));
479                            if potentials.contains(&first_terminal_id.unwrap()) {
480                                if VERBOSE { println!("    {n} ids have no transitions => AMBIGUOUS, selecting the first defined terminal"); }
481                                first_terminal_id.unwrap()
482                            } else {
483                                if VERBOSE { println!("    {n} ids have no transitions => AMBIGUOUS, selecting the first one of the list"); }
484                                potentials.pop_first().unwrap()
485                            }
486                        }
487                    };
488                    id_terminal = Some(chosen);
489                    let t = terminals.remove(&chosen).unwrap().clone();
490                    if VERBOSE { println!("    end state: id {chosen} {t}"); }
491                    dfa.end_states.insert(new_state_id, t);
492                } else if let Some((id, terminal)) = terminals.pop_first() {
493                    id_terminal = Some(id);
494                    if VERBOSE { println!("  # end state: id {id} {terminal}"); }
495                    dfa.end_states.insert(new_state_id, terminal.clone());
496                }
497            }
498
499            let has_non_lazy_terminal = id_terminal.map(|id| !lazy_followpos.contains(&id)).unwrap_or(false);
500
501            // finds the destination ids (creating new states if necessary), and populates the symbols for each destination
502            let mut map = BTreeMap::<StateId, Segments>::new();
503            for (segment, ids) in trans {
504                if VERBOSE { print!("  - {} in {}: ", segment, states_to_string(&ids)); }
505                if RM_LAZY_BRANCHES && !is_lazy_state && has_non_lazy_terminal && ids.iter().all(|id| lazy_followpos.contains(id)) {
506                    if VERBOSE { println!(" => lazy, removed"); }
507                    continue;
508                }
509                let mut state = BTreeSet::new();
510                for id in ids {
511                    state.extend(&self.followpos[&id]);
512                }
513                if VERBOSE { print!("follow = {{{}}}", states_to_string(&state)); }
514                let state_id = if let Some(state_id) = states.get(&state) {
515                    if VERBOSE { println!(" => state {state_id}"); }
516                    *state_id
517                } else {
518                    new_states.insert(state.clone());
519                    current_id += 1;
520                    if VERBOSE { println!(" => new state {} = {{{}}}", current_id, states_to_string(&state)); }
521                    states.insert(state, current_id);
522                    current_id
523                };
524                if let Some(segments) = map.get_mut(&state_id) {
525                    segments.insert(segment);
526                } else {
527                    map.insert(state_id, Segments::new(segment));
528                }
529            }
530            // regroups the symbols per destination
531            for segments in map.values_mut() {
532                segments.normalize();
533            }
534            if VERBOSE {
535                for (st, int) in &map {
536                    println!("  {} -> {}", int, st);
537                }
538            }
539            // finally, updates the graph with the reverse (symbol -> state) data
540            dfa.state_graph.insert(new_state_id, map.into_iter().map(|(id, segments)| (segments, id)).collect());
541        }
542        dfa
543    }
544
545    fn build(mut self) -> Dfa<General> {
546        self.log.add_note("building DFA...");
547        self.calc_node_pos();
548        let mut dfa = self.calc_states();
549        // transfers all the log messages to the Dfa
550        dfa.log.extend(std::mem::replace(&mut self.log, BufLog::new()));
551        dfa
552    }
553
554    #[cfg(test)]
555    pub(crate) fn build_from_graph<T: IntoIterator<Item=(StateId, Terminal)>>(
556        &mut self, graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>, init_state: StateId, end_states: T,
557    ) -> Option<Dfa<General>>
558    {
559        let mut dfa = Dfa::<General> {
560            state_graph: graph,
561            initial_state: Some(init_state),
562            end_states: BTreeMap::from_iter(end_states),
563            first_end_state: None,
564            log: BufLog::new(),
565            _phantom: PhantomData
566        };
567        dfa.first_end_state = dfa.end_states.keys().min().cloned();
568        // TODO: add checks
569        Some(dfa)
570    }
571
572    /// Merges several DFA graphs into one. The graphs represent different modes that are called with the
573    /// `Action::pushMode(id)` action.
574    fn build_from_dfa_modes<T, U>(self, dfas: T) -> Dfa<General>
575    where
576        T: IntoIterator<Item=(ModeId, Dfa<U>)>,
577    {
578        assert!(self.re.is_empty(), "build_from_dfa_modes() used on non-empty builder");
579        let mut dfa = Dfa::<General>::with_log(self.log);
580        dfa.log.add_note("combining DFA modes...");
581        let mut init_states = BTreeMap::new();
582        for (idx, new_dfa) in dfas {
583            dfa.log.add_note(format!("- DFA mode #{idx}"));
584            dfa.log.extend(new_dfa.log);
585            if new_dfa.state_graph.is_empty() {
586                dfa.log.add_error(format!("DFA mode #{idx} is empty"));
587            }
588            let offset = if init_states.is_empty() { 0 } else { 1 + dfa.state_graph.keys().max().unwrap() };
589            dfa.initial_state = dfa.initial_state.or(new_dfa.initial_state);
590            if let Some(initial_state) = new_dfa.initial_state {
591                if init_states.insert(idx, offset + initial_state).is_some() {
592                    dfa.log.add_error(format!("DFA mode #{idx} defined multiple times"))
593                };
594            } else {
595                dfa.log.add_error(format!("DFA mode #{idx} has no initial state"));
596            }
597            for (st_from, mut map) in new_dfa.state_graph {
598                for (_, st_to) in map.iter_mut() {
599                    *st_to += offset;
600                }
601                assert!(!dfa.state_graph.contains_key(&(st_from + offset)));
602                dfa.state_graph.insert(st_from + offset, map);
603            }
604            dfa.end_states.extend(new_dfa.end_states.into_iter().map(|(st, term)| (st + offset, term)));
605        }
606        if init_states.len() > 1 {
607            for (_, term) in dfa.end_states.iter_mut() {
608                term.mode_state = match term.mode {
609                    ModeOption::None => None,
610                    ModeOption::Mode(m) | ModeOption::Push(m) => {
611                        let state_opt = init_states.get(&m);
612                        if state_opt.is_none() {
613                            dfa.log.add_error(format!("unknown mode {m} in combined DFA"));
614                        }
615                        state_opt.cloned()
616                    }
617                };
618            }
619            dfa.first_end_state = None;
620        }
621        if dfa.log.has_no_errors() {
622            Dfa::<General> {
623                state_graph: dfa.state_graph,
624                initial_state: dfa.initial_state,
625                end_states: dfa.end_states,
626                first_end_state: dfa.first_end_state,
627                log: dfa.log,
628                _phantom: PhantomData,
629            }
630        } else {
631            Dfa::with_log(dfa.log)
632        }
633    }
634}
635
636impl LogReader for DfaBuilder {
637    type Item = BufLog;
638
639    fn get_log(&self) -> &Self::Item {
640        &self.log
641    }
642
643    fn give_log(self) -> Self::Item {
644        self.log
645    }
646}
647
648impl HasBuildErrorSource for DfaBuilder {
649    const SOURCE: BuildErrorSource = BuildErrorSource::DfaBuilder;
650}
651
652impl BuildFrom<ReTree> for DfaBuilder {
653    /// Creates a [`DfaBuilder`] from a tree of regular expression [`VecTree`]<[`ReNode`]>.
654    fn build_from(regex_tree: ReTree) -> Self {
655        let mut builder = DfaBuilder {
656            re: regex_tree,
657            followpos: HashMap::new(),
658            lazypos: HashSet::new(),
659            ids: HashMap::new(),
660            log: BufLog::new()
661        };
662        builder.preprocess_re();
663        builder
664    }
665}
666
667// ---------------------------------------------------------------------------------------------
668
669pub struct Dfa<T> {
670    state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
671    initial_state: Option<StateId>,
672    end_states: BTreeMap<StateId, Terminal>,
673    first_end_state: Option<StateId>,
674    pub(crate) log: BufLog,
675    _phantom: PhantomData<T>
676}
677
678impl<T> Dfa<T> {
679    pub fn with_log(log: BufLog) -> Self {
680        Dfa {
681            state_graph: BTreeMap::new(),
682            initial_state: None,
683            end_states: BTreeMap::new(),
684            first_end_state: None,
685            log,
686            _phantom: PhantomData
687        }
688    }
689
690    pub fn get_state_graph(&self) -> &BTreeMap<StateId, BTreeMap<Segments, StateId>> {
691        &self.state_graph
692    }
693
694    pub fn get_initial_state(&self) -> &Option<StateId> {
695        &self.initial_state
696    }
697
698    pub fn get_end_states(&self) -> &BTreeMap<StateId, Terminal> {
699        &self.end_states
700    }
701
702    pub fn get_first_end_state(&self) -> &Option<StateId> {
703        &self.first_end_state
704    }
705
706    /// Checks if the DFA is normalized: incremental state numbers, starting at 0, with all the accepting states
707    /// at the end.
708    pub fn is_normalized(&self) -> bool {
709        if self.state_graph.is_empty() {
710            return true;
711        }
712        let mut states = self.state_graph.keys().to_vec();
713        states.sort();
714        if *states[0] != 0 {
715            false
716        } else {
717            let mut last_end = self.end_states.contains_key(states[0]);
718            let mut last: StateId = 0;
719            for &st in states.iter().skip(1) {
720                let end = self.end_states.contains_key(st);
721                if (*st != last + 1) || (!end && last_end) {
722                    return false;
723                }
724                last_end = end;
725                last = *st;
726            }
727            true
728        }
729    }
730
731    /// Normalizes the DFA: incremental state number0, starting at 0, with all the accepting states
732    /// at the end.
733    pub fn normalize(mut self) -> Dfa<Normalized> {
734        if !self.log.has_no_errors() {
735            // if there are errors, we bypass any processing and return a shell containing the log
736            return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
737        }
738        self.log.add_note("normalizing DFA...");
739        let mut translate = BTreeMap::<StateId, StateId>::new();
740        let mut state_graph = BTreeMap::<StateId, BTreeMap<Segments, StateId>>::new();
741        let mut end_states = BTreeMap::<StateId, Terminal>::new();
742        let nbr_end = self.end_states.len();
743        let mut non_end_id = 0;
744        let mut end_id = self.state_graph.len() - nbr_end;
745        self.first_end_state = Some(end_id);
746        for &id in self.state_graph.keys() {
747            if let Some(terminal) = self.end_states.get(&id) {
748                translate.insert(id, end_id);
749                end_states.insert(end_id, terminal.clone());
750                end_id += 1;
751            } else {
752                translate.insert(id, non_end_id);
753                non_end_id += 1;
754            };
755        }
756        self.initial_state = self.initial_state.map(|st| *translate.get(&st).unwrap());
757        for s in end_states.iter_mut() {
758            if let Some(state) = s.1.mode_state {
759                s.1.mode_state = Some(translate[&state])
760            }
761        }
762        self.end_states = end_states;
763        for (id, mut trans) in std::mem::take(&mut self.state_graph) {
764            for (_, st) in trans.iter_mut() {
765                *st = translate[st];
766            }
767            state_graph.insert(translate[&id], trans);
768        }
769        self.state_graph = state_graph;
770        Dfa::<Normalized> {
771            state_graph: self.state_graph,
772            initial_state: self.initial_state,
773            end_states: self.end_states,
774            first_end_state: self.first_end_state,
775            log: self.log,
776            _phantom: PhantomData,
777        }
778    }
779
780    /// Optimizes the number of states from `self.state_graph`. Returns a map to convert old
781    /// state ids to new state ids.
782    pub fn optimize(mut self) -> Dfa<Normalized> {
783        const VERBOSE: bool = false;
784        if !self.log.has_no_errors() {
785            // if there are errors, we bypass any processing and return a shell containing the log
786            return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
787        }
788        self.log.add_note("optimizing DFA...");
789        // set `separate_end_states` = `true` if different end (accepting) states should be kept apart;
790        // for example, when it's important to differentiate tokens.
791        let separate_end_states = true;
792        if VERBOSE { println!("-----------------------------------------------------------"); }
793        let mut groups = Vec::<BTreeSet<StateId>>::new();
794        let mut st_to_group = BTreeMap::<StateId, usize>::new();
795        let nbr_non_end_states = self.state_graph.len() - self.end_states.len();
796        let mut last_non_end_id = 0;
797        let first_ending_id = nbr_non_end_states + 1;
798
799        // initial partition
800        // - all non-end states
801        let mut group = BTreeSet::<StateId>::new();
802        for st in self.state_graph.keys().filter(|&st| !self.end_states.contains_key(st)) {
803            group.insert(*st);
804            st_to_group.insert(*st, 0);
805        }
806        groups.push(group);
807        // - reserves a few empty groups for later non-ending groups:
808        for _ in 1..first_ending_id {
809            groups.push(BTreeSet::new());
810        }
811        // - end states
812        if separate_end_states {
813            for (st, _) in &self.end_states {
814                st_to_group.insert(*st, groups.len());
815                groups.push(BTreeSet::<StateId>::from([*st]));
816            }
817        } else {
818            st_to_group.extend(self.end_states.iter().map(|(id, _)| (*id, groups.len())));
819            groups.push(BTreeSet::from_iter(self.end_states.keys().map(|st| *st)));
820        }
821        let mut last_ending_id = groups.len() - 1;
822        let mut change = true;
823        while change {
824            let mut changes = Vec::<(StateId, usize, usize)>::new();   // (state, old group, new group)
825            for (id, p) in groups.iter().enumerate().filter(|(_, g)| !g.is_empty()) {
826                // do all states have the same destination group for the same symbol?
827                if VERBOSE { println!("group #{id}: {p:?}:"); }
828                // stores combination -> group index:
829                let mut combinations = BTreeMap::<BTreeMap<&Segments, usize>, usize>::new();
830                for &st_id in p {
831                    let combination = self.state_graph.get(&st_id).unwrap().iter()
832                        .filter(|(_, st)| st_to_group.contains_key(st)) // to avoid fake "end" states
833                        .map(|(s, st)| { (s, st_to_group[st]) })
834                        .collect::<BTreeMap<_, _>>();
835                    if VERBOSE { print!("- state {st_id}{}: {combination:?}", if self.end_states.contains_key(&st_id) { " <END>" } else { "" }) };
836                    if combinations.is_empty() {
837                        combinations.insert(combination, id);   // first one remains in this group
838                        if VERBOSE { println!(" (1st, no change)"); }
839                    } else {
840                        if let Some(&group_id) = combinations.get(&combination) {
841                            // programs the change if it's one of the new groups
842                            if group_id != id {
843                                changes.push((st_id, id, group_id));
844                                if VERBOSE { println!(" -> group #{group_id}"); }
845                            } else {
846                                if VERBOSE { println!(" (no change)"); }
847                            }
848                        } else {
849                            // creates a new group and programs the change
850                            let new_id = if id < first_ending_id {
851                                assert!(last_non_end_id + 1 < first_ending_id, "no more IDs for non-accepting state");
852                                last_non_end_id += 1;
853                                last_non_end_id
854                            } else {
855                                last_ending_id += 1;
856                                last_ending_id
857                            };
858                            combinations.insert(combination, new_id);
859                            changes.push((st_id, id, new_id));
860                            if VERBOSE { println!(" -> new group #{new_id}"); }
861                        }
862                    }
863                }
864            }
865            change = !changes.is_empty();
866            for (st_id, old_group_id, new_group_id) in changes {
867                if new_group_id >= groups.len() {
868                    groups.push(BTreeSet::<StateId>::new());
869                }
870                assert!(groups[old_group_id].remove(&st_id));
871                groups[new_group_id].insert(st_id);
872                assert_eq!(st_to_group.insert(st_id, new_group_id), Some(old_group_id));
873            }
874            if VERBOSE && change { println!("---"); }
875        }
876        if VERBOSE { println!("-----------------------------------------------------------"); }
877        // removes the gaps in group numbering
878        let delta = first_ending_id - last_non_end_id - 1;
879        self.first_end_state = Some(last_non_end_id + 1);
880        if delta > 0 {
881            if VERBOSE {
882                println!("removing the gaps in group numbering: (delta={delta})");
883                println!("st_to_group: {st_to_group:?}");
884                println!("groups: {groups:?}");
885            }
886            for (_, new_st) in st_to_group.iter_mut() {
887                if *new_st > last_non_end_id {
888                    *new_st -= delta;
889                }
890            }
891            groups = groups.into_iter().enumerate()
892                .filter(|(id, _)| *id <= last_non_end_id || *id >= first_ending_id)
893                .map(|(_, g)| g)
894                .to_vec();
895            if VERBOSE {
896                println!("st_to_group: {st_to_group:?}");
897                println!("groups: {groups:?}");
898            }
899        }
900        // stores the new states; note that tokens may be lost if `separate_end_states` is false
901        // since we take the token of the first accepting state in the group
902        self.end_states = BTreeMap::<StateId, Terminal>::from_iter(
903            groups.iter().enumerate()
904                .filter_map(|(group_id, states)| states.iter()
905                    .find_map(|s| self.end_states.get(s).map(|terminal| {
906                        let mut new_terminal = terminal.clone();
907                        if let Some(state) = &mut new_terminal.mode_state {
908                            *state = st_to_group[state];
909                        }
910                        (group_id as StateId, new_terminal)
911                    })))
912        );
913
914        self.initial_state = Some(st_to_group[&self.initial_state.unwrap()] as StateId);
915        self.state_graph = self.state_graph.iter()
916            .map(|(st_id, map_sym_st)| (
917                st_to_group[st_id] as StateId,
918                map_sym_st.iter().map(|(segs, st)| (segs.clone(), st_to_group[st] as StateId)).collect::<BTreeMap<_, _>>()))
919            .collect::<BTreeMap::<StateId, BTreeMap<Segments, StateId>>>();
920        if VERBOSE {
921            println!("new_graph:   {:?}", self.state_graph);
922            println!("new_initial: {:?}", self.initial_state);
923            println!("new_end:     {:?}", self.end_states);
924            println!("-----------------------------------------------------------");
925        }
926        debug_assert!(self.is_normalized(), "optimized state machine isn't regular\nend_states={:?}\ngraph={:?}",
927                      self.end_states, self.state_graph);
928        Dfa::<Normalized> {
929            state_graph: self.state_graph,
930            initial_state: self.initial_state,
931            end_states: self.end_states,
932            first_end_state: self.first_end_state,
933            log: self.log,
934            _phantom: PhantomData,
935        }
936    }
937}
938
939impl<T> LogReader for Dfa<T> {
940    type Item = BufLog;
941
942    fn get_log(&self) -> &Self::Item {
943        &self.log
944    }
945
946    fn give_log(self) -> Self::Item {
947        self.log
948    }
949}
950
951impl<T> HasBuildErrorSource for Dfa<T> {
952    const SOURCE: BuildErrorSource = BuildErrorSource::Dfa;
953}
954
955impl Dfa<General> {
956    pub fn new() -> Dfa<General> {
957        Dfa::with_log(BufLog::new())
958    }
959}
960
961impl Dfa<Normalized> {
962    pub fn gen_tables_source_code(&self, indent: usize) -> String {
963        let mut source = Vec::<String>::new();
964        source.push("let dfa_tables = DfaTables::new(".to_string());
965        source.push("    btreemap![".to_string());
966        source.extend(graph_to_code(&self.state_graph, None, 8));
967        source.push("    ],".to_string());
968        source.push(format!("    {:?},", self.initial_state));
969        source.push("    btreemap![".to_string());
970        let es = self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).to_vec();
971        source.extend(es.chunks(6).map(|v| format!("        {},", v.join(", "))));
972        source.push("    ],".to_string());
973        source.push(format!("    {:?},", self.first_end_state));
974        source.push(");".to_string());
975        indent_source(vec![source], indent)
976    }
977}
978
979// ---------------------------------------------------------------------------------------------
980
981pub struct DfaBundle {
982    /// DFAs and their respective mode IDs
983    dfas: Vec<(ModeId, Dfa<General>)>,
984    /// Log of any potential process that occurred before getting the DFAs
985    log: Option<BufLog>,
986}
987
988impl DfaBundle {
989    pub fn new<T>(dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
990        DfaBundle {
991            dfas: dfas.into_iter().collect(),
992            log: None
993        }
994    }
995
996    pub fn with_log<T>(log: BufLog, dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
997        DfaBundle {
998            dfas: dfas.into_iter().collect(),
999            log: Some(log)
1000        }
1001    }
1002}
1003
1004// impl<T> BuildFrom<T> for Dfa<General>
1005// where
1006//     T: IntoIterator<Item=(ModeId, Dfa<General>)>,
1007// {
1008//     /// Builds a [`Dfa::<General>`] from multiple DFAs, each related to one mode.
1009//     ///
1010//     /// If an error is encountered or was already encountered before, an empty shell object
1011//     /// is built with the log detailing the error(s).
1012//     fn build_from(dfas: T) -> Self {
1013//         let dfa_builder = DfaBuilder::new();
1014//         dfa_builder.build_from_dfa_modes(dfas)
1015//     }
1016// }
1017
1018impl BuildFrom<DfaBundle> for Dfa<General> {
1019    /// Builds a [`Dfa::<General>`] from multiple DFAs, each related to one mode.
1020    ///
1021    /// If an error is encountered or was already encountered before, an empty shell object
1022    /// is built with the log detailing the error(s).
1023    fn build_from(bundle: DfaBundle) -> Self {
1024        let dfa_builder = DfaBuilder::with_log(bundle.log.unwrap_or_default());
1025        dfa_builder.build_from_dfa_modes(bundle.dfas)
1026    }
1027}
1028
1029impl BuildFrom<DfaBuilder> for Dfa<General> {
1030    /// Builds a [`Dfa::<General>`] from a [`DfaBuilder`].
1031    ///
1032    /// If an error is encountered or was already encountered before, an empty shell object
1033    /// is built with the log detailing the error(s).
1034    fn build_from(dfa_builder: DfaBuilder) -> Self {
1035        dfa_builder.build()
1036    }
1037}
1038
1039// Dfa::optimize and Dfa::normalize both generate a Dfa::<Normalized>. Depending on the need,
1040// it's best to use those methods.
1041//
1042// impl From<Dfa<General>> for Dfa<Normalized> {
1043//     /// Builds a [`Dfa::<Normalized>`] from a [`Dfa::<General>`].
1044//     ///
1045//     /// If an error is encountered or was already encountered before, an empty shell object
1046//     /// is built with the log detailing the error(s).
1047//     fn from(dfa: Dfa<General>) -> Self {
1048//         dfa.normalize() // or optimize()?
1049//     }
1050// }
1051
1052// ---------------------------------------------------------------------------------------------
1053
1054pub struct DfaTables {
1055    state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1056    initial_state: Option<StateId>,
1057    end_states: BTreeMap<StateId, Terminal>,
1058    first_end_state: Option<StateId>,
1059}
1060
1061impl DfaTables {
1062    pub fn new(
1063        state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1064        initial_state: Option<StateId>,
1065        end_states: BTreeMap<StateId, Terminal>,
1066        first_end_state: Option<StateId>,
1067    ) -> Self {
1068        DfaTables { state_graph, initial_state, end_states, first_end_state }
1069    }
1070}
1071
1072impl BuildFrom<DfaTables> for Dfa<Normalized> {
1073    fn build_from(source: DfaTables) -> Self {
1074        Dfa {
1075            state_graph: source.state_graph,
1076            initial_state: source.initial_state,
1077            end_states: source.end_states,
1078            first_end_state: source.first_end_state,
1079            log: BufLog::new(),
1080            _phantom: PhantomData
1081        }
1082    }
1083}
1084
1085// ---------------------------------------------------------------------------------------------
1086// Supporting functions
1087
1088fn states_to_string<T: Display>(s: &BTreeSet<T>) -> String {
1089    s.iter().map(|id| id.to_string()).join(", ")
1090}
1091
1092#[allow(dead_code)]
1093pub(crate) fn followpos_to_string(dfa_builder: &DfaBuilder) -> String {
1094    let mut fpos = dfa_builder.followpos.iter()
1095        .map(|(id, ids)| format!("{id:3} -> {}", ids.iter().map(|x| x.to_string()).join(", ")))
1096        .to_vec();
1097    fpos.sort();
1098    fpos.join("\n")
1099}
1100
1101fn node_to_string(tree: &ReTree, index: usize, basic: bool) -> String {
1102    let node = tree.get(index);
1103    let mut result = String::new();
1104    if !basic {
1105        if node.nullable.is_none() {
1106            result.push('?');
1107        } else if node.nullable.unwrap() {
1108            result.push('!');
1109        }
1110    }
1111    result.push_str(&node.to_string());
1112    let children = tree.children(index);
1113    if !children.is_empty() {
1114        result.push_str("(");
1115        result.push_str(&children.iter().map(|&c| node_to_string(&tree, c, basic)).to_vec().join(","));
1116        result.push_str(")");
1117    }
1118    result
1119}
1120
1121pub fn retree_to_str(tree: &ReTree, node: Option<usize>, emphasis: Option<usize>, show_index: bool) -> String {
1122    fn pr_join(children: Vec<(u32, String)>, str: &str, pr: u32) -> (u32, String) {
1123        (pr,
1124         children.into_iter()
1125             .map(|(p_ch, ch)| if p_ch < pr { format!("({ch})") } else { ch })
1126             .join(str))
1127    }
1128
1129    fn pr_append(child: (u32, String), str: &str, pr: u32, force_par: bool) -> (u32, String) {
1130        (pr, if force_par || child.0 < pr { format!("({}){str}", child.1) } else { format!("{}{str}", child.1) })
1131    }
1132
1133    const PR_MATCH: u32 = 1;
1134    const PR_TERM: u32 = 2;
1135    const PR_REPEAT: u32 = 3;
1136    const PR_ITEM: u32 = 4;
1137
1138    let mut children = vec![];
1139    if tree.is_empty() {
1140        return "<empty>".to_string();
1141    }
1142    let top = node.unwrap_or_else(|| tree.get_root().unwrap());
1143    for node in tree.iter_depth_simple_at(top) {
1144        let (pr, mut str) = match node.num_children() {
1145            0 => {
1146                match &node.op {
1147                    ReType::Empty => (PR_ITEM, "ε".to_string()),
1148                    ReType::End(t) => (PR_ITEM, t.to_string()),
1149                    ReType::Char(ch) => (PR_ITEM, format!("{ch:?}")),
1150                    ReType::CharRange(segments) => (PR_ITEM, format!("[{segments}]")),
1151                    ReType::String(str) => (PR_ITEM, format!("{str:?}")),
1152                    s => panic!("{s:?} should have children"),
1153                }
1154            }
1155            n => {
1156                let mut node_children = children.split_off(children.len() - n);
1157                match &node.op {
1158                    ReType::Concat => pr_join(node_children, " ", PR_TERM),
1159                    ReType::Star => pr_append(node_children.pop().unwrap(), "*", PR_REPEAT, show_index),
1160                    ReType::Plus => pr_append(node_children.pop().unwrap(), "+", PR_REPEAT, show_index),
1161                    ReType::Or => pr_join(node_children, " | ", PR_MATCH),
1162                    ReType::Lazy => pr_append(node_children.pop().unwrap(), "?", PR_REPEAT, show_index),
1163                    s => panic!("{s:?} shouldn't have {n} child(ren)"),
1164                }
1165            }
1166        };
1167        if show_index {
1168            str = format!("{}:{str}", node.index);
1169        }
1170        if Some(node.index) == emphasis {
1171            str = format!(" ►►► {str} ◄◄◄ ");
1172        }
1173        children.push((pr, str));
1174    }
1175    children.pop().unwrap().1
1176}
1177
1178// ---------------------------------------------------------------------------------------------
1179// Supporting Debug functions
1180
1181/// Debug function to display the content of a tree.
1182pub fn tree_to_string(tree: &ReTree, root: Option<usize>, basic: bool) -> String {
1183    if tree.len() > 0 {
1184        node_to_string(tree, root.unwrap_or_else(|| tree.get_root().unwrap()), basic)
1185    } else {
1186        "None".to_string()
1187    }
1188}
1189
1190impl<T> Dfa<T> {
1191    /// Debug function to display the content of a Dfa.
1192    pub fn print(&self, indent: usize) {
1193        println!("Initial state: {}", if let Some(st) = self.initial_state { st.to_string() } else { "none".to_string() });
1194        println!("Graph:");
1195        print_graph(&self.state_graph, Some(&self.end_states), indent);
1196        println!("End states:\n{: >indent$}{}", " ",
1197                 self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).join(", "), indent=indent);
1198    }
1199}
1200
1201fn graph_to_code(
1202    state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1203    end_states: Option<&BTreeMap<StateId, Terminal>>,
1204    indent: usize,
1205) -> Vec<String> {
1206    let s = String::from_utf8(vec![32; indent]).unwrap();
1207    state_graph.iter().map(|(state, trans)| {
1208        let mut has_neg = false;
1209        let mut tr = trans.iter().map(|(sym, st)| {
1210            let s = (sym.to_string(), st.to_string());
1211            has_neg |= s.0.starts_with('~');
1212            s
1213        }).to_vec();
1214        if has_neg {
1215            for (sym, _) in &mut tr {
1216                if !sym.ends_with(']') {
1217                    sym.insert(0, '[');
1218                    sym.push(']');
1219                }
1220            }
1221        }
1222        format!("{s}{} => branch!({}),{}",
1223                state,
1224                tr.into_iter().map(|(sym, st)| format!("{sym} => {st}")).join(", "),
1225                end_states.and_then(|map| map.get(&state).map(|token| format!(" // {}", token))).unwrap_or(String::new()),
1226        )
1227    }).collect()
1228}
1229
1230
1231/// Debug function to display the graph of a Dfa.
1232pub fn print_graph(state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>, end_states: Option<&BTreeMap<StateId, Terminal>>, indent: usize) {
1233    let src = graph_to_code(state_graph, end_states, indent);
1234    println!("{}", src.join("\n"));
1235}
1236
1237// ---------------------------------------------------------------------------------------------
1238// Macros
1239
1240pub mod macros {
1241    /// Generates an `ReNode` instance.
1242    ///
1243    /// # Examples
1244    /// ```
1245    /// # use std::collections::BTreeSet;
1246    /// # use lexigram_lib::{dfa::*, node, char_reader::{UTF8_MIN, UTF8_LOW_MAX, UTF8_HIGH_MIN, UTF8_MAX}};
1247    /// # use lexigram_lib::lexer::{ActionOption, ModeOption, Terminal};
1248    /// # use lexigram_lib::segmap::Seg;
1249    /// # use lexigram_lib::segments::Segments;
1250    /// assert_eq!(node!(chr 'a'), ReNode::char('a'));
1251    /// assert_eq!(node!(['a'-'z', '0'-'9']), ReNode::char_range(Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)])));
1252    /// assert_eq!(node!(.), ReNode::char_range(Segments::from([Seg(UTF8_MIN, UTF8_LOW_MAX), Seg(UTF8_HIGH_MIN, UTF8_MAX)])));
1253    /// assert_eq!(node!(str "new"), ReNode::string("new"));
1254    /// assert_eq!(node!(=5), ReNode::end(Terminal { action: ActionOption::Token(5), channel: 0, mode: ModeOption::None, mode_state: None, pop: false }));
1255    /// assert_eq!(node!(&), ReNode::concat());
1256    /// assert_eq!(node!(|), ReNode::or());
1257    /// assert_eq!(node!(*), ReNode::star());
1258    /// assert_eq!(node!(+), ReNode::plus());
1259    /// assert_eq!(node!(e), ReNode::empty());
1260    /// ```
1261    #[macro_export]
1262    macro_rules! node {
1263        (chr $char:expr) => { $crate::dfa::ReNode::char($char) };
1264        (chr $char1:expr, $char2:expr $(;$char3:expr, $char4:expr)*) => { ($char1..=$char2)$(.chain($char3..=$char4))*.map(|c| $crate::dfa::ReNode::char(c)) };
1265        ([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1266        (~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![~ $($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1267        (.) => { node!([DOT]) };
1268        (str $str:expr) => { $crate::dfa::ReNode::string($str) };
1269        (&) => { $crate::dfa::ReNode::concat() };
1270        (|) => { $crate::dfa::ReNode::or() };
1271        (*) => { $crate::dfa::ReNode::star() };
1272        (+) => { $crate::dfa::ReNode::plus() };
1273        (e) => { $crate::dfa::ReNode::empty() };
1274        (??) => { $crate::dfa::ReNode::lazy() };
1275        // actions:
1276        (= $id:expr) => { $crate::dfa::ReNode::end($crate::lexer::Terminal {
1277            action: $crate::lexer::ActionOption::Token($id),
1278            channel: 0,
1279            mode: $crate::lexer::ModeOption::None,
1280            mode_state: None,
1281            pop: false
1282        }) };
1283        ($id:expr) => { $crate::dfa::ReNode::end($id) };
1284        //
1285        ([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!([$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1286        (~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!(~ [$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1287    }
1288
1289    #[macro_export]
1290    macro_rules! term {
1291        (= $id:expr ) =>     { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Token($id),channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1292        (more) =>            { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::More,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1293        (skip) =>            { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1294        (mode $id:expr) =>   { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::Mode($id), mode_state: None,      pop: false } };
1295        (push $id:expr) =>   { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::Push($id), mode_state: None,      pop: false } };
1296        (pushst $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: Some($id), pop: false } };
1297        (pop) =>             { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: true  } };
1298        (# $id:expr) =>      { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: $id, mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1299    }
1300
1301    #[cfg(test)]
1302    mod tests {
1303        use crate::{branch, btreemap};
1304        use crate::dfa::{graph_to_code, ReNode};
1305        use crate::char_reader::{UTF8_HIGH_MIN, UTF8_LOW_MAX, UTF8_MAX, UTF8_MIN};
1306        use crate::lexer::{ActionOption, ModeOption, Terminal};
1307        use crate::segments::Segments;
1308        use crate::segmap::Seg;
1309
1310        #[test]
1311        fn macro_node() {
1312            assert_eq!(node!([0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::dot()));
1313            assert_eq!(node!(~[0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::empty()));
1314
1315            assert_eq!(node!(chr 'a'), ReNode::char('a'));
1316            assert_eq!(node!(['a'-'z', '0'-'9']), ReNode::char_range(Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)])));
1317            assert_eq!(node!(.), ReNode::char_range(Segments::from([Seg(UTF8_MIN, UTF8_LOW_MAX), Seg(UTF8_HIGH_MIN, UTF8_MAX)])));
1318            assert_eq!(node!(str "new"), ReNode::string("new"));
1319            assert_eq!(node!(=5), ReNode::end(Terminal { action: ActionOption::Token(5), channel: 0, mode: ModeOption::None, mode_state: None, pop: false }));
1320            assert_eq!(node!(&), ReNode::concat());
1321            assert_eq!(node!(|), ReNode::or());
1322            assert_eq!(node!(*), ReNode::star());
1323            assert_eq!(node!(+), ReNode::plus());
1324            assert_eq!(node!(e), ReNode::empty());
1325        }
1326
1327        #[test]
1328        fn state_graph() {
1329            let test = btreemap![
1330            1 => branch!(),
1331            2 => branch!('A' => 0),
1332            3 => branch!('B' => 1, 'C' => 2),
1333            4 => branch!('D'-'F' => 3),
1334            5 => branch!('0'-'9', '_' => 4),
1335            6 => branch!(DOT => 5),
1336            7 => branch!(~['*'] => 6),
1337            8 => branch!(~['+', '-'] => 7),
1338            9 => branch!(~['*'] => 8, ['*'] => 9),
1339        ];
1340            let s = graph_to_code(&test, None, 4);
1341            assert_eq!(s, vec![
1342                "    1 => branch!(),".to_string(),
1343                "    2 => branch!('A' => 0),".to_string(),
1344                "    3 => branch!('B' => 1, 'C' => 2),".to_string(),
1345                "    4 => branch!('D'-'F' => 3),".to_string(),
1346                "    5 => branch!('0'-'9', '_' => 4),".to_string(),
1347                "    6 => branch!(DOT => 5),".to_string(),
1348                "    7 => branch!(~['*'] => 6),".to_string(),
1349                "    8 => branch!(~['+', '-'] => 7),".to_string(),
1350                "    9 => branch!(~['*'] => 8, ['*'] => 9),".to_string(),
1351            ]);
1352        }
1353    }
1354}