Skip to main content

lexigram_lib/dfa/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
6use std::fmt::{Display, Formatter};
7use std::marker::PhantomData;
8use vectree::VecTree;
9use lexigram_core::CollectJoin;
10use lexigram_core::char_reader::escape_string;
11use crate::{btreeset, indent_source, General, Normalized};
12use crate::char_reader::escape_char;
13use crate::lexer::{ModeId, ModeOption, StateId, Terminal};
14use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
15use crate::build::{BuildErrorSource, BuildFrom, HasBuildErrorSource};
16use crate::segments::Segments;
17use crate::segmap::Seg;
18use crate::take_until::TakeUntilIterator;
19
20// ---------------------------------------------------------------------------------------------
21
22#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
23pub enum ReType { // todo: remove Boxes
24    #[default] Empty,
25    End(Box<Terminal>),
26    Char(char),
27    CharRange(Box<Segments>),
28    String(Box<String>),
29    Concat,
30    Star,
31    Plus,
32    Or,
33    Lazy,
34}
35
36#[test]
37fn retype_size() {
38    let size = size_of::<ReType>();
39    assert!(size <= 16, "size of ReType is too big: {size}");
40}
41
42impl ReType {
43    pub fn is_empty(&self) -> bool {
44        matches!(self, ReType::Empty)
45    }
46
47    pub fn is_leaf(&self) -> bool {
48        matches!(self, ReType::Empty | ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_))
49    }
50
51    pub fn is_nullable(&self) -> Option<bool> {
52        match self {
53            ReType::Empty | ReType::Star => Some(true),
54            ReType::End(_) | ReType::Char(_) | ReType::CharRange(_) | ReType::String(_) | ReType::Plus => Some(false),
55            ReType::Concat | ReType::Or | ReType::Lazy => None,
56        }
57    }
58
59    pub fn is_end(&self) -> bool {
60        matches!(self, ReType::End(_))
61    }
62
63    // pub fn apply_chars<F: FnMut(char) -> ()>(&self, mut f: F) {
64    //     match self {
65    //         ReType::Char(c) => f(*c),
66    //         ReType::CharRange(i) => i.0.iter().flat_map(|(a, b)| (*a..=*b)).for_each(|code| f(char::from_u32(code).unwrap())),
67    //         _ => {}
68    //     }
69    // }
70}
71
72impl Display for ReType {
73    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
74        match self {
75            ReType::Empty => write!(f, "-"),
76            ReType::End(t) => write!(f, "{t}"),
77            ReType::Char(c) => write!(f, "'{}'", escape_char(*c)),
78            ReType::CharRange(segments) => write!(f, "[{segments}]"),
79            ReType::String(s) => write!(f, "'{}'", escape_string(s)),
80            ReType::Concat => write!(f, "&"),
81            ReType::Star => write!(f, "*"),
82            ReType::Plus => write!(f, "+"),
83            ReType::Or => write!(f, "|"),
84            ReType::Lazy => write!(f, "??"),
85        }
86    }
87}
88
89type Id = u32;
90
91#[derive(Clone, Debug, PartialEq, Default)]
92pub struct ReNode {
93    id: Option<Id>,
94    op: ReType,
95    firstpos: HashSet<Id>,  // todo: use BTreeSet or Vec instead? Move to DfaBuilder?
96    lastpos: HashSet<Id>,   // todo: use BTreeSet or Vec instead? Move to DfaBuilder?
97    nullable: Option<bool>
98}
99
100impl ReNode {
101    pub fn new(node: ReType) -> ReNode {
102        ReNode { id: None, op: node, firstpos: HashSet::new(), lastpos: HashSet::new(), nullable: None }
103    }
104
105    pub fn empty() -> ReNode { ReNode::new(ReType::Empty) }
106
107    pub fn end(t: Terminal) -> ReNode { ReNode::new(ReType::End(Box::new(t))) }
108
109    pub fn char(c: char) -> ReNode { ReNode::new(ReType::Char(c)) }
110
111    pub fn char_range(s: Segments) -> ReNode { ReNode::new(ReType::CharRange(Box::new(s))) }
112
113    pub fn string<T: Into<String>>(s: T) -> ReNode { ReNode::new(ReType::String(Box::new(s.into()))) }
114
115    pub fn concat() -> ReNode { ReNode::new(ReType::Concat) }
116
117    pub fn star() -> ReNode { ReNode::new(ReType::Star) }
118
119    pub fn plus() -> ReNode { ReNode::new(ReType::Plus) }
120
121    pub fn or() -> ReNode { ReNode::new(ReType::Or) }
122
123    pub fn lazy() -> ReNode { ReNode::new(ReType::Lazy) }
124
125    pub fn is_leaf(&self) -> bool {
126        self.op.is_leaf()
127    }
128
129    pub fn is_empty(&self) -> bool {
130        self.op.is_empty()
131    }
132
133    pub fn get_type(&self) -> &ReType {
134        &self.op
135    }
136
137    pub fn is_nullable(&self) -> Option<bool> {
138        self.nullable
139    }
140
141    pub fn get_mut_type(&mut self) -> &mut ReType {
142        &mut self.op
143    }
144}
145
146impl Display for ReNode {
147    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
148        if let Some(id) = self.id {
149            write!(f, "{id}:")?;
150        }
151        self.op.fmt(f)
152    }
153}
154
155// ---------------------------------------------------------------------------------------------
156
157pub type ReTree = VecTree<ReNode>;
158
159#[derive(Clone)]
160pub struct DfaBuilder {
161    /// Regular Expression tree
162    re: ReTree,
163    /// `followpos` table, containing the `Id` -> `Id` graph of `re`
164    followpos: HashMap<Id, HashSet<Id>>,
165    /// `lazypos[id_child]` includes `id_lazy` when `id_child` is a child of a lazy operator `id_lazy`
166    lazypos: HashSet<Id>,
167    /// `Id` -> node index
168    ids: HashMap<Id, usize>,
169    log: BufLog
170}
171
172impl DfaBuilder {
173    pub fn new() -> Self {
174        Self::with_log(BufLog::new())
175    }
176
177    pub fn with_log(log: BufLog) -> Self {
178        DfaBuilder {
179            re: VecTree::new(),
180            followpos: HashMap::new(),
181            lazypos: HashSet::new(),
182            ids: HashMap::new(),
183            log
184        }
185    }
186
187    #[inline(always)]
188    pub fn get_re(&self) -> &ReTree {
189        &self.re
190    }
191
192    /// Replaces ReType::String(s) with a concatenation of ReType::Char(s[i])
193    fn preprocess_re(&mut self) {
194        let mut nodes = vec![];
195        for mut inode in self.re.iter_post_depth_simple_mut() {
196            if matches!(inode.op, ReType::String(_)) {
197                // we have to do it again to move the string
198                if let ReType::String(s) = std::mem::take(&mut inode.op) {
199                    if s.is_empty() {
200                        // will be replaced by an empty Concat, but shouldn't exist
201                        self.log.add_error(format!("node #{} is an empty string", inode.index));
202                    }
203                    nodes.push((inode.index, s));
204                }
205            }
206        }
207        for (index, s) in nodes {
208            let node = self.re.get_mut(index);
209            match s.len() {
210                0 => node.op = ReType::Char('?'), // empty string replaced by '?'
211                1 => node.op = ReType::Char(s.chars().nth(0).unwrap()),
212                _ => {
213                    node.op = ReType::Concat;
214                    for c in s.chars() {
215                        self.re.add(Some(index), ReNode::char(c));
216                    }
217                }
218            }
219        }
220    }
221
222    /// Calculates `firstpos`, `lastpost`, `nullable` for each node, and the `followpos` table.
223    fn calc_node_pos(&mut self) {
224        let mut id = 0;
225        for mut inode in self.re.iter_post_depth_mut() {
226            if inode.is_leaf() {
227                if inode.num_children() > 0 {
228                    self.log.add_error(format!("node #{} {} had {} child(ren) but shouldn't have any", inode.index, inode.op, inode.num_children()));
229                }
230                id += 1;
231                inode.id = Some(id);
232                if !inode.is_empty() {
233                    inode.firstpos.insert(id);
234                    inode.lastpos.insert(id);
235                }
236                self.ids.insert(id, inode.index);
237                if inode.op.is_end() {
238                    self.followpos.insert(id, HashSet::new());
239                }
240            } else {
241                match inode.op {
242                    ReType::Concat => {
243                        if inode.num_children() > 0 {
244                            // firstpos = union of all firstpos until the first non-nullable child (included)
245                            let mut firstpos = HashSet::<Id>::new();
246                            for child in inode.iter_children_simple().take_until(|&n| !n.nullable.unwrap()) {
247                                firstpos.extend(&child.firstpos);
248                            }
249                            inode.firstpos.extend(firstpos);
250                            // lastpos = union of all lastpos until the first non-nullable child (included), starting from the end
251                            let mut lastpos = HashSet::<Id>::new();
252                            for child in inode.iter_children_simple().rev().take_until(|&n| !n.nullable.unwrap()) {
253                                lastpos.extend(&child.lastpos);
254                            }
255                            inode.lastpos.extend(lastpos);
256                            // followpos:
257                            // for all pairs of consecutive children {c[i], c[i+1]},
258                            //     for all j in c[i].lastpos
259                            //         followpos[j].extend(c[i+1].firstpos)
260                            let mut iter = inode.iter_children_simple();
261                            let a = iter.next().unwrap();   // a is c[i]
262                            let mut lastpos = a.lastpos.clone();
263                            for b in iter {   // b is c[i+1]
264                                for j in &lastpos {
265                                    if !self.followpos.contains_key(j) {
266                                        self.followpos.insert(*j, HashSet::new());
267                                    }
268                                    self.followpos.get_mut(j).unwrap().extend(&b.firstpos);
269                                }
270                                // we must build the lastpos during the iteration
271                                // &(0,1,2,3) = &2(&1(&0(0,1),2),3) => &0.lpos=0.lpos, &1.lpos=lastpos("&",[&0,2])), ...
272                                if b.nullable.unwrap() {
273                                    lastpos.extend(&b.lastpos);
274                                } else {
275                                    lastpos = b.lastpos.clone();
276                                }
277                            }
278                        } else {
279                            self.log.add_error(format!("node #{} is Concat but has no children", inode.index))
280                        }
281                    }
282                    ReType::Star | ReType::Plus => {
283                        if inode.num_children() == 1 {
284                            // firstpos, lastpos identical to child's
285                            let firstpos = inode.iter_children_simple().next().unwrap().firstpos.iter().copied().to_vec();
286                            inode.firstpos.extend(firstpos);
287                            let lastpos = inode.iter_children_simple().next().unwrap().lastpos.iter().copied().to_vec();
288                            inode.lastpos.extend(lastpos);
289                            // followpos:
290                            // for all i in *.lastpos,
291                            //     followpos[i].extend(*.firstpos)
292
293                            for i in &inode.lastpos {
294                                if !self.followpos.contains_key(i) {
295                                    self.followpos.insert(*i, HashSet::new());
296                                }
297                                self.followpos.get_mut(i).unwrap().extend(&inode.firstpos);
298                            }
299                        } else {
300                            self.log.add_error(format!("node #{} is {:?} but has {} child(ren) instead of 1 child",
301                                                       inode.index, inode.op, inode.num_children()))
302                        }
303                    }
304                    ReType::Or => {
305                        if inode.num_children() > 0 {
306                            // firstpos, lastpost = union of children's
307                            let mut firstpos = HashSet::<Id>::new();
308                            for child in inode.iter_children_simple() {
309                                firstpos.extend(&child.firstpos);
310                            }
311                            inode.firstpos.extend(firstpos);
312                            let mut lastpos = HashSet::<Id>::new();
313                            for child in inode.iter_children_simple() {
314                                lastpos.extend(&child.lastpos);
315                            }
316                            inode.lastpos.extend(lastpos);
317                        } else {
318                            self.log.add_error(format!("node #{} is Or but has no children", inode.index));
319                        }
320                    }
321                    ReType::Lazy => {
322                        if inode.num_children() == 1 {
323                            let child = inode.iter_children_simple().next().unwrap();
324                            let firstpos = child.firstpos.clone();
325                            let lastpos = child.lastpos.clone();
326                            inode.firstpos = firstpos;
327                            inode.lastpos = lastpos;
328                            for ichild in inode.iter_post_depth_simple().filter(|node| node.is_leaf()) {
329                                self.lazypos.insert(ichild.id.unwrap());
330                            }
331                        } else {
332                            self.log.add_error(format!("node #{} is Lazy but has {} child(ren) instead of 1 child",
333                                                       inode.index, inode.num_children()));
334                        }
335                    }
336                    _ => panic!("{:?}: no way to compute firstpos/...", &*inode)
337                }
338            }
339            if let Some(nullable) = inode.op.is_nullable() {
340                inode.nullable = Some(nullable);
341            } else {
342                inode.nullable = match &inode.op {
343                    ReType::Concat | ReType::Lazy => Some(inode.iter_children_simple().all(|child| child.nullable.unwrap())),
344                    ReType::Or => Some(inode.iter_children_simple().any(|child| child.nullable.unwrap())),
345                    op => panic!("{:?} should have a fixed nullable property", op)
346                }
347            }
348        }
349    }
350
351    fn calc_states(&mut self) -> Dfa<General> {
352        const VERBOSE: bool = false;
353        const RESOLVE_END_STATES: bool = true;
354        const RM_LAZY_BRANCHES: bool = true;
355        // initial state from firstpos(top node)
356        let mut dfa = Dfa::new();
357        if !self.log.has_no_errors() {
358            return dfa;
359        }
360        if VERBOSE { println!("new DFA"); }
361        let mut current_id = 0;
362        let key = BTreeSet::from_iter(self.re.get(0).firstpos.iter().copied());
363        let mut new_states = BTreeSet::<BTreeSet<Id>>::new();
364        new_states.insert(key.clone());
365        let mut states = BTreeMap::<BTreeSet<Id>, StateId>::new();
366        states.insert(key, current_id);
367        dfa.initial_state = Some(current_id);
368        let mut conflicts = vec![];
369
370        // gathers lazy ids and their immediate followpos to remove phantom branches:
371        let mut lazy_followpos = self.lazypos.iter().copied().collect::<BTreeSet<Id>>();
372        lazy_followpos.extend(self.lazypos.iter().filter_map(|id| self.followpos.get(id)).flatten());
373        if VERBOSE { println!("lazy_followpos = {{{}}}", lazy_followpos.iter().join(", ")); }
374
375        // gets a partition of the symbol segments and changes Char to CharRange
376        let mut symbols_part = Segments::empty();
377        for id in self.ids.values() {
378            let node = self.re.get_mut(*id);
379            if let ReType::Char(c) = node.op {
380                node.op = ReType::CharRange(Box::new(Segments::from(c)));
381            }
382            if let ReType::CharRange(segments) = &node.op {
383                symbols_part.add_partition(segments);
384            }
385        }
386        if VERBOSE { println!("symbols = {symbols_part:X}"); }
387
388        // prepares the segments and their source ids
389        while let Some(s) = new_states.pop_first() {
390            let new_state_id = *states.get(&s).unwrap();
391            let is_lazy_state = s.iter().all(|id| lazy_followpos.contains(id));
392            if VERBOSE {
393                println!("- state {} = {{{}}}{}", new_state_id, states_to_string(&s), if is_lazy_state { ", lazy state" } else { "" });
394            }
395            let mut trans = BTreeMap::<Seg, BTreeSet<Id>>::new();   // transitions partitioned by `symbol_parts`
396            let mut terminals = BTreeMap::<Id, &Terminal>::new();   // all terminals (used if terminal conflicts)
397            let mut first_terminal_id: Option<Id> = None;   // first met terminal id, if any (used if terminal conflicts)
398            let mut id_transitions = BTreeSet::<Id>::new(); // ids that are destination from current state (used if terminal conflicts)
399            let mut id_terminal: Option<Id> = None;         // selected terminal id, if any (used to remove phantom branches)
400            for (symbol, id) in s.iter().map(|id| (&self.re.get(self.ids[id]).op, *id)) {
401                if symbol.is_end() {
402                    dfa.state_graph.entry(new_state_id).or_insert_with(|| {
403                        if VERBOSE { println!("  + {symbol} => create state {new_state_id}"); }
404                        BTreeMap::new()
405                    });
406                    if let ReType::End(t) = symbol {
407                        id_terminal = Some(id);
408                        if first_terminal_id.is_none() {
409                            first_terminal_id = Some(id);
410                            if new_state_id == 0 {
411                                if t.is_only_skip() {
412                                    self.log.add_warning(format!("<skip> on initial state is a risk of infinite loop on bad input ({t})"));
413                                } else if t.is_token() {
414                                    self.log.add_warning(format!("<token> on initial state returns a token on bad input ({t})"));
415                                }
416                            }
417                        }
418                        if RESOLVE_END_STATES {
419                            if let Some(t2) = terminals.insert(id, t) {
420                                panic!("overriding {id} -> {t2} with {id} -> {t} in end_states {}",
421                                       terminals.iter().map(|(id, t)| format!("{id} {t}")).join(", "));
422                            }
423                        } else if let std::collections::btree_map::Entry::Vacant(e) = dfa.end_states.entry(new_state_id) {
424                            e.insert(*t.clone());
425                            if VERBOSE { println!("  # end state: id {id} {t}"); }
426                        } else if VERBOSE {
427                            println!("  # end state: id {id} {t} ## DISCARDED since another one already taken");
428                        }
429                    } else {
430                        panic!("unexpected END symbol: {symbol:?}");
431                    }
432                } else if let ReType::CharRange(segments) = symbol {
433                    if let Some(set) = self.followpos.get(&id) {
434                        id_transitions.extend(set);
435                        let cmp = segments.intersect(&symbols_part);
436                        assert!(cmp.internal.is_empty(), "{symbols_part} # {segments} = {cmp}");
437                        if VERBOSE { println!("  + {} to {}", &cmp.common, id); }
438                        for segment in cmp.common.into_iter() {
439                            if let Some(ids) = trans.get_mut(&segment) {
440                                ids.insert(id);
441                            } else {
442                                trans.insert(segment, btreeset![id]);
443                            }
444                        }
445                    } else {
446                        self.log.add_error(format!("node #{id} is not in followpos; is an accepting state missing? Orphan segment: {segments}"));
447                    }
448                }
449            }
450            if RESOLVE_END_STATES {
451                if terminals.len() > 1 {
452                    if VERBOSE {
453                        println!("  # {id_transitions:?}");
454                        println!("  # terminal conflict: {}", terminals.iter()
455                            .map(|(id, t)| format!("{id} -> {t} (has{} trans)", if id_transitions.contains(id) { "" } else { " no" }))
456                            .join(", "));
457                    }
458                    // let terminals_str = terminals.iter().map(|(_, t)| format!("{t}")).join(", ");
459                    let ts = terminals.iter().map(|(_, &t)| t).collect::<BTreeSet<_>>();
460                    let (chosen, t) = terminals.pop_first().unwrap();
461                    conflicts.push((new_state_id, ts, t));
462                    id_terminal = Some(chosen);
463                    // self.log.add_note(format!(
464                    //     "conflicting terminals in state {new_state_id}: {terminals_str}. Selecting first defined: {t}"));
465                    if VERBOSE { println!("    selecting the first one of the list: id {chosen} {t}"); }
466                    dfa.end_states.insert(new_state_id, t.to_owned());
467                } else if let Some((id, terminal)) = terminals.pop_first() {
468                    id_terminal = Some(id);
469                    if VERBOSE { println!("  # end state: id {id} {terminal}"); }
470                    dfa.end_states.insert(new_state_id, terminal.clone());
471                }
472            }
473            let has_non_lazy_terminal = if let Some(id) = id_terminal {
474                !lazy_followpos.contains(&id)
475            } else {
476                false
477            };
478
479            // finds the destination ids (creating new states if necessary), and populates the symbols for each destination
480            let mut map = BTreeMap::<StateId, Segments>::new();
481            for (segment, ids) in trans {
482                if VERBOSE { print!("  - {} in {}: ", segment, states_to_string(&ids)); }
483                if RM_LAZY_BRANCHES && !is_lazy_state && has_non_lazy_terminal && ids.iter().all(|id| lazy_followpos.contains(id)) {
484                    if VERBOSE { println!(" => lazy, removed"); }
485                    continue;
486                }
487                let mut state = BTreeSet::new();
488                for id in ids {
489                    state.extend(&self.followpos[&id]);
490                }
491                if VERBOSE { print!("follow = {{{}}}", states_to_string(&state)); }
492                let state_id = if let Some(state_id) = states.get(&state) {
493                    if VERBOSE { println!(" => state {state_id}"); }
494                    *state_id
495                } else {
496                    new_states.insert(state.clone());
497                    current_id += 1;
498                    if VERBOSE { println!(" => new state {} = {{{}}}", current_id, states_to_string(&state)); }
499                    states.insert(state, current_id);
500                    current_id
501                };
502                if let Some(segments) = map.get_mut(&state_id) {
503                    segments.insert(segment);
504                } else {
505                    map.insert(state_id, Segments::new(segment));
506                }
507            }
508            // regroups the symbols per destination
509            for segments in map.values_mut() {
510                segments.normalize();
511            }
512            if VERBOSE {
513                for (st, int) in &map {
514                    println!("  {} -> {}", int, st);
515                }
516            }
517            // finally, updates the graph with the reverse (symbol -> state) data
518            dfa.state_graph.insert(new_state_id, map.into_iter().map(|(id, segments)| (segments, id)).collect());
519        }
520
521        // checks if there are terminals that were never selected for accepting states:
522        let state_terminals = dfa.end_states.values().collect::<BTreeSet<_>>();
523        let missing = self.ids.values()
524            .filter_map(|x| if let ReType::End(t) = &self.re.get(*x).op { Some(t.as_ref()) } else { None })
525            .filter(|x| !state_terminals.contains(x))
526            .collect::<BTreeSet<_>>();
527        if VERBOSE && !missing.is_empty() { println!("# missing: {}", missing.iter().map(|t| t.to_string()).join(", ")); }
528        for missing_t in missing {
529            let related = conflicts.iter()
530                .filter_map(|(s_id, ts, chosen)|
531                    if ts.contains(missing_t) {
532                        Some(format!("\n    - conflict in state {s_id}: {} => {chosen} chosen", ts.iter().map(|t| t.to_string()).join(" <> ")))
533                    } else {
534                        None
535                    })
536                .to_vec();
537            if !related.is_empty() {
538                self.log.add_error(format!(
539                    "{missing_t} is never selected, but it was in conflict with (an)other token(s); check if re-ordering definitions solves the problem{}",
540                    related.join("")));
541            } else {
542                self.log.add_error(format!("{missing_t} is never selected"));
543            }
544        }
545
546        dfa
547    }
548
549    fn build(mut self) -> Dfa<General> {
550        self.log.add_note("building DFA...");
551        self.calc_node_pos();
552        let mut dfa = self.calc_states();
553        // transfers all the log messages to the Dfa
554        dfa.log.extend(std::mem::replace(&mut self.log, BufLog::new()));
555        dfa
556    }
557
558    #[cfg(test)]
559    pub(crate) fn build_from_graph<T: IntoIterator<Item=(StateId, Terminal)>>(
560        &mut self, graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>, init_state: StateId, end_states: T,
561    ) -> Option<Dfa<General>>
562    {
563        let mut dfa = Dfa::<General> {
564            state_graph: graph,
565            initial_state: Some(init_state),
566            end_states: BTreeMap::from_iter(end_states),
567            first_end_state: None,
568            log: BufLog::new(),
569            _phantom: PhantomData
570        };
571        dfa.first_end_state = dfa.end_states.keys().min().cloned();
572        // TODO: add checks
573        Some(dfa)
574    }
575
576    /// Merges several DFA graphs into one. The graphs represent different modes that are called with the
577    /// `Action::pushMode(id)` action.
578    fn build_from_dfa_modes<T, U>(self, dfas: T) -> Dfa<General>
579    where
580        T: IntoIterator<Item=(ModeId, Dfa<U>)>,
581    {
582        assert!(self.re.is_empty(), "build_from_dfa_modes() used on non-empty builder");
583        let mut dfa = Dfa::<General>::with_log(self.log);
584        dfa.log.add_note("combining DFA modes...");
585        let mut init_states = BTreeMap::new();
586        for (idx, new_dfa) in dfas {
587            dfa.log.add_note(format!("- DFA mode #{idx}"));
588            dfa.log.extend(new_dfa.log);
589            if new_dfa.state_graph.is_empty() {
590                dfa.log.add_error(format!("DFA mode #{idx} is empty"));
591            }
592            let offset = if init_states.is_empty() { 0 } else { 1 + dfa.state_graph.keys().max().unwrap() };
593            dfa.initial_state = dfa.initial_state.or(new_dfa.initial_state);
594            if let Some(initial_state) = new_dfa.initial_state {
595                if init_states.insert(idx, offset + initial_state).is_some() {
596                    dfa.log.add_error(format!("DFA mode #{idx} defined multiple times"))
597                };
598            } else {
599                dfa.log.add_error(format!("DFA mode #{idx} has no initial state"));
600            }
601            for (st_from, mut map) in new_dfa.state_graph {
602                for (_, st_to) in map.iter_mut() {
603                    *st_to += offset;
604                }
605                assert!(!dfa.state_graph.contains_key(&(st_from + offset)));
606                dfa.state_graph.insert(st_from + offset, map);
607            }
608            dfa.end_states.extend(new_dfa.end_states.into_iter().map(|(st, term)| (st + offset, term)));
609        }
610        if init_states.len() > 1 {
611            for (_, term) in dfa.end_states.iter_mut() {
612                term.mode_state = match term.mode {
613                    ModeOption::None => None,
614                    ModeOption::Mode(m) | ModeOption::Push(m) => {
615                        let state_opt = init_states.get(&m);
616                        if state_opt.is_none() {
617                            dfa.log.add_error(format!("unknown mode {m} in combined DFA"));
618                        }
619                        state_opt.cloned()
620                    }
621                };
622            }
623            dfa.first_end_state = None;
624        }
625        if dfa.log.has_no_errors() {
626            Dfa::<General> {
627                state_graph: dfa.state_graph,
628                initial_state: dfa.initial_state,
629                end_states: dfa.end_states,
630                first_end_state: dfa.first_end_state,
631                log: dfa.log,
632                _phantom: PhantomData,
633            }
634        } else {
635            Dfa::with_log(dfa.log)
636        }
637    }
638}
639
640impl Default for DfaBuilder {
641    fn default() -> Self {
642        Self::new()
643    }
644}
645
646impl LogReader for DfaBuilder {
647    type Item = BufLog;
648
649    fn get_log(&self) -> &Self::Item {
650        &self.log
651    }
652
653    fn give_log(self) -> Self::Item {
654        self.log
655    }
656}
657
658impl HasBuildErrorSource for DfaBuilder {
659    const SOURCE: BuildErrorSource = BuildErrorSource::DfaBuilder;
660}
661
662impl BuildFrom<ReTree> for DfaBuilder {
663    /// Creates a [`DfaBuilder`] from a tree of regular expression [`VecTree`]<[`ReNode`]>.
664    fn build_from(regex_tree: ReTree) -> Self {
665        let mut builder = DfaBuilder {
666            re: regex_tree,
667            followpos: HashMap::new(),
668            lazypos: HashSet::new(),
669            ids: HashMap::new(),
670            log: BufLog::new()
671        };
672        builder.preprocess_re();
673        builder
674    }
675}
676
677// ---------------------------------------------------------------------------------------------
678
679pub struct Dfa<T> {
680    state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
681    initial_state: Option<StateId>,
682    end_states: BTreeMap<StateId, Terminal>,
683    first_end_state: Option<StateId>,
684    pub(crate) log: BufLog,
685    _phantom: PhantomData<T>
686}
687
688impl<T> Dfa<T> {
689    pub fn with_log(log: BufLog) -> Self {
690        Dfa {
691            state_graph: BTreeMap::new(),
692            initial_state: None,
693            end_states: BTreeMap::new(),
694            first_end_state: None,
695            log,
696            _phantom: PhantomData
697        }
698    }
699
700    pub fn get_state_graph(&self) -> &BTreeMap<StateId, BTreeMap<Segments, StateId>> {
701        &self.state_graph
702    }
703
704    pub fn get_initial_state(&self) -> &Option<StateId> {
705        &self.initial_state
706    }
707
708    pub fn get_end_states(&self) -> &BTreeMap<StateId, Terminal> {
709        &self.end_states
710    }
711
712    pub fn get_first_end_state(&self) -> &Option<StateId> {
713        &self.first_end_state
714    }
715
716    /// Checks if the DFA is normalized: incremental state numbers, starting at 0, with all the accepting states
717    /// at the end.
718    pub fn is_normalized(&self) -> bool {
719        if self.state_graph.is_empty() {
720            return true;
721        }
722        let mut states = self.state_graph.keys().to_vec();
723        states.sort();
724        if *states[0] != 0 {
725            false
726        } else {
727            let mut last_end = self.end_states.contains_key(states[0]);
728            let mut last: StateId = 0;
729            for &st in states.iter().skip(1) {
730                let end = self.end_states.contains_key(st);
731                if (*st != last + 1) || (!end && last_end) {
732                    return false;
733                }
734                last_end = end;
735                last = *st;
736            }
737            true
738        }
739    }
740
741    /// Normalizes the DFA: incremental state number0, starting at 0, with all the accepting states
742    /// at the end.
743    pub fn normalize(mut self) -> Dfa<Normalized> {
744        if !self.log.has_no_errors() {
745            // if there are errors, we bypass any processing and return a shell containing the log
746            return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
747        }
748        self.log.add_note("normalizing DFA...");
749        let mut translate = BTreeMap::<StateId, StateId>::new();
750        let mut state_graph = BTreeMap::<StateId, BTreeMap<Segments, StateId>>::new();
751        let mut end_states = BTreeMap::<StateId, Terminal>::new();
752        let nbr_end = self.end_states.len();
753        let mut non_end_id = 0;
754        let mut end_id = self.state_graph.len() - nbr_end;
755        self.first_end_state = Some(end_id);
756        for &id in self.state_graph.keys() {
757            if let Some(terminal) = self.end_states.get(&id) {
758                translate.insert(id, end_id);
759                end_states.insert(end_id, terminal.clone());
760                end_id += 1;
761            } else {
762                translate.insert(id, non_end_id);
763                non_end_id += 1;
764            };
765        }
766        self.initial_state = self.initial_state.map(|st| *translate.get(&st).unwrap());
767        for s in end_states.iter_mut() {
768            if let Some(state) = s.1.mode_state {
769                s.1.mode_state = Some(translate[&state])
770            }
771        }
772        self.end_states = end_states;
773        for (id, mut trans) in std::mem::take(&mut self.state_graph) {
774            for (_, st) in trans.iter_mut() {
775                *st = translate[st];
776            }
777            state_graph.insert(translate[&id], trans);
778        }
779        self.state_graph = state_graph;
780        Dfa::<Normalized> {
781            state_graph: self.state_graph,
782            initial_state: self.initial_state,
783            end_states: self.end_states,
784            first_end_state: self.first_end_state,
785            log: self.log,
786            _phantom: PhantomData,
787        }
788    }
789
790    /// Optimizes the number of states from `self.state_graph`. Returns a map to convert old
791    /// state ids to new state ids.
792    pub fn optimize(mut self) -> Dfa<Normalized> {
793        const VERBOSE: bool = false;
794        if !self.log.has_no_errors() {
795            // if there are errors, we bypass any processing and return a shell containing the log
796            return Dfa::<Normalized>::with_log(std::mem::take(&mut self.log));
797        }
798        self.log.add_note("optimizing DFA...");
799        // set `separate_end_states` = `true` if different end (accepting) states should be kept apart;
800        // for example, when it's important to differentiate tokens.
801        let separate_end_states = true;
802        if VERBOSE { println!("-----------------------------------------------------------"); }
803        let mut groups = Vec::<BTreeSet<StateId>>::new();
804        let mut st_to_group = BTreeMap::<StateId, usize>::new();
805        let nbr_non_end_states = self.state_graph.len() - self.end_states.len();
806        let mut last_non_end_id = 0;
807        let first_ending_id = nbr_non_end_states + 1;
808
809        // initial partition
810        // - all non-end states
811        let mut group = BTreeSet::<StateId>::new();
812        for st in self.state_graph.keys().filter(|&st| !self.end_states.contains_key(st)) {
813            group.insert(*st);
814            st_to_group.insert(*st, 0);
815        }
816        groups.push(group);
817        // - reserves a few empty groups for later non-ending groups:
818        for _ in 1..first_ending_id {
819            groups.push(BTreeSet::new());
820        }
821        // - end states
822        if separate_end_states {
823            for st in self.end_states.keys() {
824                st_to_group.insert(*st, groups.len());
825                groups.push(BTreeSet::<StateId>::from([*st]));
826            }
827        } else {
828            st_to_group.extend(self.end_states.keys().map(|id| (*id, groups.len())));
829            groups.push(BTreeSet::from_iter(self.end_states.keys().copied()));
830        }
831        let mut last_ending_id = groups.len() - 1;
832        let mut change = true;
833        while change {
834            let mut changes = Vec::<(StateId, usize, usize)>::new();   // (state, old group, new group)
835            for (id, p) in groups.iter().enumerate().filter(|(_, g)| !g.is_empty()) {
836                // do all states have the same destination group for the same symbol?
837                if VERBOSE { println!("group #{id}: {p:?}:"); }
838                // stores combination -> group index:
839                let mut combinations = BTreeMap::<BTreeMap<&Segments, usize>, usize>::new();
840                for &st_id in p {
841                    let combination = self.state_graph.get(&st_id).unwrap().iter()
842                        .filter(|(_, st)| st_to_group.contains_key(st)) // to avoid fake "end" states
843                        .map(|(s, st)| { (s, st_to_group[st]) })
844                        .collect::<BTreeMap<_, _>>();
845                    if VERBOSE { print!("- state {st_id}{}: {combination:?}", if self.end_states.contains_key(&st_id) { " <END>" } else { "" }) };
846                    if combinations.is_empty() {
847                        combinations.insert(combination, id);   // first one remains in this group
848                        if VERBOSE { println!(" (1st, no change)"); }
849                    } else if let Some(&group_id) = combinations.get(&combination) {
850                        // programs the change if it's one of the new groups
851                        if group_id != id {
852                            changes.push((st_id, id, group_id));
853                            if VERBOSE { println!(" -> group #{group_id}"); }
854                        } else if VERBOSE {
855                            println!(" (no change)");
856                        }
857                    } else {
858                        // creates a new group and programs the change
859                        let new_id = if id < first_ending_id {
860                            assert!(last_non_end_id + 1 < first_ending_id, "no more IDs for non-accepting state");
861                            last_non_end_id += 1;
862                            last_non_end_id
863                        } else {
864                            last_ending_id += 1;
865                            last_ending_id
866                        };
867                        combinations.insert(combination, new_id);
868                        changes.push((st_id, id, new_id));
869                        if VERBOSE { println!(" -> new group #{new_id}"); }
870                    }
871                }
872            }
873            change = !changes.is_empty();
874            for (st_id, old_group_id, new_group_id) in changes {
875                if new_group_id >= groups.len() {
876                    groups.push(BTreeSet::<StateId>::new());
877                }
878                assert!(groups[old_group_id].remove(&st_id));
879                groups[new_group_id].insert(st_id);
880                assert_eq!(st_to_group.insert(st_id, new_group_id), Some(old_group_id));
881            }
882            if VERBOSE && change { println!("---"); }
883        }
884        if VERBOSE { println!("-----------------------------------------------------------"); }
885        // removes the gaps in group numbering
886        let delta = first_ending_id - last_non_end_id - 1;
887        self.first_end_state = Some(last_non_end_id + 1);
888        if delta > 0 {
889            if VERBOSE {
890                println!("removing the gaps in group numbering: (delta={delta})");
891                println!("st_to_group: {st_to_group:?}");
892                println!("groups: {groups:?}");
893            }
894            for (_, new_st) in st_to_group.iter_mut() {
895                if *new_st > last_non_end_id {
896                    *new_st -= delta;
897                }
898            }
899            groups = groups.into_iter().enumerate()
900                .filter(|(id, _)| *id <= last_non_end_id || *id >= first_ending_id)
901                .map(|(_, g)| g)
902                .to_vec();
903            if VERBOSE {
904                println!("st_to_group: {st_to_group:?}");
905                println!("groups: {groups:?}");
906            }
907        }
908        // stores the new states; note that tokens may be lost if `separate_end_states` is false
909        // since we take the token of the first accepting state in the group
910        self.end_states = BTreeMap::<StateId, Terminal>::from_iter(
911            groups.iter().enumerate()
912                .filter_map(|(group_id, states)| states.iter()
913                    .find_map(|s| self.end_states.get(s).map(|terminal| {
914                        let mut new_terminal = terminal.clone();
915                        if let Some(state) = &mut new_terminal.mode_state {
916                            *state = st_to_group[state];
917                        }
918                        (group_id as StateId, new_terminal)
919                    })))
920        );
921
922        self.initial_state = Some(st_to_group[&self.initial_state.unwrap()] as StateId);
923        self.state_graph = self.state_graph.iter()
924            .map(|(st_id, map_sym_st)| (
925                st_to_group[st_id] as StateId,
926                map_sym_st.iter().map(|(segs, st)| (segs.clone(), st_to_group[st] as StateId)).collect::<BTreeMap<_, _>>()))
927            .collect::<BTreeMap::<StateId, BTreeMap<Segments, StateId>>>();
928        if VERBOSE {
929            println!("new_graph:   {:?}", self.state_graph);
930            println!("new_initial: {:?}", self.initial_state);
931            println!("new_end:     {:?}", self.end_states);
932            println!("-----------------------------------------------------------");
933        }
934        debug_assert!(self.is_normalized(), "optimized state machine isn't regular\nend_states={:?}\ngraph={:?}",
935                      self.end_states, self.state_graph);
936        Dfa::<Normalized> {
937            state_graph: self.state_graph,
938            initial_state: self.initial_state,
939            end_states: self.end_states,
940            first_end_state: self.first_end_state,
941            log: self.log,
942            _phantom: PhantomData,
943        }
944    }
945}
946
947impl<T> LogReader for Dfa<T> {
948    type Item = BufLog;
949
950    fn get_log(&self) -> &Self::Item {
951        &self.log
952    }
953
954    fn give_log(self) -> Self::Item {
955        self.log
956    }
957}
958
959impl<T> HasBuildErrorSource for Dfa<T> {
960    const SOURCE: BuildErrorSource = BuildErrorSource::Dfa;
961}
962
963impl Dfa<General> {
964    pub fn new() -> Dfa<General> {
965        Dfa::with_log(BufLog::new())
966    }
967}
968
969impl Default for Dfa<General> {
970    fn default() -> Self {
971        Self::new()
972    }
973}
974
975impl Dfa<Normalized> {
976    pub fn gen_tables_source_code(&self, indent: usize) -> String {
977        let mut source = Vec::<String>::new();
978        source.push("let dfa_tables = DfaTables::new(".to_string());
979        source.push("    btreemap![".to_string());
980        source.extend(graph_to_code(&self.state_graph, None, 8));
981        source.push("    ],".to_string());
982        source.push(format!("    {:?},", self.initial_state));
983        source.push("    btreemap![".to_string());
984        let es = self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).to_vec();
985        source.extend(es.chunks(6).map(|v| format!("        {},", v.join(", "))));
986        source.push("    ],".to_string());
987        source.push(format!("    {:?},", self.first_end_state));
988        source.push(");".to_string());
989        indent_source(vec![source], indent)
990    }
991}
992
993// ---------------------------------------------------------------------------------------------
994
995pub struct DfaBundle {
996    /// DFAs and their respective mode IDs
997    dfas: Vec<(ModeId, Dfa<General>)>,
998    /// Log of any potential process that occurred before getting the DFAs
999    log: Option<BufLog>,
1000}
1001
1002impl DfaBundle {
1003    pub fn new<T>(dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
1004        DfaBundle {
1005            dfas: dfas.into_iter().collect(),
1006            log: None
1007        }
1008    }
1009
1010    pub fn with_log<T>(log: BufLog, dfas: T) -> Self where T: IntoIterator<Item=(ModeId, Dfa<General>)> {
1011        DfaBundle {
1012            dfas: dfas.into_iter().collect(),
1013            log: Some(log)
1014        }
1015    }
1016}
1017
1018// impl<T> BuildFrom<T> for Dfa<General>
1019// where
1020//     T: IntoIterator<Item=(ModeId, Dfa<General>)>,
1021// {
1022//     /// Builds a [`Dfa::<General>`] from multiple DFAs, each related to one mode.
1023//     ///
1024//     /// If an error is encountered or was already encountered before, an empty shell object
1025//     /// is built with the log detailing the error(s).
1026//     fn build_from(dfas: T) -> Self {
1027//         let dfa_builder = DfaBuilder::new();
1028//         dfa_builder.build_from_dfa_modes(dfas)
1029//     }
1030// }
1031
1032impl BuildFrom<DfaBundle> for Dfa<General> {
1033    /// Builds a [`Dfa::<General>`] from multiple DFAs, each related to one mode.
1034    ///
1035    /// If an error is encountered or was already encountered before, an empty shell object
1036    /// is built with the log detailing the error(s).
1037    fn build_from(bundle: DfaBundle) -> Self {
1038        let dfa_builder = DfaBuilder::with_log(bundle.log.unwrap_or_default());
1039        dfa_builder.build_from_dfa_modes(bundle.dfas)
1040    }
1041}
1042
1043impl BuildFrom<DfaBuilder> for Dfa<General> {
1044    /// Builds a [`Dfa::<General>`] from a [`DfaBuilder`].
1045    ///
1046    /// If an error is encountered or was already encountered before, an empty shell object
1047    /// is built with the log detailing the error(s).
1048    fn build_from(dfa_builder: DfaBuilder) -> Self {
1049        dfa_builder.build()
1050    }
1051}
1052
1053// Dfa::optimize and Dfa::normalize both generate a Dfa::<Normalized>. Depending on the need,
1054// it's best to use those methods.
1055//
1056// impl From<Dfa<General>> for Dfa<Normalized> {
1057//     /// Builds a [`Dfa::<Normalized>`] from a [`Dfa::<General>`].
1058//     ///
1059//     /// If an error is encountered or was already encountered before, an empty shell object
1060//     /// is built with the log detailing the error(s).
1061//     fn from(dfa: Dfa<General>) -> Self {
1062//         dfa.normalize() // or optimize()?
1063//     }
1064// }
1065
1066// ---------------------------------------------------------------------------------------------
1067
1068pub struct DfaTables {
1069    state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1070    initial_state: Option<StateId>,
1071    end_states: BTreeMap<StateId, Terminal>,
1072    first_end_state: Option<StateId>,
1073}
1074
1075impl DfaTables {
1076    pub fn new(
1077        state_graph: BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1078        initial_state: Option<StateId>,
1079        end_states: BTreeMap<StateId, Terminal>,
1080        first_end_state: Option<StateId>,
1081    ) -> Self {
1082        DfaTables { state_graph, initial_state, end_states, first_end_state }
1083    }
1084}
1085
1086impl BuildFrom<DfaTables> for Dfa<Normalized> {
1087    fn build_from(source: DfaTables) -> Self {
1088        Dfa {
1089            state_graph: source.state_graph,
1090            initial_state: source.initial_state,
1091            end_states: source.end_states,
1092            first_end_state: source.first_end_state,
1093            log: BufLog::new(),
1094            _phantom: PhantomData
1095        }
1096    }
1097}
1098
1099// ---------------------------------------------------------------------------------------------
1100// Supporting functions
1101
1102fn states_to_string<T: Display>(s: &BTreeSet<T>) -> String {
1103    s.iter().map(|id| id.to_string()).join(", ")
1104}
1105
1106#[allow(dead_code)]
1107pub(crate) fn followpos_to_string(dfa_builder: &DfaBuilder) -> String {
1108    let mut fpos = dfa_builder.followpos.iter()
1109        .map(|(id, ids)| format!("{id:3} -> {}", ids.iter().map(|x| x.to_string()).join(", ")))
1110        .to_vec();
1111    fpos.sort();
1112    fpos.join("\n")
1113}
1114
1115fn node_to_string(tree: &ReTree, index: usize, basic: bool) -> String {
1116    let node = tree.get(index);
1117    let mut result = String::new();
1118    if !basic {
1119        if let Some(b) = node.nullable {
1120            if b {
1121                result.push('!');
1122            }
1123        } else {
1124            result.push('?');
1125        }
1126    }
1127    result.push_str(&node.to_string());
1128    let children = tree.children(index);
1129    if !children.is_empty() {
1130        result.push('(');
1131        result.push_str(&children.iter().map(|&c| node_to_string(tree, c, basic)).to_vec().join(","));
1132        result.push(')');
1133    }
1134    result
1135}
1136
1137pub fn retree_to_str(tree: &ReTree, node: Option<usize>, emphasis: Option<usize>, show_index: bool) -> String {
1138    fn pr_join(children: Vec<(u32, String)>, str: &str, pr: u32) -> (u32, String) {
1139        (pr,
1140         children.into_iter()
1141             .map(|(p_ch, ch)| if p_ch < pr { format!("({ch})") } else { ch })
1142             .join(str))
1143    }
1144
1145    fn pr_append(child: (u32, String), str: &str, pr: u32, force_par: bool) -> (u32, String) {
1146        (pr, if force_par || child.0 < pr { format!("({}){str}", child.1) } else { format!("{}{str}", child.1) })
1147    }
1148
1149    const PR_MATCH: u32 = 1;
1150    const PR_TERM: u32 = 2;
1151    const PR_REPEAT: u32 = 3;
1152    const PR_ITEM: u32 = 4;
1153
1154    let mut children = vec![];
1155    if tree.is_empty() {
1156        return "<empty>".to_string();
1157    }
1158    let top = node.unwrap_or_else(|| tree.get_root().unwrap());
1159    for node in tree.iter_post_depth_simple_at(top) {
1160        let (pr, mut str) = match node.num_children() {
1161            0 => {
1162                match &node.op {
1163                    ReType::Empty => (PR_ITEM, "ε".to_string()),
1164                    ReType::End(t) => (PR_ITEM, t.to_string()),
1165                    ReType::Char(ch) => (PR_ITEM, format!("{ch:?}")),
1166                    ReType::CharRange(segments) => (PR_ITEM, format!("[{segments}]")),
1167                    ReType::String(str) => (PR_ITEM, format!("{str:?}")),
1168                    s => panic!("{s:?} should have children"),
1169                }
1170            }
1171            n => {
1172                let mut node_children = children.split_off(children.len() - n);
1173                match &node.op {
1174                    ReType::Concat => pr_join(node_children, " ", PR_TERM),
1175                    ReType::Star => pr_append(node_children.pop().unwrap(), "*", PR_REPEAT, show_index),
1176                    ReType::Plus => pr_append(node_children.pop().unwrap(), "+", PR_REPEAT, show_index),
1177                    ReType::Or => pr_join(node_children, " | ", PR_MATCH),
1178                    ReType::Lazy => pr_append(node_children.pop().unwrap(), "?", PR_REPEAT, show_index),
1179                    s => panic!("{s:?} shouldn't have {n} child(ren)"),
1180                }
1181            }
1182        };
1183        if show_index {
1184            str = format!("{}:{str}", node.index);
1185        }
1186        if Some(node.index) == emphasis {
1187            str = format!(" ►►► {str} ◄◄◄ ");
1188        }
1189        children.push((pr, str));
1190    }
1191    children.pop().unwrap().1
1192}
1193
1194// ---------------------------------------------------------------------------------------------
1195// Supporting Debug functions
1196
1197/// Debug function to display the content of a tree.
1198pub fn tree_to_string(tree: &ReTree, root: Option<usize>, basic: bool) -> String {
1199    if !tree.is_empty() {
1200        node_to_string(tree, root.unwrap_or_else(|| tree.get_root().unwrap()), basic)
1201    } else {
1202        "None".to_string()
1203    }
1204}
1205
1206impl<T> Dfa<T> {
1207    /// Debug function to display the content of a Dfa.
1208    pub fn print(&self, indent: usize) {
1209        println!("Initial state: {}", if let Some(st) = self.initial_state { st.to_string() } else { "none".to_string() });
1210        println!("Graph:");
1211        print_graph(&self.state_graph, Some(&self.end_states), indent);
1212        println!("End states:\n{: >indent$}{}", " ",
1213                 self.end_states.iter().map(|(s, t)| format!("{} => {}", s, t.to_macro())).join(", "), indent=indent);
1214    }
1215}
1216
1217fn graph_to_code(
1218    state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>,
1219    end_states: Option<&BTreeMap<StateId, Terminal>>,
1220    indent: usize,
1221) -> Vec<String> {
1222    let s = String::from_utf8(vec![32; indent]).unwrap();
1223    state_graph.iter().map(|(state, trans)| {
1224        let mut has_neg = false;
1225        let mut tr = trans.iter().map(|(sym, st)| {
1226            let s = (sym.to_string(), st.to_string());
1227            has_neg |= s.0.starts_with('~');
1228            s
1229        }).to_vec();
1230        if has_neg {
1231            for (sym, _) in &mut tr {
1232                if !sym.ends_with(']') {
1233                    sym.insert(0, '[');
1234                    sym.push(']');
1235                }
1236            }
1237        }
1238        format!("{s}{} => branch!({}),{}",
1239                state,
1240                tr.into_iter().map(|(sym, st)| format!("{sym} => {st}")).join(", "),
1241                end_states.and_then(|map| map.get(state).map(|token| format!(" // {}", token))).unwrap_or_default(),
1242        )
1243    }).collect()
1244}
1245
1246
1247/// Debug function to display the graph of a Dfa.
1248pub fn print_graph(state_graph: &BTreeMap<StateId, BTreeMap<Segments, StateId>>, end_states: Option<&BTreeMap<StateId, Terminal>>, indent: usize) {
1249    let src = graph_to_code(state_graph, end_states, indent);
1250    println!("{}", src.join("\n"));
1251}
1252
1253// ---------------------------------------------------------------------------------------------
1254// Macros
1255
1256pub mod macros {
1257    /// Generates an `ReNode` instance.
1258    ///
1259    /// # Examples
1260    /// ```
1261    /// # use std::collections::BTreeSet;
1262    /// # use lexigram_lib::{dfa::*, node, char_reader::{UTF8_MIN, UTF8_LOW_MAX, UTF8_HIGH_MIN, UTF8_MAX}};
1263    /// # use lexigram_lib::lexer::{ActionOption, ModeOption, Terminal};
1264    /// # use lexigram_lib::segmap::Seg;
1265    /// # use lexigram_lib::segments::Segments;
1266    /// assert_eq!(node!(chr 'a'), ReNode::char('a'));
1267    /// assert_eq!(node!(['a'-'z', '0'-'9']), ReNode::char_range(Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)])));
1268    /// assert_eq!(node!(.), ReNode::char_range(Segments::from([Seg(UTF8_MIN, UTF8_LOW_MAX), Seg(UTF8_HIGH_MIN, UTF8_MAX)])));
1269    /// assert_eq!(node!(str "new"), ReNode::string("new"));
1270    /// assert_eq!(node!(=5), ReNode::end(Terminal { action: ActionOption::Token(5), channel: 0, mode: ModeOption::None, mode_state: None, pop: false }));
1271    /// assert_eq!(node!(&), ReNode::concat());
1272    /// assert_eq!(node!(|), ReNode::or());
1273    /// assert_eq!(node!(*), ReNode::star());
1274    /// assert_eq!(node!(+), ReNode::plus());
1275    /// assert_eq!(node!(e), ReNode::empty());
1276    /// ```
1277    #[macro_export]
1278    macro_rules! node {
1279        (chr $char:expr) => { $crate::dfa::ReNode::char($char) };
1280        (chr $char1:expr, $char2:expr $(;$char3:expr, $char4:expr)*) => { ($char1..=$char2)$(.chain($char3..=$char4))*.map(|c| $crate::dfa::ReNode::char(c)) };
1281        ([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1282        (~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+]) => { $crate::dfa::ReNode::char_range($crate::segments![~ $($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1283        (.) => { node!([DOT]) };
1284        (str $str:expr) => { $crate::dfa::ReNode::string($str) };
1285        (&) => { $crate::dfa::ReNode::concat() };
1286        (|) => { $crate::dfa::ReNode::or() };
1287        (*) => { $crate::dfa::ReNode::star() };
1288        (+) => { $crate::dfa::ReNode::plus() };
1289        (e) => { $crate::dfa::ReNode::empty() };
1290        (??) => { $crate::dfa::ReNode::lazy() };
1291        // actions:
1292        (= $id:expr) => { $crate::dfa::ReNode::end($crate::lexer::Terminal {
1293            action: $crate::lexer::ActionOption::Token($id),
1294            channel: 0,
1295            mode: $crate::lexer::ModeOption::None,
1296            mode_state: None,
1297            pop: false
1298        }) };
1299        ($id:expr) => { $crate::dfa::ReNode::end($id) };
1300        //
1301        ([$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!([$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1302        (~[$($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+]) => { node!(~ [$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+]) };
1303    }
1304
1305    #[macro_export]
1306    macro_rules! term {
1307        (= $id:expr ) =>     { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Token($id),channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1308        (more) =>            { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::More,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1309        (skip) =>            { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1310        (mode $id:expr) =>   { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::Mode($id), mode_state: None,      pop: false } };
1311        (push $id:expr) =>   { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::Push($id), mode_state: None,      pop: false } };
1312        (pushst $id:expr) => { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: Some($id), pop: false } };
1313        (pop) =>             { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: 0,   mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: true  } };
1314        (# $id:expr) =>      { $crate::lexer::Terminal { action: $crate::lexer::ActionOption::Skip,      channel: $id, mode: $crate::lexer::ModeOption::None,      mode_state: None,      pop: false } };
1315    }
1316
1317    #[cfg(test)]
1318    mod tests {
1319        use crate::{branch, btreemap};
1320        use crate::dfa::{graph_to_code, ReNode};
1321        use crate::char_reader::{UTF8_HIGH_MIN, UTF8_LOW_MAX, UTF8_MAX, UTF8_MIN};
1322        use crate::lexer::{ActionOption, ModeOption, Terminal};
1323        use crate::segments::Segments;
1324        use crate::segmap::Seg;
1325
1326        #[test]
1327        fn macro_node() {
1328            assert_eq!(node!([0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::dot()));
1329            assert_eq!(node!(~[0 - LOW_MAX, HIGH_MIN - MAX]), ReNode::char_range(Segments::empty()));
1330
1331            assert_eq!(node!(chr 'a'), ReNode::char('a'));
1332            assert_eq!(node!(['a'-'z', '0'-'9']), ReNode::char_range(Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)])));
1333            assert_eq!(node!(.), ReNode::char_range(Segments::from([Seg(UTF8_MIN, UTF8_LOW_MAX), Seg(UTF8_HIGH_MIN, UTF8_MAX)])));
1334            assert_eq!(node!(str "new"), ReNode::string("new"));
1335            assert_eq!(node!(=5), ReNode::end(Terminal { action: ActionOption::Token(5), channel: 0, mode: ModeOption::None, mode_state: None, pop: false }));
1336            assert_eq!(node!(&), ReNode::concat());
1337            assert_eq!(node!(|), ReNode::or());
1338            assert_eq!(node!(*), ReNode::star());
1339            assert_eq!(node!(+), ReNode::plus());
1340            assert_eq!(node!(e), ReNode::empty());
1341        }
1342
1343        #[test]
1344        fn state_graph() {
1345            let test = btreemap![
1346            1 => branch!(),
1347            2 => branch!('A' => 0),
1348            3 => branch!('B' => 1, 'C' => 2),
1349            4 => branch!('D'-'F' => 3),
1350            5 => branch!('0'-'9', '_' => 4),
1351            6 => branch!(DOT => 5),
1352            7 => branch!(~['*'] => 6),
1353            8 => branch!(~['+', '-'] => 7),
1354            9 => branch!(~['*'] => 8, ['*'] => 9),
1355        ];
1356            let s = graph_to_code(&test, None, 4);
1357            assert_eq!(s, vec![
1358                "    1 => branch!(),".to_string(),
1359                "    2 => branch!('A' => 0),".to_string(),
1360                "    3 => branch!('B' => 1, 'C' => 2),".to_string(),
1361                "    4 => branch!('D'-'F' => 3),".to_string(),
1362                "    5 => branch!('0'-'9', '_' => 4),".to_string(),
1363                "    6 => branch!(DOT => 5),".to_string(),
1364                "    7 => branch!(~['*'] => 6),".to_string(),
1365                "    8 => branch!(~['+', '-'] => 7),".to_string(),
1366                "    9 => branch!(~['*'] => 8, ['*'] => 9),".to_string(),
1367            ]);
1368        }
1369    }
1370}