ere_core/
simplified_tree.rs

1//! Implements a simplified intermediate representation of a regular expression.
2
3use crate::{config::Config, parse_tree::*};
4
5/// For translation between our parsed [`ERE`] and the [`crate::working_nfa::WorkingNFA`]
6#[derive(Debug, PartialEq, Eq, Clone)]
7pub enum SimplifiedTreeNode {
8    /// Translates to `epsilon`
9    Empty,
10    Symbol(Atom),
11    Union(Vec<SimplifiedTreeNode>),
12    /// `Capture(child, group_num)`
13    Capture(Box<SimplifiedTreeNode>, usize),
14    Concat(Vec<SimplifiedTreeNode>),
15    /// `Repeat(child, times)`
16    Repeat(Box<SimplifiedTreeNode>, usize),
17    /// `UpTo(child, max_times, longest)`
18    UpTo(Box<SimplifiedTreeNode>, usize, bool),
19    /// `Star(child, longest)`
20    Star(Box<SimplifiedTreeNode>, bool),
21    Start,
22    End,
23    Never,
24}
25impl SimplifiedTreeNode {
26    pub fn optional(self, longest: bool) -> SimplifiedTreeNode {
27        return self.upto(1, longest);
28    }
29    pub fn union(self, other: SimplifiedTreeNode) -> SimplifiedTreeNode {
30        if let SimplifiedTreeNode::Union(mut u) = self {
31            u.push(other);
32            return SimplifiedTreeNode::Union(u);
33        }
34        return SimplifiedTreeNode::Union(vec![self, other]);
35    }
36    pub fn capture(self, group_num: usize) -> SimplifiedTreeNode {
37        return SimplifiedTreeNode::Capture(Box::new(self), group_num);
38    }
39    pub fn concat(self, other: SimplifiedTreeNode) -> SimplifiedTreeNode {
40        if let SimplifiedTreeNode::Concat(mut c) = self {
41            c.push(other);
42            return SimplifiedTreeNode::Concat(c);
43        }
44        return SimplifiedTreeNode::Concat(vec![self, other]);
45    }
46    pub fn repeat(self, count: usize) -> SimplifiedTreeNode {
47        return SimplifiedTreeNode::Repeat(self.into(), count);
48    }
49    pub fn upto(self, count: usize, longest: bool) -> SimplifiedTreeNode {
50        return SimplifiedTreeNode::UpTo(self.into(), count, longest);
51    }
52    pub fn star(self, longest: bool) -> SimplifiedTreeNode {
53        return SimplifiedTreeNode::Star(self.into(), longest);
54    }
55
56    // /// A non-optimized, backtracking implementation
57    // /// - `start` is the index of the original `text` it starts at
58    // ///
59    // /// # Returns
60    // /// The number of matched symbols, or `None` if no match is found.
61    // fn _check(&self, text: &str, start: usize, variation: usize) -> Option<usize> {
62    //     return match self {
63    //         SimplifiedTreeNode::Empty => Some(0),
64    //         SimplifiedTreeNode::Symbol(atom) => atom.check(text.chars().next()?).then_some(1),
65    //         SimplifiedTreeNode::Union(vec) => todo!(),
66    //         SimplifiedTreeNode::Capture(node, _) => node._check(text, start),
67    //         SimplifiedTreeNode::Concat(vec) => {
68    //             let mut size = 0;
69    //             for node in vec {
70    //                 size += node._check(&text[size..], start + size)?;
71    //             }
72    //             Some(size)
73    //         }
74    //         SimplifiedTreeNode::Repeat(node, n) => {
75    //             let mut size = 0;
76    //             for _ in 0..(*n) {
77    //                 size += node._check(&text[size..], start + size)?;
78    //             }
79    //             Some(size)
80    //         }
81    //         SimplifiedTreeNode::UpTo(node, n) => todo!(),
82    //         SimplifiedTreeNode::Star(node) => todo!(),
83    //         SimplifiedTreeNode::Start if start == 0 => Some(0),
84    //         SimplifiedTreeNode::Start => None,
85    //         SimplifiedTreeNode::End if start == text.len() => Some(0),
86    //         SimplifiedTreeNode::End => None,
87    //         SimplifiedTreeNode::Never => None,
88    //     };
89    // }
90    // /// A non-optimized, backtracking implementation
91    // pub fn check(&self, text: &str) -> bool {
92    //     for i in 0..text.len() {
93    //         if let Some(_) = self._check(&text[..i], i) {
94    //             return true;
95    //         }
96    //     }
97    //     return self._check("", text.len()).is_some();
98    // }
99
100    /// An upper bound for matched text length, in bytes.
101    /// If possibly infinite, returns `None`.
102    pub fn max_bytes(&self) -> Option<usize> {
103        return match self {
104            SimplifiedTreeNode::Empty => Some(0),
105            SimplifiedTreeNode::Symbol(atom) => {
106                let range = atom.to_ranges().last()?.clone();
107                Some(range.end().len_utf8())
108            }
109            SimplifiedTreeNode::Union(nodes) if nodes.is_empty() => Some(0),
110            SimplifiedTreeNode::Union(nodes) => nodes
111                .iter()
112                .map(SimplifiedTreeNode::max_bytes)
113                .reduce(|a, b| Some(std::cmp::max(a?, b?)))?,
114            SimplifiedTreeNode::Capture(node, _) => node.max_bytes(),
115            SimplifiedTreeNode::Concat(nodes) if nodes.is_empty() => Some(0),
116            SimplifiedTreeNode::Concat(nodes) => {
117                nodes.iter().map(SimplifiedTreeNode::max_bytes).sum()
118            }
119            SimplifiedTreeNode::Repeat(node, times) => Some(node.max_bytes()? * times),
120            SimplifiedTreeNode::UpTo(node, times, _) => Some(node.max_bytes()? * times),
121            SimplifiedTreeNode::Star(_, _) => None,
122            SimplifiedTreeNode::Start => Some(0),
123            SimplifiedTreeNode::End => Some(0),
124            SimplifiedTreeNode::Never => Some(0),
125        };
126    }
127
128    /// A lower bound for matched text length, in bytes.
129    pub fn min_bytes(&self) -> usize {
130        return match self {
131            SimplifiedTreeNode::Empty => 0,
132            SimplifiedTreeNode::Symbol(atom) => {
133                let Some(range) = atom.to_ranges().first().cloned() else {
134                    return 0;
135                };
136                range.start().len_utf8()
137            }
138            SimplifiedTreeNode::Union(nodes) => nodes
139                .iter()
140                .map(SimplifiedTreeNode::min_bytes)
141                .min()
142                .unwrap_or(0),
143            SimplifiedTreeNode::Capture(node, _) => node.min_bytes(),
144            SimplifiedTreeNode::Concat(nodes) => {
145                nodes.iter().map(SimplifiedTreeNode::min_bytes).sum()
146            }
147            SimplifiedTreeNode::Repeat(node, times) => node.min_bytes() * times,
148            SimplifiedTreeNode::UpTo(node, times, _) => node.min_bytes() * times,
149            SimplifiedTreeNode::Star(_, _) => 0,
150            SimplifiedTreeNode::Start => 0,
151            SimplifiedTreeNode::End => 0,
152            SimplifiedTreeNode::Never => 0,
153        };
154    }
155}
156impl SimplifiedTreeNode {
157    fn from_sub_ere(
158        value: &ERE,
159        mut group_num: usize,
160        config: &Config,
161    ) -> (SimplifiedTreeNode, usize) {
162        let parts = value
163            .0
164            .iter()
165            .map(|part| {
166                let (new_node, new_group_num) =
167                    SimplifiedTreeNode::from_ere_branch(&part, group_num, config);
168                group_num = new_group_num;
169                new_node
170            })
171            .collect();
172        return (SimplifiedTreeNode::Union(parts), group_num);
173    }
174    fn from_ere_branch(
175        value: &EREBranch,
176        mut group_num: usize,
177        config: &Config,
178    ) -> (SimplifiedTreeNode, usize) {
179        let parts = value
180            .0
181            .iter()
182            .map(|part| {
183                let (new_node, new_group_num) =
184                    SimplifiedTreeNode::from_ere_part(&part, group_num, config);
185                group_num = new_group_num;
186                new_node
187            })
188            .collect();
189        return (SimplifiedTreeNode::Concat(parts), group_num);
190    }
191    fn from_ere_part(
192        value: &EREPart,
193        group_num: usize,
194        config: &Config,
195    ) -> (SimplifiedTreeNode, usize) {
196        return match value {
197            EREPart::Single(expr) => {
198                SimplifiedTreeNode::from_ere_expression(expr, group_num, config)
199            }
200            EREPart::Quantified(expr, quantifier) => {
201                let (child, group_num) =
202                    SimplifiedTreeNode::from_ere_expression(expr, group_num, config);
203                let longest = config.quantifiers_prefer_longest() ^ quantifier.alt;
204                let part = match &quantifier.quantifier {
205                    QuantifierType::Star => child.star(longest),
206                    QuantifierType::Plus => child.clone().concat(child.star(longest)),
207                    QuantifierType::QuestionMark => child.optional(longest),
208                    QuantifierType::Multiple(n) => child.repeat(*n as usize),
209                    QuantifierType::Range(n, None) => child
210                        .clone()
211                        .repeat(*n as usize)
212                        .concat(child.star(longest)),
213                    QuantifierType::Range(n, Some(m)) => match m.checked_sub(*n) {
214                        None => SimplifiedTreeNode::Never,
215                        Some(0) => child.repeat(*n as usize),
216                        Some(r) => child
217                            .clone()
218                            .repeat(*n as usize)
219                            .concat(child.upto(r as usize, longest)),
220                    },
221                };
222                (part, group_num)
223            }
224            EREPart::Start => (SimplifiedTreeNode::Start, group_num),
225            EREPart::End => (SimplifiedTreeNode::End, group_num),
226        };
227    }
228    fn from_ere_expression(
229        value: &EREExpression,
230        group_num: usize,
231        config: &Config,
232    ) -> (SimplifiedTreeNode, usize) {
233        return match value {
234            EREExpression::Atom(atom) => (atom.clone().into(), group_num),
235            EREExpression::Subexpression(ere) => {
236                let (capture, next_group_num) =
237                    SimplifiedTreeNode::from_sub_ere(ere, group_num + 1, config);
238                (
239                    SimplifiedTreeNode::Capture(capture.into(), group_num),
240                    next_group_num,
241                )
242            }
243        };
244    }
245    /// Returns the simplified tree, along with the number of capture groups (full expression is group 0)
246    pub fn from_ere(value: &ERE, config: &Config) -> (SimplifiedTreeNode, usize) {
247        let (root, groups) = SimplifiedTreeNode::from_sub_ere(value, 1, config);
248        return (SimplifiedTreeNode::Capture(Box::new(root), 0), groups);
249    }
250
251    /// [`SimplifiedTreeNode::from_ere`] except it doesn't wrap in the capture group 0
252    pub(crate) fn from_ere_no_group0(value: &ERE, config: &Config) -> (SimplifiedTreeNode, usize) {
253        return SimplifiedTreeNode::from_sub_ere(value, 1, config);
254    }
255}
256impl From<ERE> for SimplifiedTreeNode {
257    fn from(value: ERE) -> Self {
258        return SimplifiedTreeNode::from_ere(&value, &Config::default()).0;
259    }
260}
261impl From<Atom> for SimplifiedTreeNode {
262    fn from(value: Atom) -> Self {
263        return SimplifiedTreeNode::Symbol(value);
264    }
265}