ere_core/
simplified_tree.rs

1//! Implements a simplified intermediate representation of a regular expression.
2
3use std::num::NonZeroUsize;
4
5use crate::{config::Config, parse_tree::*};
6
7/// For translation between our parsed [`ERE`] and the [`crate::working_nfa::WorkingNFA`]
8#[derive(Debug, PartialEq, Eq, Clone)]
9pub enum SimplifiedTreeNode {
10    /// Translates to `epsilon`
11    Empty,
12    Symbol(Atom),
13    Union(Vec<SimplifiedTreeNode>),
14    /// `Capture(child, group_num)`
15    Capture(Box<SimplifiedTreeNode>, usize),
16    Concat(Vec<SimplifiedTreeNode>),
17    /// `Repeat(child, times)`
18    Repeat(Box<SimplifiedTreeNode>, NonZeroUsize),
19    /// `UpTo(child, max_times, longest)`
20    UpTo(Box<SimplifiedTreeNode>, NonZeroUsize, bool),
21    /// `Star(child, longest)`
22    Star(Box<SimplifiedTreeNode>, bool),
23    Start,
24    End,
25    Never,
26}
27impl SimplifiedTreeNode {
28    pub fn optional(self, longest: bool) -> SimplifiedTreeNode {
29        return self.upto(1, longest);
30    }
31    pub fn union(self, other: SimplifiedTreeNode) -> SimplifiedTreeNode {
32        if let SimplifiedTreeNode::Union(mut u) = self {
33            u.push(other);
34            return SimplifiedTreeNode::Union(u);
35        }
36        return SimplifiedTreeNode::Union(vec![self, other]);
37    }
38    pub fn capture(self, group_num: usize) -> SimplifiedTreeNode {
39        return SimplifiedTreeNode::Capture(Box::new(self), group_num);
40    }
41    pub fn concat(self, other: SimplifiedTreeNode) -> SimplifiedTreeNode {
42        if let SimplifiedTreeNode::Concat(mut c) = self {
43            c.push(other);
44            return SimplifiedTreeNode::Concat(c);
45        }
46        return SimplifiedTreeNode::Concat(vec![self, other]);
47    }
48    pub fn repeat(self, count: usize) -> SimplifiedTreeNode {
49        return match NonZeroUsize::new(count) {
50            Some(count) => SimplifiedTreeNode::Repeat(self.into(), count),
51            None => SimplifiedTreeNode::Empty,
52        };
53    }
54    pub fn upto(self, count: usize, longest: bool) -> SimplifiedTreeNode {
55        return match NonZeroUsize::new(count) {
56            Some(count) => SimplifiedTreeNode::UpTo(self.into(), count, longest),
57            None => SimplifiedTreeNode::Empty,
58        };
59    }
60    pub fn star(self, longest: bool) -> SimplifiedTreeNode {
61        return SimplifiedTreeNode::Star(self.into(), longest);
62    }
63
64    // /// A non-optimized, backtracking implementation
65    // /// - `start` is the index of the original `text` it starts at
66    // ///
67    // /// # Returns
68    // /// The number of matched symbols, or `None` if no match is found.
69    // fn _check(&self, text: &str, start: usize, variation: usize) -> Option<usize> {
70    //     return match self {
71    //         SimplifiedTreeNode::Empty => Some(0),
72    //         SimplifiedTreeNode::Symbol(atom) => atom.check(text.chars().next()?).then_some(1),
73    //         SimplifiedTreeNode::Union(vec) => todo!(),
74    //         SimplifiedTreeNode::Capture(node, _) => node._check(text, start),
75    //         SimplifiedTreeNode::Concat(vec) => {
76    //             let mut size = 0;
77    //             for node in vec {
78    //                 size += node._check(&text[size..], start + size)?;
79    //             }
80    //             Some(size)
81    //         }
82    //         SimplifiedTreeNode::Repeat(node, n) => {
83    //             let mut size = 0;
84    //             for _ in 0..(*n) {
85    //                 size += node._check(&text[size..], start + size)?;
86    //             }
87    //             Some(size)
88    //         }
89    //         SimplifiedTreeNode::UpTo(node, n) => todo!(),
90    //         SimplifiedTreeNode::Star(node) => todo!(),
91    //         SimplifiedTreeNode::Start if start == 0 => Some(0),
92    //         SimplifiedTreeNode::Start => None,
93    //         SimplifiedTreeNode::End if start == text.len() => Some(0),
94    //         SimplifiedTreeNode::End => None,
95    //         SimplifiedTreeNode::Never => None,
96    //     };
97    // }
98    // /// A non-optimized, backtracking implementation
99    // pub fn check(&self, text: &str) -> bool {
100    //     for i in 0..text.len() {
101    //         if let Some(_) = self._check(&text[..i], i) {
102    //             return true;
103    //         }
104    //     }
105    //     return self._check("", text.len()).is_some();
106    // }
107
108    /// An upper bound for matched text length, in bytes.
109    /// If possibly infinite, returns `None`.
110    pub fn max_bytes(&self) -> Option<usize> {
111        return match self {
112            SimplifiedTreeNode::Empty => Some(0),
113            SimplifiedTreeNode::Symbol(atom) => {
114                let range = atom.to_ranges().last()?.clone();
115                Some(range.end().len_utf8())
116            }
117            SimplifiedTreeNode::Union(nodes) if nodes.is_empty() => Some(0),
118            SimplifiedTreeNode::Union(nodes) => nodes
119                .iter()
120                .map(SimplifiedTreeNode::max_bytes)
121                .reduce(|a, b| Some(std::cmp::max(a?, b?)))?,
122            SimplifiedTreeNode::Capture(node, _) => node.max_bytes(),
123            SimplifiedTreeNode::Concat(nodes) if nodes.is_empty() => Some(0),
124            SimplifiedTreeNode::Concat(nodes) => {
125                nodes.iter().map(SimplifiedTreeNode::max_bytes).sum()
126            }
127            SimplifiedTreeNode::Repeat(node, times) => Some(node.max_bytes()? * times.get()),
128            SimplifiedTreeNode::UpTo(node, times, _) => Some(node.max_bytes()? * times.get()),
129            SimplifiedTreeNode::Star(_, _) => None,
130            SimplifiedTreeNode::Start => Some(0),
131            SimplifiedTreeNode::End => Some(0),
132            SimplifiedTreeNode::Never => Some(0),
133        };
134    }
135
136    /// A lower bound for matched text length, in bytes.
137    pub fn min_bytes(&self) -> usize {
138        return match self {
139            SimplifiedTreeNode::Empty => 0,
140            SimplifiedTreeNode::Symbol(atom) => {
141                let Some(range) = atom.to_ranges().first().cloned() else {
142                    return 0;
143                };
144                range.start().len_utf8()
145            }
146            SimplifiedTreeNode::Union(nodes) => nodes
147                .iter()
148                .map(SimplifiedTreeNode::min_bytes)
149                .min()
150                .unwrap_or(0),
151            SimplifiedTreeNode::Capture(node, _) => node.min_bytes(),
152            SimplifiedTreeNode::Concat(nodes) => {
153                nodes.iter().map(SimplifiedTreeNode::min_bytes).sum()
154            }
155            SimplifiedTreeNode::Repeat(node, times) => node.min_bytes() * times.get(),
156            SimplifiedTreeNode::UpTo(node, times, _) => node.min_bytes() * times.get(),
157            SimplifiedTreeNode::Star(_, _) => 0,
158            SimplifiedTreeNode::Start => 0,
159            SimplifiedTreeNode::End => 0,
160            SimplifiedTreeNode::Never => 0,
161        };
162    }
163}
164impl SimplifiedTreeNode {
165    fn from_sub_ere(
166        value: &ERE,
167        mut group_num: usize,
168        config: &Config,
169    ) -> (SimplifiedTreeNode, usize) {
170        let parts = value
171            .0
172            .iter()
173            .map(|part| {
174                let (new_node, new_group_num) =
175                    SimplifiedTreeNode::from_ere_branch(&part, group_num, config);
176                group_num = new_group_num;
177                new_node
178            })
179            .collect();
180        return (SimplifiedTreeNode::Union(parts), group_num);
181    }
182    fn from_ere_branch(
183        value: &EREBranch,
184        mut group_num: usize,
185        config: &Config,
186    ) -> (SimplifiedTreeNode, usize) {
187        let parts = value
188            .0
189            .iter()
190            .map(|part| {
191                let (new_node, new_group_num) =
192                    SimplifiedTreeNode::from_ere_part(&part, group_num, config);
193                group_num = new_group_num;
194                new_node
195            })
196            .collect();
197        return (SimplifiedTreeNode::Concat(parts), group_num);
198    }
199    fn from_ere_part(
200        value: &EREPart,
201        group_num: usize,
202        config: &Config,
203    ) -> (SimplifiedTreeNode, usize) {
204        return match value {
205            EREPart::Single(expr) => {
206                SimplifiedTreeNode::from_ere_expression(expr, group_num, config)
207            }
208            EREPart::Quantified(expr, quantifier) => {
209                let (child, group_num) =
210                    SimplifiedTreeNode::from_ere_expression(expr, group_num, config);
211                let longest = config.quantifiers_prefer_longest() ^ quantifier.alt;
212                let part = match &quantifier.quantifier {
213                    QuantifierType::Star => child.star(longest),
214                    QuantifierType::Plus => child.clone().concat(child.star(longest)),
215                    QuantifierType::QuestionMark => child.optional(longest),
216                    QuantifierType::Multiple(n) => child.repeat(*n as usize),
217                    QuantifierType::Range(n, None) => child
218                        .clone()
219                        .repeat(*n as usize)
220                        .concat(child.star(longest)),
221                    QuantifierType::Range(n, Some(m)) => match m.checked_sub(*n) {
222                        None => SimplifiedTreeNode::Never,
223                        Some(0) => child.repeat(*n as usize),
224                        Some(r) => child
225                            .clone()
226                            .repeat(*n as usize)
227                            .concat(child.upto(r as usize, longest)),
228                    },
229                };
230                (part, group_num)
231            }
232            EREPart::Start => (SimplifiedTreeNode::Start, group_num),
233            EREPart::End => (SimplifiedTreeNode::End, group_num),
234        };
235    }
236    fn from_ere_expression(
237        value: &EREExpression,
238        group_num: usize,
239        config: &Config,
240    ) -> (SimplifiedTreeNode, usize) {
241        return match value {
242            EREExpression::Atom(atom) => (atom.clone().into(), group_num),
243            EREExpression::Subexpression(ere) => {
244                let (capture, next_group_num) =
245                    SimplifiedTreeNode::from_sub_ere(ere, group_num + 1, config);
246                (
247                    SimplifiedTreeNode::Capture(capture.into(), group_num),
248                    next_group_num,
249                )
250            }
251        };
252    }
253    /// Returns the simplified tree, along with the number of capture groups (full expression is group 0)
254    pub fn from_ere(value: &ERE, config: &Config) -> (SimplifiedTreeNode, usize) {
255        let (root, groups) = SimplifiedTreeNode::from_sub_ere(value, 1, config);
256        return (SimplifiedTreeNode::Capture(Box::new(root), 0), groups);
257    }
258
259    /// [`SimplifiedTreeNode::from_ere`] except it doesn't wrap in the capture group 0
260    pub(crate) fn from_ere_no_group0(value: &ERE, config: &Config) -> (SimplifiedTreeNode, usize) {
261        return SimplifiedTreeNode::from_sub_ere(value, 1, config);
262    }
263}
264impl From<ERE> for SimplifiedTreeNode {
265    fn from(value: ERE) -> Self {
266        return SimplifiedTreeNode::from_ere(&value, &Config::default()).0;
267    }
268}
269impl From<Atom> for SimplifiedTreeNode {
270    fn from(value: Atom) -> Self {
271        return SimplifiedTreeNode::Symbol(value);
272    }
273}