safe_regex_compiler/
parser.rs

1//! A parser for regular expressions.
2//!
3//! # Features
4//! - Provides a [`parse`](fn.parse.html) function that converts a regular
5//!   expression string into a [`FinalNode`](struct.FinalNode.html)
6//!   struct which is the root of an abstract syntax tree
7//! - Implements a straightforward
8//!   [contex-free grammar parser](https://www.cs.umd.edu/class/summer2015/cmsc330/parsing/)
9//! - Parses in a single pass
10//! - No recursion, no risk of stack overflow
11//! - `forbid(unsafe)`
12//! - Depends only on `std`
13//! - Good test coverage (92%)
14//!
15//! # Limitations
16//! - Parses only raw byte strings, `br"..."`.
17//! - Allocates.  Uses `Vec` and `String`.
18//!
19//! # Alternatives
20//! - [`regex-syntax`](https://crates.io/crates/regex-syntax)
21//!   - Mature
22//!   - Popular
23//!   - Maintained by the core Rust language developers
24//!   - Full of features
25//! - [`regular-expression`](https://crates.io/crates/regular-expression)
26//!   - No documentation
27#![forbid(unsafe_code)]
28use crate::escape_ascii;
29
30/// An AST node used during parsing.
31#[derive(Clone, Debug, PartialOrd, PartialEq)]
32pub enum Node {
33    Final(FinalNode),
34    NonFinal(NonFinalNode),
35}
36impl Node {
37    #[allow(clippy::must_use_candidate)]
38    #[allow(clippy::match_wildcard_for_single_variants)]
39    #[allow(clippy::missing_panics_doc)]
40    pub fn unwrap_final(self) -> FinalNode {
41        match self {
42            Node::Final(node) => node,
43            other => panic!("unwrap_final() called on value: {:?}", other),
44        }
45    }
46
47    #[allow(clippy::must_use_candidate)]
48    #[allow(clippy::match_wildcard_for_single_variants)]
49    #[allow(clippy::missing_panics_doc)]
50    pub fn unwrap_non_final(self) -> NonFinalNode {
51        match self {
52            Node::NonFinal(node) => node,
53            other => panic!("unwrap_non_final() called on value: {:?}", other),
54        }
55    }
56}
57
58/// An element of a regular expression character class.
59///
60/// [`FinalNode::Class`](enum.FinalNode.html#variant.Class) uses this.
61#[derive(Copy, Clone, PartialOrd, PartialEq)]
62pub enum ClassItem {
63    Byte(u8),
64    ByteRange(u8, u8),
65}
66impl core::fmt::Debug for ClassItem {
67    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
68        match self {
69            ClassItem::Byte(b) => write!(f, "Byte({})", escape_ascii([*b])),
70            ClassItem::ByteRange(a, b) => {
71                write!(
72                    f,
73                    "ByteRange({}-{})",
74                    escape_ascii([*a]),
75                    escape_ascii([*b])
76                )
77            }
78        }
79    }
80}
81
82/// AST nodes used internally during parsing.
83#[derive(Clone, PartialOrd, PartialEq)]
84pub enum NonFinalNode {
85    Escape,
86    HexEscape0,
87    HexEscape1(u8),
88    OpenClass0,
89    OpenClassNeg,
90    OpenClass(/* inclusive */ bool, Vec<ClassItem>),
91    OpenByteRange(u8),
92    ByteRange(u8, u8),
93    OpenGroup,
94    OpenExtendedGroup,
95    OpenNonCapturingGroup,
96    OpenAlt(Vec<FinalNode>),
97    RepeatMin(String),
98    RepeatMax(String, String),
99    RepeatToken(String, usize, Option<usize>),
100}
101impl core::fmt::Debug for NonFinalNode {
102    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
103        match self {
104            NonFinalNode::Escape => write!(f, "Escape"),
105            NonFinalNode::HexEscape0 => write!(f, "HexEscape0"),
106            NonFinalNode::HexEscape1(b) => write!(f, "HexEscape1({})", escape_ascii([*b])),
107            NonFinalNode::OpenClass0 => write!(f, "OpenClass0"),
108            NonFinalNode::OpenClassNeg => write!(f, "OpenClassNeg"),
109            NonFinalNode::OpenClass(true, items) => {
110                write!(f, "OpenClass{items:?}")
111            }
112            NonFinalNode::OpenClass(false, items) => {
113                write!(f, "OpenClass^{items:?}")
114            }
115            NonFinalNode::OpenByteRange(b) => write!(f, "OpenByteRange({})", escape_ascii([*b])),
116            NonFinalNode::ByteRange(a, b) => write!(
117                f,
118                "ByteRange({}-{})",
119                escape_ascii([*a]),
120                escape_ascii([*b])
121            ),
122            NonFinalNode::OpenGroup => write!(f, "OpenGroup"),
123            NonFinalNode::OpenExtendedGroup => write!(f, "OpenExtendedGroup"),
124            NonFinalNode::OpenNonCapturingGroup => write!(f, "OpenNonCapturingGroup"),
125            NonFinalNode::OpenAlt(nodes) => write!(f, "OpenAlt{nodes:?}"),
126            NonFinalNode::RepeatMin(min) => write!(f, "RepeatMin({min})"),
127            NonFinalNode::RepeatMax(min, max) => write!(f, "RepeatMax({min},{max})"),
128            NonFinalNode::RepeatToken(printable, min, opt_max) => {
129                write!(f, "RepeatToken({printable:?},{min},{opt_max:?})")
130            }
131        }
132    }
133}
134impl NonFinalNode {
135    /// Parsing can fail when a `NonFinalNode` is not converted into a
136    /// `FinalNode`.  This function returns an explanation to show to the user.
137    #[must_use]
138    pub fn reason(&self) -> String {
139        match self {
140            NonFinalNode::Escape => "incomplete escape sequence: `\\`".to_string(),
141            NonFinalNode::HexEscape0 => "incomplete escape sequence: `\\x`".to_string(),
142            NonFinalNode::HexEscape1(d) => {
143                format!("incomplete escape sequence: `\\x{}`", escape_ascii([*d]))
144            }
145            NonFinalNode::OpenClass0
146            | NonFinalNode::OpenClassNeg
147            | NonFinalNode::OpenClass(..)
148            | NonFinalNode::ByteRange(..) => "missing closing `]`".to_string(),
149            NonFinalNode::OpenByteRange(b) => {
150                format!("missing byte to close range: `{}-`", escape_ascii([*b]))
151            }
152            NonFinalNode::OpenGroup
153            | NonFinalNode::OpenExtendedGroup
154            | NonFinalNode::OpenNonCapturingGroup => "missing closing `)`".to_string(),
155            NonFinalNode::OpenAlt(_) => "missing element after bar `|`".to_string(),
156            NonFinalNode::RepeatMin(min) => {
157                format!("missing closing `}}` symbol: `{{{min}`")
158            }
159            NonFinalNode::RepeatMax(min, max) => {
160                format!("missing closing `}}` symbol: `{{{min},{max}`")
161            }
162            NonFinalNode::RepeatToken(printable, _, _) => {
163                format!("missing element before repeat element: `{printable}`")
164            }
165        }
166    }
167
168    /// Returns the contents of this `NonFinalNode::OpenClass(..)`.
169    /// Panics if this is a different enum variant.
170    #[allow(clippy::must_use_candidate)]
171    #[allow(clippy::missing_panics_doc)]
172    pub fn unwrap_open_class(self) -> (bool, Vec<ClassItem>) {
173        match self {
174            NonFinalNode::OpenClass(incl, items) => (incl, items),
175            other => panic!("unwrap_open_class() called on value: {:?}", other),
176        }
177    }
178
179    /// Returns the contents of this `NonFinalNode::OpenAlt(..)`.
180    /// Panics if this is a different enum variant.
181    #[allow(clippy::must_use_candidate)]
182    #[allow(clippy::missing_panics_doc)]
183    pub fn unwrap_open_alt(self) -> Vec<FinalNode> {
184        match self {
185            NonFinalNode::OpenAlt(nodes) => nodes,
186            other => panic!("unwrap_open_alt() called on value: {:?}", other),
187        }
188    }
189
190    /// Returns the contents of this `NonFinalNode::RepeatMin(..)`.
191    /// Panics if this is a different enum variant.
192    #[allow(clippy::must_use_candidate)]
193    #[allow(clippy::missing_panics_doc)]
194    pub fn unwrap_repeat_min(self) -> String {
195        match self {
196            NonFinalNode::RepeatMin(min) => min,
197            other => panic!("unwrap_repeat_min() called on value: {:?}", other),
198        }
199    }
200
201    /// Returns the contents of this `NonFinalNode::RepeatMax(..)`.
202    /// Panics if this is a different enum variant.
203    #[allow(clippy::must_use_candidate)]
204    #[allow(clippy::missing_panics_doc)]
205    pub fn unwrap_repeat_max(self) -> (String, String) {
206        match self {
207            NonFinalNode::RepeatMax(min, max) => (min, max),
208            other => panic!("unwrap_repeat_max() called on value: {:?}", other),
209        }
210    }
211}
212
213/// A completely parsed element of a regular expression
214/// [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
215/// (AST).
216///
217/// A `FinalNode` could represent an entire regular expression (it is the root)
218/// or a small part of one.
219///
220/// Since a regular expression ultimately processes bytes, the leaves of the
221/// AST are nodes with bytes:
222/// - [`Byte`](#variant.Byte)
223/// - [`AnyByte`](#variant.AnyByte)
224///
225/// All other variants are edges of the AST:
226/// - [`Seq`](#variant.Seq)
227/// - [`Class`](#variant.Class)
228/// - [`Group`](#variant.Group)
229/// - [`Alt`](#variant.Alt)
230/// - [`Repeat`](#variant.Repeat)
231#[derive(Clone, PartialOrd, PartialEq)]
232pub enum FinalNode {
233    /// `Byte(u8)`
234    ///
235    /// A byte of input.
236    /// This may be a non-printable byte that was escaped
237    /// in the source regular expression.
238    ///
239    /// # Examples
240    /// ```
241    /// use safe_regex_compiler::parser::parse;
242    /// use safe_regex_compiler::parser::FinalNode;
243    /// assert_eq!(
244    ///     Ok(FinalNode::Byte(b'a')),
245    ///     parse(br"a"),
246    /// );
247    /// assert_eq!(
248    ///     Ok(FinalNode::Byte(b'\n')),
249    ///     parse(br"\n"),
250    /// );
251    /// assert_eq!(
252    ///     Ok(FinalNode::Byte(b'\\')),
253    ///     parse(br"\\"),
254    /// );
255    /// assert_eq!(
256    ///     Ok(FinalNode::Byte(0x12)),
257    ///     parse(br"\x12"),
258    /// );
259    /// ```
260    Byte(u8),
261
262    /// `AnyByte`
263    ///
264    /// Matches any byte of input.  This is the `.` operator.
265    ///
266    /// # Example
267    /// ```
268    /// use safe_regex_compiler::parser::parse;
269    /// use safe_regex_compiler::parser::FinalNode;
270    /// assert_eq!(
271    ///     Ok(FinalNode::AnyByte),
272    ///     parse(br"."),
273    /// );
274    /// ```
275    AnyByte,
276
277    /// `Seq(Vec<FinalNode>)`
278    ///
279    /// A sequence of nodes.
280    ///
281    /// # Example
282    /// ```
283    /// use safe_regex_compiler::parser::parse;
284    /// use safe_regex_compiler::parser::FinalNode;
285    /// assert_eq!(
286    ///     Ok(FinalNode::Seq(vec![
287    ///         FinalNode::Byte(b'a'),
288    ///         FinalNode::Byte(b'b')],
289    ///     )),
290    ///     parse(br"ab"),
291    /// );
292    /// ```
293    Seq(Vec<FinalNode>),
294
295    /// `Class(inclusive: bool, Vec<ClassItem>)`
296    ///
297    /// A character class.  Matches any byte in the class.
298    ///
299    /// See [`ClassItem`](enum.ClassItem.html)
300    ///
301    /// # Examples
302    /// ```
303    /// use safe_regex_compiler::parser::parse;
304    /// use safe_regex_compiler::parser::FinalNode;
305    /// use safe_regex_compiler::parser::ClassItem;
306    /// assert_eq!(
307    ///     Ok(FinalNode::Class(true, vec![
308    ///         ClassItem::Byte(b'a'),
309    ///         ClassItem::Byte(b'b'),
310    ///     ])),
311    ///     parse(br"[ab]"),
312    /// );
313    /// assert_eq!(
314    ///     Ok(FinalNode::Class(false, vec![
315    ///         ClassItem::Byte(b'a'),
316    ///         ClassItem::Byte(b'b'),
317    ///     ])),
318    ///     parse(br"[^ab]"),
319    /// );
320    /// assert_eq!(
321    ///     Ok(FinalNode::Class(true, vec![
322    ///         ClassItem::ByteRange(b'0', b'9'),
323    ///     ])),
324    ///     parse(br"[0-9]"),
325    /// );
326    /// ```
327    Class(/* inclusive */ bool, Vec<ClassItem>),
328
329    /// `Group(Box<FinalNode>)`
330    ///
331    /// A capturing group of nodes.
332    /// Regular expression authors use it to override operator precedence rules
333    /// and to capture sub-slices of input.
334    ///
335    /// # Examples
336    /// ```
337    /// use safe_regex_compiler::parser::parse;
338    /// use safe_regex_compiler::parser::FinalNode;
339    /// assert_eq!(
340    ///    Ok(FinalNode::Seq(vec![
341    ///       FinalNode::Byte(b'a'),
342    ///       FinalNode::Group(Box::new(
343    ///          FinalNode::Alt(vec![
344    ///             FinalNode::Byte(b'b'),
345    ///             FinalNode::Byte(b'c'),
346    ///          ])
347    ///       )),
348    ///    ])),
349    ///    parse(br"a(b|c)"),
350    /// );
351    /// assert_eq!(
352    ///    Ok(FinalNode::Repeat(
353    ///       Box::new(FinalNode::Group(Box::new(
354    ///           FinalNode::Seq(vec![
355    ///              FinalNode::Byte(b'a'),
356    ///              FinalNode::Byte(b'b'),
357    ///           ])
358    ///       ))),
359    ///       0,    // min
360    ///       None, // max
361    ///    )),
362    ///    parse(br"(ab)*"),
363    /// );
364    /// ```
365    Group(Box<FinalNode>),
366
367    /// `NonCapturingGroup(Box<FinalNode>)`
368    ///
369    /// A non-capturing group of nodes.
370    /// Regular expression authors use it to override operator precedence rules.
371    ///
372    /// # Examples
373    /// ```
374    /// use safe_regex_compiler::parser::parse;
375    /// use safe_regex_compiler::parser::FinalNode;
376    /// assert_eq!(
377    ///    Ok(FinalNode::Seq(vec![
378    ///       FinalNode::Byte(b'a'),
379    ///       FinalNode::NonCapturingGroup(Box::new(
380    ///          FinalNode::Alt(vec![
381    ///             FinalNode::Byte(b'b'),
382    ///             FinalNode::Byte(b'c'),
383    ///          ])
384    ///       )),
385    ///    ])),
386    ///    parse(br"a(?:b|c)"),
387    /// );
388    /// assert_eq!(
389    ///    Ok(FinalNode::Repeat(
390    ///       Box::new(FinalNode::NonCapturingGroup(Box::new(
391    ///           FinalNode::Seq(vec![
392    ///              FinalNode::Byte(b'a'),
393    ///              FinalNode::Byte(b'b'),
394    ///           ])
395    ///       ))),
396    ///       0,    // min
397    ///       None, // max
398    ///    )),
399    ///    parse(br"(?:ab)*"),
400    /// );
401    /// ```
402    NonCapturingGroup(Box<FinalNode>),
403
404    /// `Alt(Vec<FinalNode>)`
405    ///
406    /// A list of alternate nodes.  The input can match any of them.
407    ///
408    /// # Examples
409    /// ```
410    /// use safe_regex_compiler::parser::parse;
411    /// use safe_regex_compiler::parser::FinalNode;
412    /// assert_eq!(
413    ///     Ok(FinalNode::Alt(vec![
414    ///         FinalNode::Byte(b'a'),
415    ///         FinalNode::Byte(b'b'),
416    ///     ])),
417    ///     parse(br"a|b"),
418    /// );
419    /// assert_eq!(
420    ///     Ok(FinalNode::Alt(vec![
421    ///         FinalNode::Byte(b'a'),
422    ///         FinalNode::Byte(b'b'),
423    ///         FinalNode::Seq(vec![
424    ///             FinalNode::AnyByte,
425    ///             FinalNode::Byte(b'c'),
426    ///         ]),
427    ///     ])),
428    ///     parse(br"a|b|.c"),
429    /// );
430    /// ```
431    Alt(Vec<FinalNode>),
432
433    /// `Repeat(Box<FinalNode>, min: usize, max: Option<usize>)`
434    ///
435    /// A repetition of a node.  It contains the minimum number of repetitions
436    /// and an optional inclusive maximum number.
437    ///
438    /// # Examples
439    /// ```
440    /// use safe_regex_compiler::parser::parse;
441    /// use safe_regex_compiler::parser::FinalNode;
442    /// assert_eq!(
443    ///     Ok(FinalNode::Repeat(
444    ///         Box::new(FinalNode::Byte(b'a')),
445    ///         0,       // min
446    ///         Some(1), // max
447    ///     )),
448    ///     parse(br"a?"),
449    /// );
450    /// assert_eq!(
451    ///     Ok(FinalNode::Repeat(
452    ///         Box::new(FinalNode::Byte(b'a')),
453    ///         0,    // min
454    ///         None, // max
455    ///     )),
456    ///     parse(br"a*"),
457    /// );
458    /// assert_eq!(
459    ///     Ok(FinalNode::Repeat(
460    ///         Box::new(FinalNode::Byte(b'a')),
461    ///         1,    // min
462    ///         None, // max
463    ///     )),
464    ///     parse(br"a+"),
465    /// );
466    /// assert_eq!(
467    ///     Ok(FinalNode::Repeat(
468    ///         Box::new(FinalNode::Byte(b'a')),
469    ///         5,       // min
470    ///         Some(5), // max
471    ///     )),
472    ///     parse(br"a{5}"),
473    /// );
474    /// assert_eq!(
475    ///     Ok(FinalNode::Repeat(
476    ///         Box::new(FinalNode::Byte(b'a')),
477    ///         5,    // min
478    ///         None, // max
479    ///     )),
480    ///     parse(br"a{5,}"),
481    /// );
482    /// assert_eq!(
483    ///     Ok(FinalNode::Repeat(
484    ///         Box::new(FinalNode::Byte(b'a')),
485    ///         0,       // min
486    ///         Some(7), // max
487    ///     )),
488    ///     parse(br"a{,7}"),
489    /// );
490    /// assert_eq!(
491    ///     Ok(FinalNode::Repeat(
492    ///         Box::new(FinalNode::Byte(b'a')),
493    ///         5,       // min
494    ///         Some(7), // max
495    ///     )),
496    ///     parse(br"a{5,7}"),
497    /// );
498    /// assert_eq!(
499    ///     Ok(FinalNode::Repeat(
500    ///         Box::new(FinalNode::Byte(b'a')),
501    ///         0,    // min
502    ///         None, // max
503    ///     )),
504    ///     parse(br"a{,}"),
505    /// );
506    /// ```
507    Repeat(Box<FinalNode>, usize, Option<usize>),
508}
509impl FinalNode {
510    /// Assumes this is a `FinalNode::Alt(_)` and returns its contents.
511    /// Panics if this is a different enum variant.
512    #[allow(clippy::must_use_candidate)]
513    #[allow(clippy::missing_panics_doc)]
514    pub fn unwrap_alt(self) -> Vec<FinalNode> {
515        match self {
516            FinalNode::Alt(nodes) => nodes,
517            other => panic!("unwrap_alt() called on value: {:?}", other),
518        }
519    }
520}
521impl core::fmt::Debug for FinalNode {
522    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
523        match self {
524            FinalNode::Byte(b) => write!(f, "Byte({})", escape_ascii([*b])),
525            FinalNode::AnyByte => write!(f, "AnyByte"),
526            FinalNode::Seq(nodes) => write!(f, "Seq{nodes:?}"),
527            FinalNode::Class(true, items) => write!(f, "Class{items:?}"),
528            FinalNode::Class(false, items) => write!(f, "Class^{items:?}"),
529            FinalNode::Group(nodes) => write!(f, "Group({nodes:?})"),
530            FinalNode::NonCapturingGroup(nodes) => write!(f, "NonCapturingGroup({nodes:?})"),
531            FinalNode::Alt(nodes) => write!(f, "Alt{nodes:?}"),
532            FinalNode::Repeat(node, min, opt_max) => {
533                write!(f, "Repeat({node:?},{min}-{opt_max:?})")
534            }
535        }
536    }
537}
538
539/// Applies one parser rule.  Each rule may do any of:
540/// - Match against the next byte of input, `byte`
541/// - Consume the byte by calling `byte.take()`
542/// - Match against the node at the top of the stack, `last`
543/// - Consume the node by calling `last.take()`
544/// - Match against the second-from-top node on the stack, `prev`
545/// - Consume the node by calling `prev.take()`
546/// - Return a new node to put on the stack above `prev` and `last`
547/// - Return an error string with an explanation for the regex author
548///
549/// There are two kinds of parser rule:
550/// - A byte consumer rule consumes a byte of input and creates
551///   a new node or modifies an existing node.
552/// - A node combiner rule merges two nodes into one node.
553///   It may modify one node and discard the other, or replace both with
554///   one new node.
555///
556/// The parser must check all combiner rules before byte consumer
557/// rules.  This is necessary for operator precedence.
558#[allow(clippy::too_many_lines)]
559fn apply_rule_once(
560    mut prev: &mut Option<Node>,
561    mut last: &mut Option<Node>,
562    byte: &mut Option<u8>,
563) -> Result<Option<Node>, String> {
564    use FinalNode::{Alt, AnyByte, Byte, Class, Group, NonCapturingGroup, Repeat, Seq};
565    use Node::{Final, NonFinal};
566    use NonFinalNode::{
567        ByteRange, Escape, HexEscape0, HexEscape1, OpenAlt, OpenByteRange, OpenClass, OpenClass0,
568        OpenClassNeg, OpenExtendedGroup, OpenGroup, OpenNonCapturingGroup, RepeatMax, RepeatMin,
569        RepeatToken,
570    };
571    #[allow(clippy::match_same_arms, clippy::unnested_or_patterns)]
572    match (&mut prev, &mut last, byte.map(|b| b)) {
573        (Some(_), None, _) => unreachable!(),
574        // Combine class nodes
575        (Some(NonFinal(OpenByteRange(a))), Some(Final(Byte(b))), _) => {
576            let node = NonFinal(ByteRange(*a, *b));
577            prev.take();
578            last.take();
579            Ok(Some(node))
580        }
581        (Some(NonFinal(OpenClass0)), Some(Final(Byte(b))), _) => {
582            let node = NonFinal(OpenClass(true, vec![ClassItem::Byte(*b)]));
583            prev.take();
584            last.take();
585            Ok(Some(node))
586        }
587        (Some(NonFinal(OpenClassNeg)), Some(Final(Byte(b))), _) => {
588            let node = NonFinal(OpenClass(false, vec![ClassItem::Byte(*b)]));
589            prev.take();
590            last.take();
591            Ok(Some(node))
592        }
593        (Some(NonFinal(OpenClass(_, items))), Some(Final(Byte(b))), _) => {
594            let item = ClassItem::Byte(*b);
595            last.take();
596            items.push(item);
597            Ok(None)
598        }
599        (Some(NonFinal(OpenClass(_, items))), Some(NonFinal(ByteRange(a, b))), _) => {
600            let item = ClassItem::ByteRange(*a, *b);
601            last.take();
602            items.push(item);
603            Ok(None)
604        }
605
606        // Combine repeat tokens
607        (None, Some(NonFinal(RepeatToken(printable, _, _))), _)
608        | (Some(NonFinal(_)), Some(NonFinal(RepeatToken(printable, _, _))), _) => Err(format!(
609            "missing element before repeat element: `{printable}`"
610        )),
611        (Some(Final(Seq(items))), Some(NonFinal(RepeatToken(_, min, opt_max))), _) => {
612            let min_copy = *min;
613            let opt_max_copy = *opt_max;
614            last.take();
615            let last_item = items.pop().unwrap();
616            items.push(Repeat(Box::new(last_item), min_copy, opt_max_copy));
617            Ok(None)
618        }
619        (Some(Final(_)), Some(NonFinal(RepeatToken(_, min, opt_max))), _) => {
620            let inner = prev.take().unwrap().unwrap_final();
621            let node = Final(Repeat(Box::new(inner), *min, *opt_max));
622            last.take();
623            Ok(Some(node))
624        }
625
626        // Combine group tokens
627        (Some(NonFinal(OpenGroup)), Some(Final(_)), None) => Err(OpenGroup.reason()),
628        (Some(NonFinal(OpenExtendedGroup)), Some(Final(_)), None) => {
629            Err(OpenExtendedGroup.reason())
630        }
631        (Some(NonFinal(OpenNonCapturingGroup)), Some(Final(_)), None) => {
632            Err(OpenNonCapturingGroup.reason())
633        }
634        // Combine Seq tokens
635        (Some(Final(Seq(nodes))), Some(Final(_)), _) => {
636            let node = last.take().unwrap().unwrap_final();
637            nodes.push(node);
638            Ok(None)
639        }
640
641        // Combine alternation/or nodes
642        (Some(NonFinal(OpenAlt(_))), Some(Final(_)), b)
643            if b != Some(b'?') && b != Some(b'+') && b != Some(b'*') && b != Some(b'{') =>
644        {
645            let node = last.take().unwrap().unwrap_final();
646            let mut nodes = prev.take().unwrap().unwrap_non_final().unwrap_open_alt();
647            nodes.push(node);
648            Ok(Some(Final(Alt(nodes))))
649        }
650        (Some(Final(Alt(nodes))), Some(Final(_)), b)
651            if b != Some(b'?') && b != Some(b'+') && b != Some(b'*') && b != Some(b'{') =>
652        {
653            let node = last.take().unwrap().unwrap_final();
654            if let Seq(ref mut seq_nodes) = nodes.last_mut().unwrap() {
655                seq_nodes.push(node);
656            } else {
657                let prev_node = nodes.pop().unwrap();
658                nodes.push(Seq(vec![prev_node, node]));
659            }
660            Ok(None)
661        }
662
663        // Create Seq token
664        // For proper precedence, these rules must appear after all the
665        // specialized `Some(Final(something))` rules above.
666        (Some(Final(_)), Some(Final(_)), b)
667            if b != Some(b'?') && b != Some(b'+') && b != Some(b'*') && b != Some(b'{') =>
668        {
669            Ok(Some(Final(Seq(vec![
670                prev.take().unwrap().unwrap_final(),
671                last.take().unwrap().unwrap_final(),
672            ]))))
673        }
674
675        // Escape `\n`
676        (_, Some(NonFinal(Escape)), Some(b'\\')) => {
677            last.take();
678            byte.take();
679            Ok(Some(Final(Byte(b'\\'))))
680        }
681        (_, _, Some(b'\\')) => {
682            byte.take();
683            Ok(Some(NonFinal(Escape)))
684        }
685        (_, Some(NonFinal(Escape)), Some(b'n')) => {
686            last.take();
687            byte.take();
688            Ok(Some(Final(Byte(b'\n'))))
689        }
690        (_, Some(NonFinal(Escape)), Some(b'r')) => {
691            last.take();
692            byte.take();
693            Ok(Some(Final(Byte(b'\r'))))
694        }
695        (_, Some(NonFinal(Escape)), Some(b't')) => {
696            last.take();
697            byte.take();
698            Ok(Some(Final(Byte(b'\t'))))
699        }
700        (_, Some(NonFinal(Escape)), Some(b'0')) => {
701            last.take();
702            byte.take();
703            Ok(Some(Final(Byte(0))))
704        }
705        (_, Some(NonFinal(Escape)), Some(b)) if b"'\"?+.*^$|(){}[]".contains(&b) => {
706            let node = Final(Byte(b));
707            last.take();
708            byte.take();
709            Ok(Some(node))
710        }
711
712        // Hex escape `\x20`
713        (_, Some(NonFinal(Escape)), Some(b'x')) => {
714            last.take();
715            byte.take();
716            Ok(Some(NonFinal(HexEscape0)))
717        }
718        (_, Some(NonFinal(Escape)), Some(d)) => {
719            Err(format!("invalid escape sequence `\\{}`", escape_ascii([d])))
720        }
721        (_, Some(NonFinal(HexEscape0)), Some(d)) => {
722            last.take();
723            byte.take();
724            Ok(Some(NonFinal(HexEscape1(d))))
725        }
726        (_, Some(NonFinal(HexEscape1(d1))), Some(d0))
727            if d1.is_ascii_hexdigit() && d0.is_ascii_hexdigit() =>
728        {
729            let string = String::from_utf8(vec![*d1, d0]).unwrap();
730            let b = u8::from_str_radix(&string, 16).unwrap();
731            last.take();
732            byte.take();
733            Ok(Some(Final(Byte(b))))
734        }
735        (_, Some(NonFinal(HexEscape1(d1))), Some(d0)) => Err(format!(
736            "invalid escape sequence `\\x{}`",
737            escape_ascii([*d1, d0])
738        )),
739
740        // Class `[ab0-9]`, `[^-ab0-9]`
741        (_, Some(NonFinal(OpenClass0)), Some(b'['))
742        | (_, Some(NonFinal(OpenClassNeg)), Some(b'['))
743        | (_, Some(NonFinal(OpenClass(..))), Some(b'[')) => {
744            byte.take();
745            Ok(Some(Final(Byte(b'['))))
746        }
747        (_, _, Some(b'[')) => {
748            byte.take();
749            Ok(Some(NonFinal(OpenClass0)))
750        }
751        (_, Some(NonFinal(OpenClass0)), Some(b'^')) => {
752            last.take();
753            byte.take();
754            Ok(Some(NonFinal(OpenClassNeg)))
755        }
756        (_, Some(NonFinal(OpenClass(_, ref mut items))), Some(b'-')) => {
757            byte.take();
758            match items.pop().unwrap() {
759                // "[a-"
760                ClassItem::Byte(b) => Ok(Some(NonFinal(OpenByteRange(b)))),
761                // "[a-b-"
762                ClassItem::ByteRange(a, b) => Err(format!(
763                    "expected byte before '-' symbol, not range: `{}-{}-`",
764                    escape_ascii([a]),
765                    escape_ascii([b])
766                )),
767            }
768        }
769        (_, Some(NonFinal(OpenClass0)), Some(b']')) => {
770            last.take();
771            byte.take();
772            Ok(Some(Final(Class(true, Vec::new()))))
773        }
774        (_, Some(NonFinal(OpenClassNeg)), Some(b']')) => {
775            last.take();
776            byte.take();
777            Ok(Some(Final(Class(false, Vec::new()))))
778        }
779        (_, Some(NonFinal(OpenClass(..))), Some(b']')) => {
780            byte.take();
781            let (incl, items) = last.take().unwrap().unwrap_non_final().unwrap_open_class();
782            Ok(Some(Final(Class(incl, items))))
783        }
784        (Some(NonFinal(OpenClass(..))), Some(NonFinal(non_final)), Some(b']')) => {
785            Err(non_final.reason())
786        }
787
788        // Bytes inside classes.
789        // These must come before all of the generic `(_, _, b'X')` rules below.
790        (_, Some(NonFinal(OpenClass0)), Some(b))
791        | (_, Some(NonFinal(OpenClassNeg)), Some(b))
792        | (_, Some(NonFinal(OpenClass(..))), Some(b))
793            if b != b']' =>
794        {
795            byte.take();
796            Ok(Some(Final(Byte(b))))
797        }
798
799        // Group `(ab)` and NonCapturingGroup `(?:ab)`
800        (_, _, Some(b'(')) => {
801            byte.take();
802            Ok(Some(NonFinal(OpenGroup)))
803        }
804        (_, Some(NonFinal(OpenGroup)), Some(b')')) => {
805            last.take();
806            byte.take();
807            Ok(Some(Final(Group(Box::new(Seq(vec![]))))))
808        }
809        (_, Some(NonFinal(OpenGroup)), Some(b'?')) => {
810            last.take();
811            byte.take();
812            Ok(Some(NonFinal(OpenExtendedGroup)))
813        }
814        (_, Some(NonFinal(OpenExtendedGroup)), Some(b':')) => {
815            last.take();
816            byte.take();
817            Ok(Some(NonFinal(OpenNonCapturingGroup)))
818        }
819        (_, Some(NonFinal(OpenExtendedGroup)), Some(_)) => {
820            Err("unexpected symbol after `(?`".to_string())
821        }
822        (_, Some(NonFinal(OpenNonCapturingGroup)), Some(b')')) => {
823            last.take();
824            byte.take();
825            Ok(Some(Final(NonCapturingGroup(Box::new(Seq(vec![]))))))
826        }
827        (Some(NonFinal(OpenGroup)), Some(NonFinal(non_final)), Some(b')')) => {
828            Err(non_final.reason())
829        }
830        (Some(NonFinal(OpenNonCapturingGroup)), Some(NonFinal(non_final)), Some(b')')) => {
831            Err(non_final.reason())
832        }
833        (Some(NonFinal(OpenGroup)), Some(Final(_)), Some(b')')) => {
834            byte.take();
835            let node = last.take().unwrap().unwrap_final();
836            prev.take();
837            Ok(Some(Final(Group(Box::new(node)))))
838        }
839        (Some(NonFinal(OpenNonCapturingGroup)), Some(Final(_)), Some(b')')) => {
840            byte.take();
841            let node = last.take().unwrap().unwrap_final();
842            prev.take();
843            Ok(Some(Final(NonCapturingGroup(Box::new(node)))))
844        }
845
846        // Repeat, postfix operators `?` `+` `*` `{n}` `{n,}` `{,m}` `{n,m}`
847        (_, _, Some(b'?')) => {
848            byte.take();
849            Ok(Some(NonFinal(RepeatToken("?".to_string(), 0, Some(1)))))
850        }
851        (_, _, Some(b'*')) => {
852            byte.take();
853            Ok(Some(NonFinal(RepeatToken("*".to_string(), 0, None))))
854        }
855        (_, _, Some(b'+')) => {
856            byte.take();
857            Ok(Some(NonFinal(RepeatToken("+".to_string(), 1, None))))
858        }
859        (_, _, Some(b'{')) => {
860            byte.take();
861            Ok(Some(NonFinal(RepeatMin(String::new()))))
862        }
863        (_, Some(NonFinal(RepeatMin(_))), Some(b',')) => {
864            byte.take();
865            let min = last.take().unwrap().unwrap_non_final().unwrap_repeat_min();
866            Ok(Some(NonFinal(RepeatMax(min, String::new()))))
867        }
868        (_, Some(NonFinal(RepeatMin(_))), Some(b'}')) => {
869            byte.take();
870            let min = last.take().unwrap().unwrap_non_final().unwrap_repeat_min();
871            let min_usize = min
872                .parse::<usize>()
873                .map_err(|e| format!("invalid repetition value `{{{min}}}`: {e}"))?;
874            Ok(Some(NonFinal(RepeatToken(
875                format!("{{{min}}}"),
876                min_usize,
877                Some(min_usize),
878            ))))
879        }
880        (_, Some(NonFinal(RepeatMin(min))), Some(b)) => {
881            byte.take();
882            min.push(char::from(b));
883            Ok(None)
884        }
885        (_, Some(NonFinal(RepeatMax(..))), Some(b'}')) => {
886            byte.take();
887            let (min, max) = last.take().unwrap().unwrap_non_final().unwrap_repeat_max();
888            let min_usize = if min.is_empty() {
889                0
890            } else {
891                min.parse::<usize>()
892                    .map_err(|e| format!("invalid repetition value `{{{min},{max}}}`: {e}"))?
893            };
894            let max_opt_usize = if max.is_empty() {
895                None
896            } else {
897                let max_usize = max
898                    .parse::<usize>()
899                    .map_err(|e| format!("invalid repetition value `{{{min},{max}}}`: {e}"))?;
900                if max_usize < min_usize {
901                    return Err(format!(
902                        "repeating element has max that is smaller than min: `{{{min},{max}}}`"
903                    ));
904                }
905                Some(max_usize)
906            };
907            Ok(Some(NonFinal(RepeatToken(
908                format!("{{{min},{max}}}"),
909                min_usize,
910                max_opt_usize,
911            ))))
912        }
913        (_, Some(NonFinal(RepeatMax(_, ref mut max))), Some(b)) => {
914            max.push(char::from(b));
915            byte.take();
916            Ok(None)
917        }
918
919        // Any byte `.`
920        (_, _, Some(b'.')) => {
921            byte.take();
922            Ok(Some(Final(AnyByte)))
923        }
924
925        // Alternate `a|b|c`
926        (_, Some(Final(Alt(_))), Some(b'|')) => {
927            byte.take();
928            let nodes = last.take().unwrap().unwrap_final().unwrap_alt();
929            Ok(Some(NonFinal(OpenAlt(nodes))))
930        }
931        (_, Some(Final(_)), Some(b'|')) => {
932            byte.take();
933            let node = last.take().unwrap().unwrap_final();
934            Ok(Some(NonFinal(OpenAlt(vec![node]))))
935        }
936        (_, None, Some(b'|')) => Err("missing element before bar `|`".to_string()),
937
938        // Other bytes
939        (_, _, Some(b)) => {
940            byte.take();
941            Ok(Some(Final(Byte(b))))
942        }
943
944        // No more bytes.
945        (_, Some(NonFinal(node)), None) => Err(node.reason()),
946        (None, None, None) => unreachable!(),
947        (None, Some(Final(_)), None) => unreachable!(),
948
949        // These cases should be unreachable.
950        (Some(NonFinal(Escape)), Some(Final(_)), _) => unreachable!(),
951        (Some(NonFinal(HexEscape0)), Some(Final(_)), _) => unreachable!(),
952        (Some(NonFinal(HexEscape1(_))), Some(Final(_)), _) => unreachable!(),
953        (Some(NonFinal(RepeatMin(..))), Some(Final(_)), _) => unreachable!(),
954        (Some(NonFinal(RepeatMax(..))), Some(Final(_)), _) => unreachable!(),
955        (Some(NonFinal(OpenClass0)), Some(Final(_)), _) => unreachable!(),
956        (Some(NonFinal(OpenClassNeg)), Some(Final(_)), _) => unreachable!(),
957        (Some(NonFinal(OpenClass(..))), Some(Final(_)), _) => unreachable!(),
958        (Some(NonFinal(OpenAlt(_))), Some(Final(_)), _) => unreachable!(),
959        (Some(NonFinal(OpenByteRange(_))), Some(Final(_)), _) => unreachable!(),
960        (Some(NonFinal(ByteRange(..))), Some(Final(_)), _) => unreachable!(),
961        (Some(NonFinal(RepeatToken(..))), Some(Final(_)), _) => unreachable!(),
962        (Some(Final(_)), Some(Final(_)), _) => unreachable!(),
963    }
964}
965
966/// Parses `regex` as a regular expression.
967///
968/// Returns a [`FinalNode`](enum.FinalNode.htmls) which is the root of the
969/// [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
970/// (AST) of the expression.
971///
972/// # Errors
973/// On error, returns a string explaining the problem.
974///
975/// # Examples
976/// ```
977/// use safe_regex_compiler::parser::parse;
978/// use safe_regex_compiler::parser::FinalNode;
979/// assert_eq!(
980///     Ok(FinalNode::Byte(b'a')), parse(br"a")
981/// );
982/// assert_eq!(
983///     Ok(FinalNode::Alt(vec![
984///         FinalNode::Byte(b'a'),
985///         FinalNode::Byte(b'b'),
986///         FinalNode::Byte(b'c'),
987///     ])),
988///     parse(br"a|b|c"),
989/// );
990/// assert_eq!(
991///     Err("missing closing `)`".to_string()),
992///     parse(br"(a"),
993/// );
994/// ```
995/// See [`FinalNode`](enum.FinalNode.html) variants for more examples.
996#[allow(clippy::missing_panics_doc)]
997pub fn parse(regex: &[u8]) -> Result<FinalNode, String> {
998    if regex.is_empty() {
999        return Ok(FinalNode::Seq(Vec::new()));
1000    }
1001    let mut data_iter = regex.iter().copied().peekable();
1002    let mut stack: Vec<Node> = Vec::new();
1003    while data_iter.peek().is_some() || stack.len() > 1 {
1004        crate::dprintln!(
1005            "process {stack:?} next={:?}",
1006            data_iter.peek().map(|b| escape_ascii([*b]))
1007        );
1008        let mut byte = data_iter.peek().copied();
1009        let byte_was_some = byte.is_some();
1010        // Pull the top two items from the stack, so we can work with them and
1011        // keep the borrow checker happy.
1012        let mut last = stack.pop();
1013        let mut prev = stack.pop();
1014        // Anything put in here becomes the new top of the stack in the next loop.
1015        let mut to_push = apply_rule_once(&mut prev, &mut last, &mut byte)?;
1016        // Put items back in `stack`.
1017        if let Some(node) = prev.take() {
1018            stack.push(node);
1019        }
1020        if let Some(node) = last.take() {
1021            stack.push(node);
1022        }
1023        if let Some(node) = to_push.take() {
1024            stack.push(node);
1025        }
1026        if byte_was_some && byte.is_none() {
1027            data_iter.next().unwrap();
1028        }
1029    }
1030    crate::dprintln!("stack {stack:?}");
1031    // Check for incomplete elements.  Example: br"(ab"
1032    for node in stack.iter().rev() {
1033        if let Node::NonFinal(non_final) = node {
1034            return Err(non_final.reason());
1035        }
1036    }
1037    assert_eq!(1, stack.len());
1038    Ok(stack.pop().unwrap().unwrap_final())
1039}