ere_core/
parse_tree.rs

1//! Implements the [ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions) parser
2//! and primitive types (like [`Atom`]).
3
4use std::{
5    fmt::{Display, Write},
6    ops::RangeInclusive,
7};
8
9use proc_macro2::TokenStream;
10use quote::{quote, ToTokens};
11
12/// On input string `text: &'a str` with options `{<prefix> => T, ..}`,
13/// it returns `Option<(&'a str, T)>` based on which literal prefix is matched
14///
15/// Example:
16/// ```
17/// let text = "asdf";
18/// let test = match_prefix!(text, {
19///     "fdsa" => 0,
20///     "qwerty" => 1,
21///     "as" => 2,
22///     "asd" => 3,
23/// });
24/// assert_eq!(test, Some("df", 2));
25/// ```
26macro_rules! match_prefix {
27    ($text:ident, { }) => (::core::option::Option::None);
28    ($text:ident, {
29        $x:literal => $y:expr,
30        $($xs:literal => $ys:expr,)*
31    }) => {
32        if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $x) {
33            ::core::option::Option::Some((rest, $y))
34        } $(else if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $xs) {
35            ::core::option::Option::Some((rest, $ys))
36        })* else {
37            ::core::option::Option::None
38        }
39    };
40}
41
42#[derive(Debug, thiserror::Error, PartialEq, Eq)]
43pub enum RegexParseError {
44    /// A capture group (e.g. `(abc)`) was unterminated
45    #[error("A capture group is not properly terminated.")]
46    UnterminatedCaptureGroup,
47    /// A character class set (e.g. `[a-z]`) was unterminated
48    #[error("A character class set is not properly terminated.")]
49    UnterminatedCharSet,
50    /// An escaped symbol was invalid.
51    #[error("An unknown regex escape (not rust string escape) was found: `\\{0}`.")]
52    InvalidEscapedChar(char),
53    #[error("An atom was expected but was invalid due to being a special character: `{0}`.")]
54    InvalidAtom(char),
55    /// Should only happen in debug settings, since it happens only when we call `Atom::take("")`
56    #[error("An atom was expected but was not found due to the end of the string.")]
57    UnexpectedEOF,
58}
59
60/// A represents a [POSIX-compliant ERE](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html).
61/// Primarily intended for use as a parser.
62#[derive(Clone)]
63pub struct ERE(
64    /// Should always have at least one branch, even if it is empty.
65    pub(crate) Vec<EREBranch>,
66);
67impl ERE {
68    /// Goes until a non-nested `)` or the end of the string (does not consume).
69    fn take<'a>(rest: &'a str) -> Result<(&'a str, ERE), RegexParseError> {
70        let mut branches = Vec::new();
71        let (mut rest, branch) = EREBranch::take(rest)?;
72        branches.push(branch);
73        loop {
74            if rest.is_empty() || rest.starts_with(')') {
75                break;
76            }
77            let Some(branch_start) = rest.strip_prefix('|') else {
78                break;
79            };
80            let (new_rest, branch) = EREBranch::take(branch_start)?;
81            rest = new_rest;
82            branches.push(branch);
83        }
84        return Ok((rest, ERE(branches)));
85    }
86
87    pub fn parse_str(input: &str) -> Result<Self, RegexParseError> {
88        let Ok(("", ere)) = ERE::take(&input) else {
89            // The only reason we would have `rest` left over is if
90            // we hit a ')'. Since there is no opening '(', this is treated as an invalid atom.
91            return Err(RegexParseError::InvalidAtom(')'));
92        };
93        return Ok(ere);
94    }
95    pub(crate) fn parse_str_syn(string: &str, span: proc_macro2::Span) -> syn::Result<Self> {
96        return ERE::parse_str(&string).map_err(|err| {
97            syn::Error::new(
98                span,
99                format_args!("While parsing regular expression:\n{err}"),
100            )
101        });
102    }
103}
104impl syn::parse::Parse for ERE {
105    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
106        let literal: syn::LitStr = input.parse()?;
107        let string = literal.value();
108        return ERE::parse_str_syn(&string, literal.span());
109    }
110}
111impl Display for ERE {
112    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
113        let mut it = self.0.iter();
114        let Some(first) = it.next() else {
115            return Ok(());
116        };
117        write!(f, "{first}")?;
118        for part in it {
119            write!(f, "|{part}")?;
120        }
121        return Ok(());
122    }
123}
124
125#[derive(Clone)]
126pub(crate) struct EREBranch(
127    /// May be empty.
128    pub(crate) Vec<EREPart>,
129);
130impl EREBranch {
131    /// May be empty.
132    /// If it is passed `")"` it will return `Ok((")", EREBranch(vec![]))`,
133    /// so end of group should be checked before taking a branch.
134    ///
135    /// ## Returns
136    /// - `Ok((rest, branch))` if we successfully found a branch
137    /// - `Err(err)` if parsing failed (probably an invalid regex)
138    fn take<'a>(mut rest: &'a str) -> Result<(&'a str, EREBranch), RegexParseError> {
139        let mut parts = Vec::new();
140        while !rest.is_empty() && !rest.starts_with(')') && !rest.starts_with('|') {
141            let (new_rest, part) = EREPart::take(rest)?;
142            rest = new_rest;
143            parts.push(part);
144        }
145        return Ok((rest, EREBranch(parts)));
146    }
147}
148impl Display for EREBranch {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        for part in &self.0 {
151            write!(f, "{part}")?;
152        }
153        return Ok(());
154    }
155}
156
157#[derive(Clone)]
158pub(crate) enum EREPart {
159    Single(EREExpression),
160    Quantified(EREExpression, Quantifier),
161    Start,
162    End,
163}
164impl EREPart {
165    fn take<'a>(rest: &'a str) -> Result<(&'a str, EREPart), RegexParseError> {
166        if let Some(rest) = rest.strip_prefix('^') {
167            return Ok((rest, EREPart::Start));
168        } else if let Some(rest) = rest.strip_prefix('$') {
169            return Ok((rest, EREPart::End));
170        }
171
172        let (rest, expr) = if let Some(rest) = rest.strip_prefix('(') {
173            let (rest, ere) = ERE::take(rest)?;
174            let Some(rest) = rest.strip_prefix(')') else {
175                return Err(RegexParseError::UnterminatedCaptureGroup);
176            };
177            (rest, EREExpression::Subexpression(ere))
178        } else {
179            let (rest, atom) = Atom::take(rest)?;
180            (rest, EREExpression::Atom(atom))
181        };
182
183        let part = match Quantifier::take(rest) {
184            Some((rest, quantifier)) => (rest, EREPart::Quantified(expr, quantifier)),
185            None => (rest, EREPart::Single(expr)),
186        };
187        return Ok(part);
188    }
189}
190impl Display for EREPart {
191    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
192        return match self {
193            EREPart::Single(expr) => write!(f, "{expr}"),
194            EREPart::Quantified(expr, quantifier) => write!(f, "{expr}{quantifier}"),
195            EREPart::Start => f.write_char('^'),
196            EREPart::End => f.write_char('$'),
197        };
198    }
199}
200
201#[derive(Clone)]
202pub(crate) enum EREExpression {
203    Atom(Atom),
204    Subexpression(ERE),
205}
206impl Display for EREExpression {
207    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
208        return match self {
209            EREExpression::Atom(atom) => write!(f, "{atom}"),
210            EREExpression::Subexpression(ere) => write!(f, "({ere})"),
211        };
212    }
213}
214
215#[derive(Debug, PartialEq, Eq, Clone, Copy)]
216pub(crate) enum QuantifierType {
217    Star,
218    Plus,
219    QuestionMark,
220    /// The equivalent to a range specifier with fixed size
221    Multiple(u32),
222    /// `(u32, None)` is unbounded, `(u32, Some(u32))` is bounded
223    Range(u32, Option<u32>),
224}
225
226impl QuantifierType {
227    /// The minimum this quantifier matches, inclusive
228    #[inline]
229    const fn min(&self) -> u32 {
230        return match self {
231            QuantifierType::Star => 0,
232            QuantifierType::Plus => 1,
233            QuantifierType::QuestionMark => 0,
234            QuantifierType::Multiple(n) => *n,
235            QuantifierType::Range(n, _) => *n,
236        };
237    }
238    /// The maximum this quantifier matches, inclusive. If `None`, it is unbounded
239    #[inline]
240    const fn max(&self) -> Option<u32> {
241        return match self {
242            QuantifierType::Star => None,
243            QuantifierType::Plus => None,
244            QuantifierType::QuestionMark => Some(1),
245            QuantifierType::Multiple(n) => Some(*n),
246            QuantifierType::Range(_, m) => *m,
247        };
248    }
249}
250impl Display for QuantifierType {
251    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
252        return match self {
253            QuantifierType::Star => f.write_char('*'),
254            QuantifierType::Plus => f.write_char('+'),
255            QuantifierType::QuestionMark => f.write_char('?'),
256            QuantifierType::Multiple(n) => write!(f, "{{{n}}}"),
257            QuantifierType::Range(n, None) => write!(f, "{{{n},}}"),
258            QuantifierType::Range(n, Some(m)) => write!(f, "{{{n},{m}}}"),
259        };
260    }
261}
262
263#[derive(Debug, PartialEq, Eq, Clone, Copy)]
264pub(crate) struct Quantifier {
265    pub quantifier: QuantifierType,
266    /// By default, (unless the `REG_MINIMAL` flag is set), quantifiers should prefer the longest valid
267    /// match for quantifiers. However, if an additional `?` occurs after the quantifier, it should prefer the
268    /// shortest (or longest if `REG_MINIMAL` makes default shortest).
269    pub alt: bool,
270}
271impl Quantifier {
272    #[inline]
273    fn take<'a>(rest: &'a str) -> Option<(&'a str, Quantifier)> {
274        let mut it = rest.chars();
275        let (rest, quantifier) = match it.next() {
276            Some('*') => (it.as_str(), QuantifierType::Star),
277            Some('+') => (it.as_str(), QuantifierType::Plus),
278            Some('?') => (it.as_str(), QuantifierType::QuestionMark),
279            Some('{') => {
280                let (inside, rest) = it.as_str().split_once('}')?;
281                match inside.split_once(',') {
282                    None => (rest, QuantifierType::Multiple(inside.parse().ok()?)),
283                    Some((min, "")) => (rest, QuantifierType::Range(min.parse().ok()?, None)),
284                    Some((min, max)) => (
285                        rest,
286                        QuantifierType::Range(min.parse().ok()?, Some(max.parse().ok()?)),
287                    ),
288                }
289            }
290            _ => return None,
291        };
292        if let Some(rest) = rest.strip_prefix('?') {
293            return Some((
294                rest,
295                Quantifier {
296                    quantifier,
297                    alt: true,
298                },
299            ));
300        } else {
301            return Some((
302                rest,
303                Quantifier {
304                    quantifier,
305                    alt: false,
306                },
307            ));
308        }
309    }
310}
311impl Display for Quantifier {
312    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
313        self.quantifier.fmt(f)?;
314        if self.alt {
315            return f.write_char('?');
316        } else {
317            return Ok(());
318        }
319    }
320}
321impl From<QuantifierType> for Quantifier {
322    fn from(quantifier: QuantifierType) -> Self {
323        return Quantifier {
324            quantifier,
325            alt: false,
326        };
327    }
328}
329
330#[derive(Debug, PartialEq, Eq, Clone, Copy)]
331pub enum CharClass {
332    /// Matches anything but `NUL` (`'\0'`)
333    Dot,
334}
335impl CharClass {
336    pub const fn check(&self, c: char) -> bool {
337        return match self {
338            CharClass::Dot => c != '\0',
339        };
340    }
341}
342impl Display for CharClass {
343    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
344        return match self {
345            CharClass::Dot => f.write_char('.'),
346        };
347    }
348}
349impl ToTokens for CharClass {
350    fn to_tokens(&self, tokens: &mut TokenStream) {
351        match self {
352            CharClass::Dot => tokens.extend(quote! {::ere::parse_tree::CharClass::Dot}),
353        }
354    }
355}
356
357/// Represents a part of an [`ERE`] that matches a single character.
358/// For example, a single char `a`, a char class `.`, or a bracket expression `[a-z]`.
359///
360/// Equality checks are semantic:
361/// ```
362/// use ere_core::parse_tree::Atom;
363/// assert_eq!(
364///     "[abcd]".parse::<Atom>(),
365///     "[a-d]".parse::<Atom>(),
366/// );
367/// assert_eq!(
368///     "[a-z]".parse::<Atom>(),
369///     "[[:lower:]]".parse::<Atom>(),
370/// );
371/// ```
372#[derive(Debug, Clone)]
373pub enum Atom {
374    /// Includes normal char and escaped chars
375    NormalChar(char),
376    CharClass(CharClass),
377    /// A matching bracket expression
378    MatchingList(Vec<BracketExpressionTerm>),
379    /// A nonmatching bracket expression
380    NonmatchingList(Vec<BracketExpressionTerm>),
381}
382impl From<char> for Atom {
383    fn from(value: char) -> Self {
384        return Atom::NormalChar(value);
385    }
386}
387impl From<CharClass> for Atom {
388    fn from(value: CharClass) -> Self {
389        return Atom::CharClass(value);
390    }
391}
392impl std::str::FromStr for Atom {
393    type Err = ();
394
395    fn from_str(s: &str) -> Result<Self, Self::Err> {
396        let Ok(("", atom)) = Atom::take(s) else {
397            return Err(());
398        };
399        return Ok(atom);
400    }
401}
402
403impl Atom {
404    /// ## Returns
405    /// - `Ok((rest, atom))` if an atom was successfully found at the start
406    /// - `Err(err)` if the input does not start with a valid atom.
407    fn take<'a>(rest: &'a str) -> Result<(&'a str, Atom), RegexParseError> {
408        let mut it = rest.chars();
409        return match it.next() {
410            Some('\\') => match it.next() {
411                Some(c) if is_escapable_character(c) => Ok((it.as_str(), Atom::NormalChar(c))),
412                Some(c) => Err(RegexParseError::InvalidEscapedChar(c)),
413                None => Err(RegexParseError::InvalidAtom('\\')),
414            },
415            Some('.') => Ok((it.as_str(), CharClass::Dot.into())),
416            Some('[') => {
417                let mut rest = it.as_str();
418                let mut items = Vec::new();
419                let none_of = if let Some(new_rest) = rest.strip_prefix('^') {
420                    rest = new_rest;
421                    true
422                } else {
423                    false
424                };
425                if let Some(new_rest) = rest.strip_prefix(']') {
426                    rest = new_rest;
427                    items.push(BracketExpressionTerm::Single(']'));
428                }
429                loop {
430                    if let Some(new_rest) = rest.strip_prefix(']') {
431                        // End of the bracket expression
432                        rest = new_rest;
433                        break;
434                    } else if let Some((new_rest, class)) = BracketCharClass::take(rest) {
435                        // A bracket char class
436                        rest = new_rest;
437                        items.push(BracketExpressionTerm::CharClass(class));
438                    } else {
439                        // Normal
440                        let mut it = rest.chars();
441                        let first = it.next().ok_or(RegexParseError::UnterminatedCharSet)?;
442                        rest = it.as_str();
443                        if let '-' = it.next().ok_or(RegexParseError::UnterminatedCharSet)? {
444                            let second = it.next().ok_or(RegexParseError::UnterminatedCharSet)?;
445                            rest = it.as_str();
446                            if second == ']' {
447                                // it's just two characters at the end
448                                items.push(BracketExpressionTerm::Single(first));
449                                items.push(BracketExpressionTerm::Single('-'));
450                                break;
451                            } else {
452                                // it's a range
453                                items.push(BracketExpressionTerm::Range(first, second));
454                            }
455                        } else {
456                            items.push(BracketExpressionTerm::Single(first));
457                        }
458                    }
459                }
460
461                if none_of {
462                    return Ok((rest, Atom::NonmatchingList(items)));
463                } else {
464                    return Ok((rest, Atom::MatchingList(items)));
465                }
466            }
467            Some(c) if !is_special_character(c) => Ok((it.as_str(), Atom::NormalChar(c))),
468            Some(c) => Err(RegexParseError::InvalidAtom(c)),
469            None => Err(RegexParseError::UnexpectedEOF),
470        };
471    }
472    pub fn check(&self, c: char) -> bool {
473        return match self {
474            Atom::NormalChar(a) => *a == c,
475            Atom::CharClass(char_class) => char_class.check(c),
476            Atom::MatchingList(vec) => vec.into_iter().any(|b| b.check(c)),
477            Atom::NonmatchingList(vec) => !vec.into_iter().any(|b| b.check(c)),
478        };
479    }
480    pub(crate) fn serialize_check(&self) -> TokenStream {
481        let ranges = self.to_ranges();
482        let mut stream = TokenStream::new();
483        for range in ranges {
484            let start = range.start();
485            let end = range.end();
486            stream.extend(quote! { (#start <= c && c <= #end) || });
487        }
488        return quote! {(#stream false)};
489    }
490    /// Produces the sorted, minimal set of ranges to represent the Atom.
491    ///
492    /// Example:
493    /// ```
494    /// use ere_core::parse_tree::Atom;
495    /// assert_eq!(
496    ///     "[a-z2-9A-X0-1YZ[:xdigit:]]".parse::<Atom>().unwrap().to_ranges(),
497    ///     vec!['0'..='9', 'A'..='Z', 'a'..='z'],
498    /// );
499    /// ```
500    pub fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
501        /// Sorts and combines ranges
502        fn combine_ranges_chars(
503            mut ranges: Vec<RangeInclusive<char>>,
504        ) -> Vec<RangeInclusive<char>> {
505            ranges.sort_by_key(|range| *range.start());
506            let Some((first_range, terms)) = ranges.split_first_chunk::<1>() else {
507                return Vec::new();
508            };
509            let mut reduced_terms = Vec::new();
510
511            let mut current_start = *first_range[0].start();
512            let mut current_end = *first_range[0].end();
513            for term in terms {
514                if term.is_empty() {
515                    continue;
516                } else if *term.start() <= current_end || (current_end..=*term.start()).count() == 2
517                {
518                    // the next term is overlapping (or starts immediately after) so combine them.
519                    current_end = std::cmp::max(current_end, *term.end());
520                } else {
521                    reduced_terms.push(current_start.clone()..=current_end.clone());
522                    current_start = *term.start();
523                    current_end = *term.end();
524                }
525            }
526            reduced_terms.push(current_start.clone()..=current_end.clone());
527            return reduced_terms;
528        }
529
530        return match self {
531            Atom::NormalChar(c) => vec![*c..=*c],
532            Atom::CharClass(CharClass::Dot) => vec!['\x01'..=char::MAX],
533            Atom::MatchingList(_terms) => {
534                let mut terms = Vec::new();
535                for term in _terms {
536                    match term {
537                        crate::parse_tree::BracketExpressionTerm::Single(c) => {
538                            terms.push(*c..=*c);
539                        }
540                        crate::parse_tree::BracketExpressionTerm::Range(a, b) => {
541                            terms.push(*a..=*b);
542                        }
543                        crate::parse_tree::BracketExpressionTerm::CharClass(class) => {
544                            terms.extend_from_slice(class.to_ranges());
545                        }
546                    }
547                }
548                combine_ranges_chars(terms)
549            }
550            Atom::NonmatchingList(terms) => {
551                let mut ranges = Vec::new();
552                ranges.push('\0'..=char::MAX);
553                for term in terms.iter().flat_map(|term| term.to_ranges()) {
554                    ranges = ranges
555                        .iter()
556                        .flat_map(|range| {
557                            if term.end() < range.start() || range.end() < term.start() {
558                                return vec![range.clone()]; // unchanged
559                            }
560                            let mut out = Vec::new();
561                            if range.start() < term.start() {
562                                let mut new_range = *range.start()..=*term.start();
563                                new_range.next_back();
564                                out.push(new_range); // part at start
565                            }
566                            if term.end() < range.end() {
567                                let mut new_range = *term.end()..=*range.end();
568                                new_range.next();
569                                out.push(new_range); // part at end
570                            }
571                            return out;
572                        })
573                        .collect();
574                }
575
576                ranges.sort_by_key(|range: &RangeInclusive<char>| *range.start());
577                ranges
578            }
579        };
580    }
581    pub fn max_bytes(&self) -> usize {
582        match self.to_ranges().last() {
583            None => 0,
584            Some(range) => range.end().len_utf8(),
585        }
586    }
587    pub fn min_bytes(&self) -> usize {
588        match self.to_ranges().first() {
589            None => 0,
590            Some(range) => range.start().len_utf8(),
591        }
592    }
593}
594impl Display for Atom {
595    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
596        return match self {
597            Atom::NormalChar(c) if is_escapable_character(*c) => write!(f, "\\{c}"),
598            Atom::NormalChar(c) => f.write_char(*c),
599            Atom::CharClass(c) => c.fmt(f),
600            Atom::MatchingList(vec) => {
601                f.write_char('[')?;
602                for term in vec {
603                    write!(f, "{term}")?;
604                }
605                f.write_char(']')
606            }
607            Atom::NonmatchingList(vec) => {
608                f.write_str("[^")?;
609                for term in vec {
610                    write!(f, "{term}")?;
611                }
612                f.write_char(']')
613            }
614        };
615    }
616}
617impl PartialEq for Atom {
618    fn eq(&self, other: &Self) -> bool {
619        return self.to_ranges() == other.to_ranges();
620    }
621}
622impl Eq for Atom {}
623
624/// From <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html#tag_09_03_05>
625#[derive(Debug, PartialEq, Eq, Clone, Copy)]
626pub enum BracketCharClass {
627    /// `[:alnum:]`
628    Alphanumeric,
629    /// `[:cntrl:]`
630    Control,
631    /// `[:lower:]`
632    Lower,
633    /// `[:space:]`
634    Space,
635    /// `[:alpha:]`
636    Alphabet,
637    /// `[:digit:]`
638    Digit,
639    /// `[:print:]`
640    Print,
641    /// `[:upper:]`
642    Upper,
643    /// `[:blank:]`
644    Blank,
645    /// `[:graph:]`
646    Graphic,
647    /// `[:punct:]`
648    Punctuation,
649    /// `[:xdigit:]`
650    HexDigit,
651}
652impl BracketCharClass {
653    pub const fn to_ranges(&self) -> &'static [RangeInclusive<char>] {
654        return match self {
655            BracketCharClass::Alphanumeric => &['0'..='9', 'A'..='Z', 'a'..='z'],
656            BracketCharClass::Control => &['\0'..='\x1f', '\x7f'..='\x7f'],
657            BracketCharClass::Lower => &['a'..='z'],
658            BracketCharClass::Space => &['\t'..='\t', '\x0a'..='\x0d', ' '..=' '],
659            BracketCharClass::Alphabet => &['A'..='Z', 'a'..='z'],
660            BracketCharClass::Digit => &['0'..='9'],
661            BracketCharClass::Print => &['\x20'..='\x7E'],
662            BracketCharClass::Upper => &['A'..='Z'],
663            BracketCharClass::Blank => &['\t'..='\t', ' '..=' '],
664            BracketCharClass::Graphic => &['\x21'..='\x7e'],
665            BracketCharClass::Punctuation => &[
666                '\x21'..='\x2f',
667                '\x3a'..='\x40',
668                '\x5b'..='\x60',
669                '\x7b'..='\x7e',
670            ],
671            BracketCharClass::HexDigit => &['0'..='9', 'A'..='F', 'a'..='f'],
672        };
673    }
674    /// Checks matches to the char classes.
675    pub const fn check_ascii(&self, c: char) -> bool {
676        return match self {
677            BracketCharClass::Alphanumeric => c.is_ascii_alphanumeric(),
678            BracketCharClass::Control => c.is_ascii_control(),
679            BracketCharClass::Lower => c.is_ascii_lowercase(),
680            BracketCharClass::Space => c.is_ascii_whitespace() || c == '\x0b', // POSIX includes vertical tab
681            BracketCharClass::Alphabet => c.is_ascii_alphabetic(),
682            BracketCharClass::Digit => c.is_ascii_digit(),
683            BracketCharClass::Print => matches!(c, '\x20'..='\x7E'),
684            BracketCharClass::Upper => c.is_ascii_uppercase(),
685            BracketCharClass::Blank => c == ' ' || c == '\t',
686            BracketCharClass::Graphic => c.is_ascii_graphic(),
687            BracketCharClass::Punctuation => c.is_ascii_punctuation(),
688            BracketCharClass::HexDigit => c.is_ascii_hexdigit(),
689        };
690    }
691    fn take<'a>(rest: &'a str) -> Option<(&'a str, BracketCharClass)> {
692        let rest = rest.strip_prefix("[:")?;
693        return match_prefix!(rest, {
694            "alnum:]" => BracketCharClass::Alphanumeric,
695            "cntrl:]" => BracketCharClass::Control,
696            "lower:]" => BracketCharClass::Lower,
697            "space:]" => BracketCharClass::Space,
698            "alpha:]" => BracketCharClass::Alphabet,
699            "digit:]" => BracketCharClass::Digit,
700            "print:]" => BracketCharClass::Print,
701            "upper:]" => BracketCharClass::Upper,
702            "blank:]" => BracketCharClass::Blank,
703            "graph:]" => BracketCharClass::Graphic,
704            "punct:]" => BracketCharClass::Punctuation,
705            "xdigit:]" => BracketCharClass::HexDigit,
706        });
707    }
708}
709impl Display for BracketCharClass {
710    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
711        return match self {
712            BracketCharClass::Alphanumeric => f.write_str("[:alnum:]"),
713            BracketCharClass::Control => f.write_str("[:cntrl:]"),
714            BracketCharClass::Lower => f.write_str("[:lower:]"),
715            BracketCharClass::Space => f.write_str("[:space:]"),
716            BracketCharClass::Alphabet => f.write_str("[:alpha:]"),
717            BracketCharClass::Digit => f.write_str("[:digit:]"),
718            BracketCharClass::Print => f.write_str("[:print:]"),
719            BracketCharClass::Upper => f.write_str("[:upper:]"),
720            BracketCharClass::Blank => f.write_str("[:blank:]"),
721            BracketCharClass::Graphic => f.write_str("[:graph:]"),
722            BracketCharClass::Punctuation => f.write_str("[:punct:]"),
723            BracketCharClass::HexDigit => f.write_str("[:xdigit:]"),
724        };
725    }
726}
727impl ToTokens for BracketCharClass {
728    fn to_tokens(&self, tokens: &mut TokenStream) {
729        match self {
730            BracketCharClass::Alphanumeric => {
731                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Alphanumeric})
732            }
733            BracketCharClass::Control => {
734                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Control})
735            }
736            BracketCharClass::Lower => {
737                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Lower})
738            }
739            BracketCharClass::Space => {
740                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Space})
741            }
742            BracketCharClass::Alphabet => {
743                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Alphabet})
744            }
745            BracketCharClass::Digit => {
746                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Digit})
747            }
748            BracketCharClass::Print => {
749                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Print})
750            }
751            BracketCharClass::Upper => {
752                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Upper})
753            }
754            BracketCharClass::Blank => {
755                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Blank})
756            }
757            BracketCharClass::Graphic => {
758                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Graph})
759            }
760            BracketCharClass::Punctuation => {
761                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Punctuation})
762            }
763            BracketCharClass::HexDigit => {
764                tokens.extend(quote! {::ere::parse_tree::BracketCharClass::XDigit})
765            }
766        }
767    }
768}
769
770#[derive(Debug, PartialEq, Eq, Clone, Copy)]
771pub enum BracketExpressionTerm {
772    Single(char),
773    Range(char, char),
774    CharClass(BracketCharClass),
775}
776impl BracketExpressionTerm {
777    pub const fn check(&self, c: char) -> bool {
778        return match self {
779            BracketExpressionTerm::Single(a) => *a == c,
780            BracketExpressionTerm::Range(a, b) => *a <= c && c <= *b,
781            BracketExpressionTerm::CharClass(class) => class.check_ascii(c),
782        };
783    }
784    pub(crate) fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
785        return match self {
786            BracketExpressionTerm::Single(c) => vec![*c..=*c],
787            BracketExpressionTerm::Range(a, b) => vec![*a..=*b],
788            BracketExpressionTerm::CharClass(bracket_char_class) => {
789                bracket_char_class.to_ranges().to_vec()
790            }
791        };
792    }
793}
794impl Display for BracketExpressionTerm {
795    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
796        return match self {
797            BracketExpressionTerm::Single(c) => f.write_char(*c),
798            BracketExpressionTerm::Range(first, second) => write!(f, "{first}-{second}"),
799            BracketExpressionTerm::CharClass(class) => class.fmt(f),
800        };
801    }
802}
803impl From<char> for BracketExpressionTerm {
804    fn from(value: char) -> Self {
805        return BracketExpressionTerm::Single(value);
806    }
807}
808impl From<BracketCharClass> for BracketExpressionTerm {
809    fn from(value: BracketCharClass) -> Self {
810        return BracketExpressionTerm::CharClass(value);
811    }
812}
813impl ToTokens for BracketExpressionTerm {
814    fn to_tokens(&self, tokens: &mut TokenStream) {
815        tokens.extend(match self {
816            BracketExpressionTerm::Single(c) => quote! {
817                ::ere::parse_tree::BracketExpressionTerm::Single(#c)
818            },
819            BracketExpressionTerm::Range(a, z) => quote! {
820                ::ere::parse_tree::BracketExpressionTerm::Range(#a, #z)
821            },
822            BracketExpressionTerm::CharClass(char_class) => quote! {
823                ::ere::parse_tree::BracketExpressionTerm::CharClass(#char_class)
824            },
825        });
826    }
827}
828
829/// The characters that can only occur if quoted
830#[inline]
831const fn is_special_character(c: char) -> bool {
832    return c == '^'
833        || c == '.'
834        || c == '['
835        || c == '$'
836        || c == '('
837        || c == ')'
838        || c == '|'
839        || c == '*'
840        || c == '+'
841        || c == '?'
842        || c == '{'
843        || c == '\\';
844}
845
846/// The characters that can only occur if quoted
847#[inline]
848pub(crate) const fn is_escapable_character(c: char) -> bool {
849    return is_special_character(c) || c == ']' || c == '}';
850}
851
852#[cfg(test)]
853mod tests {
854    use super::*;
855
856    #[test]
857    fn reconstruction() {
858        fn test_reconstruction(text: &str) {
859            let (rest, ere) = match ERE::take(text) {
860                Ok(ok) => ok,
861                Err(err) => panic!("Failed to parse `{text}`, got error: {err}"),
862            };
863            assert!(rest.is_empty(), "{text} did not get used (left {rest})");
864            let reconstructed = ere.to_string();
865            assert_eq!(text, &reconstructed);
866        }
867        test_reconstruction("asdf");
868        test_reconstruction("as+df123$");
869        test_reconstruction("asd?f*123");
870        test_reconstruction("^asdf123.*");
871
872        test_reconstruction("a[|]");
873        test_reconstruction("[^]a-z1-4A-X-]asdf");
874        test_reconstruction("cd[X-]er");
875
876        test_reconstruction("my word is [[:alnum:]_]");
877        test_reconstruction("my word is [[:lower:][:digit:]_]");
878
879        test_reconstruction("a|b");
880        test_reconstruction("a|b|c");
881        test_reconstruction("a(a(b|c)|d)|(g|f)b");
882        test_reconstruction("(a|b)|(c|d){3}");
883
884        test_reconstruction("a[y-z]{1,3}");
885        test_reconstruction("a{3,}");
886        test_reconstruction("a(efg){1,}");
887    }
888
889    #[test]
890    fn parse_quantifiers() {
891        assert_eq!(Quantifier::take(""), None);
892        assert_eq!(
893            Quantifier::take("+asdf"),
894            Some(("asdf", QuantifierType::Plus.into()))
895        );
896        assert_eq!(
897            Quantifier::take("*"),
898            Some(("", QuantifierType::Star.into()))
899        );
900        assert_eq!(Quantifier::take("e?"), None);
901        assert_eq!(Quantifier::take("{"), None);
902        assert_eq!(Quantifier::take("{}"), None);
903        assert_eq!(
904            Quantifier::take("{1}"),
905            Some(("", QuantifierType::Multiple(1).into()))
906        );
907        assert_eq!(
908            Quantifier::take("{9}ee"),
909            Some(("ee", QuantifierType::Multiple(9).into()))
910        );
911        assert_eq!(
912            Quantifier::take("{10,}ee"),
913            Some(("ee", QuantifierType::Range(10, None).into()))
914        );
915        assert_eq!(
916            Quantifier::take("{0,11}ef"),
917            Some(("ef", QuantifierType::Range(0, Some(11)).into()))
918        );
919        assert_eq!(Quantifier::take("{0,e11}ef"), None);
920        assert_eq!(Quantifier::take("{0;11}ef"), None);
921    }
922
923    #[test]
924    fn parse_atom_simple() {
925        assert_eq!(Atom::take("a"), Ok(("", Atom::NormalChar('a'))));
926        assert_eq!(Atom::take(r"abcd"), Ok(("bcd", Atom::NormalChar('a'))));
927        assert_eq!(Atom::take(r"\\"), Ok(("", Atom::NormalChar('\\'))));
928        assert_eq!(
929            Atom::take(r"\[asdf\]"),
930            Ok((r"asdf\]", Atom::NormalChar('[')))
931        );
932        assert_eq!(Atom::take(r"\."), Ok(("", Atom::NormalChar('.'))));
933        assert_eq!(Atom::take(r" "), Ok(("", Atom::NormalChar(' '))));
934        assert_eq!(Atom::take(r"\"), Err(RegexParseError::InvalidAtom('\\')));
935
936        assert_eq!(Atom::take("."), Ok(("", Atom::CharClass(CharClass::Dot))));
937        assert_eq!(Atom::take(".."), Ok((".", Atom::CharClass(CharClass::Dot))));
938    }
939
940    #[test]
941    fn parse_atom_brackets() {
942        assert_eq!(
943            Atom::take("[ab]"),
944            Ok((
945                "",
946                Atom::MatchingList(vec![
947                    BracketExpressionTerm::Single('a'),
948                    BracketExpressionTerm::Single('b'),
949                ])
950            ))
951        );
952        assert_eq!(
953            Atom::take("[]ab]"),
954            Ok((
955                "",
956                Atom::MatchingList(vec![
957                    BracketExpressionTerm::Single(']'),
958                    BracketExpressionTerm::Single('a'),
959                    BracketExpressionTerm::Single('b'),
960                ])
961            ))
962        );
963        assert_eq!(
964            Atom::take("[]ab-]"),
965            Ok((
966                "",
967                Atom::MatchingList(vec![
968                    BracketExpressionTerm::Single(']'),
969                    BracketExpressionTerm::Single('a'),
970                    BracketExpressionTerm::Single('b'),
971                    BracketExpressionTerm::Single('-'),
972                ])
973            ))
974        );
975
976        assert_eq!(
977            Atom::take("[]a-y]"),
978            Ok((
979                "",
980                Atom::MatchingList(vec![
981                    BracketExpressionTerm::Single(']'),
982                    BracketExpressionTerm::Range('a', 'y'),
983                ])
984            ))
985        );
986        assert_eq!(
987            Atom::take("[]+--]"),
988            Ok((
989                "",
990                Atom::MatchingList(vec![
991                    BracketExpressionTerm::Single(']'),
992                    BracketExpressionTerm::Range('+', '-'),
993                ])
994            ))
995        );
996
997        assert_eq!(
998            Atom::take("[^]a-y]"),
999            Ok((
1000                "",
1001                Atom::NonmatchingList(vec![
1002                    BracketExpressionTerm::Single(']'),
1003                    BracketExpressionTerm::Range('a', 'y'),
1004                ])
1005            ))
1006        );
1007        assert_eq!(
1008            Atom::take("[^]+--]"),
1009            Ok((
1010                "",
1011                Atom::NonmatchingList(vec![
1012                    BracketExpressionTerm::Single(']'),
1013                    BracketExpressionTerm::Range('+', '-'),
1014                ])
1015            ))
1016        );
1017    }
1018
1019    #[test]
1020    fn parse_atom_err() {
1021        assert_eq!(Atom::take(")"), Err(RegexParseError::InvalidAtom(')')));
1022        assert_eq!(Atom::take(r"\"), Err(RegexParseError::InvalidAtom('\\')));
1023        assert_eq!(
1024            Atom::take(r"\a"),
1025            Err(RegexParseError::InvalidEscapedChar('a'))
1026        );
1027    }
1028
1029    #[test]
1030    fn atom_to_ranges_normal_char() {
1031        let (_, atom) = Atom::take("a").unwrap();
1032        assert_eq!(atom.to_ranges(), vec!['a'..='a']);
1033    }
1034
1035    #[test]
1036    fn atom_to_ranges_matching_list() {
1037        let (_, atom) = Atom::take("[a-z]").unwrap();
1038        assert_eq!(atom.to_ranges(), vec!['a'..='z']);
1039
1040        let (_, atom) = Atom::take("[0-45-9]").unwrap();
1041        assert_eq!(atom.to_ranges(), vec!['0'..='9']);
1042
1043        let (_, atom) = Atom::take("[5-90-4]").unwrap();
1044        assert_eq!(atom.to_ranges(), vec!['0'..='9']);
1045
1046        let (_, atom) = Atom::take("[0-46-89]").unwrap();
1047        assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
1048
1049        let (_, atom) = Atom::take("[96-80-4]").unwrap();
1050        assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
1051    }
1052
1053    #[test]
1054    fn atom_to_ranges_nonmatching_list() {
1055        let (_, atom) = Atom::take("[^b-y]").unwrap();
1056        assert_eq!(atom.to_ranges(), vec!['\0'..='a', 'z'..=char::MAX]);
1057
1058        let (_, atom) = Atom::take("[^1-45-8]").unwrap();
1059        assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
1060
1061        let (_, atom) = Atom::take("[^5-81-4]").unwrap();
1062        assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
1063
1064        let (_, atom) = Atom::take("[^1-486-7]").unwrap();
1065        assert_eq!(
1066            atom.to_ranges(),
1067            vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
1068        );
1069
1070        let (_, atom) = Atom::take("[^86-71-4]").unwrap();
1071        assert_eq!(
1072            atom.to_ranges(),
1073            vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
1074        );
1075    }
1076
1077    #[test]
1078    fn atom_eq() {
1079        assert_eq!(
1080            Atom::take("[a-z]").unwrap().1,
1081            Atom::take("[a-kl-z]").unwrap().1
1082        );
1083
1084        assert_eq!(
1085            Atom::take("[a-z]").unwrap().1,
1086            Atom::take("[l-za-k]").unwrap().1
1087        );
1088
1089        assert_eq!(Atom::take("a").unwrap().1, Atom::take("[a]").unwrap().1);
1090
1091        assert_eq!(
1092            Atom::take("[1-8]").unwrap().1,
1093            Atom::take("[^\0-09-\u{10ffff}]").unwrap().1
1094        );
1095
1096        assert_eq!(
1097            Atom::take("[[:upper:]]").unwrap().1,
1098            Atom::take("[A-Z]").unwrap().1
1099        );
1100
1101        assert_ne!(
1102            Atom::take("[a-z]").unwrap().1,
1103            Atom::take("[A-Z]").unwrap().1
1104        )
1105    }
1106}