ere_core/
parse_tree.rs

1//! Implements the [ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions) parser
2//! and primitive types (like [`Atom`]).
3
4use std::{
5    fmt::{Display, Write},
6    ops::RangeInclusive,
7};
8
9use proc_macro2::TokenStream;
10use quote::{quote, ToTokens};
11
12/// On input string `text: &'a str` with options `{<prefix> => T, ..}`,
13/// it returns `Option<(&'a str, T)>` based on which literal prefix is matched
14///
15/// Example:
16/// ```
17/// let text = "asdf";
18/// let test = match_prefix!(text, {
19///     "fdsa" => 0,
20///     "qwerty" => 1,
21///     "as" => 2,
22///     "asd" => 3,
23/// });
24/// assert_eq!(test, Some("df", 2));
25/// ```
26macro_rules! match_prefix {
27    ($text:ident, { }) => (::core::option::Option::None);
28    ($text:ident, {
29        $x:literal => $y:expr,
30        $($xs:literal => $ys:expr,)*
31    }) => {
32        if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $x) {
33            ::core::option::Option::Some((rest, $y))
34        } $(else if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $xs) {
35            ::core::option::Option::Some((rest, $ys))
36        })* else {
37            ::core::option::Option::None
38        }
39    };
40}
41
42/// A represents a [POSIX-compliant ERE](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html).
43/// Primarily intended for use as a parser.
44pub struct ERE(pub(crate) Vec<EREBranch>);
45impl ERE {
46    fn take<'a>(rest: &'a str) -> Option<(&'a str, ERE)> {
47        let mut branches = Vec::new();
48        let (mut rest, branch) = EREBranch::take(rest)?;
49        branches.push(branch);
50        while let Some(new_rest) = rest.strip_prefix('|') {
51            rest = new_rest;
52            let Some((new_rest, branch)) = EREBranch::take(rest) else {
53                break;
54            };
55            rest = new_rest;
56            branches.push(branch);
57        }
58        return Some((rest, ERE(branches)));
59    }
60
61    pub fn parse_str(input: &str) -> Option<Self> {
62        let Some(("", ere)) = ERE::take(&input) else {
63            return None;
64        };
65        return Some(ere);
66    }
67}
68impl syn::parse::Parse for ERE {
69    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
70        let literal: syn::LitStr = input.parse()?;
71        let string = literal.value();
72        return ERE::parse_str(&string).ok_or_else(|| {
73            syn::Error::new(
74                literal.span(),
75                "Failed to parse POSIX Extended Regex Expression",
76            )
77        });
78    }
79}
80impl Display for ERE {
81    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82        let mut it = self.0.iter();
83        let Some(first) = it.next() else {
84            return Ok(());
85        };
86        write!(f, "{first}")?;
87        for part in it {
88            write!(f, "|{part}")?;
89        }
90        return Ok(());
91    }
92}
93
94pub(crate) struct EREBranch(pub(crate) Vec<EREPart>);
95impl EREBranch {
96    fn take<'a>(rest: &'a str) -> Option<(&'a str, EREBranch)> {
97        let mut parts = Vec::new();
98        let (mut rest, part) = EREPart::take(rest)?;
99        parts.push(part);
100        while let Some((new_rest, part)) = EREPart::take(rest) {
101            rest = new_rest;
102            parts.push(part);
103        }
104        return Some((rest, EREBranch(parts)));
105    }
106}
107impl Display for EREBranch {
108    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
109        for part in &self.0 {
110            write!(f, "{part}")?;
111        }
112        return Ok(());
113    }
114}
115
116pub(crate) enum EREPart {
117    Single(EREExpression),
118    Quantified(EREExpression, Quantifier),
119    Start,
120    End,
121}
122impl EREPart {
123    fn take<'a>(rest: &'a str) -> Option<(&'a str, EREPart)> {
124        if let Some(rest) = rest.strip_prefix('^') {
125            return Some((rest, EREPart::Start));
126        } else if let Some(rest) = rest.strip_prefix('$') {
127            return Some((rest, EREPart::End));
128        }
129
130        let (rest, expr) = if let Some(rest) = rest.strip_prefix('(') {
131            let (rest, ere) = ERE::take(rest)?;
132            let rest = rest.strip_prefix(')')?;
133            (rest, EREExpression::Subexpression(ere))
134        } else {
135            let (rest, atom) = Atom::take(rest)?;
136            (rest, EREExpression::Atom(atom))
137        };
138
139        let part = match Quantifier::take(rest) {
140            Some((rest, quantifier)) => (rest, EREPart::Quantified(expr, quantifier)),
141            None => (rest, EREPart::Single(expr)),
142        };
143        return Some(part);
144    }
145}
146impl Display for EREPart {
147    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148        return match self {
149            EREPart::Single(expr) => write!(f, "{expr}"),
150            EREPart::Quantified(expr, quantifier) => write!(f, "{expr}{quantifier}"),
151            EREPart::Start => f.write_char('^'),
152            EREPart::End => f.write_char('$'),
153        };
154    }
155}
156
157pub(crate) enum EREExpression {
158    Atom(Atom),
159    Subexpression(ERE),
160}
161impl Display for EREExpression {
162    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163        return match self {
164            EREExpression::Atom(atom) => write!(f, "{atom}"),
165            EREExpression::Subexpression(ere) => write!(f, "({ere})"),
166        };
167    }
168}
169
170#[derive(Debug, PartialEq, Eq, Clone, Copy)]
171pub(crate) enum QuantifierType {
172    Star,
173    Plus,
174    QuestionMark,
175    /// The equivalent to a range specifier with fixed size
176    Multiple(u32),
177    /// `(u32, None)` is unbounded, `(u32, Some(u32))` is bounded
178    Range(u32, Option<u32>),
179}
180
181impl QuantifierType {
182    /// The minimum this quantifier matches, inclusive
183    #[inline]
184    const fn min(&self) -> u32 {
185        return match self {
186            QuantifierType::Star => 0,
187            QuantifierType::Plus => 1,
188            QuantifierType::QuestionMark => 0,
189            QuantifierType::Multiple(n) => *n,
190            QuantifierType::Range(n, _) => *n,
191        };
192    }
193    /// The maximum this quantifier matches, inclusive. If `None`, it is unbounded
194    #[inline]
195    const fn max(&self) -> Option<u32> {
196        return match self {
197            QuantifierType::Star => None,
198            QuantifierType::Plus => None,
199            QuantifierType::QuestionMark => Some(1),
200            QuantifierType::Multiple(n) => Some(*n),
201            QuantifierType::Range(_, m) => *m,
202        };
203    }
204}
205impl Display for QuantifierType {
206    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
207        return match self {
208            QuantifierType::Star => f.write_char('*'),
209            QuantifierType::Plus => f.write_char('+'),
210            QuantifierType::QuestionMark => f.write_char('?'),
211            QuantifierType::Multiple(n) => write!(f, "{{{n}}}"),
212            QuantifierType::Range(n, None) => write!(f, "{{{n},}}"),
213            QuantifierType::Range(n, Some(m)) => write!(f, "{{{n},{m}}}"),
214        };
215    }
216}
217
218#[derive(Debug, PartialEq, Eq, Clone, Copy)]
219pub(crate) struct Quantifier {
220    pub quantifier: QuantifierType,
221    /// By default, (unless the `REG_MINIMAL` flag is set), quantifiers should prefer the longest valid
222    /// match for quantifiers. However, if an additional `?` occurs after the quantifier, it should prefer the
223    /// shortest (or longest if `REG_MINIMAL` makes default shortest).
224    pub alt: bool,
225}
226impl Quantifier {
227    #[inline]
228    fn take<'a>(rest: &'a str) -> Option<(&'a str, Quantifier)> {
229        let mut it = rest.chars();
230        let (rest, quantifier) = match it.next() {
231            Some('*') => (it.as_str(), QuantifierType::Star),
232            Some('+') => (it.as_str(), QuantifierType::Plus),
233            Some('?') => (it.as_str(), QuantifierType::QuestionMark),
234            Some('{') => {
235                let (inside, rest) = it.as_str().split_once('}')?;
236                match inside.split_once(',') {
237                    None => (rest, QuantifierType::Multiple(inside.parse().ok()?)),
238                    Some((min, "")) => (rest, QuantifierType::Range(min.parse().ok()?, None)),
239                    Some((min, max)) => (
240                        rest,
241                        QuantifierType::Range(min.parse().ok()?, Some(max.parse().ok()?)),
242                    ),
243                }
244            }
245            _ => return None,
246        };
247        if let Some(rest) = rest.strip_prefix('?') {
248            return Some((
249                rest,
250                Quantifier {
251                    quantifier,
252                    alt: true,
253                },
254            ));
255        } else {
256            return Some((
257                rest,
258                Quantifier {
259                    quantifier,
260                    alt: false,
261                },
262            ));
263        }
264    }
265}
266impl Display for Quantifier {
267    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
268        self.quantifier.fmt(f)?;
269        if self.alt {
270            return f.write_char('?');
271        } else {
272            return Ok(());
273        }
274    }
275}
276impl From<QuantifierType> for Quantifier {
277    fn from(quantifier: QuantifierType) -> Self {
278        return Quantifier {
279            quantifier,
280            alt: false,
281        };
282    }
283}
284
285#[derive(Debug, PartialEq, Eq, Clone, Copy)]
286pub enum CharClass {
287    /// Matches anything but `NUL` (`'\0'`)
288    Dot,
289}
290impl CharClass {
291    pub const fn check(&self, c: char) -> bool {
292        return match self {
293            CharClass::Dot => c != '\0',
294        };
295    }
296}
297impl Display for CharClass {
298    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
299        return match self {
300            CharClass::Dot => f.write_char('.'),
301        };
302    }
303}
304impl ToTokens for CharClass {
305    fn to_tokens(&self, tokens: &mut TokenStream) {
306        match self {
307            CharClass::Dot => tokens.extend(quote! {::ere_core::parse_tree::CharClass::Dot}),
308        }
309    }
310}
311
312/// Represents a part of an [`ERE`] that matches a single character.
313/// For example, a single char `a`, a char class `.`, or a bracket expression `[a-z]`.
314///
315/// Equality checks are semantic:
316/// ```
317/// use ere_core::parse_tree::Atom;
318/// assert_eq!(
319///     "[abcd]".parse::<Atom>(),
320///     "[a-d]".parse::<Atom>(),
321/// );
322/// assert_eq!(
323///     "[a-z]".parse::<Atom>(),
324///     "[[:lower:]]".parse::<Atom>(),
325/// );
326/// ```
327#[derive(Debug, Clone)]
328pub enum Atom {
329    /// Includes normal char and escaped chars
330    NormalChar(char),
331    CharClass(CharClass),
332    /// A matching bracket expression
333    MatchingList(Vec<BracketExpressionTerm>),
334    /// A nonmatching bracket expression
335    NonmatchingList(Vec<BracketExpressionTerm>),
336}
337impl From<char> for Atom {
338    fn from(value: char) -> Self {
339        return Atom::NormalChar(value);
340    }
341}
342impl From<CharClass> for Atom {
343    fn from(value: CharClass) -> Self {
344        return Atom::CharClass(value);
345    }
346}
347impl std::str::FromStr for Atom {
348    type Err = ();
349
350    fn from_str(s: &str) -> Result<Self, Self::Err> {
351        let Some(("", atom)) = Atom::take(s) else {
352            return Err(());
353        };
354        return Ok(atom);
355    }
356}
357
358impl Atom {
359    fn take<'a>(rest: &'a str) -> Option<(&'a str, Atom)> {
360        let mut it = rest.chars();
361        match it.next() {
362            Some('\\') => match it.next() {
363                Some(c) if is_escapable_character(c) => {
364                    return Some((it.as_str(), Atom::NormalChar(c)))
365                }
366                _ => return None,
367            },
368            Some('.') => return Some((it.as_str(), CharClass::Dot.into())),
369            Some('[') => {
370                let mut rest = it.as_str();
371                let mut items = Vec::new();
372                let none_of = if let Some(new_rest) = rest.strip_prefix('^') {
373                    rest = new_rest;
374                    true
375                } else {
376                    false
377                };
378                if let Some(new_rest) = rest.strip_prefix(']') {
379                    rest = new_rest;
380                    items.push(BracketExpressionTerm::Single(']'));
381                }
382                loop {
383                    if let Some(new_rest) = rest.strip_prefix(']') {
384                        // End of the bracket expression
385                        rest = new_rest;
386                        break;
387                    } else if let Some((new_rest, class)) = BracketCharClass::take(rest) {
388                        // A bracket char class
389                        rest = new_rest;
390                        items.push(BracketExpressionTerm::CharClass(class));
391                    } else {
392                        // Normal
393                        let mut it = rest.chars();
394                        let first = it.next()?;
395                        rest = it.as_str();
396                        if let '-' = it.next()? {
397                            let second = it.next()?;
398                            rest = it.as_str();
399                            if second == ']' {
400                                // it's just two characters at the end
401                                items.push(BracketExpressionTerm::Single(first));
402                                items.push(BracketExpressionTerm::Single('-'));
403                                break;
404                            } else {
405                                // it's a range
406                                items.push(BracketExpressionTerm::Range(first, second));
407                            }
408                        } else {
409                            items.push(BracketExpressionTerm::Single(first));
410                        }
411                    }
412                }
413
414                if none_of {
415                    return Some((rest, Atom::NonmatchingList(items)));
416                } else {
417                    return Some((rest, Atom::MatchingList(items)));
418                }
419            }
420            Some(c) if !is_special_character(c) => return Some((it.as_str(), Atom::NormalChar(c))),
421            None | Some(_) => return None,
422        }
423    }
424    pub fn check(&self, c: char) -> bool {
425        return match self {
426            Atom::NormalChar(a) => *a == c,
427            Atom::CharClass(char_class) => char_class.check(c),
428            Atom::MatchingList(vec) => vec.into_iter().any(|b| b.check(c)),
429            Atom::NonmatchingList(vec) => !vec.into_iter().any(|b| b.check(c)),
430        };
431    }
432    /// Produces the sorted, minimal set of ranges to represent the Atom.
433    ///
434    /// Example:
435    /// ```
436    /// use ere_core::parse_tree::Atom;
437    /// assert_eq!(
438    ///     "[a-z2-9A-X0-1YZ[:xdigit:]]".parse::<Atom>().unwrap().to_ranges(),
439    ///     vec!['0'..='9', 'A'..='Z', 'a'..='z'],
440    /// );
441    /// ```
442    pub fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
443        /// Sorts and combines ranges
444        fn combine_ranges_chars(
445            mut ranges: Vec<RangeInclusive<char>>,
446        ) -> Vec<RangeInclusive<char>> {
447            ranges.sort_by_key(|range| *range.start());
448            let Some((first_range, terms)) = ranges.split_first_chunk::<1>() else {
449                return Vec::new();
450            };
451            let mut reduced_terms = Vec::new();
452
453            let mut current_start = *first_range[0].start();
454            let mut current_end = *first_range[0].end();
455            for term in terms {
456                if term.is_empty() {
457                    continue;
458                } else if *term.start() <= current_end || (current_end..=*term.start()).count() == 2
459                {
460                    // the next term is overlapping (or starts immediately after) so combine them.
461                    current_end = std::cmp::max(current_end, *term.end());
462                } else {
463                    reduced_terms.push(current_start.clone()..=current_end.clone());
464                    current_start = *term.start();
465                    current_end = *term.end();
466                }
467            }
468            reduced_terms.push(current_start.clone()..=current_end.clone());
469            return reduced_terms;
470        }
471
472        return match self {
473            Atom::NormalChar(c) => vec![*c..=*c],
474            Atom::CharClass(CharClass::Dot) => vec!['\x01'..=char::MAX],
475            Atom::MatchingList(_terms) => {
476                let mut terms = Vec::new();
477                for term in _terms {
478                    match term {
479                        crate::parse_tree::BracketExpressionTerm::Single(c) => {
480                            terms.push(*c..=*c);
481                        }
482                        crate::parse_tree::BracketExpressionTerm::Range(a, b) => {
483                            terms.push(*a..=*b);
484                        }
485                        crate::parse_tree::BracketExpressionTerm::CharClass(class) => {
486                            terms.extend_from_slice(class.to_ranges());
487                        }
488                    }
489                }
490                combine_ranges_chars(terms)
491            }
492            Atom::NonmatchingList(terms) => {
493                let mut ranges = Vec::new();
494                ranges.push('\0'..=char::MAX);
495                for term in terms.iter().flat_map(|term| term.to_ranges()) {
496                    ranges = ranges
497                        .iter()
498                        .flat_map(|range| {
499                            if term.end() < range.start() || range.end() < term.start() {
500                                return vec![range.clone()]; // unchanged
501                            }
502                            let mut out = Vec::new();
503                            if range.start() < term.start() {
504                                let mut new_range = *range.start()..=*term.start();
505                                new_range.next_back();
506                                out.push(new_range); // part at start
507                            }
508                            if term.end() < range.end() {
509                                let mut new_range = *term.end()..=*range.end();
510                                new_range.next();
511                                out.push(new_range); // part at end
512                            }
513                            return out;
514                        })
515                        .collect();
516                }
517
518                ranges.sort_by_key(|range: &RangeInclusive<char>| *range.start());
519                ranges
520            }
521        };
522    }
523}
524impl Display for Atom {
525    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
526        return match self {
527            Atom::NormalChar(c) if is_escapable_character(*c) => write!(f, "\\{c}"),
528            Atom::NormalChar(c) => f.write_char(*c),
529            Atom::CharClass(c) => c.fmt(f),
530            Atom::MatchingList(vec) => {
531                f.write_char('[')?;
532                for term in vec {
533                    write!(f, "{term}")?;
534                }
535                f.write_char(']')
536            }
537            Atom::NonmatchingList(vec) => {
538                f.write_str("[^")?;
539                for term in vec {
540                    write!(f, "{term}")?;
541                }
542                f.write_char(']')
543            }
544        };
545    }
546}
547impl PartialEq for Atom {
548    fn eq(&self, other: &Self) -> bool {
549        return self.to_ranges() == other.to_ranges();
550    }
551}
552impl Eq for Atom {}
553
554/// From <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html#tag_09_03_05>
555#[derive(Debug, PartialEq, Eq, Clone, Copy)]
556pub enum BracketCharClass {
557    /// `[:alnum:]`
558    Alphanumeric,
559    /// `[:cntrl:]`
560    Control,
561    /// `[:lower:]`
562    Lower,
563    /// `[:space:]`
564    Space,
565    /// `[:alpha:]`
566    Alphabet,
567    /// `[:digit:]`
568    Digit,
569    /// `[:print:]`
570    Print,
571    /// `[:upper:]`
572    Upper,
573    /// `[:blank:]`
574    Blank,
575    /// `[:graph:]`
576    Graphic,
577    /// `[:punct:]`
578    Punctuation,
579    /// `[:xdigit:]`
580    HexDigit,
581}
582impl BracketCharClass {
583    pub const fn to_ranges(&self) -> &'static [RangeInclusive<char>] {
584        return match self {
585            BracketCharClass::Alphanumeric => &['0'..='9', 'A'..='Z', 'a'..='z'],
586            BracketCharClass::Control => &['\0'..='\x1f', '\x7f'..='\x7f'],
587            BracketCharClass::Lower => &['a'..='z'],
588            BracketCharClass::Space => &['\t'..='\t', '\x0a'..='\x0d', ' '..=' '],
589            BracketCharClass::Alphabet => &['A'..='Z', 'a'..='z'],
590            BracketCharClass::Digit => &['0'..='9'],
591            BracketCharClass::Print => &['\x20'..='\x7E'],
592            BracketCharClass::Upper => &['A'..='Z'],
593            BracketCharClass::Blank => &['\t'..='\t', ' '..=' '],
594            BracketCharClass::Graphic => &['\x21'..='\x7e'],
595            BracketCharClass::Punctuation => &[
596                '\x21'..='\x2f',
597                '\x3a'..='\x40',
598                '\x5b'..='\x60',
599                '\x7b'..='\x7e',
600            ],
601            BracketCharClass::HexDigit => &['0'..='9', 'A'..='F', 'a'..='f'],
602        };
603    }
604    /// Checks matches to the char classes.
605    pub const fn check_ascii(&self, c: char) -> bool {
606        return match self {
607            BracketCharClass::Alphanumeric => c.is_ascii_alphanumeric(),
608            BracketCharClass::Control => c.is_ascii_control(),
609            BracketCharClass::Lower => c.is_ascii_lowercase(),
610            BracketCharClass::Space => c.is_ascii_whitespace() || c == '\x0b', // POSIX includes vertical tab
611            BracketCharClass::Alphabet => c.is_ascii_alphabetic(),
612            BracketCharClass::Digit => c.is_ascii_digit(),
613            BracketCharClass::Print => matches!(c, '\x20'..='\x7E'),
614            BracketCharClass::Upper => c.is_ascii_uppercase(),
615            BracketCharClass::Blank => c == ' ' || c == '\t',
616            BracketCharClass::Graphic => c.is_ascii_graphic(),
617            BracketCharClass::Punctuation => c.is_ascii_punctuation(),
618            BracketCharClass::HexDigit => c.is_ascii_hexdigit(),
619        };
620    }
621    fn take<'a>(rest: &'a str) -> Option<(&'a str, BracketCharClass)> {
622        let rest = rest.strip_prefix("[:")?;
623        return match_prefix!(rest, {
624            "alnum:]" => BracketCharClass::Alphanumeric,
625            "cntrl:]" => BracketCharClass::Control,
626            "lower:]" => BracketCharClass::Lower,
627            "alpha:]" => BracketCharClass::Alphabet,
628            "digit:]" => BracketCharClass::Digit,
629            "print:]" => BracketCharClass::Print,
630            "upper:]" => BracketCharClass::Upper,
631            "blank:]" => BracketCharClass::Blank,
632            "graph:]" => BracketCharClass::Graphic,
633            "punct:]" => BracketCharClass::Punctuation,
634            "xdigit:]" => BracketCharClass::HexDigit,
635        });
636    }
637}
638impl Display for BracketCharClass {
639    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
640        return match self {
641            BracketCharClass::Alphanumeric => f.write_str("[:alnum:]"),
642            BracketCharClass::Control => f.write_str("[:cntrl:]"),
643            BracketCharClass::Lower => f.write_str("[:lower:]"),
644            BracketCharClass::Space => f.write_str("[:space:]"),
645            BracketCharClass::Alphabet => f.write_str("[:alpha:]"),
646            BracketCharClass::Digit => f.write_str("[:digit:]"),
647            BracketCharClass::Print => f.write_str("[:print:]"),
648            BracketCharClass::Upper => f.write_str("[:upper:]"),
649            BracketCharClass::Blank => f.write_str("[:blank:]"),
650            BracketCharClass::Graphic => f.write_str("[:graph:]"),
651            BracketCharClass::Punctuation => f.write_str("[:punct:]"),
652            BracketCharClass::HexDigit => f.write_str("[:xdigit:]"),
653        };
654    }
655}
656impl ToTokens for BracketCharClass {
657    fn to_tokens(&self, tokens: &mut TokenStream) {
658        match self {
659            BracketCharClass::Alphanumeric => {
660                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Alphanumeric})
661            }
662            BracketCharClass::Control => {
663                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Control})
664            }
665            BracketCharClass::Lower => {
666                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Lower})
667            }
668            BracketCharClass::Space => {
669                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Space})
670            }
671            BracketCharClass::Alphabet => {
672                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Alphabet})
673            }
674            BracketCharClass::Digit => {
675                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Digit})
676            }
677            BracketCharClass::Print => {
678                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Print})
679            }
680            BracketCharClass::Upper => {
681                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Upper})
682            }
683            BracketCharClass::Blank => {
684                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Blank})
685            }
686            BracketCharClass::Graphic => {
687                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Graph})
688            }
689            BracketCharClass::Punctuation => {
690                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::Punctuation})
691            }
692            BracketCharClass::HexDigit => {
693                tokens.extend(quote! {::ere_core::parse_tree::BracketCharClass::XDigit})
694            }
695        }
696    }
697}
698
699#[derive(Debug, PartialEq, Eq, Clone, Copy)]
700pub enum BracketExpressionTerm {
701    Single(char),
702    Range(char, char),
703    CharClass(BracketCharClass),
704}
705impl BracketExpressionTerm {
706    pub const fn check(&self, c: char) -> bool {
707        return match self {
708            BracketExpressionTerm::Single(a) => *a == c,
709            BracketExpressionTerm::Range(a, b) => *a <= c && c <= *b,
710            BracketExpressionTerm::CharClass(class) => class.check_ascii(c),
711        };
712    }
713    pub(crate) fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
714        return match self {
715            BracketExpressionTerm::Single(c) => vec![*c..=*c],
716            BracketExpressionTerm::Range(a, b) => vec![*a..=*b],
717            BracketExpressionTerm::CharClass(bracket_char_class) => {
718                bracket_char_class.to_ranges().to_vec()
719            }
720        };
721    }
722}
723impl Display for BracketExpressionTerm {
724    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
725        return match self {
726            BracketExpressionTerm::Single(c) => f.write_char(*c),
727            BracketExpressionTerm::Range(first, second) => write!(f, "{first}-{second}"),
728            BracketExpressionTerm::CharClass(class) => class.fmt(f),
729        };
730    }
731}
732impl From<char> for BracketExpressionTerm {
733    fn from(value: char) -> Self {
734        return BracketExpressionTerm::Single(value);
735    }
736}
737impl From<BracketCharClass> for BracketExpressionTerm {
738    fn from(value: BracketCharClass) -> Self {
739        return BracketExpressionTerm::CharClass(value);
740    }
741}
742impl ToTokens for BracketExpressionTerm {
743    fn to_tokens(&self, tokens: &mut TokenStream) {
744        tokens.extend(match self {
745            BracketExpressionTerm::Single(c) => quote! {
746                ::ere_core::parse_tree::BracketExpressionTerm::Single(#c)
747            },
748            BracketExpressionTerm::Range(a, z) => quote! {
749                ::ere_core::parse_tree::BracketExpressionTerm::Range(#a, #z)
750            },
751            BracketExpressionTerm::CharClass(char_class) => quote! {
752                ::ere_core::parse_tree::BracketExpressionTerm::CharClass(#char_class)
753            },
754        });
755    }
756}
757
758/// The characters that can only occur if quoted
759#[inline]
760const fn is_special_character(c: char) -> bool {
761    return c == '^'
762        || c == '.'
763        || c == '['
764        || c == '$'
765        || c == '('
766        || c == ')'
767        || c == '|'
768        || c == '*'
769        || c == '+'
770        || c == '?'
771        || c == '{'
772        || c == '\\';
773}
774
775/// The characters that can only occur if quoted
776#[inline]
777pub(crate) const fn is_escapable_character(c: char) -> bool {
778    return is_special_character(c) || c == ']' || c == '}';
779}
780
781#[cfg(test)]
782mod tests {
783    use super::*;
784
785    #[test]
786    fn reconstruction() {
787        fn test_reconstruction(text: &str) {
788            let (rest, ere) = ERE::take(text).unwrap();
789            assert!(rest.is_empty(), "{text} did not get used (left {rest})");
790            let reconstructed = ere.to_string();
791            assert_eq!(text, &reconstructed);
792        }
793        test_reconstruction("asdf");
794        test_reconstruction("as+df123$");
795        test_reconstruction("asd?f*123");
796        test_reconstruction("^asdf123.*");
797
798        test_reconstruction("a[|]");
799        test_reconstruction("[^]a-z1-4A-X-]asdf");
800        test_reconstruction("cd[X-]er");
801
802        test_reconstruction("my word is [[:alnum:]_]");
803        test_reconstruction("my word is [[:lower:][:digit:]_]");
804
805        test_reconstruction("a|b");
806        test_reconstruction("a|b|c");
807        test_reconstruction("a(a(b|c)|d)|(g|f)b");
808        test_reconstruction("(a|b)|(c|d){3}");
809
810        test_reconstruction("a[y-z]{1,3}");
811        test_reconstruction("a{3,}");
812        test_reconstruction("a(efg){1,}");
813    }
814
815    #[test]
816    fn parse_quantifiers() {
817        assert_eq!(Quantifier::take(""), None);
818        assert_eq!(
819            Quantifier::take("+asdf"),
820            Some(("asdf", QuantifierType::Plus.into()))
821        );
822        assert_eq!(
823            Quantifier::take("*"),
824            Some(("", QuantifierType::Star.into()))
825        );
826        assert_eq!(Quantifier::take("e?"), None);
827        assert_eq!(Quantifier::take("{"), None);
828        assert_eq!(Quantifier::take("{}"), None);
829        assert_eq!(
830            Quantifier::take("{1}"),
831            Some(("", QuantifierType::Multiple(1).into()))
832        );
833        assert_eq!(
834            Quantifier::take("{9}ee"),
835            Some(("ee", QuantifierType::Multiple(9).into()))
836        );
837        assert_eq!(
838            Quantifier::take("{10,}ee"),
839            Some(("ee", QuantifierType::Range(10, None).into()))
840        );
841        assert_eq!(
842            Quantifier::take("{0,11}ef"),
843            Some(("ef", QuantifierType::Range(0, Some(11)).into()))
844        );
845        assert_eq!(Quantifier::take("{0,e11}ef"), None);
846        assert_eq!(Quantifier::take("{0;11}ef"), None);
847    }
848
849    #[test]
850    fn parse_atom_simple() {
851        assert_eq!(Atom::take("a"), Some(("", Atom::NormalChar('a'))));
852        assert_eq!(Atom::take(r"abcd"), Some(("bcd", Atom::NormalChar('a'))));
853        assert_eq!(Atom::take(r"\\"), Some(("", Atom::NormalChar('\\'))));
854        assert_eq!(
855            Atom::take(r"\[asdf\]"),
856            Some((r"asdf\]", Atom::NormalChar('[')))
857        );
858        assert_eq!(Atom::take(r"\."), Some(("", Atom::NormalChar('.'))));
859        assert_eq!(Atom::take(r" "), Some(("", Atom::NormalChar(' '))));
860        assert_eq!(Atom::take(r"\"), None);
861
862        assert_eq!(Atom::take("."), Some(("", Atom::CharClass(CharClass::Dot))));
863        assert_eq!(
864            Atom::take(".."),
865            Some((".", Atom::CharClass(CharClass::Dot)))
866        );
867    }
868
869    #[test]
870    fn parse_atom_brackets() {
871        assert_eq!(
872            Atom::take("[ab]"),
873            Some((
874                "",
875                Atom::MatchingList(vec![
876                    BracketExpressionTerm::Single('a'),
877                    BracketExpressionTerm::Single('b'),
878                ])
879            ))
880        );
881        assert_eq!(
882            Atom::take("[]ab]"),
883            Some((
884                "",
885                Atom::MatchingList(vec![
886                    BracketExpressionTerm::Single(']'),
887                    BracketExpressionTerm::Single('a'),
888                    BracketExpressionTerm::Single('b'),
889                ])
890            ))
891        );
892        assert_eq!(
893            Atom::take("[]ab-]"),
894            Some((
895                "",
896                Atom::MatchingList(vec![
897                    BracketExpressionTerm::Single(']'),
898                    BracketExpressionTerm::Single('a'),
899                    BracketExpressionTerm::Single('b'),
900                    BracketExpressionTerm::Single('-'),
901                ])
902            ))
903        );
904
905        assert_eq!(
906            Atom::take("[]a-y]"),
907            Some((
908                "",
909                Atom::MatchingList(vec![
910                    BracketExpressionTerm::Single(']'),
911                    BracketExpressionTerm::Range('a', 'y'),
912                ])
913            ))
914        );
915        assert_eq!(
916            Atom::take("[]+--]"),
917            Some((
918                "",
919                Atom::MatchingList(vec![
920                    BracketExpressionTerm::Single(']'),
921                    BracketExpressionTerm::Range('+', '-'),
922                ])
923            ))
924        );
925
926        assert_eq!(
927            Atom::take("[^]a-y]"),
928            Some((
929                "",
930                Atom::NonmatchingList(vec![
931                    BracketExpressionTerm::Single(']'),
932                    BracketExpressionTerm::Range('a', 'y'),
933                ])
934            ))
935        );
936        assert_eq!(
937            Atom::take("[^]+--]"),
938            Some((
939                "",
940                Atom::NonmatchingList(vec![
941                    BracketExpressionTerm::Single(']'),
942                    BracketExpressionTerm::Range('+', '-'),
943                ])
944            ))
945        );
946    }
947
948    #[test]
949    fn atom_to_ranges_normal_char() {
950        let (_, atom) = Atom::take("a").unwrap();
951        assert_eq!(atom.to_ranges(), vec!['a'..='a']);
952    }
953
954    #[test]
955    fn atom_to_ranges_matching_list() {
956        let (_, atom) = Atom::take("[a-z]").unwrap();
957        assert_eq!(atom.to_ranges(), vec!['a'..='z']);
958
959        let (_, atom) = Atom::take("[0-45-9]").unwrap();
960        assert_eq!(atom.to_ranges(), vec!['0'..='9']);
961
962        let (_, atom) = Atom::take("[5-90-4]").unwrap();
963        assert_eq!(atom.to_ranges(), vec!['0'..='9']);
964
965        let (_, atom) = Atom::take("[0-46-89]").unwrap();
966        assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
967
968        let (_, atom) = Atom::take("[96-80-4]").unwrap();
969        assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
970    }
971
972    #[test]
973    fn atom_to_ranges_nonmatching_list() {
974        let (_, atom) = Atom::take("[^b-y]").unwrap();
975        assert_eq!(atom.to_ranges(), vec!['\0'..='a', 'z'..=char::MAX]);
976
977        let (_, atom) = Atom::take("[^1-45-8]").unwrap();
978        assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
979
980        let (_, atom) = Atom::take("[^5-81-4]").unwrap();
981        assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
982
983        let (_, atom) = Atom::take("[^1-486-7]").unwrap();
984        assert_eq!(
985            atom.to_ranges(),
986            vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
987        );
988
989        let (_, atom) = Atom::take("[^86-71-4]").unwrap();
990        assert_eq!(
991            atom.to_ranges(),
992            vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
993        );
994    }
995
996    #[test]
997    fn atom_eq() {
998        assert_eq!(
999            Atom::take("[a-z]").unwrap().1,
1000            Atom::take("[a-kl-z]").unwrap().1
1001        );
1002
1003        assert_eq!(
1004            Atom::take("[a-z]").unwrap().1,
1005            Atom::take("[l-za-k]").unwrap().1
1006        );
1007
1008        assert_eq!(Atom::take("a").unwrap().1, Atom::take("[a]").unwrap().1);
1009
1010        assert_eq!(
1011            Atom::take("[1-8]").unwrap().1,
1012            Atom::take("[^\0-09-\u{10ffff}]").unwrap().1
1013        );
1014
1015        assert_eq!(
1016            Atom::take("[[:upper:]]").unwrap().1,
1017            Atom::take("[A-Z]").unwrap().1
1018        );
1019
1020        assert_ne!(
1021            Atom::take("[a-z]").unwrap().1,
1022            Atom::take("[A-Z]").unwrap().1
1023        )
1024    }
1025}