ere_core/
parse_tree.rs

1use std::fmt::{Display, Write};
2
3use proc_macro2::TokenStream;
4use quote::{quote, ToTokens};
5
6/// A represents a [POSIX-compliant ERE](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html).
7/// Primarily intended for use as a parser.
8pub struct ERE(pub(crate) Vec<EREBranch>);
9impl ERE {
10    fn take<'a>(rest: &'a str) -> Option<(&'a str, ERE)> {
11        let mut branches = Vec::new();
12        let (mut rest, branch) = EREBranch::take(rest)?;
13        branches.push(branch);
14        while let Some(new_rest) = rest.strip_prefix('|') {
15            rest = new_rest;
16            let Some((new_rest, branch)) = EREBranch::take(rest) else {
17                break;
18            };
19            rest = new_rest;
20            branches.push(branch);
21        }
22        return Some((rest, ERE(branches)));
23    }
24
25    pub fn parse_str(input: &str) -> Option<Self> {
26        let Some(("", ere)) = ERE::take(&input) else {
27            return None;
28        };
29        return Some(ere);
30    }
31}
32impl syn::parse::Parse for ERE {
33    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
34        let literal: syn::LitStr = input.parse()?;
35        let string = literal.value();
36        return ERE::parse_str(&string).ok_or_else(|| {
37            syn::Error::new(
38                literal.span(),
39                "Failed to parse POSIX Extended Regex Expression",
40            )
41        });
42    }
43}
44impl Display for ERE {
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46        let mut it = self.0.iter();
47        let Some(first) = it.next() else {
48            return Ok(());
49        };
50        write!(f, "{first}")?;
51        for part in it {
52            write!(f, "|{part}")?;
53        }
54        return Ok(());
55    }
56}
57
58pub(crate) struct EREBranch(pub(crate) Vec<EREPart>);
59impl EREBranch {
60    fn take<'a>(rest: &'a str) -> Option<(&'a str, EREBranch)> {
61        let mut parts = Vec::new();
62        let (mut rest, part) = EREPart::take(rest)?;
63        parts.push(part);
64        while let Some((new_rest, part)) = EREPart::take(rest) {
65            rest = new_rest;
66            parts.push(part);
67        }
68        return Some((rest, EREBranch(parts)));
69    }
70}
71impl Display for EREBranch {
72    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
73        for part in &self.0 {
74            write!(f, "{part}")?;
75        }
76        return Ok(());
77    }
78}
79
80pub(crate) enum EREPart {
81    Single(EREExpression),
82    Quantified(EREExpression, Quantifier),
83    Start,
84    End,
85}
86impl EREPart {
87    fn take<'a>(rest: &'a str) -> Option<(&'a str, EREPart)> {
88        if let Some(rest) = rest.strip_prefix('^') {
89            return Some((rest, EREPart::Start));
90        } else if let Some(rest) = rest.strip_prefix('$') {
91            return Some((rest, EREPart::End));
92        }
93
94        let (rest, expr) = if let Some(rest) = rest.strip_prefix('(') {
95            let (rest, ere) = ERE::take(rest)?;
96            let rest = rest.strip_prefix(')')?;
97            (rest, EREExpression::Subexpression(ere))
98        } else {
99            let (rest, atom) = Atom::take(rest)?;
100            (rest, EREExpression::Atom(atom))
101        };
102
103        let part = match Quantifier::take(rest) {
104            Some((rest, quantifier)) => (rest, EREPart::Quantified(expr, quantifier)),
105            None => (rest, EREPart::Single(expr)),
106        };
107        return Some(part);
108    }
109}
110impl Display for EREPart {
111    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112        return match self {
113            EREPart::Single(expr) => write!(f, "{expr}"),
114            EREPart::Quantified(expr, quantifier) => write!(f, "{expr}{quantifier}"),
115            EREPart::Start => f.write_char('^'),
116            EREPart::End => f.write_char('$'),
117        };
118    }
119}
120
121pub(crate) enum EREExpression {
122    Atom(Atom),
123    Subexpression(ERE),
124}
125impl Display for EREExpression {
126    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
127        return match self {
128            EREExpression::Atom(atom) => write!(f, "{atom}"),
129            EREExpression::Subexpression(ere) => write!(f, "({ere})"),
130        };
131    }
132}
133
134#[derive(Debug, PartialEq, Eq, Clone, Copy)]
135pub(crate) enum Quantifier {
136    Star,
137    Plus,
138    QuestionMark,
139    /// The equivalent to a range specifier with fixed size
140    Multiple(u32),
141    /// `(u32, None)` is unbounded, `(u32, Some(u32))` is bounded
142    Range(u32, Option<u32>),
143}
144
145impl Quantifier {
146    /// The minimum this quantifier matches, inclusive
147    #[inline]
148    const fn min(&self) -> u32 {
149        return match self {
150            Quantifier::Star => 0,
151            Quantifier::Plus => 1,
152            Quantifier::QuestionMark => 0,
153            Quantifier::Multiple(n) => *n,
154            Quantifier::Range(n, _) => *n,
155        };
156    }
157    /// The maximum this quantifier matches, inclusive. If `None`, it is unbounded
158    #[inline]
159    const fn max(&self) -> Option<u32> {
160        return match self {
161            Quantifier::Star => None,
162            Quantifier::Plus => None,
163            Quantifier::QuestionMark => Some(1),
164            Quantifier::Multiple(n) => Some(*n),
165            Quantifier::Range(_, m) => *m,
166        };
167    }
168
169    #[inline]
170    fn take<'a>(rest: &'a str) -> Option<(&'a str, Quantifier)> {
171        let mut it = rest.chars();
172        match it.next() {
173            Some('*') => return Some((it.as_str(), Quantifier::Star)),
174            Some('+') => return Some((it.as_str(), Quantifier::Plus)),
175            Some('?') => return Some((it.as_str(), Quantifier::QuestionMark)),
176            Some('{') => {
177                let (inside, rest) = it.as_str().split_once('}')?;
178                match inside.split_once(',') {
179                    None => return Some((rest, Quantifier::Multiple(inside.parse().ok()?))),
180                    Some((min, "")) => {
181                        return Some((rest, Quantifier::Range(min.parse().ok()?, None)))
182                    }
183                    Some((min, max)) => {
184                        return Some((
185                            rest,
186                            Quantifier::Range(min.parse().ok()?, Some(max.parse().ok()?)),
187                        ))
188                    }
189                }
190            }
191            _ => return None,
192        }
193    }
194}
195impl Display for Quantifier {
196    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
197        return match self {
198            Quantifier::Star => f.write_char('*'),
199            Quantifier::Plus => f.write_char('+'),
200            Quantifier::QuestionMark => f.write_char('?'),
201            Quantifier::Multiple(n) => write!(f, "{{{n}}}"),
202            Quantifier::Range(n, None) => write!(f, "{{{n},}}"),
203            Quantifier::Range(n, Some(m)) => write!(f, "{{{n},{m}}}"),
204        };
205    }
206}
207
208#[derive(Debug, PartialEq, Eq, Clone, Copy)]
209pub enum CharClass {
210    Dot,
211}
212impl CharClass {
213    pub const fn check(&self, c: char) -> bool {
214        return match self {
215            CharClass::Dot => c != '\0',
216        };
217    }
218}
219impl Display for CharClass {
220    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221        return match self {
222            CharClass::Dot => f.write_char('.'),
223        };
224    }
225}
226impl ToTokens for CharClass {
227    fn to_tokens(&self, tokens: &mut TokenStream) {
228        match self {
229            CharClass::Dot => tokens.extend(quote! {::ere_core::parse_tree::CharClass::Dot}),
230        }
231    }
232}
233
234#[derive(Debug, PartialEq, Eq, Clone)]
235pub enum Atom {
236    /// Includes normal char and escaped chars
237    NormalChar(char),
238    CharClass(CharClass), // TODO
239    /// A matching bracket expression
240    MatchingList(Vec<BracketExpressionTerm>),
241    /// A nonmatching bracket expression
242    NonmatchingList(Vec<BracketExpressionTerm>),
243}
244impl From<char> for Atom {
245    fn from(value: char) -> Self {
246        return Atom::NormalChar(value);
247    }
248}
249impl From<CharClass> for Atom {
250    fn from(value: CharClass) -> Self {
251        return Atom::CharClass(value);
252    }
253}
254
255impl Atom {
256    fn take<'a>(rest: &'a str) -> Option<(&'a str, Atom)> {
257        let mut it = rest.chars();
258        match it.next() {
259            Some('\\') => match it.next() {
260                Some(c) if is_escapable_character(c) => {
261                    return Some((it.as_str(), Atom::NormalChar(c)))
262                }
263                _ => return None,
264            },
265            Some('.') => return Some((it.as_str(), CharClass::Dot.into())),
266            Some('[') => {
267                let rest = it.as_str();
268                let (rest, end_index, none_of) = if rest.starts_with(']') {
269                    (rest, rest.match_indices(']').nth(1)?.0, false)
270                } else if let Some(rest) = rest.strip_prefix('^') {
271                    if rest.starts_with(']') {
272                        (rest, rest.match_indices(']').nth(1)?.0, true)
273                    } else {
274                        (rest, rest.find(']')?, true)
275                    }
276                } else {
277                    (rest, rest.find(']')?, false)
278                };
279
280                let inside = &rest[..end_index];
281                let rest = &rest[end_index + 1..];
282                let mut it = inside.chars();
283                let mut items = Vec::new();
284
285                // TODO: There's some strangeness with when `-` is allowed. We should handle that.
286                while let Some(first) = it.next() {
287                    match it.clone().next() {
288                        Some('-') => {
289                            it.next();
290                            if let Some(second) = it.next() {
291                                items.push(BracketExpressionTerm::Range(first, second));
292                            } else {
293                                items.push(BracketExpressionTerm::Single(first));
294                                items.push(BracketExpressionTerm::Single('-'));
295                            }
296                        }
297                        _ => {
298                            items.push(BracketExpressionTerm::Single(first));
299                        }
300                    }
301                }
302
303                if none_of {
304                    return Some((rest, Atom::NonmatchingList(items)));
305                } else {
306                    return Some((rest, Atom::MatchingList(items)));
307                }
308            }
309            Some(c) if !is_special_character(c) => return Some((it.as_str(), Atom::NormalChar(c))),
310            None | Some(_) => return None,
311        }
312    }
313    pub fn check(&self, c: char) -> bool {
314        return match self {
315            Atom::NormalChar(a) => *a == c,
316            Atom::CharClass(char_class) => char_class.check(c),
317            Atom::MatchingList(vec) => vec.into_iter().any(|b| b.check(c)),
318            Atom::NonmatchingList(vec) => !vec.into_iter().any(|b| b.check(c)),
319        };
320    }
321}
322impl Display for Atom {
323    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
324        return match self {
325            Atom::NormalChar(c) if is_escapable_character(*c) => write!(f, "\\{c}"),
326            Atom::NormalChar(c) => f.write_char(*c),
327            Atom::CharClass(c) => c.fmt(f),
328            Atom::MatchingList(vec) => {
329                f.write_char('[')?;
330                for term in vec {
331                    write!(f, "{term}")?;
332                }
333                f.write_char(']')
334            }
335            Atom::NonmatchingList(vec) => {
336                f.write_str("[^")?;
337                for term in vec {
338                    write!(f, "{term}")?;
339                }
340                f.write_char(']')
341            }
342        };
343    }
344}
345#[derive(Debug, PartialEq, Eq, Clone, Copy)]
346pub enum BracketExpressionTerm {
347    Single(char),
348    Range(char, char),
349    CharClass(CharClass),
350}
351impl BracketExpressionTerm {
352    pub const fn check(&self, c: char) -> bool {
353        return match self {
354            BracketExpressionTerm::Single(a) => *a == c,
355            BracketExpressionTerm::Range(a, b) => *a <= c && c <= *b,
356            BracketExpressionTerm::CharClass(class) => class.check(c),
357        };
358    }
359}
360impl Display for BracketExpressionTerm {
361    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
362        return match self {
363            BracketExpressionTerm::Single(c) => f.write_char(*c),
364            BracketExpressionTerm::Range(first, second) => write!(f, "{first}-{second}"),
365            BracketExpressionTerm::CharClass(class) => class.fmt(f),
366        };
367    }
368}
369impl From<char> for BracketExpressionTerm {
370    fn from(value: char) -> Self {
371        return BracketExpressionTerm::Single(value);
372    }
373}
374impl From<CharClass> for BracketExpressionTerm {
375    fn from(value: CharClass) -> Self {
376        return BracketExpressionTerm::CharClass(value);
377    }
378}
379impl ToTokens for BracketExpressionTerm {
380    fn to_tokens(&self, tokens: &mut TokenStream) {
381        tokens.extend(match self {
382            BracketExpressionTerm::Single(c) => quote! {
383                ::ere_core::parse_tree::BracketExpressionTerm::Single(#c)
384            },
385            BracketExpressionTerm::Range(a, z) => quote! {
386                ::ere_core::parse_tree::BracketExpressionTerm::Range(#a, #z)
387            },
388            BracketExpressionTerm::CharClass(char_class) => quote! {
389                ::ere_core::parse_tree::BracketExpressionTerm::CharClass(#char_class)
390            },
391        });
392    }
393}
394
395/// The characters that can only occur if quoted
396#[inline]
397const fn is_special_character(c: char) -> bool {
398    return c == '^'
399        || c == '.'
400        || c == '['
401        || c == '$'
402        || c == '('
403        || c == ')'
404        || c == '|'
405        || c == '*'
406        || c == '+'
407        || c == '?'
408        || c == '{'
409        || c == '\\';
410}
411
412/// The characters that can only occur if quoted
413#[inline]
414const fn is_escapable_character(c: char) -> bool {
415    return is_special_character(c) || c == ']' || c == '}';
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn reconstruction() {
424        fn test_reconstruction(text: &str) {
425            let (rest, ere) = ERE::take(text).unwrap();
426            assert!(rest.is_empty(), "{text} did not get used (left {rest})");
427            let reconstructed = ere.to_string();
428            assert_eq!(text, &reconstructed);
429        }
430        test_reconstruction("asdf");
431        test_reconstruction("as+df123$");
432        test_reconstruction("asd?f*123");
433        test_reconstruction("^asdf123.*");
434
435        test_reconstruction("a[|]");
436        test_reconstruction("[^]a-z1-4A-X-]asdf");
437        test_reconstruction("cd[X-]er");
438
439        test_reconstruction("a|b");
440        test_reconstruction("a|b|c");
441        test_reconstruction("a(a(b|c)|d)|(g|f)b");
442        test_reconstruction("(a|b)|(c|d){3}");
443
444        test_reconstruction("a[y-z]{1,3}");
445        test_reconstruction("a{3,}");
446        test_reconstruction("a(efg){1,}");
447    }
448
449    #[test]
450    fn parse_quantifiers() {
451        assert_eq!(Quantifier::take(""), None);
452        assert_eq!(Quantifier::take("+asdf"), Some(("asdf", Quantifier::Plus)));
453        assert_eq!(Quantifier::take("*"), Some(("", Quantifier::Star)));
454        assert_eq!(Quantifier::take("e?"), None);
455        assert_eq!(Quantifier::take("{"), None);
456        assert_eq!(Quantifier::take("{}"), None);
457        assert_eq!(Quantifier::take("{1}"), Some(("", Quantifier::Multiple(1))));
458        assert_eq!(
459            Quantifier::take("{9}ee"),
460            Some(("ee", Quantifier::Multiple(9)))
461        );
462        assert_eq!(
463            Quantifier::take("{10,}ee"),
464            Some(("ee", Quantifier::Range(10, None)))
465        );
466        assert_eq!(
467            Quantifier::take("{0,11}ef"),
468            Some(("ef", Quantifier::Range(0, Some(11))))
469        );
470        assert_eq!(Quantifier::take("{0,e11}ef"), None);
471        assert_eq!(Quantifier::take("{0;11}ef"), None);
472    }
473
474    #[test]
475    fn parse_atom_simple() {
476        assert_eq!(Atom::take("a"), Some(("", Atom::NormalChar('a'))));
477        assert_eq!(Atom::take(r"abcd"), Some(("bcd", Atom::NormalChar('a'))));
478        assert_eq!(Atom::take(r"\\"), Some(("", Atom::NormalChar('\\'))));
479        assert_eq!(
480            Atom::take(r"\[asdf\]"),
481            Some((r"asdf\]", Atom::NormalChar('[')))
482        );
483        assert_eq!(Atom::take(r"\."), Some(("", Atom::NormalChar('.'))));
484        assert_eq!(Atom::take(r" "), Some(("", Atom::NormalChar(' '))));
485        assert_eq!(Atom::take(r"\"), None);
486
487        assert_eq!(Atom::take("."), Some(("", Atom::CharClass(CharClass::Dot))));
488        assert_eq!(
489            Atom::take(".."),
490            Some((".", Atom::CharClass(CharClass::Dot)))
491        );
492    }
493
494    #[test]
495    fn parse_atom_brackets() {
496        assert_eq!(
497            Atom::take("[ab]"),
498            Some((
499                "",
500                Atom::MatchingList(vec![
501                    BracketExpressionTerm::Single('a'),
502                    BracketExpressionTerm::Single('b'),
503                ])
504            ))
505        );
506        assert_eq!(
507            Atom::take("[]ab]"),
508            Some((
509                "",
510                Atom::MatchingList(vec![
511                    BracketExpressionTerm::Single(']'),
512                    BracketExpressionTerm::Single('a'),
513                    BracketExpressionTerm::Single('b'),
514                ])
515            ))
516        );
517        assert_eq!(
518            Atom::take("[]ab-]"),
519            Some((
520                "",
521                Atom::MatchingList(vec![
522                    BracketExpressionTerm::Single(']'),
523                    BracketExpressionTerm::Single('a'),
524                    BracketExpressionTerm::Single('b'),
525                    BracketExpressionTerm::Single('-'),
526                ])
527            ))
528        );
529
530        assert_eq!(
531            Atom::take("[]a-y]"),
532            Some((
533                "",
534                Atom::MatchingList(vec![
535                    BracketExpressionTerm::Single(']'),
536                    BracketExpressionTerm::Range('a', 'y'),
537                ])
538            ))
539        );
540        assert_eq!(
541            Atom::take("[]+--]"),
542            Some((
543                "",
544                Atom::MatchingList(vec![
545                    BracketExpressionTerm::Single(']'),
546                    BracketExpressionTerm::Range('+', '-'),
547                ])
548            ))
549        );
550
551        assert_eq!(
552            Atom::take("[^]a-y]"),
553            Some((
554                "",
555                Atom::NonmatchingList(vec![
556                    BracketExpressionTerm::Single(']'),
557                    BracketExpressionTerm::Range('a', 'y'),
558                ])
559            ))
560        );
561        assert_eq!(
562            Atom::take("[^]+--]"),
563            Some((
564                "",
565                Atom::NonmatchingList(vec![
566                    BracketExpressionTerm::Single(']'),
567                    BracketExpressionTerm::Range('+', '-'),
568                ])
569            ))
570        );
571    }
572}