1use std::{
5 fmt::{Display, Write},
6 ops::RangeInclusive,
7};
8
9use proc_macro2::TokenStream;
10use quote::{quote, ToTokens};
11
12macro_rules! match_prefix {
27 ($text:ident, { }) => (::core::option::Option::None);
28 ($text:ident, {
29 $x:literal => $y:expr,
30 $($xs:literal => $ys:expr,)*
31 }) => {
32 if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $x) {
33 ::core::option::Option::Some((rest, $y))
34 } $(else if let ::core::option::Option::Some(rest) = str::strip_prefix($text, $xs) {
35 ::core::option::Option::Some((rest, $ys))
36 })* else {
37 ::core::option::Option::None
38 }
39 };
40}
41
42#[derive(Debug, thiserror::Error, PartialEq, Eq)]
43pub enum RegexParseError {
44 #[error("A capture group is not properly terminated.")]
46 UnterminatedCaptureGroup,
47 #[error("A character class set is not properly terminated.")]
49 UnterminatedCharSet,
50 #[error("An unknown regex escape (not rust string escape) was found: `\\{0}`.")]
52 InvalidEscapedChar(char),
53 #[error("An atom was expected but was invalid due to being a special character: `{0}`.")]
54 InvalidAtom(char),
55 #[error("An atom was expected but was not found due to the end of the string.")]
57 UnexpectedEOF,
58}
59
60#[derive(Clone)]
63pub struct ERE(
64 pub(crate) Vec<EREBranch>,
66);
67impl ERE {
68 fn take<'a>(rest: &'a str) -> Result<(&'a str, ERE), RegexParseError> {
70 let mut branches = Vec::new();
71 let (mut rest, branch) = EREBranch::take(rest)?;
72 branches.push(branch);
73 loop {
74 if rest.is_empty() || rest.starts_with(')') {
75 break;
76 }
77 let Some(branch_start) = rest.strip_prefix('|') else {
78 break;
79 };
80 let (new_rest, branch) = EREBranch::take(branch_start)?;
81 rest = new_rest;
82 branches.push(branch);
83 }
84 return Ok((rest, ERE(branches)));
85 }
86
87 pub fn parse_str(input: &str) -> Result<Self, RegexParseError> {
88 let Ok(("", ere)) = ERE::take(&input) else {
89 return Err(RegexParseError::InvalidAtom(')'));
92 };
93 return Ok(ere);
94 }
95 pub(crate) fn parse_str_syn(string: &str, span: proc_macro2::Span) -> syn::Result<Self> {
96 return ERE::parse_str(&string).map_err(|err| {
97 syn::Error::new(
98 span,
99 format_args!("While parsing regular expression:\n{err}"),
100 )
101 });
102 }
103}
104impl syn::parse::Parse for ERE {
105 fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
106 let literal: syn::LitStr = input.parse()?;
107 let string = literal.value();
108 return ERE::parse_str_syn(&string, literal.span());
109 }
110}
111impl Display for ERE {
112 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
113 let mut it = self.0.iter();
114 let Some(first) = it.next() else {
115 return Ok(());
116 };
117 write!(f, "{first}")?;
118 for part in it {
119 write!(f, "|{part}")?;
120 }
121 return Ok(());
122 }
123}
124
125#[derive(Clone)]
126pub(crate) struct EREBranch(
127 pub(crate) Vec<EREPart>,
129);
130impl EREBranch {
131 fn take<'a>(mut rest: &'a str) -> Result<(&'a str, EREBranch), RegexParseError> {
139 let mut parts = Vec::new();
140 while !rest.is_empty() && !rest.starts_with(')') && !rest.starts_with('|') {
141 let (new_rest, part) = EREPart::take(rest)?;
142 rest = new_rest;
143 parts.push(part);
144 }
145 return Ok((rest, EREBranch(parts)));
146 }
147}
148impl Display for EREBranch {
149 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150 for part in &self.0 {
151 write!(f, "{part}")?;
152 }
153 return Ok(());
154 }
155}
156
157#[derive(Clone)]
158pub(crate) enum EREPart {
159 Single(EREExpression),
160 Quantified(EREExpression, Quantifier),
161 Start,
162 End,
163}
164impl EREPart {
165 fn take<'a>(rest: &'a str) -> Result<(&'a str, EREPart), RegexParseError> {
166 if let Some(rest) = rest.strip_prefix('^') {
167 return Ok((rest, EREPart::Start));
168 } else if let Some(rest) = rest.strip_prefix('$') {
169 return Ok((rest, EREPart::End));
170 }
171
172 let (rest, expr) = if let Some(rest) = rest.strip_prefix('(') {
173 let (rest, ere) = ERE::take(rest)?;
174 let Some(rest) = rest.strip_prefix(')') else {
175 return Err(RegexParseError::UnterminatedCaptureGroup);
176 };
177 (rest, EREExpression::Subexpression(ere))
178 } else {
179 let (rest, atom) = Atom::take(rest)?;
180 (rest, EREExpression::Atom(atom))
181 };
182
183 let part = match Quantifier::take(rest) {
184 Some((rest, quantifier)) => (rest, EREPart::Quantified(expr, quantifier)),
185 None => (rest, EREPart::Single(expr)),
186 };
187 return Ok(part);
188 }
189}
190impl Display for EREPart {
191 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
192 return match self {
193 EREPart::Single(expr) => write!(f, "{expr}"),
194 EREPart::Quantified(expr, quantifier) => write!(f, "{expr}{quantifier}"),
195 EREPart::Start => f.write_char('^'),
196 EREPart::End => f.write_char('$'),
197 };
198 }
199}
200
201#[derive(Clone)]
202pub(crate) enum EREExpression {
203 Atom(Atom),
204 Subexpression(ERE),
205}
206impl Display for EREExpression {
207 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
208 return match self {
209 EREExpression::Atom(atom) => write!(f, "{atom}"),
210 EREExpression::Subexpression(ere) => write!(f, "({ere})"),
211 };
212 }
213}
214
215#[derive(Debug, PartialEq, Eq, Clone, Copy)]
216pub(crate) enum QuantifierType {
217 Star,
218 Plus,
219 QuestionMark,
220 Multiple(u32),
222 Range(u32, Option<u32>),
224}
225
226impl QuantifierType {
227 #[inline]
229 const fn min(&self) -> u32 {
230 return match self {
231 QuantifierType::Star => 0,
232 QuantifierType::Plus => 1,
233 QuantifierType::QuestionMark => 0,
234 QuantifierType::Multiple(n) => *n,
235 QuantifierType::Range(n, _) => *n,
236 };
237 }
238 #[inline]
240 const fn max(&self) -> Option<u32> {
241 return match self {
242 QuantifierType::Star => None,
243 QuantifierType::Plus => None,
244 QuantifierType::QuestionMark => Some(1),
245 QuantifierType::Multiple(n) => Some(*n),
246 QuantifierType::Range(_, m) => *m,
247 };
248 }
249}
250impl Display for QuantifierType {
251 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
252 return match self {
253 QuantifierType::Star => f.write_char('*'),
254 QuantifierType::Plus => f.write_char('+'),
255 QuantifierType::QuestionMark => f.write_char('?'),
256 QuantifierType::Multiple(n) => write!(f, "{{{n}}}"),
257 QuantifierType::Range(n, None) => write!(f, "{{{n},}}"),
258 QuantifierType::Range(n, Some(m)) => write!(f, "{{{n},{m}}}"),
259 };
260 }
261}
262
263#[derive(Debug, PartialEq, Eq, Clone, Copy)]
264pub(crate) struct Quantifier {
265 pub quantifier: QuantifierType,
266 pub alt: bool,
270}
271impl Quantifier {
272 #[inline]
273 fn take<'a>(rest: &'a str) -> Option<(&'a str, Quantifier)> {
274 let mut it = rest.chars();
275 let (rest, quantifier) = match it.next() {
276 Some('*') => (it.as_str(), QuantifierType::Star),
277 Some('+') => (it.as_str(), QuantifierType::Plus),
278 Some('?') => (it.as_str(), QuantifierType::QuestionMark),
279 Some('{') => {
280 let (inside, rest) = it.as_str().split_once('}')?;
281 match inside.split_once(',') {
282 None => (rest, QuantifierType::Multiple(inside.parse().ok()?)),
283 Some((min, "")) => (rest, QuantifierType::Range(min.parse().ok()?, None)),
284 Some((min, max)) => (
285 rest,
286 QuantifierType::Range(min.parse().ok()?, Some(max.parse().ok()?)),
287 ),
288 }
289 }
290 _ => return None,
291 };
292 if let Some(rest) = rest.strip_prefix('?') {
293 return Some((
294 rest,
295 Quantifier {
296 quantifier,
297 alt: true,
298 },
299 ));
300 } else {
301 return Some((
302 rest,
303 Quantifier {
304 quantifier,
305 alt: false,
306 },
307 ));
308 }
309 }
310}
311impl Display for Quantifier {
312 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
313 self.quantifier.fmt(f)?;
314 if self.alt {
315 return f.write_char('?');
316 } else {
317 return Ok(());
318 }
319 }
320}
321impl From<QuantifierType> for Quantifier {
322 fn from(quantifier: QuantifierType) -> Self {
323 return Quantifier {
324 quantifier,
325 alt: false,
326 };
327 }
328}
329
330#[derive(Debug, PartialEq, Eq, Clone, Copy)]
331pub enum CharClass {
332 Dot,
334}
335impl CharClass {
336 pub const fn check(&self, c: char) -> bool {
337 return match self {
338 CharClass::Dot => c != '\0',
339 };
340 }
341}
342impl Display for CharClass {
343 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
344 return match self {
345 CharClass::Dot => f.write_char('.'),
346 };
347 }
348}
349impl ToTokens for CharClass {
350 fn to_tokens(&self, tokens: &mut TokenStream) {
351 match self {
352 CharClass::Dot => tokens.extend(quote! {::ere::parse_tree::CharClass::Dot}),
353 }
354 }
355}
356
357#[derive(Debug, Clone)]
373pub enum Atom {
374 NormalChar(char),
376 CharClass(CharClass),
377 MatchingList(Vec<BracketExpressionTerm>),
379 NonmatchingList(Vec<BracketExpressionTerm>),
381}
382impl From<char> for Atom {
383 fn from(value: char) -> Self {
384 return Atom::NormalChar(value);
385 }
386}
387impl From<CharClass> for Atom {
388 fn from(value: CharClass) -> Self {
389 return Atom::CharClass(value);
390 }
391}
392impl std::str::FromStr for Atom {
393 type Err = ();
394
395 fn from_str(s: &str) -> Result<Self, Self::Err> {
396 let Ok(("", atom)) = Atom::take(s) else {
397 return Err(());
398 };
399 return Ok(atom);
400 }
401}
402
403impl Atom {
404 fn take<'a>(rest: &'a str) -> Result<(&'a str, Atom), RegexParseError> {
408 let mut it = rest.chars();
409 return match it.next() {
410 Some('\\') => match it.next() {
411 Some(c) if is_escapable_character(c) => Ok((it.as_str(), Atom::NormalChar(c))),
412 Some(c) => Err(RegexParseError::InvalidEscapedChar(c)),
413 None => Err(RegexParseError::InvalidAtom('\\')),
414 },
415 Some('.') => Ok((it.as_str(), CharClass::Dot.into())),
416 Some('[') => {
417 let mut rest = it.as_str();
418 let mut items = Vec::new();
419 let none_of = if let Some(new_rest) = rest.strip_prefix('^') {
420 rest = new_rest;
421 true
422 } else {
423 false
424 };
425 if let Some(new_rest) = rest.strip_prefix(']') {
426 rest = new_rest;
427 items.push(BracketExpressionTerm::Single(']'));
428 }
429 loop {
430 if let Some(new_rest) = rest.strip_prefix(']') {
431 rest = new_rest;
433 break;
434 } else if let Some((new_rest, class)) = BracketCharClass::take(rest) {
435 rest = new_rest;
437 items.push(BracketExpressionTerm::CharClass(class));
438 } else {
439 let mut it = rest.chars();
441 let first = it.next().ok_or(RegexParseError::UnterminatedCharSet)?;
442 rest = it.as_str();
443 if let '-' = it.next().ok_or(RegexParseError::UnterminatedCharSet)? {
444 let second = it.next().ok_or(RegexParseError::UnterminatedCharSet)?;
445 rest = it.as_str();
446 if second == ']' {
447 items.push(BracketExpressionTerm::Single(first));
449 items.push(BracketExpressionTerm::Single('-'));
450 break;
451 } else {
452 items.push(BracketExpressionTerm::Range(first, second));
454 }
455 } else {
456 items.push(BracketExpressionTerm::Single(first));
457 }
458 }
459 }
460
461 if none_of {
462 return Ok((rest, Atom::NonmatchingList(items)));
463 } else {
464 return Ok((rest, Atom::MatchingList(items)));
465 }
466 }
467 Some(c) if !is_special_character(c) => Ok((it.as_str(), Atom::NormalChar(c))),
468 Some(c) => Err(RegexParseError::InvalidAtom(c)),
469 None => Err(RegexParseError::UnexpectedEOF),
470 };
471 }
472 pub fn check(&self, c: char) -> bool {
473 return match self {
474 Atom::NormalChar(a) => *a == c,
475 Atom::CharClass(char_class) => char_class.check(c),
476 Atom::MatchingList(vec) => vec.into_iter().any(|b| b.check(c)),
477 Atom::NonmatchingList(vec) => !vec.into_iter().any(|b| b.check(c)),
478 };
479 }
480 pub(crate) fn serialize_check(&self) -> TokenStream {
481 let ranges = self.to_ranges();
482 let mut stream = TokenStream::new();
483 for range in ranges {
484 let start = range.start();
485 let end = range.end();
486 stream.extend(quote! { (#start <= c && c <= #end) || });
487 }
488 return quote! {(#stream false)};
489 }
490 pub fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
501 fn combine_ranges_chars(
503 mut ranges: Vec<RangeInclusive<char>>,
504 ) -> Vec<RangeInclusive<char>> {
505 ranges.sort_by_key(|range| *range.start());
506 let Some((first_range, terms)) = ranges.split_first_chunk::<1>() else {
507 return Vec::new();
508 };
509 let mut reduced_terms = Vec::new();
510
511 let mut current_start = *first_range[0].start();
512 let mut current_end = *first_range[0].end();
513 for term in terms {
514 if term.is_empty() {
515 continue;
516 } else if *term.start() <= current_end || (current_end..=*term.start()).count() == 2
517 {
518 current_end = std::cmp::max(current_end, *term.end());
520 } else {
521 reduced_terms.push(current_start.clone()..=current_end.clone());
522 current_start = *term.start();
523 current_end = *term.end();
524 }
525 }
526 reduced_terms.push(current_start.clone()..=current_end.clone());
527 return reduced_terms;
528 }
529
530 return match self {
531 Atom::NormalChar(c) => vec![*c..=*c],
532 Atom::CharClass(CharClass::Dot) => vec!['\x01'..=char::MAX],
533 Atom::MatchingList(_terms) => {
534 let mut terms = Vec::new();
535 for term in _terms {
536 match term {
537 crate::parse_tree::BracketExpressionTerm::Single(c) => {
538 terms.push(*c..=*c);
539 }
540 crate::parse_tree::BracketExpressionTerm::Range(a, b) => {
541 terms.push(*a..=*b);
542 }
543 crate::parse_tree::BracketExpressionTerm::CharClass(class) => {
544 terms.extend_from_slice(class.to_ranges());
545 }
546 }
547 }
548 combine_ranges_chars(terms)
549 }
550 Atom::NonmatchingList(terms) => {
551 let mut ranges = Vec::new();
552 ranges.push('\0'..=char::MAX);
553 for term in terms.iter().flat_map(|term| term.to_ranges()) {
554 ranges = ranges
555 .iter()
556 .flat_map(|range| {
557 if term.end() < range.start() || range.end() < term.start() {
558 return vec![range.clone()]; }
560 let mut out = Vec::new();
561 if range.start() < term.start() {
562 let mut new_range = *range.start()..=*term.start();
563 new_range.next_back();
564 out.push(new_range); }
566 if term.end() < range.end() {
567 let mut new_range = *term.end()..=*range.end();
568 new_range.next();
569 out.push(new_range); }
571 return out;
572 })
573 .collect();
574 }
575
576 ranges.sort_by_key(|range: &RangeInclusive<char>| *range.start());
577 ranges
578 }
579 };
580 }
581 pub fn max_bytes(&self) -> usize {
582 match self.to_ranges().last() {
583 None => 0,
584 Some(range) => range.end().len_utf8(),
585 }
586 }
587 pub fn min_bytes(&self) -> usize {
588 match self.to_ranges().first() {
589 None => 0,
590 Some(range) => range.start().len_utf8(),
591 }
592 }
593}
594impl Display for Atom {
595 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
596 return match self {
597 Atom::NormalChar(c) if is_escapable_character(*c) => write!(f, "\\{c}"),
598 Atom::NormalChar(c) => f.write_char(*c),
599 Atom::CharClass(c) => c.fmt(f),
600 Atom::MatchingList(vec) => {
601 f.write_char('[')?;
602 for term in vec {
603 write!(f, "{term}")?;
604 }
605 f.write_char(']')
606 }
607 Atom::NonmatchingList(vec) => {
608 f.write_str("[^")?;
609 for term in vec {
610 write!(f, "{term}")?;
611 }
612 f.write_char(']')
613 }
614 };
615 }
616}
617impl PartialEq for Atom {
618 fn eq(&self, other: &Self) -> bool {
619 return self.to_ranges() == other.to_ranges();
620 }
621}
622impl Eq for Atom {}
623
624#[derive(Debug, PartialEq, Eq, Clone, Copy)]
626pub enum BracketCharClass {
627 Alphanumeric,
629 Control,
631 Lower,
633 Space,
635 Alphabet,
637 Digit,
639 Print,
641 Upper,
643 Blank,
645 Graphic,
647 Punctuation,
649 HexDigit,
651}
652impl BracketCharClass {
653 pub const fn to_ranges(&self) -> &'static [RangeInclusive<char>] {
654 return match self {
655 BracketCharClass::Alphanumeric => &['0'..='9', 'A'..='Z', 'a'..='z'],
656 BracketCharClass::Control => &['\0'..='\x1f', '\x7f'..='\x7f'],
657 BracketCharClass::Lower => &['a'..='z'],
658 BracketCharClass::Space => &['\t'..='\t', '\x0a'..='\x0d', ' '..=' '],
659 BracketCharClass::Alphabet => &['A'..='Z', 'a'..='z'],
660 BracketCharClass::Digit => &['0'..='9'],
661 BracketCharClass::Print => &['\x20'..='\x7E'],
662 BracketCharClass::Upper => &['A'..='Z'],
663 BracketCharClass::Blank => &['\t'..='\t', ' '..=' '],
664 BracketCharClass::Graphic => &['\x21'..='\x7e'],
665 BracketCharClass::Punctuation => &[
666 '\x21'..='\x2f',
667 '\x3a'..='\x40',
668 '\x5b'..='\x60',
669 '\x7b'..='\x7e',
670 ],
671 BracketCharClass::HexDigit => &['0'..='9', 'A'..='F', 'a'..='f'],
672 };
673 }
674 pub const fn check_ascii(&self, c: char) -> bool {
676 return match self {
677 BracketCharClass::Alphanumeric => c.is_ascii_alphanumeric(),
678 BracketCharClass::Control => c.is_ascii_control(),
679 BracketCharClass::Lower => c.is_ascii_lowercase(),
680 BracketCharClass::Space => c.is_ascii_whitespace() || c == '\x0b', BracketCharClass::Alphabet => c.is_ascii_alphabetic(),
682 BracketCharClass::Digit => c.is_ascii_digit(),
683 BracketCharClass::Print => matches!(c, '\x20'..='\x7E'),
684 BracketCharClass::Upper => c.is_ascii_uppercase(),
685 BracketCharClass::Blank => c == ' ' || c == '\t',
686 BracketCharClass::Graphic => c.is_ascii_graphic(),
687 BracketCharClass::Punctuation => c.is_ascii_punctuation(),
688 BracketCharClass::HexDigit => c.is_ascii_hexdigit(),
689 };
690 }
691 fn take<'a>(rest: &'a str) -> Option<(&'a str, BracketCharClass)> {
692 let rest = rest.strip_prefix("[:")?;
693 return match_prefix!(rest, {
694 "alnum:]" => BracketCharClass::Alphanumeric,
695 "cntrl:]" => BracketCharClass::Control,
696 "lower:]" => BracketCharClass::Lower,
697 "space:]" => BracketCharClass::Space,
698 "alpha:]" => BracketCharClass::Alphabet,
699 "digit:]" => BracketCharClass::Digit,
700 "print:]" => BracketCharClass::Print,
701 "upper:]" => BracketCharClass::Upper,
702 "blank:]" => BracketCharClass::Blank,
703 "graph:]" => BracketCharClass::Graphic,
704 "punct:]" => BracketCharClass::Punctuation,
705 "xdigit:]" => BracketCharClass::HexDigit,
706 });
707 }
708}
709impl Display for BracketCharClass {
710 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
711 return match self {
712 BracketCharClass::Alphanumeric => f.write_str("[:alnum:]"),
713 BracketCharClass::Control => f.write_str("[:cntrl:]"),
714 BracketCharClass::Lower => f.write_str("[:lower:]"),
715 BracketCharClass::Space => f.write_str("[:space:]"),
716 BracketCharClass::Alphabet => f.write_str("[:alpha:]"),
717 BracketCharClass::Digit => f.write_str("[:digit:]"),
718 BracketCharClass::Print => f.write_str("[:print:]"),
719 BracketCharClass::Upper => f.write_str("[:upper:]"),
720 BracketCharClass::Blank => f.write_str("[:blank:]"),
721 BracketCharClass::Graphic => f.write_str("[:graph:]"),
722 BracketCharClass::Punctuation => f.write_str("[:punct:]"),
723 BracketCharClass::HexDigit => f.write_str("[:xdigit:]"),
724 };
725 }
726}
727impl ToTokens for BracketCharClass {
728 fn to_tokens(&self, tokens: &mut TokenStream) {
729 match self {
730 BracketCharClass::Alphanumeric => {
731 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Alphanumeric})
732 }
733 BracketCharClass::Control => {
734 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Control})
735 }
736 BracketCharClass::Lower => {
737 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Lower})
738 }
739 BracketCharClass::Space => {
740 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Space})
741 }
742 BracketCharClass::Alphabet => {
743 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Alphabet})
744 }
745 BracketCharClass::Digit => {
746 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Digit})
747 }
748 BracketCharClass::Print => {
749 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Print})
750 }
751 BracketCharClass::Upper => {
752 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Upper})
753 }
754 BracketCharClass::Blank => {
755 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Blank})
756 }
757 BracketCharClass::Graphic => {
758 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Graph})
759 }
760 BracketCharClass::Punctuation => {
761 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::Punctuation})
762 }
763 BracketCharClass::HexDigit => {
764 tokens.extend(quote! {::ere::parse_tree::BracketCharClass::XDigit})
765 }
766 }
767 }
768}
769
770#[derive(Debug, PartialEq, Eq, Clone, Copy)]
771pub enum BracketExpressionTerm {
772 Single(char),
773 Range(char, char),
774 CharClass(BracketCharClass),
775}
776impl BracketExpressionTerm {
777 pub const fn check(&self, c: char) -> bool {
778 return match self {
779 BracketExpressionTerm::Single(a) => *a == c,
780 BracketExpressionTerm::Range(a, b) => *a <= c && c <= *b,
781 BracketExpressionTerm::CharClass(class) => class.check_ascii(c),
782 };
783 }
784 pub(crate) fn to_ranges(&self) -> Vec<RangeInclusive<char>> {
785 return match self {
786 BracketExpressionTerm::Single(c) => vec![*c..=*c],
787 BracketExpressionTerm::Range(a, b) => vec![*a..=*b],
788 BracketExpressionTerm::CharClass(bracket_char_class) => {
789 bracket_char_class.to_ranges().to_vec()
790 }
791 };
792 }
793}
794impl Display for BracketExpressionTerm {
795 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
796 return match self {
797 BracketExpressionTerm::Single(c) => f.write_char(*c),
798 BracketExpressionTerm::Range(first, second) => write!(f, "{first}-{second}"),
799 BracketExpressionTerm::CharClass(class) => class.fmt(f),
800 };
801 }
802}
803impl From<char> for BracketExpressionTerm {
804 fn from(value: char) -> Self {
805 return BracketExpressionTerm::Single(value);
806 }
807}
808impl From<BracketCharClass> for BracketExpressionTerm {
809 fn from(value: BracketCharClass) -> Self {
810 return BracketExpressionTerm::CharClass(value);
811 }
812}
813impl ToTokens for BracketExpressionTerm {
814 fn to_tokens(&self, tokens: &mut TokenStream) {
815 tokens.extend(match self {
816 BracketExpressionTerm::Single(c) => quote! {
817 ::ere::parse_tree::BracketExpressionTerm::Single(#c)
818 },
819 BracketExpressionTerm::Range(a, z) => quote! {
820 ::ere::parse_tree::BracketExpressionTerm::Range(#a, #z)
821 },
822 BracketExpressionTerm::CharClass(char_class) => quote! {
823 ::ere::parse_tree::BracketExpressionTerm::CharClass(#char_class)
824 },
825 });
826 }
827}
828
829#[inline]
831const fn is_special_character(c: char) -> bool {
832 return c == '^'
833 || c == '.'
834 || c == '['
835 || c == '$'
836 || c == '('
837 || c == ')'
838 || c == '|'
839 || c == '*'
840 || c == '+'
841 || c == '?'
842 || c == '{'
843 || c == '\\';
844}
845
846#[inline]
848pub(crate) const fn is_escapable_character(c: char) -> bool {
849 return is_special_character(c) || c == ']' || c == '}';
850}
851
852#[cfg(test)]
853mod tests {
854 use super::*;
855
856 #[test]
857 fn reconstruction() {
858 fn test_reconstruction(text: &str) {
859 let (rest, ere) = match ERE::take(text) {
860 Ok(ok) => ok,
861 Err(err) => panic!("Failed to parse `{text}`, got error: {err}"),
862 };
863 assert!(rest.is_empty(), "{text} did not get used (left {rest})");
864 let reconstructed = ere.to_string();
865 assert_eq!(text, &reconstructed);
866 }
867 test_reconstruction("asdf");
868 test_reconstruction("as+df123$");
869 test_reconstruction("asd?f*123");
870 test_reconstruction("^asdf123.*");
871
872 test_reconstruction("a[|]");
873 test_reconstruction("[^]a-z1-4A-X-]asdf");
874 test_reconstruction("cd[X-]er");
875
876 test_reconstruction("my word is [[:alnum:]_]");
877 test_reconstruction("my word is [[:lower:][:digit:]_]");
878
879 test_reconstruction("a|b");
880 test_reconstruction("a|b|c");
881 test_reconstruction("a(a(b|c)|d)|(g|f)b");
882 test_reconstruction("(a|b)|(c|d){3}");
883
884 test_reconstruction("a[y-z]{1,3}");
885 test_reconstruction("a{3,}");
886 test_reconstruction("a(efg){1,}");
887 }
888
889 #[test]
890 fn parse_quantifiers() {
891 assert_eq!(Quantifier::take(""), None);
892 assert_eq!(
893 Quantifier::take("+asdf"),
894 Some(("asdf", QuantifierType::Plus.into()))
895 );
896 assert_eq!(
897 Quantifier::take("*"),
898 Some(("", QuantifierType::Star.into()))
899 );
900 assert_eq!(Quantifier::take("e?"), None);
901 assert_eq!(Quantifier::take("{"), None);
902 assert_eq!(Quantifier::take("{}"), None);
903 assert_eq!(
904 Quantifier::take("{1}"),
905 Some(("", QuantifierType::Multiple(1).into()))
906 );
907 assert_eq!(
908 Quantifier::take("{9}ee"),
909 Some(("ee", QuantifierType::Multiple(9).into()))
910 );
911 assert_eq!(
912 Quantifier::take("{10,}ee"),
913 Some(("ee", QuantifierType::Range(10, None).into()))
914 );
915 assert_eq!(
916 Quantifier::take("{0,11}ef"),
917 Some(("ef", QuantifierType::Range(0, Some(11)).into()))
918 );
919 assert_eq!(Quantifier::take("{0,e11}ef"), None);
920 assert_eq!(Quantifier::take("{0;11}ef"), None);
921 }
922
923 #[test]
924 fn parse_atom_simple() {
925 assert_eq!(Atom::take("a"), Ok(("", Atom::NormalChar('a'))));
926 assert_eq!(Atom::take(r"abcd"), Ok(("bcd", Atom::NormalChar('a'))));
927 assert_eq!(Atom::take(r"\\"), Ok(("", Atom::NormalChar('\\'))));
928 assert_eq!(
929 Atom::take(r"\[asdf\]"),
930 Ok((r"asdf\]", Atom::NormalChar('[')))
931 );
932 assert_eq!(Atom::take(r"\."), Ok(("", Atom::NormalChar('.'))));
933 assert_eq!(Atom::take(r" "), Ok(("", Atom::NormalChar(' '))));
934 assert_eq!(Atom::take(r"\"), Err(RegexParseError::InvalidAtom('\\')));
935
936 assert_eq!(Atom::take("."), Ok(("", Atom::CharClass(CharClass::Dot))));
937 assert_eq!(Atom::take(".."), Ok((".", Atom::CharClass(CharClass::Dot))));
938 }
939
940 #[test]
941 fn parse_atom_brackets() {
942 assert_eq!(
943 Atom::take("[ab]"),
944 Ok((
945 "",
946 Atom::MatchingList(vec![
947 BracketExpressionTerm::Single('a'),
948 BracketExpressionTerm::Single('b'),
949 ])
950 ))
951 );
952 assert_eq!(
953 Atom::take("[]ab]"),
954 Ok((
955 "",
956 Atom::MatchingList(vec![
957 BracketExpressionTerm::Single(']'),
958 BracketExpressionTerm::Single('a'),
959 BracketExpressionTerm::Single('b'),
960 ])
961 ))
962 );
963 assert_eq!(
964 Atom::take("[]ab-]"),
965 Ok((
966 "",
967 Atom::MatchingList(vec![
968 BracketExpressionTerm::Single(']'),
969 BracketExpressionTerm::Single('a'),
970 BracketExpressionTerm::Single('b'),
971 BracketExpressionTerm::Single('-'),
972 ])
973 ))
974 );
975
976 assert_eq!(
977 Atom::take("[]a-y]"),
978 Ok((
979 "",
980 Atom::MatchingList(vec![
981 BracketExpressionTerm::Single(']'),
982 BracketExpressionTerm::Range('a', 'y'),
983 ])
984 ))
985 );
986 assert_eq!(
987 Atom::take("[]+--]"),
988 Ok((
989 "",
990 Atom::MatchingList(vec![
991 BracketExpressionTerm::Single(']'),
992 BracketExpressionTerm::Range('+', '-'),
993 ])
994 ))
995 );
996
997 assert_eq!(
998 Atom::take("[^]a-y]"),
999 Ok((
1000 "",
1001 Atom::NonmatchingList(vec![
1002 BracketExpressionTerm::Single(']'),
1003 BracketExpressionTerm::Range('a', 'y'),
1004 ])
1005 ))
1006 );
1007 assert_eq!(
1008 Atom::take("[^]+--]"),
1009 Ok((
1010 "",
1011 Atom::NonmatchingList(vec![
1012 BracketExpressionTerm::Single(']'),
1013 BracketExpressionTerm::Range('+', '-'),
1014 ])
1015 ))
1016 );
1017 }
1018
1019 #[test]
1020 fn parse_atom_err() {
1021 assert_eq!(Atom::take(")"), Err(RegexParseError::InvalidAtom(')')));
1022 assert_eq!(Atom::take(r"\"), Err(RegexParseError::InvalidAtom('\\')));
1023 assert_eq!(
1024 Atom::take(r"\a"),
1025 Err(RegexParseError::InvalidEscapedChar('a'))
1026 );
1027 }
1028
1029 #[test]
1030 fn atom_to_ranges_normal_char() {
1031 let (_, atom) = Atom::take("a").unwrap();
1032 assert_eq!(atom.to_ranges(), vec!['a'..='a']);
1033 }
1034
1035 #[test]
1036 fn atom_to_ranges_matching_list() {
1037 let (_, atom) = Atom::take("[a-z]").unwrap();
1038 assert_eq!(atom.to_ranges(), vec!['a'..='z']);
1039
1040 let (_, atom) = Atom::take("[0-45-9]").unwrap();
1041 assert_eq!(atom.to_ranges(), vec!['0'..='9']);
1042
1043 let (_, atom) = Atom::take("[5-90-4]").unwrap();
1044 assert_eq!(atom.to_ranges(), vec!['0'..='9']);
1045
1046 let (_, atom) = Atom::take("[0-46-89]").unwrap();
1047 assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
1048
1049 let (_, atom) = Atom::take("[96-80-4]").unwrap();
1050 assert_eq!(atom.to_ranges(), vec!['0'..='4', '6'..='9']);
1051 }
1052
1053 #[test]
1054 fn atom_to_ranges_nonmatching_list() {
1055 let (_, atom) = Atom::take("[^b-y]").unwrap();
1056 assert_eq!(atom.to_ranges(), vec!['\0'..='a', 'z'..=char::MAX]);
1057
1058 let (_, atom) = Atom::take("[^1-45-8]").unwrap();
1059 assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
1060
1061 let (_, atom) = Atom::take("[^5-81-4]").unwrap();
1062 assert_eq!(atom.to_ranges(), vec!['\0'..='0', '9'..=char::MAX]);
1063
1064 let (_, atom) = Atom::take("[^1-486-7]").unwrap();
1065 assert_eq!(
1066 atom.to_ranges(),
1067 vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
1068 );
1069
1070 let (_, atom) = Atom::take("[^86-71-4]").unwrap();
1071 assert_eq!(
1072 atom.to_ranges(),
1073 vec!['\0'..='0', '5'..='5', '9'..=char::MAX]
1074 );
1075 }
1076
1077 #[test]
1078 fn atom_eq() {
1079 assert_eq!(
1080 Atom::take("[a-z]").unwrap().1,
1081 Atom::take("[a-kl-z]").unwrap().1
1082 );
1083
1084 assert_eq!(
1085 Atom::take("[a-z]").unwrap().1,
1086 Atom::take("[l-za-k]").unwrap().1
1087 );
1088
1089 assert_eq!(Atom::take("a").unwrap().1, Atom::take("[a]").unwrap().1);
1090
1091 assert_eq!(
1092 Atom::take("[1-8]").unwrap().1,
1093 Atom::take("[^\0-09-\u{10ffff}]").unwrap().1
1094 );
1095
1096 assert_eq!(
1097 Atom::take("[[:upper:]]").unwrap().1,
1098 Atom::take("[A-Z]").unwrap().1
1099 );
1100
1101 assert_ne!(
1102 Atom::take("[a-z]").unwrap().1,
1103 Atom::take("[A-Z]").unwrap().1
1104 )
1105 }
1106}