Skip to main content

fuzzy_regex/parser/
ast.rs

1//! Abstract Syntax Tree definitions for fuzzy regex patterns.
2
3#![allow(clippy::match_same_arms, clippy::too_many_lines)]
4// Note: enum_clone_variant is not a valid lint
5
6use crate::types::FuzzyLimits;
7
8/// Matching flags that can be set in the pattern.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10pub struct MatchFlags {
11    /// BESTMATCH flag (`(?b)`) - search for the best match instead of the first.
12    pub best_match: bool,
13    /// ENHANCEMATCH flag (`(?e)`) - improve the fit of fuzzy matches.
14    pub enhance_match: bool,
15    /// POSIX leftmost-longest flag (`(?p)`) - find longest match at leftmost position.
16    pub posix: bool,
17    /// Verbose flag (`(?x)`) - ignore whitespace and allow comments.
18    pub verbose: bool,
19    /// Dot-all flag (`(?s)`) - `.` matches newlines.
20    pub dot_all: bool,
21    /// Multi-line flag (`(?m)`) - `^`/`$` match at line boundaries.
22    pub multi_line: bool,
23    /// Ungreedy flag (`(?U)`) - invert default greediness of quantifiers.
24    pub ungreedy: bool,
25    /// Case-insensitive flag (`(?i)`) - match case-insensitively.
26    pub case_insensitive: bool,
27    /// Global flag (`(?g)`) - find all matches, not just the first.
28    pub global: bool,
29    /// Unicode flag (`(?u)`) - enable Unicode character classes.
30    pub unicode: bool,
31}
32
33impl MatchFlags {
34    /// Create a new empty flags set.
35    #[must_use]
36    pub fn new() -> Self {
37        Self::default()
38    }
39
40    /// Set the BESTMATCH flag.
41    #[must_use]
42    pub fn with_best_match(mut self) -> Self {
43        self.best_match = true;
44        self
45    }
46
47    /// Set the ENHANCEMATCH flag.
48    #[must_use]
49    pub fn with_enhance_match(mut self) -> Self {
50        self.enhance_match = true;
51        self
52    }
53
54    /// Set the POSIX flag.
55    #[must_use]
56    pub fn with_posix(mut self) -> Self {
57        self.posix = true;
58        self
59    }
60}
61
62/// Fuzziness specification for a literal segment.
63///
64/// Supports two syntax styles:
65/// 1. Simple: `hello~2` (allows 2 total edits)
66/// 2. mrab-style: `(?:hello){i<=1,d<=2}` (max 1 insertion, 2 deletions)
67#[derive(Debug, Clone, PartialEq, Default)]
68pub enum Fuzziness {
69    /// Simple edit count: `hello~2` allows 2 total edits.
70    Edits(u8),
71    /// Detailed limits: `hello~{i=1,d=0,s=2}` or mrab-style `{i<=1,d<=2}`.
72    Detailed(FuzzyLimits),
73    /// mrab-style specification with optional cost constraints.
74    /// `{i<=1,s<=2,2i+2d+1s<=4}`
75    MrabStyle(MrabFuzziness),
76    /// Inherit from global/parent settings.
77    #[default]
78    Inherited,
79    /// Exact match only (no fuzzy matching): `hello~0`.
80    Exact,
81}
82
83/// mrab-regex style fuzziness specification.
84///
85/// Supports:
86/// - Error type limits: `{i<=1,d<=2,s<=3,t<=1}`
87/// - Total error limits: `{e<=5}`
88/// - Cost constraints: `{2i+2d+1s+1t<=4}` or `{c<=4}` (equal cost)
89/// - Ranges: `{1<=e<=3}`
90/// - Exclusive bounds: `{i<3}` (fewer than 3)
91/// - Character class restrictions: `{s<=2:[a-z]}` (restricts edit characters to class)
92/// - Unlimited errors: `{e}` (any number of errors allowed)
93#[derive(Debug, Clone, PartialEq, Default)]
94pub struct MrabFuzziness {
95    /// Maximum insertions allowed (None = not specified).
96    pub max_insertions: Option<u8>,
97    /// Maximum deletions allowed.
98    pub max_deletions: Option<u8>,
99    /// Maximum substitutions allowed.
100    pub max_substitutions: Option<u8>,
101    /// Maximum transpositions allowed.
102    pub max_transpositions: Option<u8>,
103    /// Maximum total errors allowed.
104    pub max_errors: Option<u8>,
105    /// Minimum total errors required.
106    pub min_errors: Option<u8>,
107    /// Whether insertions are unlimited (e.g., `{i}` without a value).
108    pub unlimited_insertions: bool,
109    /// Whether deletions are unlimited.
110    pub unlimited_deletions: bool,
111    /// Whether substitutions are unlimited.
112    pub unlimited_substitutions: bool,
113    /// Whether transpositions are unlimited.
114    pub unlimited_transpositions: bool,
115    /// Whether total errors are unlimited (e.g., `{e}` without a value).
116    pub unlimited_errors: bool,
117    /// Cost for insertions (for cost-based constraints).
118    pub insertion_cost: Option<u8>,
119    /// Cost for deletions.
120    pub deletion_cost: Option<u8>,
121    /// Cost for substitutions.
122    pub substitution_cost: Option<u8>,
123    /// Cost for transpositions.
124    pub transposition_cost: Option<u8>,
125    /// Maximum total cost.
126    pub max_cost: Option<u8>,
127    /// Character class restriction for substitutions (e.g., `[a-z]`).
128    /// Substituted characters must be in this class.
129    pub substitution_chars: Option<CharClass>,
130    /// Character class restriction for insertions.
131    /// Inserted characters must be in this class.
132    pub insertion_chars: Option<CharClass>,
133    /// Character class restriction for deletions.
134    /// Note: Deletions don't introduce new characters, so this is less meaningful.
135    pub deletion_chars: Option<CharClass>,
136}
137
138impl MrabFuzziness {
139    /// Create a new empty mrab-style fuzziness.
140    #[must_use]
141    pub fn new() -> Self {
142        Self::default()
143    }
144
145    /// Set maximum insertions.
146    #[must_use]
147    pub fn insertions(mut self, max: u8) -> Self {
148        self.max_insertions = Some(max);
149        self
150    }
151
152    /// Set maximum deletions.
153    #[must_use]
154    pub fn deletions(mut self, max: u8) -> Self {
155        self.max_deletions = Some(max);
156        self
157    }
158
159    /// Set maximum substitutions.
160    #[must_use]
161    pub fn substitutions(mut self, max: u8) -> Self {
162        self.max_substitutions = Some(max);
163        self
164    }
165
166    /// Set maximum total errors.
167    #[must_use]
168    pub fn errors(mut self, max: u8) -> Self {
169        self.max_errors = Some(max);
170        self
171    }
172
173    /// Set error range.
174    #[must_use]
175    pub fn error_range(mut self, min: u8, max: u8) -> Self {
176        self.min_errors = Some(min);
177        self.max_errors = Some(max);
178        self
179    }
180
181    /// Convert to `FuzzyLimits`.
182    #[must_use]
183    pub fn to_limits(&self) -> FuzzyLimits {
184        // Use 255 as "unlimited" since that's the max value for u8
185        const UNLIMITED: u8 = 255;
186
187        let mut limits = FuzzyLimits::new();
188
189        // Handle insertions
190        if let Some(i) = self.max_insertions {
191            limits = limits.insertions(i);
192        } else if self.unlimited_insertions {
193            limits = limits.insertions(UNLIMITED);
194        }
195
196        // Handle deletions
197        if let Some(d) = self.max_deletions {
198            limits = limits.deletions(d);
199        } else if self.unlimited_deletions {
200            limits = limits.deletions(UNLIMITED);
201        }
202
203        // Handle substitutions
204        if let Some(s) = self.max_substitutions {
205            limits = limits.substitutions(s);
206        } else if self.unlimited_substitutions {
207            limits = limits.substitutions(UNLIMITED);
208        }
209
210        // Handle transpositions
211        if let Some(t) = self.max_transpositions {
212            limits = limits.swaps(t);
213        } else if self.unlimited_transpositions {
214            limits = limits.swaps(UNLIMITED);
215        }
216
217        // Handle total errors
218        if let Some(e) = self.max_errors {
219            limits = limits.edits(e);
220        } else if self.unlimited_errors {
221            limits = limits.edits(UNLIMITED);
222        } else if let Some(max_cost) = self.max_cost {
223            // When cost constraint is used without explicit error limit,
224            // infer max_edits from cost constraint. Use max_cost divided by
225            // minimum operation cost (at least 1).
226            let min_cost = [
227                self.insertion_cost.unwrap_or(1),
228                self.deletion_cost.unwrap_or(1),
229                self.substitution_cost.unwrap_or(1),
230                self.transposition_cost.unwrap_or(1),
231            ]
232            .into_iter()
233            .filter(|&c| c > 0)
234            .min()
235            .unwrap_or(1);
236            // max_cost is stored as N+1 for <=N, so subtract 1 to get actual limit
237            let actual_max_cost = max_cost.saturating_sub(1);
238            let inferred_max_edits = actual_max_cost / min_cost;
239            limits = limits.edits(inferred_max_edits);
240        }
241
242        limits
243    }
244
245    /// Check if this fuzziness specification has any unlimited flags set.
246    #[must_use]
247    pub fn has_unlimited(&self) -> bool {
248        self.unlimited_insertions
249            || self.unlimited_deletions
250            || self.unlimited_substitutions
251            || self.unlimited_transpositions
252            || self.unlimited_errors
253    }
254}
255
256impl Fuzziness {
257    /// Convert to `FuzzyLimits`, using default if inherited.
258    #[must_use]
259    pub fn to_limits(&self, default_edits: u8) -> Option<FuzzyLimits> {
260        match self {
261            Fuzziness::Exact => Some(FuzzyLimits::new().edits(0)),
262            Fuzziness::Edits(n) => Some(FuzzyLimits::new().edits(*n)),
263            Fuzziness::Detailed(limits) => Some(limits.clone()),
264            Fuzziness::MrabStyle(mrab) => Some(mrab.to_limits()),
265            Fuzziness::Inherited => {
266                if default_edits > 0 {
267                    Some(FuzzyLimits::new().edits(default_edits))
268                } else {
269                    None
270                }
271            }
272        }
273    }
274
275    /// Get the minimum edits required (for exclusive lower bounds like `{0<e<5}`).
276    #[must_use]
277    pub fn min_edits(&self) -> Option<u8> {
278        match self {
279            Fuzziness::MrabStyle(mrab) => mrab.min_errors,
280            _ => None,
281        }
282    }
283}
284
285/// AST node representing a parsed regex pattern.
286#[derive(Debug, Clone, PartialEq)]
287pub enum Ast {
288    /// Empty pattern.
289    Empty,
290
291    /// Literal string with optional fuzziness: `hello`, `hello~2`.
292    Literal {
293        /// The literal text to match.
294        text: String,
295        /// Fuzziness specification for approximate matching.
296        fuzziness: Fuzziness,
297    },
298
299    /// Single character (from escape or plain char outside literals).
300    Char(char),
301
302    /// Character class: `[a-z]`, `[^abc]`, `\d`, `\w`, `\s`, `.`
303    CharClass(CharClass),
304
305    /// Concatenation of patterns.
306    Concat(Vec<Ast>),
307
308    /// Alternation: `a|b|c`.
309    Alternation(Vec<Ast>),
310
311    /// Quantified expression: `a*`, `a+`, `a?`, `a{n,m}`.
312    Quantified {
313        /// The expression being quantified.
314        expr: Box<Ast>,
315        /// The quantifier specifying repetition bounds.
316        quantifier: Quantifier,
317        /// Whether the quantifier is greedy (matches as much as possible).
318        greedy: bool,
319    },
320
321    /// Capture group: `(expr)`.
322    Group {
323        /// The capture group index (1-based for user-facing, 0-based internally).
324        index: usize,
325        /// Optional name for named capture groups like `(?P<name>...)`.
326        name: Option<String>,
327        /// The expression contained in the group.
328        expr: Box<Ast>,
329    },
330
331    /// Non-capturing group: `(?:expr)`.
332    NonCapturingGroup {
333        /// The expression contained in the group.
334        expr: Box<Ast>,
335        /// Fuzziness specification applied to this group.
336        fuzziness: Fuzziness,
337    },
338
339    /// Anchor: `^`, `$`.
340    Anchor(Anchor),
341
342    /// Lookahead: `(?=...)`, `(?!...)`.
343    Lookahead {
344        /// True for positive lookahead `(?=...)`, false for negative `(?!...)`.
345        positive: bool,
346        /// The expression to match in the lookahead.
347        expr: Box<Ast>,
348    },
349
350    /// Lookbehind: `(?<=...)`, `(?<!...)`.
351    Lookbehind {
352        /// True for positive lookbehind `(?<=...)`, false for negative `(?<!...)`.
353        positive: bool,
354        /// The expression to match in the lookbehind.
355        expr: Box<Ast>,
356    },
357
358    /// Backreference: `\1`, `\2`, optionally with fuzziness `\1{e<=1}`.
359    Backreference {
360        /// The capture group number being referenced.
361        group: usize,
362        /// Fuzziness specification for approximate backreference matching.
363        fuzziness: Fuzziness,
364    },
365
366    /// Named list reference: `\L<name>`.
367    NamedList {
368        /// The name of the list.
369        name: String,
370    },
371
372    /// Reset match start: `\K`
373    /// Resets the starting point of the match. Everything before \K is matched
374    /// but not included in the final match result.
375    ResetMatchStart,
376
377    /// Atomic group: `(?>...)`
378    /// Once the group matches, backtracking is disabled within the group.
379    AtomicGroup {
380        /// The expression contained in the atomic group.
381        expr: Box<Ast>,
382    },
383
384    /// Recursive entire pattern: `(?R)`
385    /// Recursively matches the entire pattern.
386    RecursivePattern,
387
388    /// Recursive numbered group: `(?1)`, `(?2)`, etc.
389    /// Recursively matches a specific capture group.
390    RecursiveGroup {
391        /// The capture group number to recurse into.
392        group: usize,
393    },
394
395    /// Recursive named group: `(?&name)` or `(?P>name)`
396    /// Recursively matches a named capture group.
397    RecursiveNamedGroup {
398        /// The name of the capture group to recurse into.
399        name: String,
400    },
401}
402
403impl Ast {
404    /// Create a literal AST node with inherited fuzziness.
405    pub fn literal(text: impl Into<String>) -> Self {
406        Ast::Literal {
407            text: text.into(),
408            fuzziness: Fuzziness::Inherited,
409        }
410    }
411
412    /// Create a literal AST node with specific fuzziness.
413    pub fn literal_fuzzy(text: impl Into<String>, fuzziness: Fuzziness) -> Self {
414        Ast::Literal {
415            text: text.into(),
416            fuzziness,
417        }
418    }
419
420    /// Create a quantified AST node.
421    #[must_use]
422    pub fn quantified(expr: Ast, quantifier: Quantifier, greedy: bool) -> Self {
423        Ast::Quantified {
424            expr: Box::new(expr),
425            quantifier,
426            greedy,
427        }
428    }
429
430    /// Create a capture group.
431    #[must_use]
432    pub fn group(index: usize, expr: Ast) -> Self {
433        Ast::Group {
434            index,
435            name: None,
436            expr: Box::new(expr),
437        }
438    }
439
440    /// Create a named capture group.
441    pub fn named_group(index: usize, name: impl Into<String>, expr: Ast) -> Self {
442        Ast::Group {
443            index,
444            name: Some(name.into()),
445            expr: Box::new(expr),
446        }
447    }
448
449    /// Check if this AST is empty.
450    #[must_use]
451    pub fn is_empty(&self) -> bool {
452        matches!(self, Ast::Empty)
453    }
454}
455
456/// Character class definition.
457#[derive(Debug, Clone, PartialEq)]
458pub struct CharClass {
459    /// Whether this is a negated class `[^...]`.
460    pub negated: bool,
461    /// The ranges/characters in this class.
462    pub items: Vec<CharClassItem>,
463}
464
465impl CharClass {
466    /// Create a new character class.
467    #[must_use]
468    pub fn new(negated: bool, items: Vec<CharClassItem>) -> Self {
469        CharClass { negated, items }
470    }
471
472    /// Create a character class matching any character except newlines (default `.`).
473    #[must_use]
474    pub fn any() -> Self {
475        CharClass {
476            negated: false,
477            items: vec![CharClassItem::Named(NamedClass::AnyExceptNewline)],
478        }
479    }
480
481    /// Create a character class matching any character including newlines (`dot_all` `.`).
482    #[must_use]
483    pub fn any_with_newlines() -> Self {
484        CharClass {
485            negated: false,
486            items: vec![CharClassItem::Named(NamedClass::Any)],
487        }
488    }
489
490    /// Create a digit class (`\d`).
491    #[must_use]
492    pub fn digit() -> Self {
493        CharClass {
494            negated: false,
495            items: vec![CharClassItem::Named(NamedClass::Digit)],
496        }
497    }
498
499    /// Create a word class (`\w`).
500    #[must_use]
501    pub fn word() -> Self {
502        CharClass {
503            negated: false,
504            items: vec![CharClassItem::Named(NamedClass::Word)],
505        }
506    }
507
508    /// Create a whitespace class (`\s`).
509    #[must_use]
510    pub fn whitespace() -> Self {
511        CharClass {
512            negated: false,
513            items: vec![CharClassItem::Named(NamedClass::Whitespace)],
514        }
515    }
516
517    /// Check if a character matches this class.
518    #[must_use]
519    pub fn matches(&self, ch: char) -> bool {
520        let in_class = self.items.iter().any(|item| item.matches(ch));
521        if self.negated { !in_class } else { in_class }
522    }
523
524    /// Check if a character matches this class with Unicode support.
525    #[must_use]
526    pub fn matches_unicode(&self, ch: char) -> bool {
527        let in_class = self
528            .items
529            .iter()
530            .any(|item| item.matches_with_unicode(ch, true));
531        if self.negated { !in_class } else { in_class }
532    }
533}
534
535/// An item in a character class.
536#[derive(Debug, Clone, PartialEq)]
537pub enum CharClassItem {
538    /// Single character.
539    Single(char),
540    /// Character range: `a-z`.
541    Range(char, char),
542    /// Named character class: `\d`, `\w`, `\s`.
543    Named(NamedClass),
544}
545
546impl CharClassItem {
547    /// Check if a character matches this item.
548    #[must_use]
549    pub fn matches(&self, ch: char) -> bool {
550        match self {
551            CharClassItem::Single(c) => *c == ch,
552            CharClassItem::Range(start, end) => ch >= *start && ch <= *end,
553            CharClassItem::Named(class) => class.matches(ch),
554        }
555    }
556
557    /// Check if a character matches this item with Unicode support.
558    #[must_use]
559    pub fn matches_with_unicode(&self, ch: char, unicode: bool) -> bool {
560        match self {
561            CharClassItem::Single(c) => *c == ch,
562            CharClassItem::Range(start, end) => ch >= *start && ch <= *end,
563            CharClassItem::Named(class) => class.matches_with_unicode(ch, unicode),
564        }
565    }
566}
567
568/// Named character class.
569#[derive(Debug, Clone, Copy, PartialEq, Eq)]
570pub enum NamedClass {
571    /// `\d` - digits.
572    Digit,
573    /// `\D` - non-digits.
574    NotDigit,
575    /// `\w` - word characters.
576    Word,
577    /// `\W` - non-word characters.
578    NotWord,
579    /// `\s` - whitespace.
580    Whitespace,
581    /// `\S` - non-whitespace.
582    NotWhitespace,
583    /// `.` - any character (including newlines, for `dot_all` mode).
584    Any,
585    /// `.` - any character except newlines (default mode).
586    AnyExceptNewline,
587}
588
589impl NamedClass {
590    /// Check if a character matches this named class (ASCII mode).
591    #[must_use]
592    pub fn matches(&self, ch: char) -> bool {
593        self.matches_with_unicode(ch, false)
594    }
595
596    /// Check if a character matches this named class.
597    /// When `unicode` is true, uses Unicode character classes.
598    #[must_use]
599    pub fn matches_with_unicode(&self, ch: char, unicode: bool) -> bool {
600        match self {
601            NamedClass::Digit => {
602                if unicode {
603                    ch.is_ascii_digit() || unicode_digit(ch)
604                } else {
605                    ch.is_ascii_digit()
606                }
607            }
608            NamedClass::NotDigit => {
609                if unicode {
610                    !(ch.is_ascii_digit() || unicode_digit(ch))
611                } else {
612                    !ch.is_ascii_digit()
613                }
614            }
615            NamedClass::Word => {
616                if unicode {
617                    ch.is_alphanumeric() || ch == '_' || unicode_word_char(ch)
618                } else {
619                    ch.is_ascii_alphanumeric() || ch == '_'
620                }
621            }
622            NamedClass::NotWord => {
623                if unicode {
624                    !(ch.is_alphanumeric() || ch == '_' || unicode_word_char(ch))
625                } else {
626                    !(ch.is_ascii_alphanumeric() || ch == '_')
627                }
628            }
629            NamedClass::Whitespace => {
630                if unicode {
631                    ch.is_whitespace() || unicode_whitespace(ch)
632                } else {
633                    ch.is_ascii_whitespace()
634                }
635            }
636            NamedClass::NotWhitespace => {
637                if unicode {
638                    !(ch.is_whitespace() || unicode_whitespace(ch))
639                } else {
640                    !ch.is_ascii_whitespace()
641                }
642            }
643            NamedClass::Any => true,
644            NamedClass::AnyExceptNewline => ch != '\n' && ch != '\r',
645        }
646    }
647}
648
649/// Check if character is a Unicode digit (outside ASCII).
650fn unicode_digit(ch: char) -> bool {
651    matches!(ch,
652        '\u{0660}'..='\u{0669}' |  // Arabic-Indic digits
653        '\u{06F0}'..='\u{06F9}' |  // Extended Arabic-Indic digits
654        '\u{0966}'..='\u{096F}' |  // Devanagari digits
655        '\u{0E50}'..='\u{0E59}' |  // Thai digits
656        '\u{FF10}'..='\u{FF19}' |  // Fullwidth digits
657        '\u{104A0}'..='\u{104D9}' | // Osage digits
658        '\u{1D7CE}'..='\u{1D7FF}'  // Mathematical bold digits
659    )
660}
661
662/// Check if character is a Unicode word character beyond ASCII.
663fn unicode_word_char(ch: char) -> bool {
664    // Check for Unicode letters and connectors
665    matches!(ch,
666        '\u{00C0}'..='\u{024F}' |  // Latin Extended
667        '\u{0250}'..='\u{02AF}' |  // IPA Extensions
668        '\u{02B0}'..='\u{02FF}' |  // Spacing Modifier Letters
669        '\u{0300}'..='\u{036F}' |  // Combining Diacritical Marks
670        '\u{0370}'..='\u{03FF}' |  // Greek
671        '\u{0400}'..='\u{04FF}' |  // Cyrillic
672        '\u{0500}'..='\u{052F}' |  // Cyrillic Supplement
673        '\u{0530}'..='\u{058F}' |  // Armenian
674        '\u{0590}'..='\u{05FF}' |  // Hebrew
675        '\u{0600}'..='\u{06FF}' |  // Arabic
676        '\u{0900}'..='\u{097F}' |  // Devanagari
677        '\u{4E00}'..='\u{9FFF}' |  // CJK
678        '\u{3400}'..='\u{4DBF}' |  // CJK Extension A
679        '\u{F900}'..='\u{FAFF}' |  // CJK Compatibility
680        '\u{2000}'..='\u{206F}' |  // General Punctuation
681        '\u{2070}'..='\u{209F}' |  // Superscripts/Subscripts
682        '\u{20A0}'..='\u{20CF}' |  // Currency Symbols
683        '\u{2100}'..='\u{214F}' |  // Letterlike Symbols
684        '\u{2150}'..='\u{218F}' |  // Number Forms
685        '\u{2190}'..='\u{21FF}' |  // Arrows
686        '\u{2200}'..='\u{22FF}' |  // Mathematical Operators
687        '\u{2300}'..='\u{23FF}' |  // Miscellaneous Technical
688        '\u{2460}'..='\u{24FF}' |  // Enclosed Alphanumerics
689        '\u{2500}'..='\u{257F}' |  // Box Drawing
690        '\u{2580}'..='\u{259F}' |  // Block Elements
691        '\u{25A0}'..='\u{25FF}' |  // Geometric Shapes
692        '\u{2600}'..='\u{26FF}' |  // Miscellaneous Symbols
693        '\u{2700}'..='\u{27BF}' |  // Dingbats
694        '\u{FB00}'..='\u{FB4F}' |  // Alphabetic Presentation Forms
695        '\u{FB50}'..='\u{FDFF}' |  // Arabic Presentation Forms A
696        '\u{FE70}'..='\u{FEFF}' |  // Arabic Presentation Forms B
697        '\u{FF00}'..='\u{FFEF}' |  // Halfwidth/Fullwidth Forms
698        '\u{1F600}'..='\u{1F64F}' | // Emoticons
699        '\u{1F300}'..='\u{1F5FF}' | // Misc Symbols and Pictographs
700        '\u{1F680}'..='\u{1F6FF}' | // Transport and Map
701        '\u{1F900}'..='\u{1F9FF}' | // Supplemental Symbols
702        '\u{1FA00}'..='\u{1FA6F}' | // Chess Symbols
703        '\u{1FA70}'..='\u{1FAFF}' | // Symbols and Pictographs Extended
704        '\u{1F170}'..='\u{1F19A}' | // Enclosed Alphanumeric Supplement
705        '\u{00B5}' // Micro sign
706    )
707}
708
709/// Check if character is Unicode whitespace beyond ASCII.
710fn unicode_whitespace(ch: char) -> bool {
711    matches!(
712        ch,
713        '\u{0085}' |  // Next Line (NEL)
714        '\u{00A0}' |  // No-Break Space
715        '\u{1680}' |  // Ogham Space Mark
716        '\u{2000}'
717            ..='\u{200A}' |  // En Quad to Hair Space
718        '\u{2028}' |  // Line Separator
719        '\u{2029}' |  // Paragraph Separator
720        '\u{202F}' |  // Narrow No-Break Space
721        '\u{205F}' |  // Medium Mathematical Space
722        '\u{3000}' // Ideographic Space
723    )
724}
725
726/// Quantifier specification.
727#[derive(Debug, Clone, Copy, PartialEq, Eq)]
728pub enum Quantifier {
729    /// `*` - zero or more.
730    ZeroOrMore,
731    /// `+` - one or more.
732    OneOrMore,
733    /// `?` - zero or one.
734    ZeroOrOne,
735    /// `{n}` - exactly n.
736    Exactly(usize),
737    /// `{n,}` - at least n.
738    AtLeast(usize),
739    /// `{n,m}` - between n and m (inclusive).
740    Between(usize, usize),
741}
742
743impl Quantifier {
744    /// Get the minimum number of repetitions.
745    #[must_use]
746    pub fn min(&self) -> usize {
747        match self {
748            Quantifier::ZeroOrMore | Quantifier::ZeroOrOne => 0,
749            Quantifier::OneOrMore => 1,
750            Quantifier::Exactly(n) | Quantifier::AtLeast(n) | Quantifier::Between(n, _) => *n,
751        }
752    }
753
754    /// Get the maximum number of repetitions (None = unbounded).
755    #[must_use]
756    pub fn max(&self) -> Option<usize> {
757        match self {
758            Quantifier::ZeroOrMore | Quantifier::OneOrMore | Quantifier::AtLeast(_) => None,
759            Quantifier::ZeroOrOne => Some(1),
760            Quantifier::Exactly(n) => Some(*n),
761            Quantifier::Between(_, m) => Some(*m),
762        }
763    }
764}
765
766/// Anchor type.
767#[derive(Debug, Clone, Copy, PartialEq, Eq)]
768pub enum Anchor {
769    /// `^` - start of string/line.
770    Start,
771    /// `$` - end of string/line.
772    End,
773    /// `\b` - word boundary.
774    WordBoundary,
775    /// `\B` - non-word boundary.
776    NotWordBoundary,
777}