fuzzy_regex/parser/ast.rs
1//! Abstract Syntax Tree definitions for fuzzy regex patterns.
2
3#![allow(clippy::match_same_arms, clippy::too_many_lines)]
4// Note: enum_clone_variant is not a valid lint
5
6use crate::types::FuzzyLimits;
7
8/// Matching flags that can be set in the pattern.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10pub struct MatchFlags {
11 /// BESTMATCH flag (`(?b)`) - search for the best match instead of the first.
12 pub best_match: bool,
13 /// ENHANCEMATCH flag (`(?e)`) - improve the fit of fuzzy matches.
14 pub enhance_match: bool,
15 /// POSIX leftmost-longest flag (`(?p)`) - find longest match at leftmost position.
16 pub posix: bool,
17 /// Verbose flag (`(?x)`) - ignore whitespace and allow comments.
18 pub verbose: bool,
19 /// Dot-all flag (`(?s)`) - `.` matches newlines.
20 pub dot_all: bool,
21 /// Multi-line flag (`(?m)`) - `^`/`$` match at line boundaries.
22 pub multi_line: bool,
23 /// Ungreedy flag (`(?U)`) - invert default greediness of quantifiers.
24 pub ungreedy: bool,
25 /// Case-insensitive flag (`(?i)`) - match case-insensitively.
26 pub case_insensitive: bool,
27 /// Global flag (`(?g)`) - find all matches, not just the first.
28 pub global: bool,
29 /// Unicode flag (`(?u)`) - enable Unicode character classes.
30 pub unicode: bool,
31}
32
33impl MatchFlags {
34 /// Create a new empty flags set.
35 #[must_use]
36 pub fn new() -> Self {
37 Self::default()
38 }
39
40 /// Set the BESTMATCH flag.
41 #[must_use]
42 pub fn with_best_match(mut self) -> Self {
43 self.best_match = true;
44 self
45 }
46
47 /// Set the ENHANCEMATCH flag.
48 #[must_use]
49 pub fn with_enhance_match(mut self) -> Self {
50 self.enhance_match = true;
51 self
52 }
53
54 /// Set the POSIX flag.
55 #[must_use]
56 pub fn with_posix(mut self) -> Self {
57 self.posix = true;
58 self
59 }
60}
61
62/// Fuzziness specification for a literal segment.
63///
64/// Supports two syntax styles:
65/// 1. Simple: `hello~2` (allows 2 total edits)
66/// 2. mrab-style: `(?:hello){i<=1,d<=2}` (max 1 insertion, 2 deletions)
67#[derive(Debug, Clone, PartialEq, Default)]
68pub enum Fuzziness {
69 /// Simple edit count: `hello~2` allows 2 total edits.
70 Edits(u8),
71 /// Detailed limits: `hello~{i=1,d=0,s=2}` or mrab-style `{i<=1,d<=2}`.
72 Detailed(FuzzyLimits),
73 /// mrab-style specification with optional cost constraints.
74 /// `{i<=1,s<=2,2i+2d+1s<=4}`
75 MrabStyle(MrabFuzziness),
76 /// Inherit from global/parent settings.
77 #[default]
78 Inherited,
79 /// Exact match only (no fuzzy matching): `hello~0`.
80 Exact,
81}
82
83/// mrab-regex style fuzziness specification.
84///
85/// Supports:
86/// - Error type limits: `{i<=1,d<=2,s<=3,t<=1}`
87/// - Total error limits: `{e<=5}`
88/// - Cost constraints: `{2i+2d+1s+1t<=4}` or `{c<=4}` (equal cost)
89/// - Ranges: `{1<=e<=3}`
90/// - Exclusive bounds: `{i<3}` (fewer than 3)
91/// - Character class restrictions: `{s<=2:[a-z]}` (restricts edit characters to class)
92/// - Unlimited errors: `{e}` (any number of errors allowed)
93#[derive(Debug, Clone, PartialEq, Default)]
94pub struct MrabFuzziness {
95 /// Maximum insertions allowed (None = not specified).
96 pub max_insertions: Option<u8>,
97 /// Maximum deletions allowed.
98 pub max_deletions: Option<u8>,
99 /// Maximum substitutions allowed.
100 pub max_substitutions: Option<u8>,
101 /// Maximum transpositions allowed.
102 pub max_transpositions: Option<u8>,
103 /// Maximum total errors allowed.
104 pub max_errors: Option<u8>,
105 /// Minimum total errors required.
106 pub min_errors: Option<u8>,
107 /// Whether insertions are unlimited (e.g., `{i}` without a value).
108 pub unlimited_insertions: bool,
109 /// Whether deletions are unlimited.
110 pub unlimited_deletions: bool,
111 /// Whether substitutions are unlimited.
112 pub unlimited_substitutions: bool,
113 /// Whether transpositions are unlimited.
114 pub unlimited_transpositions: bool,
115 /// Whether total errors are unlimited (e.g., `{e}` without a value).
116 pub unlimited_errors: bool,
117 /// Cost for insertions (for cost-based constraints).
118 pub insertion_cost: Option<u8>,
119 /// Cost for deletions.
120 pub deletion_cost: Option<u8>,
121 /// Cost for substitutions.
122 pub substitution_cost: Option<u8>,
123 /// Cost for transpositions.
124 pub transposition_cost: Option<u8>,
125 /// Maximum total cost.
126 pub max_cost: Option<u8>,
127 /// Character class restriction for substitutions (e.g., `[a-z]`).
128 /// Substituted characters must be in this class.
129 pub substitution_chars: Option<CharClass>,
130 /// Character class restriction for insertions.
131 /// Inserted characters must be in this class.
132 pub insertion_chars: Option<CharClass>,
133 /// Character class restriction for deletions.
134 /// Note: Deletions don't introduce new characters, so this is less meaningful.
135 pub deletion_chars: Option<CharClass>,
136}
137
138impl MrabFuzziness {
139 /// Create a new empty mrab-style fuzziness.
140 #[must_use]
141 pub fn new() -> Self {
142 Self::default()
143 }
144
145 /// Set maximum insertions.
146 #[must_use]
147 pub fn insertions(mut self, max: u8) -> Self {
148 self.max_insertions = Some(max);
149 self
150 }
151
152 /// Set maximum deletions.
153 #[must_use]
154 pub fn deletions(mut self, max: u8) -> Self {
155 self.max_deletions = Some(max);
156 self
157 }
158
159 /// Set maximum substitutions.
160 #[must_use]
161 pub fn substitutions(mut self, max: u8) -> Self {
162 self.max_substitutions = Some(max);
163 self
164 }
165
166 /// Set maximum total errors.
167 #[must_use]
168 pub fn errors(mut self, max: u8) -> Self {
169 self.max_errors = Some(max);
170 self
171 }
172
173 /// Set error range.
174 #[must_use]
175 pub fn error_range(mut self, min: u8, max: u8) -> Self {
176 self.min_errors = Some(min);
177 self.max_errors = Some(max);
178 self
179 }
180
181 /// Convert to `FuzzyLimits`.
182 #[must_use]
183 pub fn to_limits(&self) -> FuzzyLimits {
184 // Use 255 as "unlimited" since that's the max value for u8
185 const UNLIMITED: u8 = 255;
186
187 let mut limits = FuzzyLimits::new();
188
189 // Handle insertions
190 if let Some(i) = self.max_insertions {
191 limits = limits.insertions(i);
192 } else if self.unlimited_insertions {
193 limits = limits.insertions(UNLIMITED);
194 }
195
196 // Handle deletions
197 if let Some(d) = self.max_deletions {
198 limits = limits.deletions(d);
199 } else if self.unlimited_deletions {
200 limits = limits.deletions(UNLIMITED);
201 }
202
203 // Handle substitutions
204 if let Some(s) = self.max_substitutions {
205 limits = limits.substitutions(s);
206 } else if self.unlimited_substitutions {
207 limits = limits.substitutions(UNLIMITED);
208 }
209
210 // Handle transpositions
211 if let Some(t) = self.max_transpositions {
212 limits = limits.swaps(t);
213 } else if self.unlimited_transpositions {
214 limits = limits.swaps(UNLIMITED);
215 }
216
217 // Handle total errors
218 if let Some(e) = self.max_errors {
219 limits = limits.edits(e);
220 } else if self.unlimited_errors {
221 limits = limits.edits(UNLIMITED);
222 } else if let Some(max_cost) = self.max_cost {
223 // When cost constraint is used without explicit error limit,
224 // infer max_edits from cost constraint. Use max_cost divided by
225 // minimum operation cost (at least 1).
226 let min_cost = [
227 self.insertion_cost.unwrap_or(1),
228 self.deletion_cost.unwrap_or(1),
229 self.substitution_cost.unwrap_or(1),
230 self.transposition_cost.unwrap_or(1),
231 ]
232 .into_iter()
233 .filter(|&c| c > 0)
234 .min()
235 .unwrap_or(1);
236 // max_cost is stored as N+1 for <=N, so subtract 1 to get actual limit
237 let actual_max_cost = max_cost.saturating_sub(1);
238 let inferred_max_edits = actual_max_cost / min_cost;
239 limits = limits.edits(inferred_max_edits);
240 }
241
242 limits
243 }
244
245 /// Check if this fuzziness specification has any unlimited flags set.
246 #[must_use]
247 pub fn has_unlimited(&self) -> bool {
248 self.unlimited_insertions
249 || self.unlimited_deletions
250 || self.unlimited_substitutions
251 || self.unlimited_transpositions
252 || self.unlimited_errors
253 }
254}
255
256impl Fuzziness {
257 /// Convert to `FuzzyLimits`, using default if inherited.
258 #[must_use]
259 pub fn to_limits(&self, default_edits: u8) -> Option<FuzzyLimits> {
260 match self {
261 Fuzziness::Exact => Some(FuzzyLimits::new().edits(0)),
262 Fuzziness::Edits(n) => Some(FuzzyLimits::new().edits(*n)),
263 Fuzziness::Detailed(limits) => Some(limits.clone()),
264 Fuzziness::MrabStyle(mrab) => Some(mrab.to_limits()),
265 Fuzziness::Inherited => {
266 if default_edits > 0 {
267 Some(FuzzyLimits::new().edits(default_edits))
268 } else {
269 None
270 }
271 }
272 }
273 }
274
275 /// Get the minimum edits required (for exclusive lower bounds like `{0<e<5}`).
276 #[must_use]
277 pub fn min_edits(&self) -> Option<u8> {
278 match self {
279 Fuzziness::MrabStyle(mrab) => mrab.min_errors,
280 _ => None,
281 }
282 }
283}
284
285/// AST node representing a parsed regex pattern.
286#[derive(Debug, Clone, PartialEq)]
287pub enum Ast {
288 /// Empty pattern.
289 Empty,
290
291 /// Literal string with optional fuzziness: `hello`, `hello~2`.
292 Literal {
293 /// The literal text to match.
294 text: String,
295 /// Fuzziness specification for approximate matching.
296 fuzziness: Fuzziness,
297 },
298
299 /// Single character (from escape or plain char outside literals).
300 Char(char),
301
302 /// Character class: `[a-z]`, `[^abc]`, `\d`, `\w`, `\s`, `.`
303 CharClass(CharClass),
304
305 /// Concatenation of patterns.
306 Concat(Vec<Ast>),
307
308 /// Alternation: `a|b|c`.
309 Alternation(Vec<Ast>),
310
311 /// Quantified expression: `a*`, `a+`, `a?`, `a{n,m}`.
312 Quantified {
313 /// The expression being quantified.
314 expr: Box<Ast>,
315 /// The quantifier specifying repetition bounds.
316 quantifier: Quantifier,
317 /// Whether the quantifier is greedy (matches as much as possible).
318 greedy: bool,
319 },
320
321 /// Capture group: `(expr)`.
322 Group {
323 /// The capture group index (1-based for user-facing, 0-based internally).
324 index: usize,
325 /// Optional name for named capture groups like `(?P<name>...)`.
326 name: Option<String>,
327 /// The expression contained in the group.
328 expr: Box<Ast>,
329 },
330
331 /// Non-capturing group: `(?:expr)`.
332 NonCapturingGroup {
333 /// The expression contained in the group.
334 expr: Box<Ast>,
335 /// Fuzziness specification applied to this group.
336 fuzziness: Fuzziness,
337 },
338
339 /// Anchor: `^`, `$`.
340 Anchor(Anchor),
341
342 /// Lookahead: `(?=...)`, `(?!...)`.
343 Lookahead {
344 /// True for positive lookahead `(?=...)`, false for negative `(?!...)`.
345 positive: bool,
346 /// The expression to match in the lookahead.
347 expr: Box<Ast>,
348 },
349
350 /// Lookbehind: `(?<=...)`, `(?<!...)`.
351 Lookbehind {
352 /// True for positive lookbehind `(?<=...)`, false for negative `(?<!...)`.
353 positive: bool,
354 /// The expression to match in the lookbehind.
355 expr: Box<Ast>,
356 },
357
358 /// Backreference: `\1`, `\2`, optionally with fuzziness `\1{e<=1}`.
359 Backreference {
360 /// The capture group number being referenced.
361 group: usize,
362 /// Fuzziness specification for approximate backreference matching.
363 fuzziness: Fuzziness,
364 },
365
366 /// Named list reference: `\L<name>`.
367 NamedList {
368 /// The name of the list.
369 name: String,
370 },
371
372 /// Reset match start: `\K`
373 /// Resets the starting point of the match. Everything before \K is matched
374 /// but not included in the final match result.
375 ResetMatchStart,
376
377 /// Atomic group: `(?>...)`
378 /// Once the group matches, backtracking is disabled within the group.
379 AtomicGroup {
380 /// The expression contained in the atomic group.
381 expr: Box<Ast>,
382 },
383
384 /// Recursive entire pattern: `(?R)`
385 /// Recursively matches the entire pattern.
386 RecursivePattern,
387
388 /// Recursive numbered group: `(?1)`, `(?2)`, etc.
389 /// Recursively matches a specific capture group.
390 RecursiveGroup {
391 /// The capture group number to recurse into.
392 group: usize,
393 },
394
395 /// Recursive named group: `(?&name)` or `(?P>name)`
396 /// Recursively matches a named capture group.
397 RecursiveNamedGroup {
398 /// The name of the capture group to recurse into.
399 name: String,
400 },
401}
402
403impl Ast {
404 /// Create a literal AST node with inherited fuzziness.
405 pub fn literal(text: impl Into<String>) -> Self {
406 Ast::Literal {
407 text: text.into(),
408 fuzziness: Fuzziness::Inherited,
409 }
410 }
411
412 /// Create a literal AST node with specific fuzziness.
413 pub fn literal_fuzzy(text: impl Into<String>, fuzziness: Fuzziness) -> Self {
414 Ast::Literal {
415 text: text.into(),
416 fuzziness,
417 }
418 }
419
420 /// Create a quantified AST node.
421 #[must_use]
422 pub fn quantified(expr: Ast, quantifier: Quantifier, greedy: bool) -> Self {
423 Ast::Quantified {
424 expr: Box::new(expr),
425 quantifier,
426 greedy,
427 }
428 }
429
430 /// Create a capture group.
431 #[must_use]
432 pub fn group(index: usize, expr: Ast) -> Self {
433 Ast::Group {
434 index,
435 name: None,
436 expr: Box::new(expr),
437 }
438 }
439
440 /// Create a named capture group.
441 pub fn named_group(index: usize, name: impl Into<String>, expr: Ast) -> Self {
442 Ast::Group {
443 index,
444 name: Some(name.into()),
445 expr: Box::new(expr),
446 }
447 }
448
449 /// Check if this AST is empty.
450 #[must_use]
451 pub fn is_empty(&self) -> bool {
452 matches!(self, Ast::Empty)
453 }
454}
455
456/// Character class definition.
457#[derive(Debug, Clone, PartialEq)]
458pub struct CharClass {
459 /// Whether this is a negated class `[^...]`.
460 pub negated: bool,
461 /// The ranges/characters in this class.
462 pub items: Vec<CharClassItem>,
463}
464
465impl CharClass {
466 /// Create a new character class.
467 #[must_use]
468 pub fn new(negated: bool, items: Vec<CharClassItem>) -> Self {
469 CharClass { negated, items }
470 }
471
472 /// Create a character class matching any character except newlines (default `.`).
473 #[must_use]
474 pub fn any() -> Self {
475 CharClass {
476 negated: false,
477 items: vec![CharClassItem::Named(NamedClass::AnyExceptNewline)],
478 }
479 }
480
481 /// Create a character class matching any character including newlines (`dot_all` `.`).
482 #[must_use]
483 pub fn any_with_newlines() -> Self {
484 CharClass {
485 negated: false,
486 items: vec![CharClassItem::Named(NamedClass::Any)],
487 }
488 }
489
490 /// Create a digit class (`\d`).
491 #[must_use]
492 pub fn digit() -> Self {
493 CharClass {
494 negated: false,
495 items: vec![CharClassItem::Named(NamedClass::Digit)],
496 }
497 }
498
499 /// Create a word class (`\w`).
500 #[must_use]
501 pub fn word() -> Self {
502 CharClass {
503 negated: false,
504 items: vec![CharClassItem::Named(NamedClass::Word)],
505 }
506 }
507
508 /// Create a whitespace class (`\s`).
509 #[must_use]
510 pub fn whitespace() -> Self {
511 CharClass {
512 negated: false,
513 items: vec![CharClassItem::Named(NamedClass::Whitespace)],
514 }
515 }
516
517 /// Check if a character matches this class.
518 #[must_use]
519 pub fn matches(&self, ch: char) -> bool {
520 let in_class = self.items.iter().any(|item| item.matches(ch));
521 if self.negated { !in_class } else { in_class }
522 }
523
524 /// Check if a character matches this class with Unicode support.
525 #[must_use]
526 pub fn matches_unicode(&self, ch: char) -> bool {
527 let in_class = self
528 .items
529 .iter()
530 .any(|item| item.matches_with_unicode(ch, true));
531 if self.negated { !in_class } else { in_class }
532 }
533}
534
535/// An item in a character class.
536#[derive(Debug, Clone, PartialEq)]
537pub enum CharClassItem {
538 /// Single character.
539 Single(char),
540 /// Character range: `a-z`.
541 Range(char, char),
542 /// Named character class: `\d`, `\w`, `\s`.
543 Named(NamedClass),
544}
545
546impl CharClassItem {
547 /// Check if a character matches this item.
548 #[must_use]
549 pub fn matches(&self, ch: char) -> bool {
550 match self {
551 CharClassItem::Single(c) => *c == ch,
552 CharClassItem::Range(start, end) => ch >= *start && ch <= *end,
553 CharClassItem::Named(class) => class.matches(ch),
554 }
555 }
556
557 /// Check if a character matches this item with Unicode support.
558 #[must_use]
559 pub fn matches_with_unicode(&self, ch: char, unicode: bool) -> bool {
560 match self {
561 CharClassItem::Single(c) => *c == ch,
562 CharClassItem::Range(start, end) => ch >= *start && ch <= *end,
563 CharClassItem::Named(class) => class.matches_with_unicode(ch, unicode),
564 }
565 }
566}
567
568/// Named character class.
569#[derive(Debug, Clone, Copy, PartialEq, Eq)]
570pub enum NamedClass {
571 /// `\d` - digits.
572 Digit,
573 /// `\D` - non-digits.
574 NotDigit,
575 /// `\w` - word characters.
576 Word,
577 /// `\W` - non-word characters.
578 NotWord,
579 /// `\s` - whitespace.
580 Whitespace,
581 /// `\S` - non-whitespace.
582 NotWhitespace,
583 /// `.` - any character (including newlines, for `dot_all` mode).
584 Any,
585 /// `.` - any character except newlines (default mode).
586 AnyExceptNewline,
587}
588
589impl NamedClass {
590 /// Check if a character matches this named class (ASCII mode).
591 #[must_use]
592 pub fn matches(&self, ch: char) -> bool {
593 self.matches_with_unicode(ch, false)
594 }
595
596 /// Check if a character matches this named class.
597 /// When `unicode` is true, uses Unicode character classes.
598 #[must_use]
599 pub fn matches_with_unicode(&self, ch: char, unicode: bool) -> bool {
600 match self {
601 NamedClass::Digit => {
602 if unicode {
603 ch.is_ascii_digit() || unicode_digit(ch)
604 } else {
605 ch.is_ascii_digit()
606 }
607 }
608 NamedClass::NotDigit => {
609 if unicode {
610 !(ch.is_ascii_digit() || unicode_digit(ch))
611 } else {
612 !ch.is_ascii_digit()
613 }
614 }
615 NamedClass::Word => {
616 if unicode {
617 ch.is_alphanumeric() || ch == '_' || unicode_word_char(ch)
618 } else {
619 ch.is_ascii_alphanumeric() || ch == '_'
620 }
621 }
622 NamedClass::NotWord => {
623 if unicode {
624 !(ch.is_alphanumeric() || ch == '_' || unicode_word_char(ch))
625 } else {
626 !(ch.is_ascii_alphanumeric() || ch == '_')
627 }
628 }
629 NamedClass::Whitespace => {
630 if unicode {
631 ch.is_whitespace() || unicode_whitespace(ch)
632 } else {
633 ch.is_ascii_whitespace()
634 }
635 }
636 NamedClass::NotWhitespace => {
637 if unicode {
638 !(ch.is_whitespace() || unicode_whitespace(ch))
639 } else {
640 !ch.is_ascii_whitespace()
641 }
642 }
643 NamedClass::Any => true,
644 NamedClass::AnyExceptNewline => ch != '\n' && ch != '\r',
645 }
646 }
647}
648
649/// Check if character is a Unicode digit (outside ASCII).
650fn unicode_digit(ch: char) -> bool {
651 matches!(ch,
652 '\u{0660}'..='\u{0669}' | // Arabic-Indic digits
653 '\u{06F0}'..='\u{06F9}' | // Extended Arabic-Indic digits
654 '\u{0966}'..='\u{096F}' | // Devanagari digits
655 '\u{0E50}'..='\u{0E59}' | // Thai digits
656 '\u{FF10}'..='\u{FF19}' | // Fullwidth digits
657 '\u{104A0}'..='\u{104D9}' | // Osage digits
658 '\u{1D7CE}'..='\u{1D7FF}' // Mathematical bold digits
659 )
660}
661
662/// Check if character is a Unicode word character beyond ASCII.
663fn unicode_word_char(ch: char) -> bool {
664 // Check for Unicode letters and connectors
665 matches!(ch,
666 '\u{00C0}'..='\u{024F}' | // Latin Extended
667 '\u{0250}'..='\u{02AF}' | // IPA Extensions
668 '\u{02B0}'..='\u{02FF}' | // Spacing Modifier Letters
669 '\u{0300}'..='\u{036F}' | // Combining Diacritical Marks
670 '\u{0370}'..='\u{03FF}' | // Greek
671 '\u{0400}'..='\u{04FF}' | // Cyrillic
672 '\u{0500}'..='\u{052F}' | // Cyrillic Supplement
673 '\u{0530}'..='\u{058F}' | // Armenian
674 '\u{0590}'..='\u{05FF}' | // Hebrew
675 '\u{0600}'..='\u{06FF}' | // Arabic
676 '\u{0900}'..='\u{097F}' | // Devanagari
677 '\u{4E00}'..='\u{9FFF}' | // CJK
678 '\u{3400}'..='\u{4DBF}' | // CJK Extension A
679 '\u{F900}'..='\u{FAFF}' | // CJK Compatibility
680 '\u{2000}'..='\u{206F}' | // General Punctuation
681 '\u{2070}'..='\u{209F}' | // Superscripts/Subscripts
682 '\u{20A0}'..='\u{20CF}' | // Currency Symbols
683 '\u{2100}'..='\u{214F}' | // Letterlike Symbols
684 '\u{2150}'..='\u{218F}' | // Number Forms
685 '\u{2190}'..='\u{21FF}' | // Arrows
686 '\u{2200}'..='\u{22FF}' | // Mathematical Operators
687 '\u{2300}'..='\u{23FF}' | // Miscellaneous Technical
688 '\u{2460}'..='\u{24FF}' | // Enclosed Alphanumerics
689 '\u{2500}'..='\u{257F}' | // Box Drawing
690 '\u{2580}'..='\u{259F}' | // Block Elements
691 '\u{25A0}'..='\u{25FF}' | // Geometric Shapes
692 '\u{2600}'..='\u{26FF}' | // Miscellaneous Symbols
693 '\u{2700}'..='\u{27BF}' | // Dingbats
694 '\u{FB00}'..='\u{FB4F}' | // Alphabetic Presentation Forms
695 '\u{FB50}'..='\u{FDFF}' | // Arabic Presentation Forms A
696 '\u{FE70}'..='\u{FEFF}' | // Arabic Presentation Forms B
697 '\u{FF00}'..='\u{FFEF}' | // Halfwidth/Fullwidth Forms
698 '\u{1F600}'..='\u{1F64F}' | // Emoticons
699 '\u{1F300}'..='\u{1F5FF}' | // Misc Symbols and Pictographs
700 '\u{1F680}'..='\u{1F6FF}' | // Transport and Map
701 '\u{1F900}'..='\u{1F9FF}' | // Supplemental Symbols
702 '\u{1FA00}'..='\u{1FA6F}' | // Chess Symbols
703 '\u{1FA70}'..='\u{1FAFF}' | // Symbols and Pictographs Extended
704 '\u{1F170}'..='\u{1F19A}' | // Enclosed Alphanumeric Supplement
705 '\u{00B5}' // Micro sign
706 )
707}
708
709/// Check if character is Unicode whitespace beyond ASCII.
710fn unicode_whitespace(ch: char) -> bool {
711 matches!(
712 ch,
713 '\u{0085}' | // Next Line (NEL)
714 '\u{00A0}' | // No-Break Space
715 '\u{1680}' | // Ogham Space Mark
716 '\u{2000}'
717 ..='\u{200A}' | // En Quad to Hair Space
718 '\u{2028}' | // Line Separator
719 '\u{2029}' | // Paragraph Separator
720 '\u{202F}' | // Narrow No-Break Space
721 '\u{205F}' | // Medium Mathematical Space
722 '\u{3000}' // Ideographic Space
723 )
724}
725
726/// Quantifier specification.
727#[derive(Debug, Clone, Copy, PartialEq, Eq)]
728pub enum Quantifier {
729 /// `*` - zero or more.
730 ZeroOrMore,
731 /// `+` - one or more.
732 OneOrMore,
733 /// `?` - zero or one.
734 ZeroOrOne,
735 /// `{n}` - exactly n.
736 Exactly(usize),
737 /// `{n,}` - at least n.
738 AtLeast(usize),
739 /// `{n,m}` - between n and m (inclusive).
740 Between(usize, usize),
741}
742
743impl Quantifier {
744 /// Get the minimum number of repetitions.
745 #[must_use]
746 pub fn min(&self) -> usize {
747 match self {
748 Quantifier::ZeroOrMore | Quantifier::ZeroOrOne => 0,
749 Quantifier::OneOrMore => 1,
750 Quantifier::Exactly(n) | Quantifier::AtLeast(n) | Quantifier::Between(n, _) => *n,
751 }
752 }
753
754 /// Get the maximum number of repetitions (None = unbounded).
755 #[must_use]
756 pub fn max(&self) -> Option<usize> {
757 match self {
758 Quantifier::ZeroOrMore | Quantifier::OneOrMore | Quantifier::AtLeast(_) => None,
759 Quantifier::ZeroOrOne => Some(1),
760 Quantifier::Exactly(n) => Some(*n),
761 Quantifier::Between(_, m) => Some(*m),
762 }
763 }
764}
765
766/// Anchor type.
767#[derive(Debug, Clone, Copy, PartialEq, Eq)]
768pub enum Anchor {
769 /// `^` - start of string/line.
770 Start,
771 /// `$` - end of string/line.
772 End,
773 /// `\b` - word boundary.
774 WordBoundary,
775 /// `\B` - non-word boundary.
776 NotWordBoundary,
777}