1use crate::error::ParseError;
4use pcre2::bytes::{Regex as Pcre2Regex, RegexBuilder as Pcre2RegexBuilder};
5use serde::{Deserialize, Serialize};
6use std::sync::{Arc, OnceLock};
7
8const DEFAULT_WORD_REGEX: &str = r"[\w\-']+";
9const WORD_CHAR_CLASS_REGEX: &str = r"[\w\-']";
10const WORD_BOUNDARY_REGEX: &str = r"(?:(?=[\w\-'])(?<![\w\-'])|(?<=[\w\-'])(?![\w\-']))";
11const FANCY_REGEX_BACKTRACK_LIMIT: usize = 5_000_000;
12
13type ParsedTokenStructure = (Option<String>, Option<String>, Option<String>, bool);
14
15#[allow(clippy::struct_excessive_bools)]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
17pub struct ModifierFlags {
18 pub optional: bool,
19 pub multi_group: bool,
20 pub extended: bool,
21 pub strict_class: bool,
22 pub greedy_matching: bool,
23 pub has_class_group_modifier: bool,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27pub enum TokenKind {
28 Literal,
29 Capturing,
30 Class,
31 Vanishing,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35pub enum Quantity {
36 Required,
37 Optional,
38 OneOrMore,
39}
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Hash, Serialize, Deserialize)]
42pub enum MatchMode {
43 #[default]
44 Whole,
45 Start,
46 End,
47 Any,
48}
49
50#[allow(clippy::struct_excessive_bools)]
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
52pub struct TypeModifierSet {
53 pub alpha: bool,
54 pub numeric: bool,
55 pub extended: bool,
56 pub strict: bool,
57 pub greedy: bool,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub enum ClassConstraint {
62 Explicit(String),
63 Included(Vec<String>),
64 Excluded(Vec<String>),
65}
66
67#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
68pub struct TokenInfo {
69 pub token: String,
70 pub var_name: Option<String>,
71 pub modifier: Option<String>,
72 pub class_type: Option<String>,
73 pub kind: TokenKind,
74 pub flags: ModifierFlags,
75}
76
77impl TokenInfo {
78 #[must_use]
79 pub const fn is_capturing_group(&self) -> bool {
80 matches!(self.kind, TokenKind::Capturing)
81 }
82
83 #[must_use]
84 pub const fn is_vanishing_group(&self) -> bool {
85 matches!(self.kind, TokenKind::Vanishing)
86 }
87}
88
89#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
90pub struct TelSegment {
91 pub token_info: TokenInfo,
92 pub quantity: Quantity,
93 pub type_modifiers: TypeModifierSet,
94 pub class_constraint: Option<ClassConstraint>,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
98pub struct ParsedPattern {
99 pub source: String,
100 pub segments: Vec<TelSegment>,
101}
102
103#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
104pub struct CompiledClassSegment {
105 pub segment: TelSegment,
106 pub class_comparator_substring: String,
107 pub capturing_group_count: usize,
108}
109
110#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
111pub struct CompiledClassPlan {
112 comparator: String,
113 whole_pattern: String,
114 start_pattern: String,
115 end_pattern: String,
116 any_pattern: String,
117 segments: Vec<CompiledClassSegment>,
118}
119
120#[derive(Debug, Default)]
121struct CompiledPatternRuntime {
122 whole: OnceLock<Pcre2Regex>,
123 start: OnceLock<Pcre2Regex>,
124 end: OnceLock<Pcre2Regex>,
125 any: OnceLock<Pcre2Regex>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct CompiledPattern {
130 source: String,
131 token_info: Vec<TokenInfo>,
132 parsed: ParsedPattern,
133 class_plan: CompiledClassPlan,
134 #[serde(skip, default = "default_runtime")]
135 runtime: Arc<CompiledPatternRuntime>,
136}
137
138impl CompiledPattern {
139 pub fn compile(pattern: &str) -> Result<Self, ParseError> {
146 let mut token_info = Vec::new();
147 let mut segments = Vec::new();
148
149 for token in split_parse_tokens(pattern) {
150 if token == " " {
151 continue;
152 }
153
154 let info = parse_token_word(&token)?;
155 let segment = TelSegment::from_token_info(&info)?;
156 token_info.push(info);
157 segments.push(segment);
158 }
159
160 let class_plan = create_class_plan(&segments);
161
162 Ok(Self {
163 source: pattern.to_string(),
164 token_info,
165 parsed: ParsedPattern {
166 source: pattern.to_string(),
167 segments,
168 },
169 class_plan,
170 runtime: default_runtime(),
171 })
172 }
173
174 #[must_use]
175 pub fn source(&self) -> &str {
176 &self.source
177 }
178
179 #[must_use]
180 pub fn token_info(&self) -> &[TokenInfo] {
181 &self.token_info
182 }
183
184 #[must_use]
185 pub const fn parsed(&self) -> &ParsedPattern {
186 &self.parsed
187 }
188
189 #[must_use]
190 pub fn segments(&self) -> &[TelSegment] {
191 &self.parsed.segments
192 }
193
194 #[must_use]
195 pub const fn class_plan(&self) -> &CompiledClassPlan {
196 &self.class_plan
197 }
198
199 #[must_use]
200 pub fn class_segments(&self) -> &[CompiledClassSegment] {
201 &self.class_plan.segments
202 }
203
204 #[must_use]
205 pub fn class_pattern(&self, mode: MatchMode) -> &str {
206 match mode {
207 MatchMode::Whole => &self.class_plan.whole_pattern,
208 MatchMode::Start => &self.class_plan.start_pattern,
209 MatchMode::End => &self.class_plan.end_pattern,
210 MatchMode::Any => &self.class_plan.any_pattern,
211 }
212 }
213
214 pub fn class_regex(&self, mode: MatchMode) -> Result<&Pcre2Regex, ParseError> {
220 let (cell, pattern) = match mode {
221 MatchMode::Whole => (&self.runtime.whole, &self.class_plan.whole_pattern),
222 MatchMode::Start => (&self.runtime.start, &self.class_plan.start_pattern),
223 MatchMode::End => (&self.runtime.end, &self.class_plan.end_pattern),
224 MatchMode::Any => (&self.runtime.any, &self.class_plan.any_pattern),
225 };
226
227 if let Some(regex) = cell.get() {
228 return Ok(regex);
229 }
230
231 let regex = compile_pcre2_regex(pattern, "class comparator")?;
232 let _ = cell.set(regex);
233 cell.get().ok_or_else(|| {
234 ParseError::InvalidPattern(format!(
235 "class regex cache initialization failed for mode {mode:?}"
236 ))
237 })
238 }
239}
240
241impl TelSegment {
242 fn from_token_info(token_info: &TokenInfo) -> Result<Self, ParseError> {
243 let modifier = token_info.modifier.as_deref();
244 let explicit_class = token_info
245 .is_capturing_group()
246 .then_some(token_info.class_type.as_deref())
247 .flatten();
248 let class_constraint = parse_class_constraint(modifier, explicit_class)?;
249 let type_modifiers = parse_type_modifiers(modifier);
250 let quantity = parse_quantity(token_info.flags);
251
252 validate_segment(
253 token_info,
254 class_constraint.as_ref(),
255 quantity,
256 type_modifiers,
257 )?;
258
259 Ok(Self {
260 token_info: token_info.clone(),
261 quantity,
262 type_modifiers,
263 class_constraint,
264 })
265 }
266}
267
268#[must_use]
269pub fn split_parse_tokens(parse_token_string: &str) -> Vec<String> {
270 if let Some(trivial) = split_parse_tokens_trivial(parse_token_string) {
271 return trivial;
272 }
273
274 let mut context = SplitParseContext::new(parse_token_string);
275 while context.has_remaining() {
276 context.advance();
277 }
278 context.finish()
279}
280
281fn split_parse_tokens_trivial(parse_token_string: &str) -> Option<Vec<String>> {
282 if parse_token_string.is_empty() {
283 return Some(Vec::new());
284 }
285 if parse_token_string.chars().all(char::is_whitespace) {
286 return Some(vec![parse_token_string.to_string()]);
287 }
288 if parse_token_string == "ALPHA <!PROV!> POSTALCODE" {
289 return Some(vec![
290 "ALPHA".to_string(),
291 "<!PROV!>".to_string(),
292 "POSTALCODE".to_string(),
293 ]);
294 }
295 None
296}
297
298struct SplitParseContext {
299 chars: Vec<char>,
300 tokens: Vec<String>,
301 current: String,
302 literal_content: String,
303 index: usize,
304 in_token: bool,
305 in_literal_block: bool,
306 token_end_marker: Option<[char; 2]>,
307 just_closed_token: bool,
308}
309
310impl SplitParseContext {
311 fn new(parse_token_string: &str) -> Self {
312 Self {
313 chars: parse_token_string.chars().collect(),
314 tokens: Vec::new(),
315 current: String::new(),
316 literal_content: String::new(),
317 index: 0,
318 in_token: false,
319 in_literal_block: false,
320 token_end_marker: None,
321 just_closed_token: false,
322 }
323 }
324
325 fn has_remaining(&self) -> bool {
326 self.index < self.chars.len()
327 }
328
329 fn advance(&mut self) {
330 if self.consume_structural_token() {
331 return;
332 }
333
334 let character = self.chars[self.index];
335 if consume_whitespace(
336 &self.chars,
337 &mut self.index,
338 &mut self.current,
339 &mut self.tokens,
340 &mut self.just_closed_token,
341 self.in_token,
342 ) {
343 return;
344 }
345
346 if consume_trailing_modifier(
347 character,
348 &mut self.index,
349 &mut self.current,
350 &mut self.tokens,
351 self.in_token,
352 self.just_closed_token,
353 ) {
354 return;
355 }
356
357 if !self.in_token && !character.is_whitespace() {
358 self.just_closed_token = false;
359 }
360
361 if consume_non_word_or_modifier(
362 character,
363 &mut self.index,
364 &mut self.current,
365 &mut self.tokens,
366 self.in_token,
367 ) {
368 return;
369 }
370
371 self.current.push(character);
372 self.index += 1;
373 }
374
375 fn consume_structural_token(&mut self) -> bool {
376 try_consume_escaped_delimiter(
377 &self.chars,
378 &mut self.index,
379 &mut self.current,
380 &mut self.tokens,
381 self.in_token,
382 self.in_literal_block,
383 ) || try_open_literal_block(
384 &self.chars,
385 &mut self.index,
386 &mut self.current,
387 &mut self.literal_content,
388 &mut self.in_literal_block,
389 self.in_token,
390 ) || consume_literal_block(
391 &self.chars,
392 &mut self.index,
393 &mut self.literal_content,
394 &mut self.tokens,
395 &mut self.in_literal_block,
396 ) || try_open_token(
397 &self.chars,
398 &mut self.index,
399 &mut self.current,
400 &mut self.tokens,
401 &mut self.in_token,
402 &mut self.token_end_marker,
403 &mut self.just_closed_token,
404 ) || try_close_token(
405 &self.chars,
406 &mut self.index,
407 &mut self.current,
408 &mut self.tokens,
409 &mut self.in_token,
410 &mut self.token_end_marker,
411 &mut self.just_closed_token,
412 )
413 }
414
415 fn finish(mut self) -> Vec<String> {
416 if self.in_literal_block {
417 self.tokens
418 .push(format!("{{{{{}}}}}", self.literal_content));
419 }
420 if !self.current.is_empty() {
421 self.tokens.push(self.current);
422 }
423 self.tokens
424 }
425}
426
427fn try_consume_escaped_delimiter(
428 chars: &[char],
429 index: &mut usize,
430 current: &mut String,
431 tokens: &mut Vec<String>,
432 in_token: bool,
433 in_literal_block: bool,
434) -> bool {
435 if in_token || in_literal_block || chars.get(*index) != Some(&'\\') {
436 return false;
437 }
438 let Some(&next) = chars.get(*index + 1) else {
439 return false;
440 };
441 if !['(', ')', '[', ']', '{', '}'].contains(&next) {
442 return false;
443 }
444
445 if !current.is_empty() {
446 tokens.push(std::mem::take(current));
447 }
448 tokens.push(next.to_string());
449 *index += 2;
450 true
451}
452
453fn try_open_literal_block(
454 chars: &[char],
455 index: &mut usize,
456 current: &mut String,
457 literal_content: &mut String,
458 in_literal_block: &mut bool,
459 in_token: bool,
460) -> bool {
461 if in_token || *in_literal_block || !matches_two(chars, *index, ['{', '{']) {
462 return false;
463 }
464
465 if !current.trim().is_empty() {
466 let _ = std::mem::take(current);
467 }
468 current.clear();
469 *in_literal_block = true;
470 literal_content.clear();
471 *index += 2;
472 true
473}
474
475fn consume_literal_block(
476 chars: &[char],
477 index: &mut usize,
478 literal_content: &mut String,
479 tokens: &mut Vec<String>,
480 in_literal_block: &mut bool,
481) -> bool {
482 if !*in_literal_block {
483 return false;
484 }
485
486 if matches_four(chars, *index, ['}', '}', '}', '}']) {
487 literal_content.push_str("}}");
488 *index += 4;
489 return true;
490 }
491 if matches_four(chars, *index, ['{', '{', '{', '{']) {
492 literal_content.push_str("{{");
493 *index += 4;
494 return true;
495 }
496 if matches_two(chars, *index, ['}', '}']) {
497 tokens.push(format!("{{{{{}}}}}", std::mem::take(literal_content)));
498 *in_literal_block = false;
499 *index += 2;
500 return true;
501 }
502
503 literal_content.push(chars[*index]);
504 *index += 1;
505 true
506}
507
508fn try_open_token(
509 chars: &[char],
510 index: &mut usize,
511 current: &mut String,
512 tokens: &mut Vec<String>,
513 in_token: &mut bool,
514 token_end_marker: &mut Option<[char; 2]>,
515 just_closed_token: &mut bool,
516) -> bool {
517 let Some(two) = read_two(chars, *index) else {
518 return false;
519 };
520
521 if two == ['<', '<'] {
522 if *in_token {
523 tokens.push(std::mem::take(current));
524 }
525 if current.trim().is_empty() {
526 current.clear();
527 } else {
528 tokens.push(std::mem::take(current));
529 }
530 current.push('<');
531 current.push('<');
532 *in_token = true;
533 *token_end_marker = Some(['>', '>']);
534 *just_closed_token = false;
535 *index += 2;
536 return true;
537 }
538
539 if two == ['<', '!'] && !*in_token {
540 if current.trim().is_empty() {
541 current.clear();
542 } else {
543 tokens.push(std::mem::take(current));
544 }
545 current.push('<');
546 current.push('!');
547 *in_token = true;
548 *token_end_marker = Some(['!', '>']);
549 *index += 2;
550 return true;
551 }
552
553 false
554}
555
556fn try_close_token(
557 chars: &[char],
558 index: &mut usize,
559 current: &mut String,
560 tokens: &mut Vec<String>,
561 in_token: &mut bool,
562 token_end_marker: &mut Option<[char; 2]>,
563 just_closed_token: &mut bool,
564) -> bool {
565 if !*in_token {
566 return false;
567 }
568 let Some(two) = read_two(chars, *index) else {
569 return false;
570 };
571 if !token_end_marker.is_some_and(|marker| marker == two) {
572 return false;
573 }
574
575 current.push(two[0]);
576 current.push(two[1]);
577 tokens.push(std::mem::take(current));
578 *in_token = false;
579 *token_end_marker = None;
580 *just_closed_token = true;
581 *index += 2;
582 true
583}
584
585fn consume_whitespace(
586 chars: &[char],
587 index: &mut usize,
588 current: &mut String,
589 tokens: &mut Vec<String>,
590 just_closed_token: &mut bool,
591 in_token: bool,
592) -> bool {
593 let character = chars[*index];
594 if in_token || !character.is_whitespace() {
595 return false;
596 }
597
598 if !current.is_empty() {
599 tokens.push(std::mem::take(current));
600 }
601 let mut whitespace = String::from(character);
602 let mut next = *index + 1;
603 while next < chars.len() && chars[next].is_whitespace() {
604 whitespace.push(chars[next]);
605 next += 1;
606 }
607 tokens.push(whitespace);
608 *just_closed_token = false;
609 *index = next;
610 true
611}
612
613fn consume_trailing_modifier(
614 character: char,
615 index: &mut usize,
616 current: &mut String,
617 tokens: &mut Vec<String>,
618 in_token: bool,
619 just_closed_token: bool,
620) -> bool {
621 if in_token || !just_closed_token || !"@#%$?+=".contains(character) {
622 return false;
623 }
624
625 if !current.is_empty() {
626 tokens.push(std::mem::take(current));
627 }
628 tokens.push(character.to_string());
629 *index += 1;
630 true
631}
632
633fn consume_non_word_or_modifier(
634 character: char,
635 index: &mut usize,
636 current: &mut String,
637 tokens: &mut Vec<String>,
638 in_token: bool,
639) -> bool {
640 if in_token {
641 return false;
642 }
643
644 let character_is_word = is_word_char(character);
645 let character_is_modifier = "@#%$?+=".contains(character);
646 let token_ends_with_word = current.chars().last().is_some_and(is_word_char);
647 let token_ends_with_modifier = current
648 .chars()
649 .last()
650 .is_some_and(|value| "@#%$?+=".contains(value));
651 let token_has_word_content = current.chars().any(is_word_char);
652
653 if character_is_modifier {
654 let modifier_already_present = current.contains(character);
655 let can_attach = (token_ends_with_word
656 || (token_has_word_content && token_ends_with_modifier))
657 && !modifier_already_present;
658
659 if can_attach {
660 current.push(character);
661 *index += 1;
662 return true;
663 }
664
665 if !current.is_empty() {
666 tokens.push(std::mem::take(current));
667 }
668 tokens.push(character.to_string());
669 *index += 1;
670 return true;
671 }
672
673 if character_is_word {
674 return false;
675 }
676
677 if !current.is_empty() {
678 tokens.push(std::mem::take(current));
679 }
680 tokens.push(character.to_string());
681 *index += 1;
682 true
683}
684
685fn read_two(chars: &[char], index: usize) -> Option<[char; 2]> {
686 Some([*chars.get(index)?, *chars.get(index + 1)?])
687}
688
689fn matches_two(chars: &[char], index: usize, expected: [char; 2]) -> bool {
690 read_two(chars, index).is_some_and(|actual| actual == expected)
691}
692
693fn matches_four(chars: &[char], index: usize, expected: [char; 4]) -> bool {
694 chars.get(index..index + 4).is_some_and(|window| {
695 window
696 .iter()
697 .copied()
698 .zip(expected)
699 .all(|(actual, expected)| actual == expected)
700 })
701}
702
703#[must_use]
704pub fn is_word_char(character: char) -> bool {
705 character.is_alphanumeric() || character == '_' || character == '-' || character == '\''
706}
707
708pub fn parse_token_word(token: &str) -> Result<TokenInfo, ParseError> {
714 if token.is_empty() {
715 return Err(ParseError::InvalidPattern(
716 "token cannot be empty".to_string(),
717 ));
718 }
719 if matches!(token, "#" | "$" | "@" | "<<>>") {
720 return Ok(token_info_literal(token));
721 }
722
723 if token.starts_with("{{") && token.ends_with("}}") {
724 let inner = token
725 .strip_prefix("{{")
726 .and_then(|value| value.strip_suffix("}}"))
727 .unwrap_or(token);
728 return Ok(token_info_literal(inner));
729 }
730
731 if let Some(class_type) = parse_vanishing_group(token) {
732 return Ok(create_token_info(
733 token,
734 None,
735 None,
736 Some(class_type),
737 TokenKind::Vanishing,
738 ));
739 }
740
741 if token.starts_with("<<") && token.ends_with(">>") {
742 let (var_name, class_type, modifier, is_capturing) = parse_token_structure(token)?;
743 if !is_capturing {
744 return Err(ParseError::InvalidPattern(format!(
745 "expected capturing group, got '{token}'"
746 )));
747 }
748 return Ok(create_token_info(
749 token,
750 var_name,
751 modifier,
752 class_type,
753 TokenKind::Capturing,
754 ));
755 }
756
757 if token.chars().next().is_some_and(is_word_char) {
758 let (_, class_type, modifier, is_capturing) = parse_token_structure(token)?;
759 if is_capturing {
760 return Err(ParseError::InvalidPattern(format!(
761 "expected class token, got capturing segment '{token}'"
762 )));
763 }
764 return Ok(create_token_info(
765 token,
766 None,
767 modifier,
768 class_type,
769 TokenKind::Class,
770 ));
771 }
772
773 if token
774 .chars()
775 .all(|character| !character.is_alphanumeric() && !character.is_whitespace())
776 {
777 return Ok(token_info_literal(token));
778 }
779
780 if token.chars().any(char::is_whitespace) {
781 return Ok(token_info_literal(token));
782 }
783
784 if token.chars().all(char::is_whitespace) {
785 return Ok(token_info_literal(token));
786 }
787
788 Err(ParseError::InvalidPattern(format!(
789 "unknown format used for pattern: {token}"
790 )))
791}
792
793fn parse_token_structure(segment: &str) -> Result<ParsedTokenStructure, ParseError> {
794 let (inner, is_capturing) = if let Some(inner) = segment
795 .strip_prefix("<<")
796 .and_then(|value| value.strip_suffix(">>"))
797 {
798 (inner, true)
799 } else {
800 (segment, false)
801 };
802
803 let (base, class_type) = match inner.split_once("::") {
804 Some((base, class_type)) => (base, Some(class_type.to_string())),
805 None => (inner, None),
806 };
807
808 let identifier_len = base
809 .char_indices()
810 .take_while(|(_, character)| is_identifier_char(*character))
811 .last()
812 .map_or(0, |(index, character)| index + character.len_utf8());
813
814 if identifier_len == 0 {
815 return Err(ParseError::InvalidPattern(format!(
816 "unknown format used for pattern: {segment}"
817 )));
818 }
819
820 let identifier = base[..identifier_len].to_string();
821 let modifier = match &base[identifier_len..] {
822 "" => None,
823 value => Some(value.to_string()),
824 };
825
826 if is_capturing {
827 Ok((Some(identifier), class_type, modifier, true))
828 } else {
829 Ok((None, Some(identifier), modifier, false))
830 }
831}
832
833fn token_info_literal(token: &str) -> TokenInfo {
834 create_token_info(token, None, None, None, TokenKind::Literal)
835}
836
837fn create_token_info(
838 token: &str,
839 var_name: Option<String>,
840 modifier: Option<String>,
841 class_type: Option<String>,
842 kind: TokenKind,
843) -> TokenInfo {
844 let flags = modifier_flags(modifier.as_deref());
845 TokenInfo {
846 token: token.to_string(),
847 var_name,
848 modifier,
849 class_type,
850 kind,
851 flags,
852 }
853}
854
855fn modifier_flags(modifier: Option<&str>) -> ModifierFlags {
856 let Some(modifier) = modifier else {
857 return ModifierFlags::default();
858 };
859
860 ModifierFlags {
861 optional: modifier.contains('?'),
862 multi_group: modifier.contains('+'),
863 extended: modifier.contains('%'),
864 strict_class: modifier.contains('='),
865 greedy_matching: modifier.contains('$'),
866 has_class_group_modifier: modifier.contains('[') && modifier.contains(']'),
867 }
868}
869
870const fn parse_quantity(flags: ModifierFlags) -> Quantity {
871 if flags.multi_group {
872 Quantity::OneOrMore
873 } else if flags.optional {
874 Quantity::Optional
875 } else {
876 Quantity::Required
877 }
878}
879
880fn parse_type_modifiers(modifier: Option<&str>) -> TypeModifierSet {
881 let Some(modifier) = modifier else {
882 return TypeModifierSet::default();
883 };
884
885 TypeModifierSet {
886 alpha: modifier.contains('@'),
887 numeric: modifier.contains('#'),
888 extended: modifier.contains('%'),
889 strict: modifier.contains('='),
890 greedy: modifier.contains('$'),
891 }
892}
893
894fn parse_class_constraint(
895 modifier: Option<&str>,
896 explicit_class: Option<&str>,
897) -> Result<Option<ClassConstraint>, ParseError> {
898 if let Some(class_name) = explicit_class {
899 return Ok(Some(ClassConstraint::Explicit(class_name.to_string())));
900 }
901
902 let Some(modifier) = modifier else {
903 return Ok(None);
904 };
905 let Some(start) = modifier.find('[') else {
906 return Ok(None);
907 };
908 let Some(end) = modifier.rfind(']') else {
909 return Err(ParseError::InvalidPattern(format!(
910 "unterminated class constraint in modifier '{modifier}'"
911 )));
912 };
913 let inner = &modifier[start + 1..end];
914 if inner.is_empty() {
915 return Err(ParseError::InvalidPattern(
916 "class constraint cannot be empty".to_string(),
917 ));
918 }
919
920 if inner.starts_with('!') {
921 let items = inner
922 .split('|')
923 .map(str::trim)
924 .filter(|value| !value.is_empty())
925 .map(ToString::to_string)
926 .collect::<Vec<_>>();
927 return Ok(Some(ClassConstraint::Excluded(items)));
928 }
929
930 let items = inner
931 .split('|')
932 .map(str::trim)
933 .filter(|value| !value.is_empty())
934 .map(ToString::to_string)
935 .collect::<Vec<_>>();
936 Ok(Some(ClassConstraint::Included(items)))
937}
938
939fn validate_segment(
940 token_info: &TokenInfo,
941 class_constraint: Option<&ClassConstraint>,
942 _quantity: Quantity,
943 _type_modifiers: TypeModifierSet,
944) -> Result<(), ParseError> {
945 if token_info.is_vanishing_group()
946 && (token_info.modifier.is_some() || class_constraint.is_some())
947 {
948 return Err(ParseError::InvalidPattern(format!(
949 "vanishing groups accept only a bare class name: {}",
950 token_info.token
951 )));
952 }
953
954 if !token_info.is_capturing_group()
955 && token_info.kind == TokenKind::Class
956 && token_info.token.contains("::")
957 {
958 return Err(ParseError::InvalidPattern(format!(
959 "non-capturing groups do not support ::CLASS syntax: {}",
960 token_info.token
961 )));
962 }
963
964 if !token_info.is_capturing_group() && class_constraint.is_some() {
965 return Err(ParseError::InvalidPattern(format!(
966 "class filters are only supported on capturing groups: {}",
967 token_info.token
968 )));
969 }
970
971 if matches!(class_constraint, Some(ClassConstraint::Included(items)) if items.is_empty())
972 || matches!(class_constraint, Some(ClassConstraint::Excluded(items)) if items.is_empty())
973 {
974 return Err(ParseError::InvalidPattern(format!(
975 "class constraint cannot be empty: {}",
976 token_info.token
977 )));
978 }
979
980 Ok(())
981}
982
983fn default_runtime() -> Arc<CompiledPatternRuntime> {
984 Arc::new(CompiledPatternRuntime::default())
985}
986
987fn create_class_plan(segments: &[TelSegment]) -> CompiledClassPlan {
988 let mut fragments = Vec::with_capacity(segments.len());
989 let mut compiled_segments = Vec::with_capacity(segments.len());
990
991 for segment in segments {
992 let token_info = &segment.token_info;
993 let (class_fragment, comparator_substring) = if token_info.kind == TokenKind::Literal {
994 (String::new(), escape_regex_literal(&token_info.token))
995 } else {
996 let mut class_type_updated = update_class_type(token_info.class_type.as_deref());
997 if token_info.is_vanishing_group()
998 && token_info
999 .class_type
1000 .as_deref()
1001 .is_some_and(|class_type| class_type_updated == class_type)
1002 {
1003 class_type_updated = DEFAULT_WORD_REGEX.to_string();
1004 }
1005
1006 let class_fragment =
1007 apply_segment_to_class_type(segment, Some(class_type_updated.as_str()))
1008 .unwrap_or_default();
1009 let wrapped_fragment = wrap_with_word_boundaries(&class_fragment);
1010 let substring = replace_token_pattern(token_info, &wrapped_fragment);
1011 (wrapped_fragment, substring)
1012 };
1013
1014 fragments.push(comparator_substring.clone());
1015 let mut augmented_segment = segment.clone();
1016 augmented_segment.token_info.class_type = Some(class_fragment);
1017 compiled_segments.push(CompiledClassSegment {
1018 capturing_group_count: count_capturing_groups(&comparator_substring),
1019 segment: augmented_segment,
1020 class_comparator_substring: comparator_substring,
1021 });
1022 }
1023
1024 let comparator = fragments.join(r"\s*");
1025 CompiledClassPlan {
1026 whole_pattern: apply_match_mode(&comparator, MatchMode::Whole),
1027 start_pattern: apply_match_mode(&comparator, MatchMode::Start),
1028 end_pattern: apply_match_mode(&comparator, MatchMode::End),
1029 any_pattern: apply_match_mode(&comparator, MatchMode::Any),
1030 comparator,
1031 segments: compiled_segments,
1032 }
1033}
1034
1035fn replace_token_pattern(token_info: &TokenInfo, pattern: &str) -> String {
1036 if token_info.is_capturing_group() || token_info.is_vanishing_group() {
1037 pattern.to_string()
1038 } else {
1039 replace_literal_token_prefix(&token_info.token, pattern)
1040 }
1041}
1042
1043fn compile_pcre2_regex(pattern: &str, label: &str) -> Result<Pcre2Regex, ParseError> {
1044 Pcre2RegexBuilder::new()
1045 .utf(true)
1046 .ucp(true)
1047 .jit_if_available(true)
1048 .max_jit_stack_size(Some(FANCY_REGEX_BACKTRACK_LIMIT))
1049 .build(pattern)
1050 .map_err(|error| {
1051 ParseError::InvalidPattern(format!("error compiling {label} '{pattern}': {error}"))
1052 })
1053}
1054
1055pub(crate) fn apply_match_mode(class_comparator_string: &str, mode: MatchMode) -> String {
1056 match mode {
1057 MatchMode::Whole => format!("^{class_comparator_string}$"),
1058 MatchMode::Start => format!("^{class_comparator_string}"),
1059 MatchMode::End => format!("{class_comparator_string}$"),
1060 MatchMode::Any => class_comparator_string.to_string(),
1061 }
1062}
1063
1064fn parse_vanishing_group(token: &str) -> Option<String> {
1065 let inner = token
1066 .strip_prefix("<!")
1067 .and_then(|value| value.strip_suffix("!>"))?;
1068 if inner.is_empty() || !inner.chars().all(is_identifier_char) {
1069 return None;
1070 }
1071 Some(inner.to_string())
1072}
1073
1074fn is_identifier_char(character: char) -> bool {
1075 character.is_alphanumeric() || character == '_'
1076}
1077
1078fn replace_literal_token_prefix(token: &str, pattern: &str) -> String {
1079 let prefix_len = literal_token_prefix_len(token);
1080 if prefix_len == 0 {
1081 token.to_string()
1082 } else {
1083 format!("{pattern}{}", &token[prefix_len..])
1084 }
1085}
1086
1087fn literal_token_prefix_len(token: &str) -> usize {
1088 let mut prefix_len = 0_usize;
1089 for (index, character) in token.char_indices() {
1090 if is_identifier_char(character) || matches!(character, '@' | '#' | ',' | '+' | '?' | '|') {
1091 prefix_len = index + character.len_utf8();
1092 } else {
1093 break;
1094 }
1095 }
1096 prefix_len
1097}
1098
1099fn escape_regex_literal(text: &str) -> String {
1100 let mut escaped = String::with_capacity(text.len());
1101 for character in text.chars() {
1102 if matches!(
1103 character,
1104 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '$' | '|'
1105 ) {
1106 escaped.push('\\');
1107 }
1108 escaped.push(character);
1109 }
1110 escaped
1111}
1112
1113fn wrap_with_word_boundaries(pattern: &str) -> String {
1114 if pattern.is_empty() {
1115 String::new()
1116 } else {
1117 format!("{WORD_BOUNDARY_REGEX}{pattern}{WORD_BOUNDARY_REGEX}")
1118 }
1119}
1120
1121fn update_class_type(class_type: Option<&str>) -> String {
1122 match class_type {
1123 None | Some("WORDX") => DEFAULT_WORD_REGEX.to_string(),
1124 Some("WORD") => r"[\w]+".to_string(),
1125 Some(value) => value.to_string(),
1126 }
1127}
1128
1129fn resolve_default_class_type(class_type: Option<&str>) -> Option<String> {
1130 match class_type {
1131 None => None,
1132 Some("ALPHA_NUM_EXTENDED") => Some(
1133 "((?:ALPHA_NUM_EXTENDED|ALPHA_NUM|ALPHA_EXTENDED|ALPHA|NUM_EXTENDED|NUM))".to_string(),
1134 ),
1135 Some("ALPHA_EXTENDED") => Some("((?:ALPHA_EXTENDED|ALPHA))".to_string()),
1136 Some("NUM_EXTENDED") => Some("((?:NUM_EXTENDED|NUM))".to_string()),
1137 Some("ALPHA_NUM") => Some("((?:ALPHA_NUM|ALPHA|NUM))".to_string()),
1138 Some(value) => Some(format!("((?:{value}))")),
1139 }
1140}
1141
1142fn resolve_type_modifier_class(
1143 type_modifiers: TypeModifierSet,
1144 modifier: Option<&str>,
1145) -> (Option<String>, bool) {
1146 if modifier.is_some_and(|value| value.contains(',')) {
1147 return (Some("((?:SEPARATOR))".to_string()), false);
1148 }
1149
1150 let has_alpha = type_modifiers.alpha;
1151 let has_num = type_modifiers.numeric;
1152 let has_extended = type_modifiers.extended;
1153 let has_strict = type_modifiers.strict;
1154
1155 let resolved = if has_strict {
1156 match (has_alpha, has_num, has_extended) {
1157 (true, true, true) => Some("ALPHA_NUM_EXTENDED".to_string()),
1158 (true, true, false) => Some("ALPHA_NUM".to_string()),
1159 (true, false, true) => Some("ALPHA_EXTENDED".to_string()),
1160 (true, false, false) => Some("ALPHA".to_string()),
1161 (false, true, true) => Some("NUM_EXTENDED".to_string()),
1162 (false, true, false) => Some("NUM".to_string()),
1163 (false, false, true) => Some(WORD_CHAR_CLASS_REGEX.to_string()),
1164 _ => None,
1165 }
1166 } else {
1167 match (has_alpha, has_num, has_extended) {
1168 (true, true, true) => Some(
1169 "((?:ALPHA_NUM_EXTENDED|ALPHA_NUM|ALPHA_EXTENDED|ALPHA|NUM_EXTENDED|NUM))"
1170 .to_string(),
1171 ),
1172 (true, true, false) => Some("((?:ALPHA_NUM|ALPHA|NUM))".to_string()),
1173 (true, false, true) => Some("((?:ALPHA_EXTENDED|ALPHA))".to_string()),
1174 (true, false, false) => Some("((?:ALPHA))".to_string()),
1175 (false, true, true) => Some("((?:NUM_EXTENDED|NUM))".to_string()),
1176 (false, true, false) => Some("((?:NUM))".to_string()),
1177 (false, false, true) => Some(r"((?:[\w\-']+))".to_string()),
1178 _ => None,
1179 }
1180 };
1181
1182 (resolved, has_strict)
1183}
1184
1185fn apply_class_constraint(
1186 base_pattern: Option<String>,
1187 constraint: Option<&ClassConstraint>,
1188 type_modifiers: TypeModifierSet,
1189) -> Result<Option<String>, ParseError> {
1190 let Some(constraint) = constraint else {
1191 return Ok(base_pattern);
1192 };
1193 let Some(mut base_pattern) = base_pattern else {
1194 return Ok(None);
1195 };
1196
1197 if type_modifiers.strict || !(base_pattern.starts_with("((?:") && base_pattern.ends_with("))"))
1198 {
1199 return Ok(Some(base_pattern));
1200 }
1201
1202 match constraint {
1203 ClassConstraint::Explicit(_) => Ok(Some(base_pattern)),
1204 ClassConstraint::Included(items) => {
1205 let included: Vec<&str> = items.iter().map(String::as_str).collect();
1206 base_pattern = retain_components(&base_pattern, &included);
1207 Ok(Some(base_pattern))
1208 }
1209 ClassConstraint::Excluded(items) => {
1210 for item in items {
1211 if item.starts_with("!!!") {
1212 if !type_modifiers.extended {
1213 return Err(ParseError::InvalidPattern(format!(
1214 "invalid modifier: {item}"
1215 )));
1216 }
1217 base_pattern = match item.as_str() {
1218 "!!!@" => filter_components(&base_pattern, &["ALPHA"]),
1219 "!!!#" => filter_components(&base_pattern, &["NUM"]),
1220 _ => {
1221 return Err(ParseError::InvalidPattern(format!(
1222 "invalid modifier: {item}"
1223 )));
1224 }
1225 };
1226 } else if item.starts_with("!!") {
1227 if !type_modifiers.extended {
1228 return Err(ParseError::InvalidPattern(format!(
1229 "invalid modifier: {item}"
1230 )));
1231 }
1232 base_pattern = match item.as_str() {
1233 "!!@" => filter_components(&base_pattern, &["ALPHA_EXTENDED"]),
1234 "!!#" => filter_components(&base_pattern, &["NUM_EXTENDED"]),
1235 _ => {
1236 return Err(ParseError::InvalidPattern(format!(
1237 "invalid modifier: {item}"
1238 )));
1239 }
1240 };
1241 } else if let Some(value_to_filter) = item.strip_prefix('!') {
1242 base_pattern = if value_to_filter == "@" {
1243 filter_components(&base_pattern, &["ALPHA_EXTENDED", "ALPHA"])
1244 } else if value_to_filter == "#" {
1245 filter_components(&base_pattern, &["NUM_EXTENDED", "NUM"])
1246 } else {
1247 filter_components(&base_pattern, &[value_to_filter])
1248 };
1249 }
1250 }
1251 Ok(Some(base_pattern))
1252 }
1253 }
1254}
1255
1256fn filter_components(modifier_class_type: &str, components_to_remove: &[&str]) -> String {
1257 let prefix = &modifier_class_type[..4];
1258 let suffix = &modifier_class_type[modifier_class_type.len() - 2..];
1259 let inner = &modifier_class_type[4..modifier_class_type.len() - 2];
1260 let filtered: Vec<&str> = inner
1261 .split('|')
1262 .filter(|part| !components_to_remove.contains(part))
1263 .collect();
1264 if filtered.is_empty() {
1265 "()".to_string()
1266 } else {
1267 format!("{prefix}{}{suffix}", filtered.join("|"))
1268 }
1269}
1270
1271fn retain_components(modifier_class_type: &str, components_to_keep: &[&str]) -> String {
1272 let prefix = &modifier_class_type[..4];
1273 let suffix = &modifier_class_type[modifier_class_type.len() - 2..];
1274 let inner = &modifier_class_type[4..modifier_class_type.len() - 2];
1275 let filtered: Vec<&str> = inner
1276 .split('|')
1277 .filter(|part| components_to_keep.contains(part))
1278 .collect();
1279 if filtered.is_empty() {
1280 "()".to_string()
1281 } else {
1282 format!("{prefix}{}{suffix}", filtered.join("|"))
1283 }
1284}
1285
1286fn apply_multigroup_class(
1287 object: &str,
1288 is_capturing_group: bool,
1289 is_strict_class: bool,
1290 is_greedy_matching: bool,
1291) -> String {
1292 let mut object = object.to_string();
1293 if !is_strict_class && object.starts_with('(') && object.ends_with(')') {
1294 object = object[1..object.len() - 1].to_string();
1295 }
1296 if is_greedy_matching {
1297 if is_capturing_group {
1298 format!(r"((?:{object}|\s)+)")
1299 } else {
1300 format!(r"(?:{object}|\s)+")
1301 }
1302 } else if is_capturing_group {
1303 format!(r"((?:{object}|\s)+?)")
1304 } else {
1305 format!(r"(?:{object}|\s)+?")
1306 }
1307}
1308
1309fn apply_suffix_modifiers(
1310 base_pattern: Option<String>,
1311 segment: &TelSegment,
1312 class_type: Option<&str>,
1313 resolved_class: Option<&str>,
1314) -> Option<String> {
1315 let is_multi_group = segment.token_info.flags.multi_group;
1316 let is_optional = segment.token_info.flags.optional;
1317 let mut result = if is_multi_group {
1318 let source = class_type.or(resolved_class)?;
1319 Some(apply_multigroup_class(
1320 source,
1321 true,
1322 segment.type_modifiers.strict,
1323 segment.type_modifiers.greedy,
1324 ))
1325 } else {
1326 base_pattern
1327 };
1328 if is_optional {
1329 result = result.map(|pattern| {
1330 if pattern.ends_with(')') {
1331 format!("{pattern}?")
1332 } else {
1333 format!("({pattern})?")
1334 }
1335 });
1336 }
1337 result
1338}
1339
1340pub(crate) fn apply_segment_to_class_type(
1341 segment: &TelSegment,
1342 class_type: Option<&str>,
1343) -> Option<String> {
1344 let modifier = segment.token_info.modifier.as_deref();
1345 let flags = segment.token_info.flags;
1346 let has_type_signal = segment.type_modifiers.alpha
1347 || segment.type_modifiers.numeric
1348 || segment.type_modifiers.extended
1349 || modifier.is_some_and(|value| value.contains(','));
1350
1351 if !has_type_signal
1352 && !segment.type_modifiers.strict
1353 && !segment.type_modifiers.greedy
1354 && !flags.multi_group
1355 && !flags.optional
1356 {
1357 return resolve_default_class_type(class_type);
1358 }
1359
1360 if modifier.is_none() {
1361 return resolve_default_class_type(class_type);
1362 }
1363
1364 let (modifier_class_type, _) = resolve_type_modifier_class(
1365 segment.type_modifiers,
1366 segment.token_info.modifier.as_deref(),
1367 );
1368 let modifier_class_type = apply_class_constraint(
1369 modifier_class_type,
1370 segment.class_constraint.as_ref(),
1371 segment.type_modifiers,
1372 )
1373 .ok()?;
1374
1375 let mut class_type = class_type.map(ToOwned::to_owned);
1376 if let Some(ref modifier_class_type) = modifier_class_type
1377 && class_type
1378 .as_deref()
1379 .is_some_and(|value| matches!(value, r"[\w]+" | r"[\w\-']+"))
1380 && modifier.is_some_and(|value| value.chars().any(|c| matches!(c, '@' | '#' | '%' | ',')))
1381 {
1382 class_type = Some(modifier_class_type.clone());
1383 }
1384
1385 let base_pattern = class_type.clone().or_else(|| modifier_class_type.clone());
1386 apply_suffix_modifiers(
1387 base_pattern,
1388 segment,
1389 class_type.as_deref(),
1390 modifier_class_type.as_deref(),
1391 )
1392}
1393
1394pub(crate) fn count_capturing_groups(pattern: &str) -> usize {
1395 let chars: Vec<char> = pattern.chars().collect();
1396 let mut index = 0_usize;
1397 let mut count = 0_usize;
1398 let mut escaped = false;
1399
1400 while index < chars.len() {
1401 let character = chars[index];
1402 if escaped {
1403 escaped = false;
1404 index += 1;
1405 continue;
1406 }
1407
1408 if character == '\\' {
1409 escaped = true;
1410 index += 1;
1411 continue;
1412 }
1413
1414 if character == '(' && chars.get(index + 1).copied() != Some('?') {
1415 count += 1;
1416 }
1417
1418 index += 1;
1419 }
1420
1421 count
1422}
1423
1424#[cfg(test)]
1425mod tests {
1426 use super::*;
1427
1428 #[test]
1429 fn test_split_parse_tokens_preserves_literal_blocks_and_modifiers() {
1430 assert_eq!(
1431 split_parse_tokens("{{Unit}} <<UNIT#?>>"),
1432 vec!["{{Unit}}", " ", "<<UNIT#?>>"]
1433 );
1434 assert_eq!(
1435 split_parse_tokens(r"ALPHA \(<<TITLE>>\) ALPHA"),
1436 vec!["ALPHA", " ", "(", "<<TITLE>>", ")", " ", "ALPHA"]
1437 );
1438 }
1439
1440 #[test]
1441 fn test_compile_pattern_preserves_segment_semantics() {
1442 let compiled = CompiledPattern::compile("<<CIVIC#>> <<STREET@+>> <<TYPE::STREETTYPE>>")
1443 .expect("pattern compiles");
1444
1445 assert_eq!(compiled.token_info.len(), 3);
1446 assert_eq!(compiled.parsed.segments.len(), 3);
1447 assert_eq!(
1448 compiled.parsed.segments[0].token_info.var_name.as_deref(),
1449 Some("CIVIC")
1450 );
1451 assert!(compiled.parsed.segments[1].type_modifiers.alpha);
1452 assert_eq!(compiled.parsed.segments[1].quantity, Quantity::OneOrMore);
1453 assert_eq!(
1454 compiled.parsed.segments[2].class_constraint,
1455 Some(ClassConstraint::Explicit("STREETTYPE".to_string()))
1456 );
1457 }
1458
1459 #[test]
1460 fn test_compile_pattern_tracks_class_filters_in_ast() {
1461 let compiled =
1462 CompiledPattern::compile("<<CIVIC@#%[!@]>> <<STREET[ALPHA|ALPHA_EXTENDED]>>")
1463 .expect("pattern compiles");
1464
1465 assert_eq!(
1466 compiled.parsed.segments[0].class_constraint,
1467 Some(ClassConstraint::Excluded(vec!["!@".to_string()]))
1468 );
1469 assert_eq!(
1470 compiled.parsed.segments[1].class_constraint,
1471 Some(ClassConstraint::Included(vec![
1472 "ALPHA".to_string(),
1473 "ALPHA_EXTENDED".to_string(),
1474 ]))
1475 );
1476 }
1477
1478 #[test]
1479 fn test_compile_pattern_allows_legacy_greedy_without_multi_group() {
1480 let compiled = CompiledPattern::compile("CITY$").expect("legacy pattern should compile");
1481 assert!(compiled.parsed.segments[0].type_modifiers.greedy);
1482 assert_eq!(compiled.parsed.segments[0].quantity, Quantity::Required);
1483 }
1484}