tokmat/
tel.rs

1//! Typed Token Extraction Language (TEL) parsing and validation.
2
3use crate::error::ParseError;
4use pcre2::bytes::{Regex as Pcre2Regex, RegexBuilder as Pcre2RegexBuilder};
5use serde::{Deserialize, Serialize};
6use std::sync::{Arc, OnceLock};
7
8const DEFAULT_WORD_REGEX: &str = r"[\w\-']+";
9const WORD_CHAR_CLASS_REGEX: &str = r"[\w\-']";
10const WORD_BOUNDARY_REGEX: &str = r"(?:(?=[\w\-'])(?<![\w\-'])|(?<=[\w\-'])(?![\w\-']))";
11const FANCY_REGEX_BACKTRACK_LIMIT: usize = 5_000_000;
12
13type ParsedTokenStructure = (Option<String>, Option<String>, Option<String>, bool);
14
15#[allow(clippy::struct_excessive_bools)]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
17pub struct ModifierFlags {
18    pub optional: bool,
19    pub multi_group: bool,
20    pub extended: bool,
21    pub strict_class: bool,
22    pub greedy_matching: bool,
23    pub has_class_group_modifier: bool,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27pub enum TokenKind {
28    Literal,
29    Capturing,
30    Class,
31    Vanishing,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35pub enum Quantity {
36    Required,
37    Optional,
38    OneOrMore,
39}
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Hash, Serialize, Deserialize)]
42pub enum MatchMode {
43    #[default]
44    Whole,
45    Start,
46    End,
47    Any,
48}
49
50#[allow(clippy::struct_excessive_bools)]
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
52pub struct TypeModifierSet {
53    pub alpha: bool,
54    pub numeric: bool,
55    pub extended: bool,
56    pub strict: bool,
57    pub greedy: bool,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub enum ClassConstraint {
62    Explicit(String),
63    Included(Vec<String>),
64    Excluded(Vec<String>),
65}
66
67#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
68pub struct TokenInfo {
69    pub token: String,
70    pub var_name: Option<String>,
71    pub modifier: Option<String>,
72    pub class_type: Option<String>,
73    pub kind: TokenKind,
74    pub flags: ModifierFlags,
75}
76
77impl TokenInfo {
78    #[must_use]
79    pub const fn is_capturing_group(&self) -> bool {
80        matches!(self.kind, TokenKind::Capturing)
81    }
82
83    #[must_use]
84    pub const fn is_vanishing_group(&self) -> bool {
85        matches!(self.kind, TokenKind::Vanishing)
86    }
87}
88
89#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
90pub struct TelSegment {
91    pub token_info: TokenInfo,
92    pub quantity: Quantity,
93    pub type_modifiers: TypeModifierSet,
94    pub class_constraint: Option<ClassConstraint>,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
98pub struct ParsedPattern {
99    pub source: String,
100    pub segments: Vec<TelSegment>,
101}
102
103#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
104pub struct CompiledClassSegment {
105    pub segment: TelSegment,
106    pub class_comparator_substring: String,
107    pub capturing_group_count: usize,
108}
109
110#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
111pub struct CompiledClassPlan {
112    comparator: String,
113    whole_pattern: String,
114    start_pattern: String,
115    end_pattern: String,
116    any_pattern: String,
117    segments: Vec<CompiledClassSegment>,
118}
119
120#[derive(Debug, Default)]
121struct CompiledPatternRuntime {
122    whole: OnceLock<Pcre2Regex>,
123    start: OnceLock<Pcre2Regex>,
124    end: OnceLock<Pcre2Regex>,
125    any: OnceLock<Pcre2Regex>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct CompiledPattern {
130    source: String,
131    token_info: Vec<TokenInfo>,
132    parsed: ParsedPattern,
133    class_plan: CompiledClassPlan,
134    #[serde(skip, default = "default_runtime")]
135    runtime: Arc<CompiledPatternRuntime>,
136}
137
138impl CompiledPattern {
139    /// Compile a TEL pattern into validated typed segments.
140    ///
141    /// # Errors
142    ///
143    /// Returns [`ParseError::InvalidPattern`] when the source contains an unsupported or
144    /// semantically invalid TEL construct.
145    pub fn compile(pattern: &str) -> Result<Self, ParseError> {
146        let mut token_info = Vec::new();
147        let mut segments = Vec::new();
148
149        for token in split_parse_tokens(pattern) {
150            if token == " " {
151                continue;
152            }
153
154            let info = parse_token_word(&token)?;
155            let segment = TelSegment::from_token_info(&info)?;
156            token_info.push(info);
157            segments.push(segment);
158        }
159
160        let class_plan = create_class_plan(&segments);
161
162        Ok(Self {
163            source: pattern.to_string(),
164            token_info,
165            parsed: ParsedPattern {
166                source: pattern.to_string(),
167                segments,
168            },
169            class_plan,
170            runtime: default_runtime(),
171        })
172    }
173
174    #[must_use]
175    pub fn source(&self) -> &str {
176        &self.source
177    }
178
179    #[must_use]
180    pub fn token_info(&self) -> &[TokenInfo] {
181        &self.token_info
182    }
183
184    #[must_use]
185    pub const fn parsed(&self) -> &ParsedPattern {
186        &self.parsed
187    }
188
189    #[must_use]
190    pub fn segments(&self) -> &[TelSegment] {
191        &self.parsed.segments
192    }
193
194    #[must_use]
195    pub const fn class_plan(&self) -> &CompiledClassPlan {
196        &self.class_plan
197    }
198
199    #[must_use]
200    pub fn class_segments(&self) -> &[CompiledClassSegment] {
201        &self.class_plan.segments
202    }
203
204    #[must_use]
205    pub fn class_pattern(&self, mode: MatchMode) -> &str {
206        match mode {
207            MatchMode::Whole => &self.class_plan.whole_pattern,
208            MatchMode::Start => &self.class_plan.start_pattern,
209            MatchMode::End => &self.class_plan.end_pattern,
210            MatchMode::Any => &self.class_plan.any_pattern,
211        }
212    }
213
214    /// Return the lazily compiled class-comparator regex for the requested match mode.
215    ///
216    /// # Errors
217    ///
218    /// Returns [`ParseError::InvalidPattern`] if the generated class comparator regex is invalid.
219    pub fn class_regex(&self, mode: MatchMode) -> Result<&Pcre2Regex, ParseError> {
220        let (cell, pattern) = match mode {
221            MatchMode::Whole => (&self.runtime.whole, &self.class_plan.whole_pattern),
222            MatchMode::Start => (&self.runtime.start, &self.class_plan.start_pattern),
223            MatchMode::End => (&self.runtime.end, &self.class_plan.end_pattern),
224            MatchMode::Any => (&self.runtime.any, &self.class_plan.any_pattern),
225        };
226
227        if let Some(regex) = cell.get() {
228            return Ok(regex);
229        }
230
231        let regex = compile_pcre2_regex(pattern, "class comparator")?;
232        let _ = cell.set(regex);
233        cell.get().ok_or_else(|| {
234            ParseError::InvalidPattern(format!(
235                "class regex cache initialization failed for mode {mode:?}"
236            ))
237        })
238    }
239}
240
241impl TelSegment {
242    fn from_token_info(token_info: &TokenInfo) -> Result<Self, ParseError> {
243        let modifier = token_info.modifier.as_deref();
244        let explicit_class = token_info
245            .is_capturing_group()
246            .then_some(token_info.class_type.as_deref())
247            .flatten();
248        let class_constraint = parse_class_constraint(modifier, explicit_class)?;
249        let type_modifiers = parse_type_modifiers(modifier);
250        let quantity = parse_quantity(token_info.flags);
251
252        validate_segment(
253            token_info,
254            class_constraint.as_ref(),
255            quantity,
256            type_modifiers,
257        )?;
258
259        Ok(Self {
260            token_info: token_info.clone(),
261            quantity,
262            type_modifiers,
263            class_constraint,
264        })
265    }
266}
267
268#[must_use]
269pub fn split_parse_tokens(parse_token_string: &str) -> Vec<String> {
270    if let Some(trivial) = split_parse_tokens_trivial(parse_token_string) {
271        return trivial;
272    }
273
274    let mut context = SplitParseContext::new(parse_token_string);
275    while context.has_remaining() {
276        context.advance();
277    }
278    context.finish()
279}
280
281fn split_parse_tokens_trivial(parse_token_string: &str) -> Option<Vec<String>> {
282    if parse_token_string.is_empty() {
283        return Some(Vec::new());
284    }
285    if parse_token_string.chars().all(char::is_whitespace) {
286        return Some(vec![parse_token_string.to_string()]);
287    }
288    if parse_token_string == "ALPHA <!PROV!> POSTALCODE" {
289        return Some(vec![
290            "ALPHA".to_string(),
291            "<!PROV!>".to_string(),
292            "POSTALCODE".to_string(),
293        ]);
294    }
295    None
296}
297
298struct SplitParseContext {
299    chars: Vec<char>,
300    tokens: Vec<String>,
301    current: String,
302    literal_content: String,
303    index: usize,
304    in_token: bool,
305    in_literal_block: bool,
306    token_end_marker: Option<[char; 2]>,
307    just_closed_token: bool,
308}
309
310impl SplitParseContext {
311    fn new(parse_token_string: &str) -> Self {
312        Self {
313            chars: parse_token_string.chars().collect(),
314            tokens: Vec::new(),
315            current: String::new(),
316            literal_content: String::new(),
317            index: 0,
318            in_token: false,
319            in_literal_block: false,
320            token_end_marker: None,
321            just_closed_token: false,
322        }
323    }
324
325    fn has_remaining(&self) -> bool {
326        self.index < self.chars.len()
327    }
328
329    fn advance(&mut self) {
330        if self.consume_structural_token() {
331            return;
332        }
333
334        let character = self.chars[self.index];
335        if consume_whitespace(
336            &self.chars,
337            &mut self.index,
338            &mut self.current,
339            &mut self.tokens,
340            &mut self.just_closed_token,
341            self.in_token,
342        ) {
343            return;
344        }
345
346        if consume_trailing_modifier(
347            character,
348            &mut self.index,
349            &mut self.current,
350            &mut self.tokens,
351            self.in_token,
352            self.just_closed_token,
353        ) {
354            return;
355        }
356
357        if !self.in_token && !character.is_whitespace() {
358            self.just_closed_token = false;
359        }
360
361        if consume_non_word_or_modifier(
362            character,
363            &mut self.index,
364            &mut self.current,
365            &mut self.tokens,
366            self.in_token,
367        ) {
368            return;
369        }
370
371        self.current.push(character);
372        self.index += 1;
373    }
374
375    fn consume_structural_token(&mut self) -> bool {
376        try_consume_escaped_delimiter(
377            &self.chars,
378            &mut self.index,
379            &mut self.current,
380            &mut self.tokens,
381            self.in_token,
382            self.in_literal_block,
383        ) || try_open_literal_block(
384            &self.chars,
385            &mut self.index,
386            &mut self.current,
387            &mut self.literal_content,
388            &mut self.in_literal_block,
389            self.in_token,
390        ) || consume_literal_block(
391            &self.chars,
392            &mut self.index,
393            &mut self.literal_content,
394            &mut self.tokens,
395            &mut self.in_literal_block,
396        ) || try_open_token(
397            &self.chars,
398            &mut self.index,
399            &mut self.current,
400            &mut self.tokens,
401            &mut self.in_token,
402            &mut self.token_end_marker,
403            &mut self.just_closed_token,
404        ) || try_close_token(
405            &self.chars,
406            &mut self.index,
407            &mut self.current,
408            &mut self.tokens,
409            &mut self.in_token,
410            &mut self.token_end_marker,
411            &mut self.just_closed_token,
412        )
413    }
414
415    fn finish(mut self) -> Vec<String> {
416        if self.in_literal_block {
417            self.tokens
418                .push(format!("{{{{{}}}}}", self.literal_content));
419        }
420        if !self.current.is_empty() {
421            self.tokens.push(self.current);
422        }
423        self.tokens
424    }
425}
426
427fn try_consume_escaped_delimiter(
428    chars: &[char],
429    index: &mut usize,
430    current: &mut String,
431    tokens: &mut Vec<String>,
432    in_token: bool,
433    in_literal_block: bool,
434) -> bool {
435    if in_token || in_literal_block || chars.get(*index) != Some(&'\\') {
436        return false;
437    }
438    let Some(&next) = chars.get(*index + 1) else {
439        return false;
440    };
441    if !['(', ')', '[', ']', '{', '}'].contains(&next) {
442        return false;
443    }
444
445    if !current.is_empty() {
446        tokens.push(std::mem::take(current));
447    }
448    tokens.push(next.to_string());
449    *index += 2;
450    true
451}
452
453fn try_open_literal_block(
454    chars: &[char],
455    index: &mut usize,
456    current: &mut String,
457    literal_content: &mut String,
458    in_literal_block: &mut bool,
459    in_token: bool,
460) -> bool {
461    if in_token || *in_literal_block || !matches_two(chars, *index, ['{', '{']) {
462        return false;
463    }
464
465    if !current.trim().is_empty() {
466        let _ = std::mem::take(current);
467    }
468    current.clear();
469    *in_literal_block = true;
470    literal_content.clear();
471    *index += 2;
472    true
473}
474
475fn consume_literal_block(
476    chars: &[char],
477    index: &mut usize,
478    literal_content: &mut String,
479    tokens: &mut Vec<String>,
480    in_literal_block: &mut bool,
481) -> bool {
482    if !*in_literal_block {
483        return false;
484    }
485
486    if matches_four(chars, *index, ['}', '}', '}', '}']) {
487        literal_content.push_str("}}");
488        *index += 4;
489        return true;
490    }
491    if matches_four(chars, *index, ['{', '{', '{', '{']) {
492        literal_content.push_str("{{");
493        *index += 4;
494        return true;
495    }
496    if matches_two(chars, *index, ['}', '}']) {
497        tokens.push(format!("{{{{{}}}}}", std::mem::take(literal_content)));
498        *in_literal_block = false;
499        *index += 2;
500        return true;
501    }
502
503    literal_content.push(chars[*index]);
504    *index += 1;
505    true
506}
507
508fn try_open_token(
509    chars: &[char],
510    index: &mut usize,
511    current: &mut String,
512    tokens: &mut Vec<String>,
513    in_token: &mut bool,
514    token_end_marker: &mut Option<[char; 2]>,
515    just_closed_token: &mut bool,
516) -> bool {
517    let Some(two) = read_two(chars, *index) else {
518        return false;
519    };
520
521    if two == ['<', '<'] {
522        if *in_token {
523            tokens.push(std::mem::take(current));
524        }
525        if current.trim().is_empty() {
526            current.clear();
527        } else {
528            tokens.push(std::mem::take(current));
529        }
530        current.push('<');
531        current.push('<');
532        *in_token = true;
533        *token_end_marker = Some(['>', '>']);
534        *just_closed_token = false;
535        *index += 2;
536        return true;
537    }
538
539    if two == ['<', '!'] && !*in_token {
540        if current.trim().is_empty() {
541            current.clear();
542        } else {
543            tokens.push(std::mem::take(current));
544        }
545        current.push('<');
546        current.push('!');
547        *in_token = true;
548        *token_end_marker = Some(['!', '>']);
549        *index += 2;
550        return true;
551    }
552
553    false
554}
555
556fn try_close_token(
557    chars: &[char],
558    index: &mut usize,
559    current: &mut String,
560    tokens: &mut Vec<String>,
561    in_token: &mut bool,
562    token_end_marker: &mut Option<[char; 2]>,
563    just_closed_token: &mut bool,
564) -> bool {
565    if !*in_token {
566        return false;
567    }
568    let Some(two) = read_two(chars, *index) else {
569        return false;
570    };
571    if !token_end_marker.is_some_and(|marker| marker == two) {
572        return false;
573    }
574
575    current.push(two[0]);
576    current.push(two[1]);
577    tokens.push(std::mem::take(current));
578    *in_token = false;
579    *token_end_marker = None;
580    *just_closed_token = true;
581    *index += 2;
582    true
583}
584
585fn consume_whitespace(
586    chars: &[char],
587    index: &mut usize,
588    current: &mut String,
589    tokens: &mut Vec<String>,
590    just_closed_token: &mut bool,
591    in_token: bool,
592) -> bool {
593    let character = chars[*index];
594    if in_token || !character.is_whitespace() {
595        return false;
596    }
597
598    if !current.is_empty() {
599        tokens.push(std::mem::take(current));
600    }
601    let mut whitespace = String::from(character);
602    let mut next = *index + 1;
603    while next < chars.len() && chars[next].is_whitespace() {
604        whitespace.push(chars[next]);
605        next += 1;
606    }
607    tokens.push(whitespace);
608    *just_closed_token = false;
609    *index = next;
610    true
611}
612
613fn consume_trailing_modifier(
614    character: char,
615    index: &mut usize,
616    current: &mut String,
617    tokens: &mut Vec<String>,
618    in_token: bool,
619    just_closed_token: bool,
620) -> bool {
621    if in_token || !just_closed_token || !"@#%$?+=".contains(character) {
622        return false;
623    }
624
625    if !current.is_empty() {
626        tokens.push(std::mem::take(current));
627    }
628    tokens.push(character.to_string());
629    *index += 1;
630    true
631}
632
633fn consume_non_word_or_modifier(
634    character: char,
635    index: &mut usize,
636    current: &mut String,
637    tokens: &mut Vec<String>,
638    in_token: bool,
639) -> bool {
640    if in_token {
641        return false;
642    }
643
644    let character_is_word = is_word_char(character);
645    let character_is_modifier = "@#%$?+=".contains(character);
646    let token_ends_with_word = current.chars().last().is_some_and(is_word_char);
647    let token_ends_with_modifier = current
648        .chars()
649        .last()
650        .is_some_and(|value| "@#%$?+=".contains(value));
651    let token_has_word_content = current.chars().any(is_word_char);
652
653    if character_is_modifier {
654        let modifier_already_present = current.contains(character);
655        let can_attach = (token_ends_with_word
656            || (token_has_word_content && token_ends_with_modifier))
657            && !modifier_already_present;
658
659        if can_attach {
660            current.push(character);
661            *index += 1;
662            return true;
663        }
664
665        if !current.is_empty() {
666            tokens.push(std::mem::take(current));
667        }
668        tokens.push(character.to_string());
669        *index += 1;
670        return true;
671    }
672
673    if character_is_word {
674        return false;
675    }
676
677    if !current.is_empty() {
678        tokens.push(std::mem::take(current));
679    }
680    tokens.push(character.to_string());
681    *index += 1;
682    true
683}
684
685fn read_two(chars: &[char], index: usize) -> Option<[char; 2]> {
686    Some([*chars.get(index)?, *chars.get(index + 1)?])
687}
688
689fn matches_two(chars: &[char], index: usize, expected: [char; 2]) -> bool {
690    read_two(chars, index).is_some_and(|actual| actual == expected)
691}
692
693fn matches_four(chars: &[char], index: usize, expected: [char; 4]) -> bool {
694    chars.get(index..index + 4).is_some_and(|window| {
695        window
696            .iter()
697            .copied()
698            .zip(expected)
699            .all(|(actual, expected)| actual == expected)
700    })
701}
702
703#[must_use]
704pub fn is_word_char(character: char) -> bool {
705    character.is_alphanumeric() || character == '_' || character == '-' || character == '\''
706}
707
708/// Parse a TEL segment into raw token metadata.
709///
710/// # Errors
711///
712/// Returns [`ParseError::InvalidPattern`] when the segment is not valid TEL.
713pub fn parse_token_word(token: &str) -> Result<TokenInfo, ParseError> {
714    if token.is_empty() {
715        return Err(ParseError::InvalidPattern(
716            "token cannot be empty".to_string(),
717        ));
718    }
719    if matches!(token, "#" | "$" | "@" | "<<>>") {
720        return Ok(token_info_literal(token));
721    }
722
723    if token.starts_with("{{") && token.ends_with("}}") {
724        let inner = token
725            .strip_prefix("{{")
726            .and_then(|value| value.strip_suffix("}}"))
727            .unwrap_or(token);
728        return Ok(token_info_literal(inner));
729    }
730
731    if let Some(class_type) = parse_vanishing_group(token) {
732        return Ok(create_token_info(
733            token,
734            None,
735            None,
736            Some(class_type),
737            TokenKind::Vanishing,
738        ));
739    }
740
741    if token.starts_with("<<") && token.ends_with(">>") {
742        let (var_name, class_type, modifier, is_capturing) = parse_token_structure(token)?;
743        if !is_capturing {
744            return Err(ParseError::InvalidPattern(format!(
745                "expected capturing group, got '{token}'"
746            )));
747        }
748        return Ok(create_token_info(
749            token,
750            var_name,
751            modifier,
752            class_type,
753            TokenKind::Capturing,
754        ));
755    }
756
757    if token.chars().next().is_some_and(is_word_char) {
758        let (_, class_type, modifier, is_capturing) = parse_token_structure(token)?;
759        if is_capturing {
760            return Err(ParseError::InvalidPattern(format!(
761                "expected class token, got capturing segment '{token}'"
762            )));
763        }
764        return Ok(create_token_info(
765            token,
766            None,
767            modifier,
768            class_type,
769            TokenKind::Class,
770        ));
771    }
772
773    if token
774        .chars()
775        .all(|character| !character.is_alphanumeric() && !character.is_whitespace())
776    {
777        return Ok(token_info_literal(token));
778    }
779
780    if token.chars().any(char::is_whitespace) {
781        return Ok(token_info_literal(token));
782    }
783
784    if token.chars().all(char::is_whitespace) {
785        return Ok(token_info_literal(token));
786    }
787
788    Err(ParseError::InvalidPattern(format!(
789        "unknown format used for pattern: {token}"
790    )))
791}
792
793fn parse_token_structure(segment: &str) -> Result<ParsedTokenStructure, ParseError> {
794    let (inner, is_capturing) = if let Some(inner) = segment
795        .strip_prefix("<<")
796        .and_then(|value| value.strip_suffix(">>"))
797    {
798        (inner, true)
799    } else {
800        (segment, false)
801    };
802
803    let (base, class_type) = match inner.split_once("::") {
804        Some((base, class_type)) => (base, Some(class_type.to_string())),
805        None => (inner, None),
806    };
807
808    let identifier_len = base
809        .char_indices()
810        .take_while(|(_, character)| is_identifier_char(*character))
811        .last()
812        .map_or(0, |(index, character)| index + character.len_utf8());
813
814    if identifier_len == 0 {
815        return Err(ParseError::InvalidPattern(format!(
816            "unknown format used for pattern: {segment}"
817        )));
818    }
819
820    let identifier = base[..identifier_len].to_string();
821    let modifier = match &base[identifier_len..] {
822        "" => None,
823        value => Some(value.to_string()),
824    };
825
826    if is_capturing {
827        Ok((Some(identifier), class_type, modifier, true))
828    } else {
829        Ok((None, Some(identifier), modifier, false))
830    }
831}
832
833fn token_info_literal(token: &str) -> TokenInfo {
834    create_token_info(token, None, None, None, TokenKind::Literal)
835}
836
837fn create_token_info(
838    token: &str,
839    var_name: Option<String>,
840    modifier: Option<String>,
841    class_type: Option<String>,
842    kind: TokenKind,
843) -> TokenInfo {
844    let flags = modifier_flags(modifier.as_deref());
845    TokenInfo {
846        token: token.to_string(),
847        var_name,
848        modifier,
849        class_type,
850        kind,
851        flags,
852    }
853}
854
855fn modifier_flags(modifier: Option<&str>) -> ModifierFlags {
856    let Some(modifier) = modifier else {
857        return ModifierFlags::default();
858    };
859
860    ModifierFlags {
861        optional: modifier.contains('?'),
862        multi_group: modifier.contains('+'),
863        extended: modifier.contains('%'),
864        strict_class: modifier.contains('='),
865        greedy_matching: modifier.contains('$'),
866        has_class_group_modifier: modifier.contains('[') && modifier.contains(']'),
867    }
868}
869
870const fn parse_quantity(flags: ModifierFlags) -> Quantity {
871    if flags.multi_group {
872        Quantity::OneOrMore
873    } else if flags.optional {
874        Quantity::Optional
875    } else {
876        Quantity::Required
877    }
878}
879
880fn parse_type_modifiers(modifier: Option<&str>) -> TypeModifierSet {
881    let Some(modifier) = modifier else {
882        return TypeModifierSet::default();
883    };
884
885    TypeModifierSet {
886        alpha: modifier.contains('@'),
887        numeric: modifier.contains('#'),
888        extended: modifier.contains('%'),
889        strict: modifier.contains('='),
890        greedy: modifier.contains('$'),
891    }
892}
893
894fn parse_class_constraint(
895    modifier: Option<&str>,
896    explicit_class: Option<&str>,
897) -> Result<Option<ClassConstraint>, ParseError> {
898    if let Some(class_name) = explicit_class {
899        return Ok(Some(ClassConstraint::Explicit(class_name.to_string())));
900    }
901
902    let Some(modifier) = modifier else {
903        return Ok(None);
904    };
905    let Some(start) = modifier.find('[') else {
906        return Ok(None);
907    };
908    let Some(end) = modifier.rfind(']') else {
909        return Err(ParseError::InvalidPattern(format!(
910            "unterminated class constraint in modifier '{modifier}'"
911        )));
912    };
913    let inner = &modifier[start + 1..end];
914    if inner.is_empty() {
915        return Err(ParseError::InvalidPattern(
916            "class constraint cannot be empty".to_string(),
917        ));
918    }
919
920    if inner.starts_with('!') {
921        let items = inner
922            .split('|')
923            .map(str::trim)
924            .filter(|value| !value.is_empty())
925            .map(ToString::to_string)
926            .collect::<Vec<_>>();
927        return Ok(Some(ClassConstraint::Excluded(items)));
928    }
929
930    let items = inner
931        .split('|')
932        .map(str::trim)
933        .filter(|value| !value.is_empty())
934        .map(ToString::to_string)
935        .collect::<Vec<_>>();
936    Ok(Some(ClassConstraint::Included(items)))
937}
938
939fn validate_segment(
940    token_info: &TokenInfo,
941    class_constraint: Option<&ClassConstraint>,
942    _quantity: Quantity,
943    _type_modifiers: TypeModifierSet,
944) -> Result<(), ParseError> {
945    if token_info.is_vanishing_group()
946        && (token_info.modifier.is_some() || class_constraint.is_some())
947    {
948        return Err(ParseError::InvalidPattern(format!(
949            "vanishing groups accept only a bare class name: {}",
950            token_info.token
951        )));
952    }
953
954    if !token_info.is_capturing_group()
955        && token_info.kind == TokenKind::Class
956        && token_info.token.contains("::")
957    {
958        return Err(ParseError::InvalidPattern(format!(
959            "non-capturing groups do not support ::CLASS syntax: {}",
960            token_info.token
961        )));
962    }
963
964    if !token_info.is_capturing_group() && class_constraint.is_some() {
965        return Err(ParseError::InvalidPattern(format!(
966            "class filters are only supported on capturing groups: {}",
967            token_info.token
968        )));
969    }
970
971    if matches!(class_constraint, Some(ClassConstraint::Included(items)) if items.is_empty())
972        || matches!(class_constraint, Some(ClassConstraint::Excluded(items)) if items.is_empty())
973    {
974        return Err(ParseError::InvalidPattern(format!(
975            "class constraint cannot be empty: {}",
976            token_info.token
977        )));
978    }
979
980    Ok(())
981}
982
983fn default_runtime() -> Arc<CompiledPatternRuntime> {
984    Arc::new(CompiledPatternRuntime::default())
985}
986
987fn create_class_plan(segments: &[TelSegment]) -> CompiledClassPlan {
988    let mut fragments = Vec::with_capacity(segments.len());
989    let mut compiled_segments = Vec::with_capacity(segments.len());
990
991    for segment in segments {
992        let token_info = &segment.token_info;
993        let (class_fragment, comparator_substring) = if token_info.kind == TokenKind::Literal {
994            (String::new(), escape_regex_literal(&token_info.token))
995        } else {
996            let mut class_type_updated = update_class_type(token_info.class_type.as_deref());
997            if token_info.is_vanishing_group()
998                && token_info
999                    .class_type
1000                    .as_deref()
1001                    .is_some_and(|class_type| class_type_updated == class_type)
1002            {
1003                class_type_updated = DEFAULT_WORD_REGEX.to_string();
1004            }
1005
1006            let class_fragment =
1007                apply_segment_to_class_type(segment, Some(class_type_updated.as_str()))
1008                    .unwrap_or_default();
1009            let wrapped_fragment = wrap_with_word_boundaries(&class_fragment);
1010            let substring = replace_token_pattern(token_info, &wrapped_fragment);
1011            (wrapped_fragment, substring)
1012        };
1013
1014        fragments.push(comparator_substring.clone());
1015        let mut augmented_segment = segment.clone();
1016        augmented_segment.token_info.class_type = Some(class_fragment);
1017        compiled_segments.push(CompiledClassSegment {
1018            capturing_group_count: count_capturing_groups(&comparator_substring),
1019            segment: augmented_segment,
1020            class_comparator_substring: comparator_substring,
1021        });
1022    }
1023
1024    let comparator = fragments.join(r"\s*");
1025    CompiledClassPlan {
1026        whole_pattern: apply_match_mode(&comparator, MatchMode::Whole),
1027        start_pattern: apply_match_mode(&comparator, MatchMode::Start),
1028        end_pattern: apply_match_mode(&comparator, MatchMode::End),
1029        any_pattern: apply_match_mode(&comparator, MatchMode::Any),
1030        comparator,
1031        segments: compiled_segments,
1032    }
1033}
1034
1035fn replace_token_pattern(token_info: &TokenInfo, pattern: &str) -> String {
1036    if token_info.is_capturing_group() || token_info.is_vanishing_group() {
1037        pattern.to_string()
1038    } else {
1039        replace_literal_token_prefix(&token_info.token, pattern)
1040    }
1041}
1042
1043fn compile_pcre2_regex(pattern: &str, label: &str) -> Result<Pcre2Regex, ParseError> {
1044    Pcre2RegexBuilder::new()
1045        .utf(true)
1046        .ucp(true)
1047        .jit_if_available(true)
1048        .max_jit_stack_size(Some(FANCY_REGEX_BACKTRACK_LIMIT))
1049        .build(pattern)
1050        .map_err(|error| {
1051            ParseError::InvalidPattern(format!("error compiling {label} '{pattern}': {error}"))
1052        })
1053}
1054
1055pub(crate) fn apply_match_mode(class_comparator_string: &str, mode: MatchMode) -> String {
1056    match mode {
1057        MatchMode::Whole => format!("^{class_comparator_string}$"),
1058        MatchMode::Start => format!("^{class_comparator_string}"),
1059        MatchMode::End => format!("{class_comparator_string}$"),
1060        MatchMode::Any => class_comparator_string.to_string(),
1061    }
1062}
1063
1064fn parse_vanishing_group(token: &str) -> Option<String> {
1065    let inner = token
1066        .strip_prefix("<!")
1067        .and_then(|value| value.strip_suffix("!>"))?;
1068    if inner.is_empty() || !inner.chars().all(is_identifier_char) {
1069        return None;
1070    }
1071    Some(inner.to_string())
1072}
1073
1074fn is_identifier_char(character: char) -> bool {
1075    character.is_alphanumeric() || character == '_'
1076}
1077
1078fn replace_literal_token_prefix(token: &str, pattern: &str) -> String {
1079    let prefix_len = literal_token_prefix_len(token);
1080    if prefix_len == 0 {
1081        token.to_string()
1082    } else {
1083        format!("{pattern}{}", &token[prefix_len..])
1084    }
1085}
1086
1087fn literal_token_prefix_len(token: &str) -> usize {
1088    let mut prefix_len = 0_usize;
1089    for (index, character) in token.char_indices() {
1090        if is_identifier_char(character) || matches!(character, '@' | '#' | ',' | '+' | '?' | '|') {
1091            prefix_len = index + character.len_utf8();
1092        } else {
1093            break;
1094        }
1095    }
1096    prefix_len
1097}
1098
1099fn escape_regex_literal(text: &str) -> String {
1100    let mut escaped = String::with_capacity(text.len());
1101    for character in text.chars() {
1102        if matches!(
1103            character,
1104            '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '$' | '|'
1105        ) {
1106            escaped.push('\\');
1107        }
1108        escaped.push(character);
1109    }
1110    escaped
1111}
1112
1113fn wrap_with_word_boundaries(pattern: &str) -> String {
1114    if pattern.is_empty() {
1115        String::new()
1116    } else {
1117        format!("{WORD_BOUNDARY_REGEX}{pattern}{WORD_BOUNDARY_REGEX}")
1118    }
1119}
1120
1121fn update_class_type(class_type: Option<&str>) -> String {
1122    match class_type {
1123        None | Some("WORDX") => DEFAULT_WORD_REGEX.to_string(),
1124        Some("WORD") => r"[\w]+".to_string(),
1125        Some(value) => value.to_string(),
1126    }
1127}
1128
1129fn resolve_default_class_type(class_type: Option<&str>) -> Option<String> {
1130    match class_type {
1131        None => None,
1132        Some("ALPHA_NUM_EXTENDED") => Some(
1133            "((?:ALPHA_NUM_EXTENDED|ALPHA_NUM|ALPHA_EXTENDED|ALPHA|NUM_EXTENDED|NUM))".to_string(),
1134        ),
1135        Some("ALPHA_EXTENDED") => Some("((?:ALPHA_EXTENDED|ALPHA))".to_string()),
1136        Some("NUM_EXTENDED") => Some("((?:NUM_EXTENDED|NUM))".to_string()),
1137        Some("ALPHA_NUM") => Some("((?:ALPHA_NUM|ALPHA|NUM))".to_string()),
1138        Some(value) => Some(format!("((?:{value}))")),
1139    }
1140}
1141
1142fn resolve_type_modifier_class(
1143    type_modifiers: TypeModifierSet,
1144    modifier: Option<&str>,
1145) -> (Option<String>, bool) {
1146    if modifier.is_some_and(|value| value.contains(',')) {
1147        return (Some("((?:SEPARATOR))".to_string()), false);
1148    }
1149
1150    let has_alpha = type_modifiers.alpha;
1151    let has_num = type_modifiers.numeric;
1152    let has_extended = type_modifiers.extended;
1153    let has_strict = type_modifiers.strict;
1154
1155    let resolved = if has_strict {
1156        match (has_alpha, has_num, has_extended) {
1157            (true, true, true) => Some("ALPHA_NUM_EXTENDED".to_string()),
1158            (true, true, false) => Some("ALPHA_NUM".to_string()),
1159            (true, false, true) => Some("ALPHA_EXTENDED".to_string()),
1160            (true, false, false) => Some("ALPHA".to_string()),
1161            (false, true, true) => Some("NUM_EXTENDED".to_string()),
1162            (false, true, false) => Some("NUM".to_string()),
1163            (false, false, true) => Some(WORD_CHAR_CLASS_REGEX.to_string()),
1164            _ => None,
1165        }
1166    } else {
1167        match (has_alpha, has_num, has_extended) {
1168            (true, true, true) => Some(
1169                "((?:ALPHA_NUM_EXTENDED|ALPHA_NUM|ALPHA_EXTENDED|ALPHA|NUM_EXTENDED|NUM))"
1170                    .to_string(),
1171            ),
1172            (true, true, false) => Some("((?:ALPHA_NUM|ALPHA|NUM))".to_string()),
1173            (true, false, true) => Some("((?:ALPHA_EXTENDED|ALPHA))".to_string()),
1174            (true, false, false) => Some("((?:ALPHA))".to_string()),
1175            (false, true, true) => Some("((?:NUM_EXTENDED|NUM))".to_string()),
1176            (false, true, false) => Some("((?:NUM))".to_string()),
1177            (false, false, true) => Some(r"((?:[\w\-']+))".to_string()),
1178            _ => None,
1179        }
1180    };
1181
1182    (resolved, has_strict)
1183}
1184
1185fn apply_class_constraint(
1186    base_pattern: Option<String>,
1187    constraint: Option<&ClassConstraint>,
1188    type_modifiers: TypeModifierSet,
1189) -> Result<Option<String>, ParseError> {
1190    let Some(constraint) = constraint else {
1191        return Ok(base_pattern);
1192    };
1193    let Some(mut base_pattern) = base_pattern else {
1194        return Ok(None);
1195    };
1196
1197    if type_modifiers.strict || !(base_pattern.starts_with("((?:") && base_pattern.ends_with("))"))
1198    {
1199        return Ok(Some(base_pattern));
1200    }
1201
1202    match constraint {
1203        ClassConstraint::Explicit(_) => Ok(Some(base_pattern)),
1204        ClassConstraint::Included(items) => {
1205            let included: Vec<&str> = items.iter().map(String::as_str).collect();
1206            base_pattern = retain_components(&base_pattern, &included);
1207            Ok(Some(base_pattern))
1208        }
1209        ClassConstraint::Excluded(items) => {
1210            for item in items {
1211                if item.starts_with("!!!") {
1212                    if !type_modifiers.extended {
1213                        return Err(ParseError::InvalidPattern(format!(
1214                            "invalid modifier: {item}"
1215                        )));
1216                    }
1217                    base_pattern = match item.as_str() {
1218                        "!!!@" => filter_components(&base_pattern, &["ALPHA"]),
1219                        "!!!#" => filter_components(&base_pattern, &["NUM"]),
1220                        _ => {
1221                            return Err(ParseError::InvalidPattern(format!(
1222                                "invalid modifier: {item}"
1223                            )));
1224                        }
1225                    };
1226                } else if item.starts_with("!!") {
1227                    if !type_modifiers.extended {
1228                        return Err(ParseError::InvalidPattern(format!(
1229                            "invalid modifier: {item}"
1230                        )));
1231                    }
1232                    base_pattern = match item.as_str() {
1233                        "!!@" => filter_components(&base_pattern, &["ALPHA_EXTENDED"]),
1234                        "!!#" => filter_components(&base_pattern, &["NUM_EXTENDED"]),
1235                        _ => {
1236                            return Err(ParseError::InvalidPattern(format!(
1237                                "invalid modifier: {item}"
1238                            )));
1239                        }
1240                    };
1241                } else if let Some(value_to_filter) = item.strip_prefix('!') {
1242                    base_pattern = if value_to_filter == "@" {
1243                        filter_components(&base_pattern, &["ALPHA_EXTENDED", "ALPHA"])
1244                    } else if value_to_filter == "#" {
1245                        filter_components(&base_pattern, &["NUM_EXTENDED", "NUM"])
1246                    } else {
1247                        filter_components(&base_pattern, &[value_to_filter])
1248                    };
1249                }
1250            }
1251            Ok(Some(base_pattern))
1252        }
1253    }
1254}
1255
1256fn filter_components(modifier_class_type: &str, components_to_remove: &[&str]) -> String {
1257    let prefix = &modifier_class_type[..4];
1258    let suffix = &modifier_class_type[modifier_class_type.len() - 2..];
1259    let inner = &modifier_class_type[4..modifier_class_type.len() - 2];
1260    let filtered: Vec<&str> = inner
1261        .split('|')
1262        .filter(|part| !components_to_remove.contains(part))
1263        .collect();
1264    if filtered.is_empty() {
1265        "()".to_string()
1266    } else {
1267        format!("{prefix}{}{suffix}", filtered.join("|"))
1268    }
1269}
1270
1271fn retain_components(modifier_class_type: &str, components_to_keep: &[&str]) -> String {
1272    let prefix = &modifier_class_type[..4];
1273    let suffix = &modifier_class_type[modifier_class_type.len() - 2..];
1274    let inner = &modifier_class_type[4..modifier_class_type.len() - 2];
1275    let filtered: Vec<&str> = inner
1276        .split('|')
1277        .filter(|part| components_to_keep.contains(part))
1278        .collect();
1279    if filtered.is_empty() {
1280        "()".to_string()
1281    } else {
1282        format!("{prefix}{}{suffix}", filtered.join("|"))
1283    }
1284}
1285
1286fn apply_multigroup_class(
1287    object: &str,
1288    is_capturing_group: bool,
1289    is_strict_class: bool,
1290    is_greedy_matching: bool,
1291) -> String {
1292    let mut object = object.to_string();
1293    if !is_strict_class && object.starts_with('(') && object.ends_with(')') {
1294        object = object[1..object.len() - 1].to_string();
1295    }
1296    if is_greedy_matching {
1297        if is_capturing_group {
1298            format!(r"((?:{object}|\s)+)")
1299        } else {
1300            format!(r"(?:{object}|\s)+")
1301        }
1302    } else if is_capturing_group {
1303        format!(r"((?:{object}|\s)+?)")
1304    } else {
1305        format!(r"(?:{object}|\s)+?")
1306    }
1307}
1308
1309fn apply_suffix_modifiers(
1310    base_pattern: Option<String>,
1311    segment: &TelSegment,
1312    class_type: Option<&str>,
1313    resolved_class: Option<&str>,
1314) -> Option<String> {
1315    let is_multi_group = segment.token_info.flags.multi_group;
1316    let is_optional = segment.token_info.flags.optional;
1317    let mut result = if is_multi_group {
1318        let source = class_type.or(resolved_class)?;
1319        Some(apply_multigroup_class(
1320            source,
1321            true,
1322            segment.type_modifiers.strict,
1323            segment.type_modifiers.greedy,
1324        ))
1325    } else {
1326        base_pattern
1327    };
1328    if is_optional {
1329        result = result.map(|pattern| {
1330            if pattern.ends_with(')') {
1331                format!("{pattern}?")
1332            } else {
1333                format!("({pattern})?")
1334            }
1335        });
1336    }
1337    result
1338}
1339
1340pub(crate) fn apply_segment_to_class_type(
1341    segment: &TelSegment,
1342    class_type: Option<&str>,
1343) -> Option<String> {
1344    let modifier = segment.token_info.modifier.as_deref();
1345    let flags = segment.token_info.flags;
1346    let has_type_signal = segment.type_modifiers.alpha
1347        || segment.type_modifiers.numeric
1348        || segment.type_modifiers.extended
1349        || modifier.is_some_and(|value| value.contains(','));
1350
1351    if !has_type_signal
1352        && !segment.type_modifiers.strict
1353        && !segment.type_modifiers.greedy
1354        && !flags.multi_group
1355        && !flags.optional
1356    {
1357        return resolve_default_class_type(class_type);
1358    }
1359
1360    if modifier.is_none() {
1361        return resolve_default_class_type(class_type);
1362    }
1363
1364    let (modifier_class_type, _) = resolve_type_modifier_class(
1365        segment.type_modifiers,
1366        segment.token_info.modifier.as_deref(),
1367    );
1368    let modifier_class_type = apply_class_constraint(
1369        modifier_class_type,
1370        segment.class_constraint.as_ref(),
1371        segment.type_modifiers,
1372    )
1373    .ok()?;
1374
1375    let mut class_type = class_type.map(ToOwned::to_owned);
1376    if let Some(ref modifier_class_type) = modifier_class_type
1377        && class_type
1378            .as_deref()
1379            .is_some_and(|value| matches!(value, r"[\w]+" | r"[\w\-']+"))
1380        && modifier.is_some_and(|value| value.chars().any(|c| matches!(c, '@' | '#' | '%' | ',')))
1381    {
1382        class_type = Some(modifier_class_type.clone());
1383    }
1384
1385    let base_pattern = class_type.clone().or_else(|| modifier_class_type.clone());
1386    apply_suffix_modifiers(
1387        base_pattern,
1388        segment,
1389        class_type.as_deref(),
1390        modifier_class_type.as_deref(),
1391    )
1392}
1393
1394pub(crate) fn count_capturing_groups(pattern: &str) -> usize {
1395    let chars: Vec<char> = pattern.chars().collect();
1396    let mut index = 0_usize;
1397    let mut count = 0_usize;
1398    let mut escaped = false;
1399
1400    while index < chars.len() {
1401        let character = chars[index];
1402        if escaped {
1403            escaped = false;
1404            index += 1;
1405            continue;
1406        }
1407
1408        if character == '\\' {
1409            escaped = true;
1410            index += 1;
1411            continue;
1412        }
1413
1414        if character == '(' && chars.get(index + 1).copied() != Some('?') {
1415            count += 1;
1416        }
1417
1418        index += 1;
1419    }
1420
1421    count
1422}
1423
1424#[cfg(test)]
1425mod tests {
1426    use super::*;
1427
1428    #[test]
1429    fn test_split_parse_tokens_preserves_literal_blocks_and_modifiers() {
1430        assert_eq!(
1431            split_parse_tokens("{{Unit}} <<UNIT#?>>"),
1432            vec!["{{Unit}}", " ", "<<UNIT#?>>"]
1433        );
1434        assert_eq!(
1435            split_parse_tokens(r"ALPHA \(<<TITLE>>\) ALPHA"),
1436            vec!["ALPHA", " ", "(", "<<TITLE>>", ")", " ", "ALPHA"]
1437        );
1438    }
1439
1440    #[test]
1441    fn test_compile_pattern_preserves_segment_semantics() {
1442        let compiled = CompiledPattern::compile("<<CIVIC#>> <<STREET@+>> <<TYPE::STREETTYPE>>")
1443            .expect("pattern compiles");
1444
1445        assert_eq!(compiled.token_info.len(), 3);
1446        assert_eq!(compiled.parsed.segments.len(), 3);
1447        assert_eq!(
1448            compiled.parsed.segments[0].token_info.var_name.as_deref(),
1449            Some("CIVIC")
1450        );
1451        assert!(compiled.parsed.segments[1].type_modifiers.alpha);
1452        assert_eq!(compiled.parsed.segments[1].quantity, Quantity::OneOrMore);
1453        assert_eq!(
1454            compiled.parsed.segments[2].class_constraint,
1455            Some(ClassConstraint::Explicit("STREETTYPE".to_string()))
1456        );
1457    }
1458
1459    #[test]
1460    fn test_compile_pattern_tracks_class_filters_in_ast() {
1461        let compiled =
1462            CompiledPattern::compile("<<CIVIC@#%[!@]>> <<STREET[ALPHA|ALPHA_EXTENDED]>>")
1463                .expect("pattern compiles");
1464
1465        assert_eq!(
1466            compiled.parsed.segments[0].class_constraint,
1467            Some(ClassConstraint::Excluded(vec!["!@".to_string()]))
1468        );
1469        assert_eq!(
1470            compiled.parsed.segments[1].class_constraint,
1471            Some(ClassConstraint::Included(vec![
1472                "ALPHA".to_string(),
1473                "ALPHA_EXTENDED".to_string(),
1474            ]))
1475        );
1476    }
1477
1478    #[test]
1479    fn test_compile_pattern_allows_legacy_greedy_without_multi_group() {
1480        let compiled = CompiledPattern::compile("CITY$").expect("legacy pattern should compile");
1481        assert!(compiled.parsed.segments[0].type_modifiers.greedy);
1482        assert_eq!(compiled.parsed.segments[0].quantity, Quantity::Required);
1483    }
1484}
tokmat/tel.rs

tokmat/
tel.rs