perl_regex/
lib.rs

1//! Perl regex validation and analysis
2//!
3//! This module provides tools to validate Perl regular expressions
4//! and detect potential security or performance issues like catastrophic backtracking.
5
6use thiserror::Error;
7
8/// Error type for Perl regex validation failures.
9#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11    /// Syntax error at a specific byte offset in the regex pattern.
12    #[error("{message} at offset {offset}")]
13    Syntax {
14        /// Human-readable description of the syntax issue.
15        message: String,
16        /// Byte offset where the error was detected.
17        offset: usize,
18    },
19}
20
21impl RegexError {
22    /// Create a new syntax error with a message and byte offset.
23    pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24        RegexError::Syntax { message: message.into(), offset }
25    }
26}
27
28/// Validator for Perl regular expressions to prevent security and performance issues
29pub struct RegexValidator {
30    max_nesting: usize,
31    max_unicode_properties: usize,
32}
33
34impl Default for RegexValidator {
35    fn default() -> Self {
36        Self::new()
37    }
38}
39
40impl RegexValidator {
41    /// Create a new validator with default safety limits
42    pub fn new() -> Self {
43        Self {
44            // Default limits from issue #461
45            max_nesting: 10,
46            // Limit from issue #460
47            max_unicode_properties: 50,
48        }
49    }
50
51    /// Validate a regex pattern for potential performance or security risks
52    pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
53        self.check_complexity(pattern, start_pos)
54    }
55
56    /// Check if the pattern contains embedded code constructs (?{...}) or (??{...})
57    pub fn detects_code_execution(&self, pattern: &str) -> bool {
58        let bytes = pattern.as_bytes();
59        let mut i = 0;
60        let len = bytes.len();
61        while i < len {
62            let ch = bytes[i];
63            if ch == b'\\' {
64                i += 2; // skip escaped
65                continue;
66            }
67            if ch == b'[' {
68                // Skip character class content so literals like [(?{] are not
69                // misclassified as embedded code execution.
70                i += 1;
71                while i < len {
72                    let class_ch = bytes[i];
73                    if class_ch == b'\\' {
74                        i += 2; // skip escaped char inside class
75                    } else if class_ch == b']' {
76                        i += 1;
77                        break;
78                    } else {
79                        i += 1;
80                    }
81                }
82                continue;
83            }
84            if ch == b'(' {
85                if i + 1 < len && bytes[i + 1] == b'?' {
86                    i += 2; // consume '(' and '?'
87                    // Check for { or ?{
88                    if i < len {
89                        if bytes[i] == b'{' {
90                            return true; // (?{
91                        } else if bytes[i] == b'?' {
92                            if i + 1 < len && bytes[i + 1] == b'{' {
93                                return true; // (??{
94                            }
95                        }
96                    }
97                    continue;
98                }
99            }
100            i += 1;
101        }
102        false
103    }
104
105    /// Check for nested quantifiers that can cause catastrophic backtracking
106    /// e.g. (a+)+, (a*)*, (a?)*
107    pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
108        // This is a heuristic check for nested quantifiers
109        // It looks for a quantifier character following a group that ends with a quantifier
110        // e.g. ")+" in "...)+"
111        // Real implementation would need a full regex parser, but this heuristic
112        // covers common cases like (a+)+
113
114        let bytes = pattern.as_bytes();
115        let mut i = 0;
116        let len = bytes.len();
117        let mut group_stack = Vec::new();
118
119        // Track the last significant character index and its type
120        // Type: 0=other, 1=quantifier, 2=group_end
121        let mut last_type = 0;
122
123        while i < len {
124            let ch = bytes[i];
125            match ch {
126                b'\\' => {
127                    i += 2; // skip escaped
128                    last_type = 0;
129                    continue;
130                }
131                b'(' => {
132                    // Check if non-capturing or other special group
133                    if i + 1 < len && bytes[i + 1] == b'?' {
134                        i += 2; // consume '(' and '?'
135                        // Skip group-type specifier so it doesn't reach the
136                        // quantifier match arm (mirrors check_complexity logic)
137                        if i < len
138                            && matches!(
139                                bytes[i],
140                                b':' | b'=' | b'!' | b'<' | b'>' | b'|' | b'P' | b'#'
141                            )
142                        {
143                            i += 1;
144                        }
145                    } else {
146                        i += 1;
147                    }
148                    group_stack.push(false); // false = no quantifier inside yet
149                    last_type = 0;
150                    continue;
151                }
152                b')' => {
153                    if let Some(has_quantifier) = group_stack.pop() {
154                        if has_quantifier {
155                            last_type = 2; // group end with internal quantifier
156                        } else {
157                            last_type = 0;
158                        }
159                    }
160                }
161                b'+' | b'*' | b'?' | b'{' => {
162                    // If we just closed a group that had a quantifier inside,
163                    // and now we see another quantifier, that's a nested quantifier!
164                    if last_type == 2 {
165                        // Check if it's really a quantifier or literal {
166                        if ch == b'{' {
167                            // Only count as quantifier if it looks like {n} or {n,m}.
168                            let mut peek_i = i + 1;
169                            if Self::is_brace_quantifier(bytes, &mut peek_i) {
170                                return true;
171                            } else {
172                                // Important fix: If it's not a brace quantifier, do NOT
173                                // advance i using peek_i. It's just a literal '{'
174                                last_type = 0;
175                                i += 1;
176                                continue;
177                            }
178                        } else {
179                            return true;
180                        }
181                    }
182
183                    // Mark current group as having a quantifier
184                    if let Some(last) = group_stack.last_mut() {
185                        *last = true;
186                    }
187                    last_type = 1;
188                }
189                _ => {
190                    last_type = 0;
191                }
192            }
193            i += 1;
194        }
195        false
196    }
197
198    fn is_brace_quantifier(bytes: &[u8], i: &mut usize) -> bool {
199        // Require at least one digit after '{'
200        let mut has_digit = false;
201        let mut has_comma = false;
202        let len = bytes.len();
203
204        while *i < len {
205            let ch = bytes[*i];
206            *i += 1;
207            if ch.is_ascii_digit() {
208                has_digit = true;
209            } else if ch == b',' && !has_comma {
210                has_comma = true;
211            } else if ch == b'}' && has_digit {
212                return true;
213            } else {
214                break;
215            }
216        }
217
218        false // Should have returned true at '}' if valid
219    }
220
221    fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
222        // NOTE: Nested quantifier detection (detect_nested_quantifiers) is intentionally
223        // NOT called here. The heuristic produces too many false positives on valid Perl
224        // patterns such as (?:/\.)+, (\w+)*, (?:pattern)+. Callers that want an advisory
225        // check can invoke detect_nested_quantifiers() directly and surface the result
226        // as a non-fatal diagnostic.
227
228        let bytes = pattern.as_bytes();
229        let mut i = 0;
230        let len = bytes.len();
231
232        // Stack stores the type of the current group
233        let mut stack: Vec<GroupType> = Vec::new();
234        let mut unicode_property_count = 0;
235
236        while i < len {
237            let ch = bytes[i];
238            match ch {
239                b'\\' => {
240                    // Check for escaped character
241                    if i + 1 < len {
242                        let next_char = bytes[i + 1];
243                        match next_char {
244                            b'p' | b'P' => {
245                                // Unicode property start \p or \P
246                                // We consume the 'p'/'P'
247                                i += 2;
248
249                                // Check if it's followed by {
250                                if i < len && bytes[i] == b'{' {
251                                    unicode_property_count += 1;
252                                    if unicode_property_count > self.max_unicode_properties {
253                                        return Err(RegexError::syntax(
254                                            "Too many Unicode properties in regex (max 50)",
255                                            start_pos + i - 2, // approximate original idx
256                                        ));
257                                    }
258                                }
259                                continue;
260                            }
261                            _ => {
262                                // Just skip other escaped chars
263                                i += 2;
264                                continue;
265                            }
266                        }
267                    }
268                }
269                b'[' => {
270                    // Need to skip character classes
271                    i += 1;
272                    while i < len {
273                        if bytes[i] == b'\\' {
274                            i += 2;
275                        } else if bytes[i] == b']' {
276                            break;
277                        } else {
278                            i += 1;
279                        }
280                    }
281                }
282                b'(' => {
283                    let mut group_type = GroupType::Normal;
284
285                    // Check for extension syntax (?...)
286                    if i + 1 < len && bytes[i + 1] == b'?' {
287                        i += 2; // consume '(' and '?'
288
289                        // Check for < (lookbehind or named capture)
290                        if i < len && bytes[i] == b'<' {
291                            i += 1; // consume <
292
293                            // Check for = or ! (lookbehind)
294                            if i < len && (bytes[i] == b'=' || bytes[i] == b'!') {
295                                i += 1; // consume = or !
296                                group_type = GroupType::Lookbehind;
297                            }
298                            // Otherwise it's likely a named capture (?<name>...) or condition (?<...)
299                            // which we treat as a normal group
300                        } else if i < len && bytes[i] == b'|' {
301                            i += 1; // consume |
302                            group_type = GroupType::BranchReset { branch_count: 1 };
303                        }
304                    } else {
305                        i += 1;
306                    }
307
308                    match group_type {
309                        GroupType::Lookbehind => {
310                            // Calculate current lookbehind depth
311                            let lookbehind_depth =
312                                stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
313                            if lookbehind_depth >= self.max_nesting {
314                                return Err(RegexError::syntax(
315                                    "Regex lookbehind nesting too deep",
316                                    start_pos + i - 1, // rough idx
317                                ));
318                            }
319                        }
320                        GroupType::BranchReset { .. } => {
321                            // Calculate current branch reset nesting
322                            let reset_depth = stack
323                                .iter()
324                                .filter(|g| matches!(g, GroupType::BranchReset { .. }))
325                                .count();
326                            if reset_depth >= self.max_nesting {
327                                // Use same nesting limit for now
328                                return Err(RegexError::syntax(
329                                    "Regex branch reset nesting too deep",
330                                    start_pos + i - 1,
331                                ));
332                            }
333                        }
334                        _ => {}
335                    }
336                    stack.push(group_type);
337                    continue;
338                }
339                b'|' => {
340                    // Check if we are in a branch reset group
341                    if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
342                        *branch_count += 1;
343                        if *branch_count > 50 {
344                            // Max 50 branches
345                            return Err(RegexError::syntax(
346                                "Too many branches in branch reset group (max 50)",
347                                start_pos + i,
348                            ));
349                        }
350                    }
351                }
352                b')' => {
353                    // Pop group from stack
354                    stack.pop();
355                }
356                _ => {}
357            }
358            i += 1;
359        }
360
361        Ok(())
362    }
363}
364
365enum GroupType {
366    Normal,
367    Lookbehind,
368    BranchReset { branch_count: usize },
369}
370
371/// A named capture group extracted from a regex pattern.
372
373#[derive(Debug, Clone, PartialEq)]
374pub struct CaptureGroup {
375    /// The capture group name from `(?<name>...)`.
376    pub name: String,
377    /// One-based capture index (counting all capturing groups left to right).
378    pub index: usize,
379    /// The sub-pattern inside the capture group.
380    pub pattern: String,
381}
382
383/// Analysis utilities for Perl regex patterns: capture extraction and hover text.
384pub struct RegexAnalyzer;
385
386impl RegexAnalyzer {
387    /// Extract all named capture groups from a Perl regex pattern.
388    ///
389    /// Scans the pattern for `(?<name>...)` groups and returns them in left-to-right
390    /// order. Non-capturing groups (`(?:...)`), lookaheads, and lookbehinds do not
391    /// increment the capture index. Escaped parentheses (`\(`) are skipped.
392    ///
393    /// # Example
394    /// ```
395    /// use perl_regex::RegexAnalyzer;
396    /// let caps = RegexAnalyzer::extract_named_captures("(?<year>\\d{4})-(?<month>\\d{2})");
397    /// assert_eq!(caps.len(), 2);
398    /// assert_eq!(caps[0].name, "year");
399    /// assert_eq!(caps[0].index, 1);
400    /// ```
401    pub fn extract_named_captures(pattern: &str) -> Vec<CaptureGroup> {
402        let mut result = Vec::new();
403        let mut capture_index = 0usize;
404        let bytes = pattern.as_bytes();
405        let len = bytes.len();
406        let mut i = 0;
407
408        while i < len {
409            // Skip escaped characters.
410            if bytes[i] == b'\\' {
411                i += 2;
412                continue;
413            }
414
415            // Skip character classes [...] entirely.
416            if bytes[i] == b'[' {
417                i += 1;
418                while i < len {
419                    if bytes[i] == b'\\' {
420                        i += 2;
421                    } else if bytes[i] == b']' {
422                        i += 1;
423                        break;
424                    } else {
425                        i += 1;
426                    }
427                }
428                continue;
429            }
430
431            if bytes[i] == b'(' {
432                i += 1;
433
434                // Determine the group kind.
435                if i < len && bytes[i] == b'?' {
436                    i += 1; // consume '?'
437
438                    if i < len && bytes[i] == b'<' {
439                        i += 1; // consume '<'
440
441                        // Lookbehind: (?<= or (?<!  — not a capture.
442                        if i < len && (bytes[i] == b'=' || bytes[i] == b'!') {
443                            i += 1;
444                            continue;
445                        }
446
447                        if let Some((name, next_pos)) =
448                            parse_named_capture_name_from(bytes, i, b'>')
449                        {
450                            capture_index += 1;
451                            i = next_pos;
452
453                            // Collect the sub-pattern up to the matching ')'.
454                            let pattern_start = i;
455                            let mut depth = 1usize;
456                            while i < len && depth > 0 {
457                                if bytes[i] == b'\\' {
458                                    i += 2;
459                                    continue;
460                                }
461                                if bytes[i] == b'[' {
462                                    i += 1;
463                                    while i < len {
464                                        if bytes[i] == b'\\' {
465                                            i += 2;
466                                        } else if bytes[i] == b']' {
467                                            i += 1;
468                                            break;
469                                        } else {
470                                            i += 1;
471                                        }
472                                    }
473                                    continue;
474                                }
475                                if bytes[i] == b'(' {
476                                    depth += 1;
477                                } else if bytes[i] == b')' {
478                                    depth -= 1;
479                                }
480                                i += 1;
481                            }
482                            // The ')' was consumed above; sub-pattern ends before it.
483                            let sub: String = if i > 0 && pattern_start < i - 1 {
484                                // Since we parsed byte by byte matching ASCII mostly,
485                                // the slice boundaries should be valid UTF-8.
486                                // If not, String::from_utf8_lossy covers it safely.
487                                String::from_utf8_lossy(&bytes[pattern_start..i - 1]).into_owned()
488                            } else {
489                                String::new()
490                            };
491
492                            result.push(CaptureGroup { name, index: capture_index, pattern: sub });
493                            continue;
494                        }
495                    } else if i < len && bytes[i] == b'\'' {
496                        if let Some((name, next_pos)) =
497                            parse_named_capture_name(bytes, i, b'\'', b'\'')
498                        {
499                            capture_index += 1;
500                            i = next_pos;
501
502                            // Collect the sub-pattern up to the matching ')'.
503                            let pattern_start = i;
504                            let mut depth = 1usize;
505                            while i < len && depth > 0 {
506                                if bytes[i] == b'\\' {
507                                    i += 2;
508                                    continue;
509                                }
510                                if bytes[i] == b'[' {
511                                    i += 1;
512                                    while i < len {
513                                        if bytes[i] == b'\\' {
514                                            i += 2;
515                                        } else if bytes[i] == b']' {
516                                            i += 1;
517                                            break;
518                                        } else {
519                                            i += 1;
520                                        }
521                                    }
522                                    continue;
523                                }
524                                if bytes[i] == b'(' {
525                                    depth += 1;
526                                } else if bytes[i] == b')' {
527                                    depth -= 1;
528                                }
529                                i += 1;
530                            }
531                            // The ')' was consumed above; sub-pattern ends before it.
532                            let sub: String = if i > 0 && pattern_start < i - 1 {
533                                String::from_utf8_lossy(&bytes[pattern_start..i - 1]).into_owned()
534                            } else {
535                                String::new()
536                            };
537
538                            result.push(CaptureGroup { name, index: capture_index, pattern: sub });
539                            continue;
540                        }
541                    } else if i < len
542                        && matches!(bytes[i], b':' | b'=' | b'!' | b'>' | b'|' | b'P' | b'#')
543                    {
544                        // Non-capturing group: (?:...), (?=...), (?!...), (?|...), etc.
545                        // Does not increment capture_index; just move on (fall through to
546                        // normal scanning — the loop will handle nested parens naturally).
547                        continue;
548                    }
549                    // Any other (?...) — treat as non-capturing for index purposes.
550                    continue;
551                }
552
553                // Plain capturing group `(...)`.
554                capture_index += 1;
555                continue;
556            }
557
558            i += 1;
559        }
560
561        result
562    }
563
564    /// Generate hover text for a Perl regex pattern and its modifiers.
565    ///
566    /// Summarises the named capture groups and explains the meaning of each
567    /// modifier flag (`i`, `m`, `s`, `x`, `g`, `a`, `d`, `l`, `u`, `n`,
568    /// `p`, `r`, `c`, `o`, `e`). Repeated modifiers are deduplicated.
569    /// Unknown modifier flags are collected and appended as
570    /// `Unknown modifiers: \`…\`` at the end of the hover text.
571    ///
572    /// # Example
573    /// ```
574    /// use perl_regex::RegexAnalyzer;
575    /// let text = RegexAnalyzer::hover_text_for_regex("(?<id>\\d+)", "i");
576    /// assert!(text.contains("id"));
577    /// assert!(text.contains("case"));
578    /// ```
579    pub fn hover_text_for_regex(pattern: &str, modifiers: &str) -> String {
580        let mut parts: Vec<String> = Vec::new();
581
582        if !pattern.is_empty() {
583            parts.push(format!("Regex: `{pattern}`"));
584        }
585
586        // Named captures section.
587        let captures = Self::extract_named_captures(pattern);
588        if !captures.is_empty() {
589            parts.push("Named captures:".to_string());
590            for cap in &captures {
591                parts.push(format!(
592                    "  ${{{name}}} (capture {index}): `{pat}`",
593                    name = cap.name,
594                    index = cap.index,
595                    pat = cap.pattern,
596                ));
597            }
598        }
599
600        // Modifier explanations.
601        let mut seen_modifiers: Vec<char> = Vec::new();
602        let mut modifier_notes: Vec<&str> = Vec::new();
603        let mut unknown_modifiers: Vec<char> = Vec::new();
604        for modifier in modifiers.chars() {
605            if seen_modifiers.contains(&modifier) {
606                continue;
607            }
608            seen_modifiers.push(modifier);
609            match describe_modifier(modifier) {
610                Some(description) => modifier_notes.push(description),
611                None => {
612                    unknown_modifiers.push(modifier);
613                }
614            }
615        }
616
617        if !modifier_notes.is_empty() {
618            parts.push("Modifiers:".to_string());
619            for note in modifier_notes {
620                parts.push(format!("  {note}"));
621            }
622        }
623
624        if !unknown_modifiers.is_empty() {
625            let unknown: String = unknown_modifiers.into_iter().collect();
626            parts.push(format!("Unknown modifiers: `{unknown}`"));
627        }
628
629        parts.join("\n")
630    }
631}
632
633fn describe_modifier(modifier: char) -> Option<&'static str> {
634    match modifier {
635        'i' => Some("case-insensitive matching"),
636        'm' => Some("multiline mode: ^ and $ match line boundaries"),
637        's' => Some("single-line mode: dot matches newline"),
638        'x' => Some("extended mode: whitespace and comments allowed"),
639        'g' => Some("global: match all occurrences"),
640        'a' => Some("ASCII-safe character classes"),
641        'd' => Some("native platform character set semantics"),
642        'l' => Some("locale-dependent character semantics"),
643        'u' => Some("Unicode character semantics"),
644        'n' => Some("non-capturing by default for unnamed groups"),
645        'p' => Some("preserve string for ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH}"),
646        'r' => Some("non-destructive substitution result"),
647        'c' => Some("keep current match position for /g scans"),
648        'o' => Some("compile pattern only once"),
649        'e' => Some("evaluate replacement as code in substitutions"),
650        _ => None,
651    }
652}
653
654fn parse_named_capture_name(
655    bytes: &[u8],
656    pos: usize,
657    open_delim: u8,
658    close_delim: u8,
659) -> Option<(String, usize)> {
660    if pos >= bytes.len() || bytes[pos] != open_delim {
661        return None;
662    }
663
664    let mut i = pos + 1;
665    let name_start = i;
666    while i < bytes.len() && bytes[i] != close_delim {
667        i += 1;
668    }
669
670    if i == name_start || i >= bytes.len() {
671        return None;
672    }
673
674    let name = String::from_utf8_lossy(&bytes[name_start..i]).into_owned();
675    Some((name, i + 1))
676}
677
678fn parse_named_capture_name_from(
679    bytes: &[u8],
680    start: usize,
681    close_delim: u8,
682) -> Option<(String, usize)> {
683    if start >= bytes.len() {
684        return None;
685    }
686
687    let mut i = start;
688    while i < bytes.len() && bytes[i] != close_delim {
689        i += 1;
690    }
691
692    if i == start || i >= bytes.len() {
693        return None;
694    }
695
696    let name = String::from_utf8_lossy(&bytes[start..i]).into_owned();
697    Some((name, i + 1))
698}
699
700#[cfg(test)]
701mod tests {
702    use super::*;
703
704    // --- RegexError ---
705
706    #[test]
707    fn regex_error_syntax_stores_message_and_offset() {
708        let err = RegexError::syntax("unexpected char", 7);
709        match &err {
710            RegexError::Syntax { message, offset } => {
711                assert_eq!(message, "unexpected char");
712                assert_eq!(*offset, 7);
713            }
714        }
715        assert!(err.to_string().contains("7"));
716        assert!(err.to_string().contains("unexpected char"));
717    }
718
719    #[test]
720    fn regex_error_implements_clone_and_partialeq() {
721        let e1 = RegexError::syntax("msg", 3);
722        let e2 = e1.clone();
723        assert_eq!(e1, e2);
724    }
725
726    // --- RegexValidator::validate (valid patterns) ---
727
728    #[test]
729    fn validate_simple_pattern_ok() {
730        let v = RegexValidator::new();
731        assert!(v.validate("hello", 0).is_ok());
732        assert!(v.validate("", 0).is_ok());
733        assert!(v.validate("(a|b)+", 0).is_ok());
734    }
735
736    #[test]
737    fn validate_unicode_property_within_limit_ok() {
738        let v = RegexValidator::new();
739        // 50 unicode properties is the limit
740        let pattern = r"\p{L}".repeat(50);
741        assert!(v.validate(&pattern, 0).is_ok());
742    }
743
744    #[test]
745    fn validate_too_many_unicode_properties_errors() {
746        let v = RegexValidator::new();
747        let pattern = r"\p{L}".repeat(51);
748        let err = v.validate(&pattern, 0).unwrap_err();
749        assert!(err.to_string().contains("Unicode"));
750    }
751
752    #[test]
753    fn validate_unicode_property_offset_propagated() {
754        let v = RegexValidator::new();
755        let prefix = "x";
756        let pattern = format!("{}{}", prefix, r"\p{L}".repeat(51));
757        let err = v.validate(&pattern, 10).unwrap_err();
758        // The reported offset should be >= 10 (start_pos)
759        match err {
760            RegexError::Syntax { offset, .. } => assert!(offset >= 10),
761        }
762    }
763
764    #[test]
765    fn validate_lookbehind_within_limit_ok() {
766        let v = RegexValidator::new();
767        // 10 is the limit; 9 nested lookbehinds should be fine
768        let mut pattern = String::from("foo");
769        for _ in 0..9 {
770            pattern = format!("(?<={})", pattern);
771        }
772        assert!(v.validate(&pattern, 0).is_ok());
773    }
774
775    #[test]
776    fn validate_lookbehind_nesting_too_deep_errors() {
777        let v = RegexValidator::new();
778        // Build 11 nested lookbehinds to exceed the depth limit of 10
779        let mut pattern = String::from("a");
780        for _ in 0..11 {
781            pattern = format!("(?<={})", pattern);
782        }
783        let err = v.validate(&pattern, 0).unwrap_err();
784        assert!(err.to_string().contains("lookbehind") || err.to_string().contains("nesting"));
785    }
786
787    #[test]
788    fn validate_branch_reset_nesting_too_deep_errors() {
789        let v = RegexValidator::new();
790        let mut pattern = String::from("a");
791        for _ in 0..11 {
792            pattern = format!("(?|{})", pattern);
793        }
794        let err = v.validate(&pattern, 0).unwrap_err();
795        assert!(err.to_string().contains("branch reset") || err.to_string().contains("nesting"));
796    }
797
798    #[test]
799    fn validate_too_many_branches_in_reset_group_errors() {
800        let v = RegexValidator::new();
801        // 51 alternatives in one (?| ... ) group exceeds max 50 branches
802        let alts = (0u32..51).map(|i| format!("a{i}")).collect::<Vec<_>>().join("|");
803        let pattern = format!("(?|{alts})");
804        let err = v.validate(&pattern, 0).unwrap_err();
805        assert!(err.to_string().contains("branch") || err.to_string().contains("50"));
806    }
807
808    #[test]
809    fn validate_character_class_skipped() {
810        // `[(?{]` should not trigger embedded code detection in validate()
811        let v = RegexValidator::new();
812        assert!(v.validate("[(?{]", 0).is_ok());
813    }
814
815    // --- RegexValidator::detects_code_execution ---
816
817    #[test]
818    fn detects_code_execution_with_code_block() {
819        let v = RegexValidator::new();
820        assert!(v.detects_code_execution("(?{ print 'hi' })"));
821    }
822
823    #[test]
824    fn detects_code_execution_with_deferred_code_block() {
825        let v = RegexValidator::new();
826        assert!(v.detects_code_execution("(??{ some_code() })"));
827    }
828
829    #[test]
830    fn detects_code_execution_false_for_non_capturing() {
831        let v = RegexValidator::new();
832        assert!(!v.detects_code_execution("(?:foo)"));
833        assert!(!v.detects_code_execution("(?=ahead)"));
834        assert!(!v.detects_code_execution("(?!not)"));
835    }
836
837    #[test]
838    fn detects_code_execution_escaped_paren_not_detected() {
839        let v = RegexValidator::new();
840        assert!(!v.detects_code_execution(r"\(?{"));
841    }
842
843    #[test]
844    fn detects_code_execution_in_char_class_not_detected() {
845        let v = RegexValidator::new();
846        assert!(!v.detects_code_execution("[(?{]"));
847    }
848
849    #[test]
850    fn detects_code_execution_empty_pattern() {
851        let v = RegexValidator::new();
852        assert!(!v.detects_code_execution(""));
853    }
854
855    // --- RegexValidator::detect_nested_quantifiers ---
856
857    #[test]
858    fn detect_nested_quantifiers_finds_plus_plus() {
859        let v = RegexValidator::new();
860        assert!(v.detect_nested_quantifiers("(a+)+"));
861    }
862
863    #[test]
864    fn detect_nested_quantifiers_finds_star_star() {
865        let v = RegexValidator::new();
866        assert!(v.detect_nested_quantifiers("(a*)*"));
867    }
868
869    #[test]
870    fn detect_nested_quantifiers_finds_brace_quantifier() {
871        let v = RegexValidator::new();
872        assert!(v.detect_nested_quantifiers("(a+){2,5}"));
873    }
874
875    #[test]
876    fn detect_nested_quantifiers_safe_patterns() {
877        let v = RegexValidator::new();
878        assert!(!v.detect_nested_quantifiers("(abc)+")); // no inner quantifier
879        assert!(!v.detect_nested_quantifiers("[a-z]+")); // character class, not group
880        assert!(!v.detect_nested_quantifiers("a+b+")); // quantifiers outside groups
881    }
882
883    // --- RegexValidator::Default ---
884
885    #[test]
886    fn default_is_same_as_new() {
887        let v: RegexValidator = Default::default();
888        assert!(v.validate("simple", 0).is_ok());
889    }
890
891    // --- RegexAnalyzer::extract_named_captures ---
892
893    #[test]
894    fn extract_named_captures_angle_bracket_syntax() {
895        let caps = RegexAnalyzer::extract_named_captures(r"(?<year>\d{4})-(?<month>\d{2})");
896        assert_eq!(caps.len(), 2);
897        assert_eq!(caps[0].name, "year");
898        assert_eq!(caps[0].index, 1);
899        assert_eq!(caps[1].name, "month");
900        assert_eq!(caps[1].index, 2);
901    }
902
903    #[test]
904    fn extract_named_captures_single_quote_syntax() {
905        let caps = RegexAnalyzer::extract_named_captures(r"(?'name'\w+)");
906        assert_eq!(caps.len(), 1);
907        assert_eq!(caps[0].name, "name");
908        assert_eq!(caps[0].index, 1);
909    }
910
911    #[test]
912    fn extract_named_captures_no_captures() {
913        let caps = RegexAnalyzer::extract_named_captures(r"\d+\.\d+");
914        assert!(caps.is_empty());
915    }
916
917    #[test]
918    fn extract_named_captures_non_capturing_group_not_counted() {
919        let caps = RegexAnalyzer::extract_named_captures(r"(?:foo)(?<bar>baz)");
920        assert_eq!(caps.len(), 1);
921        assert_eq!(caps[0].name, "bar");
922        assert_eq!(caps[0].index, 1); // plain capturing groups before it still count
923    }
924
925    #[test]
926    fn extract_named_captures_lookbehind_not_counted() {
927        // (?<= ...) is lookbehind, not a named capture
928        let caps = RegexAnalyzer::extract_named_captures(r"(?<=foo)(?<word>\w+)");
929        assert_eq!(caps.len(), 1);
930        assert_eq!(caps[0].name, "word");
931    }
932
933    #[test]
934    fn extract_named_captures_escaped_paren_skipped() {
935        let caps = RegexAnalyzer::extract_named_captures(r"\((?<x>\d)\)");
936        assert_eq!(caps.len(), 1);
937        assert_eq!(caps[0].name, "x");
938    }
939
940    #[test]
941    fn extract_named_captures_stores_subpattern() {
942        let caps = RegexAnalyzer::extract_named_captures(r"(?<id>\d+)");
943        assert_eq!(caps.len(), 1);
944        assert_eq!(caps[0].pattern, r"\d+");
945    }
946
947    // --- RegexAnalyzer::hover_text_for_regex ---
948
949    #[test]
950    fn hover_text_includes_pattern_and_captures() {
951        let text = RegexAnalyzer::hover_text_for_regex(r"(?<id>\d+)", "i");
952        assert!(text.contains("id"));
953        assert!(text.contains("case"));
954    }
955
956    #[test]
957    fn hover_text_modifier_explanations() {
958        let text = RegexAnalyzer::hover_text_for_regex("foo", "imsx");
959        assert!(text.contains("case-insensitive"));
960        assert!(text.contains("multiline"));
961        assert!(text.contains("single-line"));
962        assert!(text.contains("extended"));
963    }
964
965    #[test]
966    fn hover_text_global_modifier() {
967        let text = RegexAnalyzer::hover_text_for_regex("foo", "g");
968        assert!(text.contains("global"));
969    }
970
971    #[test]
972    fn hover_text_no_modifiers() {
973        let text = RegexAnalyzer::hover_text_for_regex("hello", "");
974        assert!(text.contains("hello"));
975        assert!(!text.contains("Modifiers"));
976    }
977
978    #[test]
979    fn hover_text_empty_pattern() {
980        let text = RegexAnalyzer::hover_text_for_regex("", "");
981        assert!(text.is_empty());
982    }
983
984    #[test]
985    fn hover_text_unknown_modifier_ignored() {
986        let text = RegexAnalyzer::hover_text_for_regex("x", "z");
987        // z is not a known modifier, so no modifier section
988        assert!(!text.contains("Modifiers"));
989    }
990}
perl_regex/lib.rs

perl_regex/
lib.rs