Skip to main content

perl_regex/
lib.rs

1//! Perl regex validation and analysis
2//!
3//! This module provides tools to validate Perl regular expressions
4//! and detect potential security or performance issues like catastrophic backtracking.
5
6use thiserror::Error;
7
8/// Error type for Perl regex validation failures.
9#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11    /// Syntax error at a specific byte offset in the regex pattern.
12    #[error("{message} at offset {offset}")]
13    Syntax {
14        /// Human-readable description of the syntax issue.
15        message: String,
16        /// Byte offset where the error was detected.
17        offset: usize,
18    },
19}
20
21impl RegexError {
22    /// Create a new syntax error with a message and byte offset.
23    pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24        RegexError::Syntax { message: message.into(), offset }
25    }
26}
27
28/// Validator for Perl regular expressions to prevent security and performance issues
29pub struct RegexValidator {
30    max_nesting: usize,
31    max_unicode_properties: usize,
32}
33
34impl RegexValidator {
35    /// Create a new validator with default safety limits
36    pub fn new() -> Self {
37        Self {
38            // Default limits from issue #461
39            max_nesting: 10,
40            // Limit from issue #460
41            max_unicode_properties: 50,
42        }
43    }
44
45    /// Validate a regex pattern for potential performance or security risks
46    pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
47        self.check_complexity(pattern, start_pos)
48    }
49
50    /// Check if the pattern contains embedded code constructs (?{...}) or (??{...})
51    pub fn detects_code_execution(&self, pattern: &str) -> bool {
52        let mut chars = pattern.char_indices().peekable();
53        while let Some((_, ch)) = chars.next() {
54            if ch == '\\' {
55                chars.next(); // skip escaped
56                continue;
57            }
58            if ch == '(' {
59                if let Some((_, '?')) = chars.peek() {
60                    chars.next(); // consume ?
61                    // Check for { or ?{
62                    if let Some((_, next)) = chars.peek() {
63                        if *next == '{' {
64                            return true; // (?{
65                        } else if *next == '?' {
66                            chars.next(); // consume second ?
67                            if let Some((_, '{')) = chars.peek() {
68                                return true; // (??{
69                            }
70                        }
71                    }
72                }
73            }
74        }
75        false
76    }
77
78    /// Check for nested quantifiers that can cause catastrophic backtracking
79    /// e.g. (a+)+, (a*)*, (a?)*
80    pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
81        // This is a heuristic check for nested quantifiers
82        // It looks for a quantifier character following a group that ends with a quantifier
83        // e.g. ")+" in "...)+"
84        // Real implementation would need a full regex parser, but this heuristic
85        // covers common cases like (a+)+
86
87        let mut chars = pattern.char_indices().peekable();
88        let mut group_stack = Vec::new();
89
90        // Track the last significant character index and its type
91        // Type: 0=other, 1=quantifier, 2=group_end
92        let mut last_type = 0;
93
94        while let Some((_, ch)) = chars.next() {
95            match ch {
96                '\\' => {
97                    chars.next(); // skip escaped
98                    last_type = 0;
99                }
100                '(' => {
101                    // Check if non-capturing or other special group
102                    if let Some((_, '?')) = chars.peek() {
103                        chars.next(); // consume '?'
104                        // Skip group-type specifier so it doesn't reach the
105                        // quantifier match arm (mirrors check_complexity logic)
106                        if matches!(
107                            chars.peek(),
108                            Some((_, ':' | '=' | '!' | '<' | '>' | '|' | 'P' | '#'))
109                        ) {
110                            chars.next();
111                        }
112                    }
113                    group_stack.push(false); // false = no quantifier inside yet
114                    last_type = 0;
115                }
116                ')' => {
117                    if let Some(has_quantifier) = group_stack.pop() {
118                        if has_quantifier {
119                            last_type = 2; // group end with internal quantifier
120                        } else {
121                            last_type = 0;
122                        }
123                    }
124                }
125                '+' | '*' | '?' | '{' => {
126                    // If we just closed a group that had a quantifier inside,
127                    // and now we see another quantifier, that's a nested quantifier!
128                    if last_type == 2 {
129                        // Check if it's really a quantifier or literal {
130                        if ch == '{' {
131                            // Only count as quantifier if it looks like {n} or {n,m}
132                            // peek ahead... (simplified for now)
133                            return true; // Assume { is quantifier for safety heuristic
134                        } else {
135                            return true;
136                        }
137                    }
138
139                    // Mark current group as having a quantifier
140                    if let Some(last) = group_stack.last_mut() {
141                        *last = true;
142                    }
143                    last_type = 1;
144                }
145                _ => {
146                    last_type = 0;
147                }
148            }
149        }
150        false
151    }
152
153    fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
154        // NOTE: Nested quantifier detection (detect_nested_quantifiers) is intentionally
155        // NOT called here. The heuristic produces too many false positives on valid Perl
156        // patterns such as (?:/\.)+, (\w+)*, (?:pattern)+. Callers that want an advisory
157        // check can invoke detect_nested_quantifiers() directly and surface the result
158        // as a non-fatal diagnostic.
159
160        let mut chars = pattern.char_indices().peekable();
161        // Stack stores the type of the current group
162        let mut stack: Vec<GroupType> = Vec::new();
163        let mut unicode_property_count = 0;
164
165        while let Some((idx, ch)) = chars.next() {
166            match ch {
167                '\\' => {
168                    // Check for escaped character
169                    if let Some((_, next_char)) = chars.peek() {
170                        match next_char {
171                            'p' | 'P' => {
172                                // Unicode property start \p or \P
173                                // We consume the 'p'/'P'
174                                chars.next();
175
176                                // Check if it's followed by {
177                                if let Some((_, '{')) = chars.peek() {
178                                    unicode_property_count += 1;
179                                    if unicode_property_count > self.max_unicode_properties {
180                                        return Err(RegexError::syntax(
181                                            "Too many Unicode properties in regex (max 50)",
182                                            start_pos + idx,
183                                        ));
184                                    }
185                                }
186                            }
187                            _ => {
188                                // Just skip other escaped chars
189                                chars.next();
190                            }
191                        }
192                    }
193                }
194                '(' => {
195                    let mut group_type = GroupType::Normal;
196
197                    // Check for extension syntax (?...)
198                    if let Some((_, '?')) = chars.peek() {
199                        chars.next(); // consume ?
200
201                        // Check for < (lookbehind or named capture)
202                        if let Some((_, '<')) = chars.peek() {
203                            chars.next(); // consume <
204
205                            // Check for = or ! (lookbehind)
206                            if matches!(chars.peek(), Some((_, '=')) | Some((_, '!'))) {
207                                chars.next(); // consume = or !
208                                group_type = GroupType::Lookbehind;
209                            }
210                            // Otherwise it's likely a named capture (?<name>...) or condition (?<...)
211                            // which we treat as a normal group
212                        } else if let Some((_, '|')) = chars.peek() {
213                            chars.next(); // consume |
214                            group_type = GroupType::BranchReset { branch_count: 1 };
215                        }
216                    }
217
218                    match group_type {
219                        GroupType::Lookbehind => {
220                            // Calculate current lookbehind depth
221                            let lookbehind_depth =
222                                stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
223                            if lookbehind_depth >= self.max_nesting {
224                                return Err(RegexError::syntax(
225                                    "Regex lookbehind nesting too deep",
226                                    start_pos + idx,
227                                ));
228                            }
229                        }
230                        GroupType::BranchReset { .. } => {
231                            // Calculate current branch reset nesting
232                            let reset_depth = stack
233                                .iter()
234                                .filter(|g| matches!(g, GroupType::BranchReset { .. }))
235                                .count();
236                            if reset_depth >= self.max_nesting {
237                                // Use same nesting limit for now
238                                return Err(RegexError::syntax(
239                                    "Regex branch reset nesting too deep",
240                                    start_pos + idx,
241                                ));
242                            }
243                        }
244                        _ => {}
245                    }
246                    stack.push(group_type);
247                }
248                '|' => {
249                    // Check if we are in a branch reset group
250                    if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
251                        *branch_count += 1;
252                        if *branch_count > 50 {
253                            // Max 50 branches
254                            return Err(RegexError::syntax(
255                                "Too many branches in branch reset group (max 50)",
256                                start_pos + idx,
257                            ));
258                        }
259                    }
260                }
261                ')' => {
262                    stack.pop();
263                }
264                '[' => {
265                    // Skip character class [ ... ]
266                    // Need to handle escaping inside []
267                    while let Some((_, c)) = chars.next() {
268                        if c == '\\' {
269                            chars.next();
270                        } else if c == ']' {
271                            break;
272                        }
273                    }
274                }
275                _ => {}
276            }
277        }
278
279        Ok(())
280    }
281}
282
283enum GroupType {
284    Normal,
285    Lookbehind,
286    BranchReset { branch_count: usize },
287}
288
289impl Default for RegexValidator {
290    fn default() -> Self {
291        Self::new()
292    }
293}
294
295/// A named capture group extracted from a Perl regex pattern.
296///
297/// Named captures use the `(?<name>...)` syntax introduced in Perl 5.10.
298/// Captured text is accessible via `$+{name}` or `$1`, `$2`, ... by index.
299#[derive(Debug, Clone)]
300pub struct CaptureGroup {
301    /// The capture group name from `(?<name>...)`.
302    pub name: String,
303    /// One-based capture index (counting all capturing groups left to right).
304    pub index: usize,
305    /// The sub-pattern inside the capture group.
306    pub pattern: String,
307}
308
309/// Analysis utilities for Perl regex patterns: capture extraction and hover text.
310pub struct RegexAnalyzer;
311
312impl RegexAnalyzer {
313    /// Extract all named capture groups from a Perl regex pattern.
314    ///
315    /// Scans the pattern for `(?<name>...)` groups and returns them in left-to-right
316    /// order. Non-capturing groups (`(?:...)`), lookaheads, and lookbehinds do not
317    /// increment the capture index. Escaped parentheses (`\(`) are skipped.
318    ///
319    /// # Example
320    /// ```
321    /// use perl_regex::RegexAnalyzer;
322    /// let caps = RegexAnalyzer::extract_named_captures("(?<year>\\d{4})-(?<month>\\d{2})");
323    /// assert_eq!(caps.len(), 2);
324    /// assert_eq!(caps[0].name, "year");
325    /// assert_eq!(caps[0].index, 1);
326    /// ```
327    pub fn extract_named_captures(pattern: &str) -> Vec<CaptureGroup> {
328        let mut result = Vec::new();
329        let mut capture_index = 0usize;
330        let chars: Vec<char> = pattern.chars().collect();
331        let len = chars.len();
332        let mut i = 0;
333
334        while i < len {
335            // Skip escaped characters.
336            if chars[i] == '\\' {
337                i += 2;
338                continue;
339            }
340
341            // Skip character classes [...] entirely.
342            if chars[i] == '[' {
343                i += 1;
344                while i < len {
345                    if chars[i] == '\\' {
346                        i += 2;
347                    } else if chars[i] == ']' {
348                        i += 1;
349                        break;
350                    } else {
351                        i += 1;
352                    }
353                }
354                continue;
355            }
356
357            if chars[i] == '(' {
358                i += 1;
359
360                // Determine the group kind.
361                if i < len && chars[i] == '?' {
362                    i += 1; // consume '?'
363
364                    if i < len && chars[i] == '<' {
365                        i += 1; // consume '<'
366
367                        // Lookbehind: (?<= or (?<!  — not a capture.
368                        if i < len && (chars[i] == '=' || chars[i] == '!') {
369                            i += 1;
370                            continue;
371                        }
372
373                        // Named capture (?<name>...) — collect the name.
374                        capture_index += 1;
375                        let name_start = i;
376                        while i < len && chars[i] != '>' {
377                            i += 1;
378                        }
379                        let name: String = chars[name_start..i].iter().collect();
380                        if i < len {
381                            i += 1; // consume '>'
382                        }
383
384                        // Collect the sub-pattern up to the matching ')'.
385                        let pattern_start = i;
386                        let mut depth = 1usize;
387                        while i < len && depth > 0 {
388                            if chars[i] == '\\' {
389                                i += 2;
390                                continue;
391                            }
392                            if chars[i] == '(' {
393                                depth += 1;
394                            } else if chars[i] == ')' {
395                                depth -= 1;
396                            }
397                            i += 1;
398                        }
399                        // The ')' was consumed above; sub-pattern ends before it.
400                        let sub: String = if i > 0 && pattern_start < i - 1 {
401                            chars[pattern_start..i - 1].iter().collect()
402                        } else {
403                            String::new()
404                        };
405
406                        result.push(CaptureGroup { name, index: capture_index, pattern: sub });
407                        continue;
408                    } else if i < len && matches!(chars[i], ':' | '=' | '!' | '>' | '|' | 'P' | '#')
409                    {
410                        // Non-capturing group: (?:...), (?=...), (?!...), (?|...), etc.
411                        // Does not increment capture_index; just move on (fall through to
412                        // normal scanning — the loop will handle nested parens naturally).
413                        continue;
414                    }
415                    // Any other (?...) — treat as non-capturing for index purposes.
416                    continue;
417                }
418
419                // Plain capturing group `(...)`.
420                capture_index += 1;
421                continue;
422            }
423
424            i += 1;
425        }
426
427        result
428    }
429
430    /// Generate hover text for a Perl regex pattern and its modifiers.
431    ///
432    /// Summarises the named capture groups and explains the meaning of each
433    /// modifier flag (`i`, `m`, `s`, `x`, `g`).
434    ///
435    /// # Example
436    /// ```
437    /// use perl_regex::RegexAnalyzer;
438    /// let text = RegexAnalyzer::hover_text_for_regex("(?<id>\\d+)", "i");
439    /// assert!(text.contains("id"));
440    /// assert!(text.contains("case"));
441    /// ```
442    pub fn hover_text_for_regex(pattern: &str, modifiers: &str) -> String {
443        let mut parts: Vec<String> = Vec::new();
444
445        if !pattern.is_empty() {
446            parts.push(format!("Regex: `{pattern}`"));
447        }
448
449        // Named captures section.
450        let captures = Self::extract_named_captures(pattern);
451        if !captures.is_empty() {
452            parts.push("Named captures:".to_string());
453            for cap in &captures {
454                parts.push(format!(
455                    "  ${{{name}}} (capture {index}): `{pat}`",
456                    name = cap.name,
457                    index = cap.index,
458                    pat = cap.pattern,
459                ));
460            }
461        }
462
463        // Modifier explanations.
464        let modifier_notes: Vec<&str> = modifiers
465            .chars()
466            .filter_map(|m| match m {
467                'i' => Some("case-insensitive matching"),
468                'm' => Some("multiline mode: ^ and $ match line boundaries"),
469                's' => Some("single-line mode: dot matches newline"),
470                'x' => Some("extended mode: whitespace and comments allowed"),
471                'g' => Some("global: match all occurrences"),
472                _ => None,
473            })
474            .collect();
475
476        if !modifier_notes.is_empty() {
477            parts.push("Modifiers:".to_string());
478            for note in modifier_notes {
479                parts.push(format!("  {note}"));
480            }
481        }
482
483        parts.join("\n")
484    }
485}