Skip to main content

perl_regex/
lib.rs

1//! Perl regex validation and analysis
2//!
3//! This module provides tools to validate Perl regular expressions
4//! and detect potential security or performance issues like catastrophic backtracking.
5
6use thiserror::Error;
7
8/// Error type for Perl regex validation failures.
9#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11    /// Syntax error at a specific byte offset in the regex pattern.
12    #[error("{message} at offset {offset}")]
13    Syntax {
14        /// Human-readable description of the syntax issue.
15        message: String,
16        /// Byte offset where the error was detected.
17        offset: usize,
18    },
19}
20
21impl RegexError {
22    /// Create a new syntax error with a message and byte offset.
23    pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24        RegexError::Syntax { message: message.into(), offset }
25    }
26}
27
28/// Validator for Perl regular expressions to prevent security and performance issues
29pub struct RegexValidator {
30    max_nesting: usize,
31    max_unicode_properties: usize,
32}
33
34impl RegexValidator {
35    /// Create a new validator with default safety limits
36    pub fn new() -> Self {
37        Self {
38            // Default limits from issue #461
39            max_nesting: 10,
40            // Limit from issue #460
41            max_unicode_properties: 50,
42        }
43    }
44
45    /// Validate a regex pattern for potential performance or security risks
46    pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
47        self.check_complexity(pattern, start_pos)
48    }
49
50    /// Check if the pattern contains embedded code constructs (?{...}) or (??{...})
51    pub fn detects_code_execution(&self, pattern: &str) -> bool {
52        let mut chars = pattern.char_indices().peekable();
53        while let Some((_, ch)) = chars.next() {
54            if ch == '\\' {
55                chars.next(); // skip escaped
56                continue;
57            }
58            if ch == '(' {
59                if let Some((_, '?')) = chars.peek() {
60                    chars.next(); // consume ?
61                    // Check for { or ?{
62                    if let Some((_, next)) = chars.peek() {
63                        if *next == '{' {
64                            return true; // (?{
65                        } else if *next == '?' {
66                            chars.next(); // consume second ?
67                            if let Some((_, '{')) = chars.peek() {
68                                return true; // (??{
69                            }
70                        }
71                    }
72                }
73            }
74        }
75        false
76    }
77
78    /// Check for nested quantifiers that can cause catastrophic backtracking
79    /// e.g. (a+)+, (a*)*, (a?)*
80    pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
81        // This is a heuristic check for nested quantifiers
82        // It looks for a quantifier character following a group that ends with a quantifier
83        // e.g. ")+" in "...)+"
84        // Real implementation would need a full regex parser, but this heuristic
85        // covers common cases like (a+)+
86
87        let mut chars = pattern.char_indices().peekable();
88        let mut group_stack = Vec::new();
89
90        // Track the last significant character index and its type
91        // Type: 0=other, 1=quantifier, 2=group_end
92        let mut last_type = 0;
93
94        while let Some((_, ch)) = chars.next() {
95            match ch {
96                '\\' => {
97                    chars.next(); // skip escaped
98                    last_type = 0;
99                }
100                '(' => {
101                    // Check if non-capturing or other special group
102                    if let Some((_, '?')) = chars.peek() {
103                        chars.next(); // consume '?'
104                        // Skip group-type specifier so it doesn't reach the
105                        // quantifier match arm (mirrors check_complexity logic)
106                        if matches!(
107                            chars.peek(),
108                            Some((_, ':' | '=' | '!' | '<' | '>' | '|' | 'P' | '#'))
109                        ) {
110                            chars.next();
111                        }
112                    }
113                    group_stack.push(false); // false = no quantifier inside yet
114                    last_type = 0;
115                }
116                ')' => {
117                    if let Some(has_quantifier) = group_stack.pop() {
118                        if has_quantifier {
119                            last_type = 2; // group end with internal quantifier
120                        } else {
121                            last_type = 0;
122                        }
123                    }
124                }
125                '+' | '*' | '?' | '{' => {
126                    // If we just closed a group that had a quantifier inside,
127                    // and now we see another quantifier, that's a nested quantifier!
128                    if last_type == 2 {
129                        // Check if it's really a quantifier or literal {
130                        if ch == '{' {
131                            // Only count as quantifier if it looks like {n} or {n,m}
132                            // peek ahead... (simplified for now)
133                            return true; // Assume { is quantifier for safety heuristic
134                        } else {
135                            return true;
136                        }
137                    }
138
139                    // Mark current group as having a quantifier
140                    if let Some(last) = group_stack.last_mut() {
141                        *last = true;
142                    }
143                    last_type = 1;
144                }
145                _ => {
146                    last_type = 0;
147                }
148            }
149        }
150        false
151    }
152
153    fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
154        // NOTE: Nested quantifier detection (detect_nested_quantifiers) is intentionally
155        // NOT called here. The heuristic produces too many false positives on valid Perl
156        // patterns such as (?:/\.)+, (\w+)*, (?:pattern)+. Callers that want an advisory
157        // check can invoke detect_nested_quantifiers() directly and surface the result
158        // as a non-fatal diagnostic.
159
160        let mut chars = pattern.char_indices().peekable();
161        // Stack stores the type of the current group
162        let mut stack: Vec<GroupType> = Vec::new();
163        let mut unicode_property_count = 0;
164
165        while let Some((idx, ch)) = chars.next() {
166            match ch {
167                '\\' => {
168                    // Check for escaped character
169                    if let Some((_, next_char)) = chars.peek() {
170                        match next_char {
171                            'p' | 'P' => {
172                                // Unicode property start \p or \P
173                                // We consume the 'p'/'P'
174                                chars.next();
175
176                                // Check if it's followed by {
177                                if let Some((_, '{')) = chars.peek() {
178                                    unicode_property_count += 1;
179                                    if unicode_property_count > self.max_unicode_properties {
180                                        return Err(RegexError::syntax(
181                                            "Too many Unicode properties in regex (max 50)",
182                                            start_pos + idx,
183                                        ));
184                                    }
185                                }
186                            }
187                            _ => {
188                                // Just skip other escaped chars
189                                chars.next();
190                            }
191                        }
192                    }
193                }
194                '(' => {
195                    let mut group_type = GroupType::Normal;
196
197                    // Check for extension syntax (?...)
198                    if let Some((_, '?')) = chars.peek() {
199                        chars.next(); // consume ?
200
201                        // Check for < (lookbehind or named capture)
202                        if let Some((_, '<')) = chars.peek() {
203                            chars.next(); // consume <
204
205                            // Check for = or ! (lookbehind)
206                            if matches!(chars.peek(), Some((_, '=')) | Some((_, '!'))) {
207                                chars.next(); // consume = or !
208                                group_type = GroupType::Lookbehind;
209                            }
210                            // Otherwise it's likely a named capture (?<name>...) or condition (?<...)
211                            // which we treat as a normal group
212                        } else if let Some((_, '|')) = chars.peek() {
213                            chars.next(); // consume |
214                            group_type = GroupType::BranchReset { branch_count: 1 };
215                        }
216                    }
217
218                    match group_type {
219                        GroupType::Lookbehind => {
220                            // Calculate current lookbehind depth
221                            let lookbehind_depth =
222                                stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
223                            if lookbehind_depth >= self.max_nesting {
224                                return Err(RegexError::syntax(
225                                    "Regex lookbehind nesting too deep",
226                                    start_pos + idx,
227                                ));
228                            }
229                        }
230                        GroupType::BranchReset { .. } => {
231                            // Calculate current branch reset nesting
232                            let reset_depth = stack
233                                .iter()
234                                .filter(|g| matches!(g, GroupType::BranchReset { .. }))
235                                .count();
236                            if reset_depth >= self.max_nesting {
237                                // Use same nesting limit for now
238                                return Err(RegexError::syntax(
239                                    "Regex branch reset nesting too deep",
240                                    start_pos + idx,
241                                ));
242                            }
243                        }
244                        _ => {}
245                    }
246                    stack.push(group_type);
247                }
248                '|' => {
249                    // Check if we are in a branch reset group
250                    if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
251                        *branch_count += 1;
252                        if *branch_count > 50 {
253                            // Max 50 branches
254                            return Err(RegexError::syntax(
255                                "Too many branches in branch reset group (max 50)",
256                                start_pos + idx,
257                            ));
258                        }
259                    }
260                }
261                ')' => {
262                    stack.pop();
263                }
264                '[' => {
265                    // Skip character class [ ... ]
266                    // Need to handle escaping inside []
267                    while let Some((_, c)) = chars.next() {
268                        if c == '\\' {
269                            chars.next();
270                        } else if c == ']' {
271                            break;
272                        }
273                    }
274                }
275                _ => {}
276            }
277        }
278
279        Ok(())
280    }
281}
282
283enum GroupType {
284    Normal,
285    Lookbehind,
286    BranchReset { branch_count: usize },
287}
288
289impl Default for RegexValidator {
290    fn default() -> Self {
291        Self::new()
292    }
293}