perl_regex/lib.rs
1//! Perl regex validation and analysis
2//!
3//! This module provides tools to validate Perl regular expressions
4//! and detect potential security or performance issues like catastrophic backtracking.
5
6use thiserror::Error;
7
8/// Error type for Perl regex validation failures.
9#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11 /// Syntax error at a specific byte offset in the regex pattern.
12 #[error("{message} at offset {offset}")]
13 Syntax {
14 /// Human-readable description of the syntax issue.
15 message: String,
16 /// Byte offset where the error was detected.
17 offset: usize,
18 },
19}
20
21impl RegexError {
22 /// Create a new syntax error with a message and byte offset.
23 pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24 RegexError::Syntax { message: message.into(), offset }
25 }
26}
27
28/// Validator for Perl regular expressions to prevent security and performance issues
29pub struct RegexValidator {
30 max_nesting: usize,
31 max_unicode_properties: usize,
32}
33
34impl RegexValidator {
35 /// Create a new validator with default safety limits
36 pub fn new() -> Self {
37 Self {
38 // Default limits from issue #461
39 max_nesting: 10,
40 // Limit from issue #460
41 max_unicode_properties: 50,
42 }
43 }
44
45 /// Validate a regex pattern for potential performance or security risks
46 pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
47 self.check_complexity(pattern, start_pos)
48 }
49
50 /// Check if the pattern contains embedded code constructs (?{...}) or (??{...})
51 pub fn detects_code_execution(&self, pattern: &str) -> bool {
52 let mut chars = pattern.char_indices().peekable();
53 while let Some((_, ch)) = chars.next() {
54 if ch == '\\' {
55 chars.next(); // skip escaped
56 continue;
57 }
58 if ch == '(' {
59 if let Some((_, '?')) = chars.peek() {
60 chars.next(); // consume ?
61 // Check for { or ?{
62 if let Some((_, next)) = chars.peek() {
63 if *next == '{' {
64 return true; // (?{
65 } else if *next == '?' {
66 chars.next(); // consume second ?
67 if let Some((_, '{')) = chars.peek() {
68 return true; // (??{
69 }
70 }
71 }
72 }
73 }
74 }
75 false
76 }
77
78 /// Check for nested quantifiers that can cause catastrophic backtracking
79 /// e.g. (a+)+, (a*)*, (a?)*
80 pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
81 // This is a heuristic check for nested quantifiers
82 // It looks for a quantifier character following a group that ends with a quantifier
83 // e.g. ")+" in "...)+"
84 // Real implementation would need a full regex parser, but this heuristic
85 // covers common cases like (a+)+
86
87 let mut chars = pattern.char_indices().peekable();
88 let mut group_stack = Vec::new();
89
90 // Track the last significant character index and its type
91 // Type: 0=other, 1=quantifier, 2=group_end
92 let mut last_type = 0;
93
94 while let Some((_, ch)) = chars.next() {
95 match ch {
96 '\\' => {
97 chars.next(); // skip escaped
98 last_type = 0;
99 }
100 '(' => {
101 // Check if non-capturing or other special group
102 if let Some((_, '?')) = chars.peek() {
103 chars.next(); // consume '?'
104 // Skip group-type specifier so it doesn't reach the
105 // quantifier match arm (mirrors check_complexity logic)
106 if matches!(
107 chars.peek(),
108 Some((_, ':' | '=' | '!' | '<' | '>' | '|' | 'P' | '#'))
109 ) {
110 chars.next();
111 }
112 }
113 group_stack.push(false); // false = no quantifier inside yet
114 last_type = 0;
115 }
116 ')' => {
117 if let Some(has_quantifier) = group_stack.pop() {
118 if has_quantifier {
119 last_type = 2; // group end with internal quantifier
120 } else {
121 last_type = 0;
122 }
123 }
124 }
125 '+' | '*' | '?' | '{' => {
126 // If we just closed a group that had a quantifier inside,
127 // and now we see another quantifier, that's a nested quantifier!
128 if last_type == 2 {
129 // Check if it's really a quantifier or literal {
130 if ch == '{' {
131 // Only count as quantifier if it looks like {n} or {n,m}
132 // peek ahead... (simplified for now)
133 return true; // Assume { is quantifier for safety heuristic
134 } else {
135 return true;
136 }
137 }
138
139 // Mark current group as having a quantifier
140 if let Some(last) = group_stack.last_mut() {
141 *last = true;
142 }
143 last_type = 1;
144 }
145 _ => {
146 last_type = 0;
147 }
148 }
149 }
150 false
151 }
152
153 fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
154 // NOTE: Nested quantifier detection (detect_nested_quantifiers) is intentionally
155 // NOT called here. The heuristic produces too many false positives on valid Perl
156 // patterns such as (?:/\.)+, (\w+)*, (?:pattern)+. Callers that want an advisory
157 // check can invoke detect_nested_quantifiers() directly and surface the result
158 // as a non-fatal diagnostic.
159
160 let mut chars = pattern.char_indices().peekable();
161 // Stack stores the type of the current group
162 let mut stack: Vec<GroupType> = Vec::new();
163 let mut unicode_property_count = 0;
164
165 while let Some((idx, ch)) = chars.next() {
166 match ch {
167 '\\' => {
168 // Check for escaped character
169 if let Some((_, next_char)) = chars.peek() {
170 match next_char {
171 'p' | 'P' => {
172 // Unicode property start \p or \P
173 // We consume the 'p'/'P'
174 chars.next();
175
176 // Check if it's followed by {
177 if let Some((_, '{')) = chars.peek() {
178 unicode_property_count += 1;
179 if unicode_property_count > self.max_unicode_properties {
180 return Err(RegexError::syntax(
181 "Too many Unicode properties in regex (max 50)",
182 start_pos + idx,
183 ));
184 }
185 }
186 }
187 _ => {
188 // Just skip other escaped chars
189 chars.next();
190 }
191 }
192 }
193 }
194 '(' => {
195 let mut group_type = GroupType::Normal;
196
197 // Check for extension syntax (?...)
198 if let Some((_, '?')) = chars.peek() {
199 chars.next(); // consume ?
200
201 // Check for < (lookbehind or named capture)
202 if let Some((_, '<')) = chars.peek() {
203 chars.next(); // consume <
204
205 // Check for = or ! (lookbehind)
206 if matches!(chars.peek(), Some((_, '=')) | Some((_, '!'))) {
207 chars.next(); // consume = or !
208 group_type = GroupType::Lookbehind;
209 }
210 // Otherwise it's likely a named capture (?<name>...) or condition (?<...)
211 // which we treat as a normal group
212 } else if let Some((_, '|')) = chars.peek() {
213 chars.next(); // consume |
214 group_type = GroupType::BranchReset { branch_count: 1 };
215 }
216 }
217
218 match group_type {
219 GroupType::Lookbehind => {
220 // Calculate current lookbehind depth
221 let lookbehind_depth =
222 stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
223 if lookbehind_depth >= self.max_nesting {
224 return Err(RegexError::syntax(
225 "Regex lookbehind nesting too deep",
226 start_pos + idx,
227 ));
228 }
229 }
230 GroupType::BranchReset { .. } => {
231 // Calculate current branch reset nesting
232 let reset_depth = stack
233 .iter()
234 .filter(|g| matches!(g, GroupType::BranchReset { .. }))
235 .count();
236 if reset_depth >= self.max_nesting {
237 // Use same nesting limit for now
238 return Err(RegexError::syntax(
239 "Regex branch reset nesting too deep",
240 start_pos + idx,
241 ));
242 }
243 }
244 _ => {}
245 }
246 stack.push(group_type);
247 }
248 '|' => {
249 // Check if we are in a branch reset group
250 if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
251 *branch_count += 1;
252 if *branch_count > 50 {
253 // Max 50 branches
254 return Err(RegexError::syntax(
255 "Too many branches in branch reset group (max 50)",
256 start_pos + idx,
257 ));
258 }
259 }
260 }
261 ')' => {
262 stack.pop();
263 }
264 '[' => {
265 // Skip character class [ ... ]
266 // Need to handle escaping inside []
267 while let Some((_, c)) = chars.next() {
268 if c == '\\' {
269 chars.next();
270 } else if c == ']' {
271 break;
272 }
273 }
274 }
275 _ => {}
276 }
277 }
278
279 Ok(())
280 }
281}
282
283enum GroupType {
284 Normal,
285 Lookbehind,
286 BranchReset { branch_count: usize },
287}
288
289impl Default for RegexValidator {
290 fn default() -> Self {
291 Self::new()
292 }
293}