perl_regex/lib.rs
1//! Perl regex validation and analysis
2//!
3//! This module provides tools to validate Perl regular expressions
4//! and detect potential security or performance issues like catastrophic backtracking.
5
6use thiserror::Error;
7
8/// Error type for Perl regex validation failures.
9#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11 /// Syntax error at a specific byte offset in the regex pattern.
12 #[error("{message} at offset {offset}")]
13 Syntax {
14 /// Human-readable description of the syntax issue.
15 message: String,
16 /// Byte offset where the error was detected.
17 offset: usize,
18 },
19}
20
21impl RegexError {
22 /// Create a new syntax error with a message and byte offset.
23 pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24 RegexError::Syntax { message: message.into(), offset }
25 }
26}
27
28/// Validator for Perl regular expressions to prevent security and performance issues
29pub struct RegexValidator {
30 max_nesting: usize,
31 max_unicode_properties: usize,
32}
33
34impl RegexValidator {
35 /// Create a new validator with default safety limits
36 pub fn new() -> Self {
37 Self {
38 // Default limits from issue #461
39 max_nesting: 10,
40 // Limit from issue #460
41 max_unicode_properties: 50,
42 }
43 }
44
45 /// Validate a regex pattern for potential performance or security risks
46 pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
47 self.check_complexity(pattern, start_pos)
48 }
49
50 /// Check if the pattern contains embedded code constructs (?{...}) or (??{...})
51 pub fn detects_code_execution(&self, pattern: &str) -> bool {
52 let mut chars = pattern.char_indices().peekable();
53 while let Some((_, ch)) = chars.next() {
54 if ch == '\\' {
55 chars.next(); // skip escaped
56 continue;
57 }
58 if ch == '(' {
59 if let Some((_, '?')) = chars.peek() {
60 chars.next(); // consume ?
61 // Check for { or ?{
62 if let Some((_, next)) = chars.peek() {
63 if *next == '{' {
64 return true; // (?{
65 } else if *next == '?' {
66 chars.next(); // consume second ?
67 if let Some((_, '{')) = chars.peek() {
68 return true; // (??{
69 }
70 }
71 }
72 }
73 }
74 }
75 false
76 }
77
78 /// Check for nested quantifiers that can cause catastrophic backtracking
79 /// e.g. (a+)+, (a*)*, (a?)*
80 pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
81 // This is a heuristic check for nested quantifiers
82 // It looks for a quantifier character following a group that ends with a quantifier
83 // e.g. ")+" in "...)+"
84 // Real implementation would need a full regex parser, but this heuristic
85 // covers common cases like (a+)+
86
87 let mut chars = pattern.char_indices().peekable();
88 let mut group_stack = Vec::new();
89
90 // Track the last significant character index and its type
91 // Type: 0=other, 1=quantifier, 2=group_end
92 let mut last_type = 0;
93
94 while let Some((_, ch)) = chars.next() {
95 match ch {
96 '\\' => {
97 chars.next(); // skip escaped
98 last_type = 0;
99 }
100 '(' => {
101 // Check if non-capturing or other special group
102 if let Some((_, '?')) = chars.peek() {
103 chars.next(); // consume '?'
104 // Skip group-type specifier so it doesn't reach the
105 // quantifier match arm (mirrors check_complexity logic)
106 if matches!(
107 chars.peek(),
108 Some((_, ':' | '=' | '!' | '<' | '>' | '|' | 'P' | '#'))
109 ) {
110 chars.next();
111 }
112 }
113 group_stack.push(false); // false = no quantifier inside yet
114 last_type = 0;
115 }
116 ')' => {
117 if let Some(has_quantifier) = group_stack.pop() {
118 if has_quantifier {
119 last_type = 2; // group end with internal quantifier
120 } else {
121 last_type = 0;
122 }
123 }
124 }
125 '+' | '*' | '?' | '{' => {
126 // If we just closed a group that had a quantifier inside,
127 // and now we see another quantifier, that's a nested quantifier!
128 if last_type == 2 {
129 // Check if it's really a quantifier or literal {
130 if ch == '{' {
131 // Only count as quantifier if it looks like {n} or {n,m}
132 // peek ahead... (simplified for now)
133 return true; // Assume { is quantifier for safety heuristic
134 } else {
135 return true;
136 }
137 }
138
139 // Mark current group as having a quantifier
140 if let Some(last) = group_stack.last_mut() {
141 *last = true;
142 }
143 last_type = 1;
144 }
145 _ => {
146 last_type = 0;
147 }
148 }
149 }
150 false
151 }
152
153 fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
154 // NOTE: Nested quantifier detection (detect_nested_quantifiers) is intentionally
155 // NOT called here. The heuristic produces too many false positives on valid Perl
156 // patterns such as (?:/\.)+, (\w+)*, (?:pattern)+. Callers that want an advisory
157 // check can invoke detect_nested_quantifiers() directly and surface the result
158 // as a non-fatal diagnostic.
159
160 let mut chars = pattern.char_indices().peekable();
161 // Stack stores the type of the current group
162 let mut stack: Vec<GroupType> = Vec::new();
163 let mut unicode_property_count = 0;
164
165 while let Some((idx, ch)) = chars.next() {
166 match ch {
167 '\\' => {
168 // Check for escaped character
169 if let Some((_, next_char)) = chars.peek() {
170 match next_char {
171 'p' | 'P' => {
172 // Unicode property start \p or \P
173 // We consume the 'p'/'P'
174 chars.next();
175
176 // Check if it's followed by {
177 if let Some((_, '{')) = chars.peek() {
178 unicode_property_count += 1;
179 if unicode_property_count > self.max_unicode_properties {
180 return Err(RegexError::syntax(
181 "Too many Unicode properties in regex (max 50)",
182 start_pos + idx,
183 ));
184 }
185 }
186 }
187 _ => {
188 // Just skip other escaped chars
189 chars.next();
190 }
191 }
192 }
193 }
194 '(' => {
195 let mut group_type = GroupType::Normal;
196
197 // Check for extension syntax (?...)
198 if let Some((_, '?')) = chars.peek() {
199 chars.next(); // consume ?
200
201 // Check for < (lookbehind or named capture)
202 if let Some((_, '<')) = chars.peek() {
203 chars.next(); // consume <
204
205 // Check for = or ! (lookbehind)
206 if matches!(chars.peek(), Some((_, '=')) | Some((_, '!'))) {
207 chars.next(); // consume = or !
208 group_type = GroupType::Lookbehind;
209 }
210 // Otherwise it's likely a named capture (?<name>...) or condition (?<...)
211 // which we treat as a normal group
212 } else if let Some((_, '|')) = chars.peek() {
213 chars.next(); // consume |
214 group_type = GroupType::BranchReset { branch_count: 1 };
215 }
216 }
217
218 match group_type {
219 GroupType::Lookbehind => {
220 // Calculate current lookbehind depth
221 let lookbehind_depth =
222 stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
223 if lookbehind_depth >= self.max_nesting {
224 return Err(RegexError::syntax(
225 "Regex lookbehind nesting too deep",
226 start_pos + idx,
227 ));
228 }
229 }
230 GroupType::BranchReset { .. } => {
231 // Calculate current branch reset nesting
232 let reset_depth = stack
233 .iter()
234 .filter(|g| matches!(g, GroupType::BranchReset { .. }))
235 .count();
236 if reset_depth >= self.max_nesting {
237 // Use same nesting limit for now
238 return Err(RegexError::syntax(
239 "Regex branch reset nesting too deep",
240 start_pos + idx,
241 ));
242 }
243 }
244 _ => {}
245 }
246 stack.push(group_type);
247 }
248 '|' => {
249 // Check if we are in a branch reset group
250 if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
251 *branch_count += 1;
252 if *branch_count > 50 {
253 // Max 50 branches
254 return Err(RegexError::syntax(
255 "Too many branches in branch reset group (max 50)",
256 start_pos + idx,
257 ));
258 }
259 }
260 }
261 ')' => {
262 stack.pop();
263 }
264 '[' => {
265 // Skip character class [ ... ]
266 // Need to handle escaping inside []
267 while let Some((_, c)) = chars.next() {
268 if c == '\\' {
269 chars.next();
270 } else if c == ']' {
271 break;
272 }
273 }
274 }
275 _ => {}
276 }
277 }
278
279 Ok(())
280 }
281}
282
283enum GroupType {
284 Normal,
285 Lookbehind,
286 BranchReset { branch_count: usize },
287}
288
289impl Default for RegexValidator {
290 fn default() -> Self {
291 Self::new()
292 }
293}
294
295/// A named capture group extracted from a Perl regex pattern.
296///
297/// Named captures use the `(?<name>...)` syntax introduced in Perl 5.10.
298/// Captured text is accessible via `$+{name}` or `$1`, `$2`, ... by index.
299#[derive(Debug, Clone)]
300pub struct CaptureGroup {
301 /// The capture group name from `(?<name>...)`.
302 pub name: String,
303 /// One-based capture index (counting all capturing groups left to right).
304 pub index: usize,
305 /// The sub-pattern inside the capture group.
306 pub pattern: String,
307}
308
309/// Analysis utilities for Perl regex patterns: capture extraction and hover text.
310pub struct RegexAnalyzer;
311
312impl RegexAnalyzer {
313 /// Extract all named capture groups from a Perl regex pattern.
314 ///
315 /// Scans the pattern for `(?<name>...)` groups and returns them in left-to-right
316 /// order. Non-capturing groups (`(?:...)`), lookaheads, and lookbehinds do not
317 /// increment the capture index. Escaped parentheses (`\(`) are skipped.
318 ///
319 /// # Example
320 /// ```
321 /// use perl_regex::RegexAnalyzer;
322 /// let caps = RegexAnalyzer::extract_named_captures("(?<year>\\d{4})-(?<month>\\d{2})");
323 /// assert_eq!(caps.len(), 2);
324 /// assert_eq!(caps[0].name, "year");
325 /// assert_eq!(caps[0].index, 1);
326 /// ```
327 pub fn extract_named_captures(pattern: &str) -> Vec<CaptureGroup> {
328 let mut result = Vec::new();
329 let mut capture_index = 0usize;
330 let chars: Vec<char> = pattern.chars().collect();
331 let len = chars.len();
332 let mut i = 0;
333
334 while i < len {
335 // Skip escaped characters.
336 if chars[i] == '\\' {
337 i += 2;
338 continue;
339 }
340
341 // Skip character classes [...] entirely.
342 if chars[i] == '[' {
343 i += 1;
344 while i < len {
345 if chars[i] == '\\' {
346 i += 2;
347 } else if chars[i] == ']' {
348 i += 1;
349 break;
350 } else {
351 i += 1;
352 }
353 }
354 continue;
355 }
356
357 if chars[i] == '(' {
358 i += 1;
359
360 // Determine the group kind.
361 if i < len && chars[i] == '?' {
362 i += 1; // consume '?'
363
364 if i < len && chars[i] == '<' {
365 i += 1; // consume '<'
366
367 // Lookbehind: (?<= or (?<! — not a capture.
368 if i < len && (chars[i] == '=' || chars[i] == '!') {
369 i += 1;
370 continue;
371 }
372
373 // Named capture (?<name>...) — collect the name.
374 capture_index += 1;
375 let name_start = i;
376 while i < len && chars[i] != '>' {
377 i += 1;
378 }
379 let name: String = chars[name_start..i].iter().collect();
380 if i < len {
381 i += 1; // consume '>'
382 }
383
384 // Collect the sub-pattern up to the matching ')'.
385 let pattern_start = i;
386 let mut depth = 1usize;
387 while i < len && depth > 0 {
388 if chars[i] == '\\' {
389 i += 2;
390 continue;
391 }
392 if chars[i] == '(' {
393 depth += 1;
394 } else if chars[i] == ')' {
395 depth -= 1;
396 }
397 i += 1;
398 }
399 // The ')' was consumed above; sub-pattern ends before it.
400 let sub: String = if i > 0 && pattern_start < i - 1 {
401 chars[pattern_start..i - 1].iter().collect()
402 } else {
403 String::new()
404 };
405
406 result.push(CaptureGroup { name, index: capture_index, pattern: sub });
407 continue;
408 } else if i < len && matches!(chars[i], ':' | '=' | '!' | '>' | '|' | 'P' | '#')
409 {
410 // Non-capturing group: (?:...), (?=...), (?!...), (?|...), etc.
411 // Does not increment capture_index; just move on (fall through to
412 // normal scanning — the loop will handle nested parens naturally).
413 continue;
414 }
415 // Any other (?...) — treat as non-capturing for index purposes.
416 continue;
417 }
418
419 // Plain capturing group `(...)`.
420 capture_index += 1;
421 continue;
422 }
423
424 i += 1;
425 }
426
427 result
428 }
429
430 /// Generate hover text for a Perl regex pattern and its modifiers.
431 ///
432 /// Summarises the named capture groups and explains the meaning of each
433 /// modifier flag (`i`, `m`, `s`, `x`, `g`).
434 ///
435 /// # Example
436 /// ```
437 /// use perl_regex::RegexAnalyzer;
438 /// let text = RegexAnalyzer::hover_text_for_regex("(?<id>\\d+)", "i");
439 /// assert!(text.contains("id"));
440 /// assert!(text.contains("case"));
441 /// ```
442 pub fn hover_text_for_regex(pattern: &str, modifiers: &str) -> String {
443 let mut parts: Vec<String> = Vec::new();
444
445 if !pattern.is_empty() {
446 parts.push(format!("Regex: `{pattern}`"));
447 }
448
449 // Named captures section.
450 let captures = Self::extract_named_captures(pattern);
451 if !captures.is_empty() {
452 parts.push("Named captures:".to_string());
453 for cap in &captures {
454 parts.push(format!(
455 " ${{{name}}} (capture {index}): `{pat}`",
456 name = cap.name,
457 index = cap.index,
458 pat = cap.pattern,
459 ));
460 }
461 }
462
463 // Modifier explanations.
464 let modifier_notes: Vec<&str> = modifiers
465 .chars()
466 .filter_map(|m| match m {
467 'i' => Some("case-insensitive matching"),
468 'm' => Some("multiline mode: ^ and $ match line boundaries"),
469 's' => Some("single-line mode: dot matches newline"),
470 'x' => Some("extended mode: whitespace and comments allowed"),
471 'g' => Some("global: match all occurrences"),
472 _ => None,
473 })
474 .collect();
475
476 if !modifier_notes.is_empty() {
477 parts.push("Modifiers:".to_string());
478 for note in modifier_notes {
479 parts.push(format!(" {note}"));
480 }
481 }
482
483 parts.join("\n")
484 }
485}