1use thiserror::Error;
7
8#[derive(Error, Debug, Clone, PartialEq)]
10pub enum RegexError {
11 #[error("{message} at offset {offset}")]
13 Syntax {
14 message: String,
16 offset: usize,
18 },
19}
20
21impl RegexError {
22 pub fn syntax(message: impl Into<String>, offset: usize) -> Self {
24 RegexError::Syntax { message: message.into(), offset }
25 }
26}
27
28pub struct RegexValidator {
30 max_nesting: usize,
31 max_unicode_properties: usize,
32}
33
34impl Default for RegexValidator {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl RegexValidator {
41 pub fn new() -> Self {
43 Self {
44 max_nesting: 10,
46 max_unicode_properties: 50,
48 }
49 }
50
51 pub fn validate(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
53 self.check_complexity(pattern, start_pos)
54 }
55
56 pub fn detects_code_execution(&self, pattern: &str) -> bool {
58 let bytes = pattern.as_bytes();
59 let mut i = 0;
60 let len = bytes.len();
61 while i < len {
62 let ch = bytes[i];
63 if ch == b'\\' {
64 i += 2; continue;
66 }
67 if ch == b'[' {
68 i += 1;
71 while i < len {
72 let class_ch = bytes[i];
73 if class_ch == b'\\' {
74 i += 2; } else if class_ch == b']' {
76 i += 1;
77 break;
78 } else {
79 i += 1;
80 }
81 }
82 continue;
83 }
84 if ch == b'(' {
85 if i + 1 < len && bytes[i + 1] == b'?' {
86 i += 2; if i < len {
89 if bytes[i] == b'{' {
90 return true; } else if bytes[i] == b'?' {
92 if i + 1 < len && bytes[i + 1] == b'{' {
93 return true; }
95 }
96 }
97 continue;
98 }
99 }
100 i += 1;
101 }
102 false
103 }
104
105 pub fn detect_nested_quantifiers(&self, pattern: &str) -> bool {
108 let bytes = pattern.as_bytes();
115 let mut i = 0;
116 let len = bytes.len();
117 let mut group_stack = Vec::new();
118
119 let mut last_type = 0;
122
123 while i < len {
124 let ch = bytes[i];
125 match ch {
126 b'\\' => {
127 i += 2; last_type = 0;
129 continue;
130 }
131 b'(' => {
132 if i + 1 < len && bytes[i + 1] == b'?' {
134 i += 2; if i < len
138 && matches!(
139 bytes[i],
140 b':' | b'=' | b'!' | b'<' | b'>' | b'|' | b'P' | b'#'
141 )
142 {
143 i += 1;
144 }
145 } else {
146 i += 1;
147 }
148 group_stack.push(false); last_type = 0;
150 continue;
151 }
152 b')' => {
153 if let Some(has_quantifier) = group_stack.pop() {
154 if has_quantifier {
155 last_type = 2; } else {
157 last_type = 0;
158 }
159 }
160 }
161 b'+' | b'*' | b'?' | b'{' => {
162 if last_type == 2 {
165 if ch == b'{' {
167 let mut peek_i = i + 1;
169 if Self::is_brace_quantifier(bytes, &mut peek_i) {
170 return true;
171 } else {
172 last_type = 0;
175 i += 1;
176 continue;
177 }
178 } else {
179 return true;
180 }
181 }
182
183 if let Some(last) = group_stack.last_mut() {
185 *last = true;
186 }
187 last_type = 1;
188 }
189 _ => {
190 last_type = 0;
191 }
192 }
193 i += 1;
194 }
195 false
196 }
197
198 fn is_brace_quantifier(bytes: &[u8], i: &mut usize) -> bool {
199 let mut has_digit = false;
201 let mut has_comma = false;
202 let len = bytes.len();
203
204 while *i < len {
205 let ch = bytes[*i];
206 *i += 1;
207 if ch.is_ascii_digit() {
208 has_digit = true;
209 } else if ch == b',' && !has_comma {
210 has_comma = true;
211 } else if ch == b'}' && has_digit {
212 return true;
213 } else {
214 break;
215 }
216 }
217
218 false }
220
221 fn check_complexity(&self, pattern: &str, start_pos: usize) -> Result<(), RegexError> {
222 let bytes = pattern.as_bytes();
229 let mut i = 0;
230 let len = bytes.len();
231
232 let mut stack: Vec<GroupType> = Vec::new();
234 let mut unicode_property_count = 0;
235
236 while i < len {
237 let ch = bytes[i];
238 match ch {
239 b'\\' => {
240 if i + 1 < len {
242 let next_char = bytes[i + 1];
243 match next_char {
244 b'p' | b'P' => {
245 i += 2;
248
249 if i < len && bytes[i] == b'{' {
251 unicode_property_count += 1;
252 if unicode_property_count > self.max_unicode_properties {
253 return Err(RegexError::syntax(
254 "Too many Unicode properties in regex (max 50)",
255 start_pos + i - 2, ));
257 }
258 }
259 continue;
260 }
261 _ => {
262 i += 2;
264 continue;
265 }
266 }
267 }
268 }
269 b'[' => {
270 i += 1;
272 while i < len {
273 if bytes[i] == b'\\' {
274 i += 2;
275 } else if bytes[i] == b']' {
276 break;
277 } else {
278 i += 1;
279 }
280 }
281 }
282 b'(' => {
283 let mut group_type = GroupType::Normal;
284
285 if i + 1 < len && bytes[i + 1] == b'?' {
287 i += 2; if i < len && bytes[i] == b'<' {
291 i += 1; if i < len && (bytes[i] == b'=' || bytes[i] == b'!') {
295 i += 1; group_type = GroupType::Lookbehind;
297 }
298 } else if i < len && bytes[i] == b'|' {
301 i += 1; group_type = GroupType::BranchReset { branch_count: 1 };
303 }
304 } else {
305 i += 1;
306 }
307
308 match group_type {
309 GroupType::Lookbehind => {
310 let lookbehind_depth =
312 stack.iter().filter(|g| matches!(g, GroupType::Lookbehind)).count();
313 if lookbehind_depth >= self.max_nesting {
314 return Err(RegexError::syntax(
315 "Regex lookbehind nesting too deep",
316 start_pos + i - 1, ));
318 }
319 }
320 GroupType::BranchReset { .. } => {
321 let reset_depth = stack
323 .iter()
324 .filter(|g| matches!(g, GroupType::BranchReset { .. }))
325 .count();
326 if reset_depth >= self.max_nesting {
327 return Err(RegexError::syntax(
329 "Regex branch reset nesting too deep",
330 start_pos + i - 1,
331 ));
332 }
333 }
334 _ => {}
335 }
336 stack.push(group_type);
337 continue;
338 }
339 b'|' => {
340 if let Some(GroupType::BranchReset { branch_count }) = stack.last_mut() {
342 *branch_count += 1;
343 if *branch_count > 50 {
344 return Err(RegexError::syntax(
346 "Too many branches in branch reset group (max 50)",
347 start_pos + i,
348 ));
349 }
350 }
351 }
352 b')' => {
353 stack.pop();
355 }
356 _ => {}
357 }
358 i += 1;
359 }
360
361 Ok(())
362 }
363}
364
365enum GroupType {
366 Normal,
367 Lookbehind,
368 BranchReset { branch_count: usize },
369}
370
371#[derive(Debug, Clone, PartialEq)]
374pub struct CaptureGroup {
375 pub name: String,
377 pub index: usize,
379 pub pattern: String,
381}
382
383pub struct RegexAnalyzer;
385
386impl RegexAnalyzer {
387 pub fn extract_named_captures(pattern: &str) -> Vec<CaptureGroup> {
402 let mut result = Vec::new();
403 let mut capture_index = 0usize;
404 let bytes = pattern.as_bytes();
405 let len = bytes.len();
406 let mut i = 0;
407
408 while i < len {
409 if bytes[i] == b'\\' {
411 i += 2;
412 continue;
413 }
414
415 if bytes[i] == b'[' {
417 i += 1;
418 while i < len {
419 if bytes[i] == b'\\' {
420 i += 2;
421 } else if bytes[i] == b']' {
422 i += 1;
423 break;
424 } else {
425 i += 1;
426 }
427 }
428 continue;
429 }
430
431 if bytes[i] == b'(' {
432 i += 1;
433
434 if i < len && bytes[i] == b'?' {
436 i += 1; if i < len && bytes[i] == b'<' {
439 i += 1; if i < len && (bytes[i] == b'=' || bytes[i] == b'!') {
443 i += 1;
444 continue;
445 }
446
447 if let Some((name, next_pos)) =
448 parse_named_capture_name_from(bytes, i, b'>')
449 {
450 capture_index += 1;
451 i = next_pos;
452
453 let pattern_start = i;
455 let mut depth = 1usize;
456 while i < len && depth > 0 {
457 if bytes[i] == b'\\' {
458 i += 2;
459 continue;
460 }
461 if bytes[i] == b'[' {
462 i += 1;
463 while i < len {
464 if bytes[i] == b'\\' {
465 i += 2;
466 } else if bytes[i] == b']' {
467 i += 1;
468 break;
469 } else {
470 i += 1;
471 }
472 }
473 continue;
474 }
475 if bytes[i] == b'(' {
476 depth += 1;
477 } else if bytes[i] == b')' {
478 depth -= 1;
479 }
480 i += 1;
481 }
482 let sub: String = if i > 0 && pattern_start < i - 1 {
484 String::from_utf8_lossy(&bytes[pattern_start..i - 1]).into_owned()
488 } else {
489 String::new()
490 };
491
492 result.push(CaptureGroup { name, index: capture_index, pattern: sub });
493 continue;
494 }
495 } else if i < len && bytes[i] == b'\'' {
496 if let Some((name, next_pos)) =
497 parse_named_capture_name(bytes, i, b'\'', b'\'')
498 {
499 capture_index += 1;
500 i = next_pos;
501
502 let pattern_start = i;
504 let mut depth = 1usize;
505 while i < len && depth > 0 {
506 if bytes[i] == b'\\' {
507 i += 2;
508 continue;
509 }
510 if bytes[i] == b'[' {
511 i += 1;
512 while i < len {
513 if bytes[i] == b'\\' {
514 i += 2;
515 } else if bytes[i] == b']' {
516 i += 1;
517 break;
518 } else {
519 i += 1;
520 }
521 }
522 continue;
523 }
524 if bytes[i] == b'(' {
525 depth += 1;
526 } else if bytes[i] == b')' {
527 depth -= 1;
528 }
529 i += 1;
530 }
531 let sub: String = if i > 0 && pattern_start < i - 1 {
533 String::from_utf8_lossy(&bytes[pattern_start..i - 1]).into_owned()
534 } else {
535 String::new()
536 };
537
538 result.push(CaptureGroup { name, index: capture_index, pattern: sub });
539 continue;
540 }
541 } else if i < len
542 && matches!(bytes[i], b':' | b'=' | b'!' | b'>' | b'|' | b'P' | b'#')
543 {
544 continue;
548 }
549 continue;
551 }
552
553 capture_index += 1;
555 continue;
556 }
557
558 i += 1;
559 }
560
561 result
562 }
563
564 pub fn hover_text_for_regex(pattern: &str, modifiers: &str) -> String {
580 let mut parts: Vec<String> = Vec::new();
581
582 if !pattern.is_empty() {
583 parts.push(format!("Regex: `{pattern}`"));
584 }
585
586 let captures = Self::extract_named_captures(pattern);
588 if !captures.is_empty() {
589 parts.push("Named captures:".to_string());
590 for cap in &captures {
591 parts.push(format!(
592 " ${{{name}}} (capture {index}): `{pat}`",
593 name = cap.name,
594 index = cap.index,
595 pat = cap.pattern,
596 ));
597 }
598 }
599
600 let mut seen_modifiers: Vec<char> = Vec::new();
602 let mut modifier_notes: Vec<&str> = Vec::new();
603 let mut unknown_modifiers: Vec<char> = Vec::new();
604 for modifier in modifiers.chars() {
605 if seen_modifiers.contains(&modifier) {
606 continue;
607 }
608 seen_modifiers.push(modifier);
609 match describe_modifier(modifier) {
610 Some(description) => modifier_notes.push(description),
611 None => {
612 unknown_modifiers.push(modifier);
613 }
614 }
615 }
616
617 if !modifier_notes.is_empty() {
618 parts.push("Modifiers:".to_string());
619 for note in modifier_notes {
620 parts.push(format!(" {note}"));
621 }
622 }
623
624 if !unknown_modifiers.is_empty() {
625 let unknown: String = unknown_modifiers.into_iter().collect();
626 parts.push(format!("Unknown modifiers: `{unknown}`"));
627 }
628
629 parts.join("\n")
630 }
631}
632
633fn describe_modifier(modifier: char) -> Option<&'static str> {
634 match modifier {
635 'i' => Some("case-insensitive matching"),
636 'm' => Some("multiline mode: ^ and $ match line boundaries"),
637 's' => Some("single-line mode: dot matches newline"),
638 'x' => Some("extended mode: whitespace and comments allowed"),
639 'g' => Some("global: match all occurrences"),
640 'a' => Some("ASCII-safe character classes"),
641 'd' => Some("native platform character set semantics"),
642 'l' => Some("locale-dependent character semantics"),
643 'u' => Some("Unicode character semantics"),
644 'n' => Some("non-capturing by default for unnamed groups"),
645 'p' => Some("preserve string for ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH}"),
646 'r' => Some("non-destructive substitution result"),
647 'c' => Some("keep current match position for /g scans"),
648 'o' => Some("compile pattern only once"),
649 'e' => Some("evaluate replacement as code in substitutions"),
650 _ => None,
651 }
652}
653
654fn parse_named_capture_name(
655 bytes: &[u8],
656 pos: usize,
657 open_delim: u8,
658 close_delim: u8,
659) -> Option<(String, usize)> {
660 if pos >= bytes.len() || bytes[pos] != open_delim {
661 return None;
662 }
663
664 let mut i = pos + 1;
665 let name_start = i;
666 while i < bytes.len() && bytes[i] != close_delim {
667 i += 1;
668 }
669
670 if i == name_start || i >= bytes.len() {
671 return None;
672 }
673
674 let name = String::from_utf8_lossy(&bytes[name_start..i]).into_owned();
675 Some((name, i + 1))
676}
677
678fn parse_named_capture_name_from(
679 bytes: &[u8],
680 start: usize,
681 close_delim: u8,
682) -> Option<(String, usize)> {
683 if start >= bytes.len() {
684 return None;
685 }
686
687 let mut i = start;
688 while i < bytes.len() && bytes[i] != close_delim {
689 i += 1;
690 }
691
692 if i == start || i >= bytes.len() {
693 return None;
694 }
695
696 let name = String::from_utf8_lossy(&bytes[start..i]).into_owned();
697 Some((name, i + 1))
698}
699
700#[cfg(test)]
701mod tests {
702 use super::*;
703
704 #[test]
707 fn regex_error_syntax_stores_message_and_offset() {
708 let err = RegexError::syntax("unexpected char", 7);
709 match &err {
710 RegexError::Syntax { message, offset } => {
711 assert_eq!(message, "unexpected char");
712 assert_eq!(*offset, 7);
713 }
714 }
715 assert!(err.to_string().contains("7"));
716 assert!(err.to_string().contains("unexpected char"));
717 }
718
719 #[test]
720 fn regex_error_implements_clone_and_partialeq() {
721 let e1 = RegexError::syntax("msg", 3);
722 let e2 = e1.clone();
723 assert_eq!(e1, e2);
724 }
725
726 #[test]
729 fn validate_simple_pattern_ok() {
730 let v = RegexValidator::new();
731 assert!(v.validate("hello", 0).is_ok());
732 assert!(v.validate("", 0).is_ok());
733 assert!(v.validate("(a|b)+", 0).is_ok());
734 }
735
736 #[test]
737 fn validate_unicode_property_within_limit_ok() {
738 let v = RegexValidator::new();
739 let pattern = r"\p{L}".repeat(50);
741 assert!(v.validate(&pattern, 0).is_ok());
742 }
743
744 #[test]
745 fn validate_too_many_unicode_properties_errors() {
746 let v = RegexValidator::new();
747 let pattern = r"\p{L}".repeat(51);
748 let err = v.validate(&pattern, 0).unwrap_err();
749 assert!(err.to_string().contains("Unicode"));
750 }
751
752 #[test]
753 fn validate_unicode_property_offset_propagated() {
754 let v = RegexValidator::new();
755 let prefix = "x";
756 let pattern = format!("{}{}", prefix, r"\p{L}".repeat(51));
757 let err = v.validate(&pattern, 10).unwrap_err();
758 match err {
760 RegexError::Syntax { offset, .. } => assert!(offset >= 10),
761 }
762 }
763
764 #[test]
765 fn validate_lookbehind_within_limit_ok() {
766 let v = RegexValidator::new();
767 let mut pattern = String::from("foo");
769 for _ in 0..9 {
770 pattern = format!("(?<={})", pattern);
771 }
772 assert!(v.validate(&pattern, 0).is_ok());
773 }
774
775 #[test]
776 fn validate_lookbehind_nesting_too_deep_errors() {
777 let v = RegexValidator::new();
778 let mut pattern = String::from("a");
780 for _ in 0..11 {
781 pattern = format!("(?<={})", pattern);
782 }
783 let err = v.validate(&pattern, 0).unwrap_err();
784 assert!(err.to_string().contains("lookbehind") || err.to_string().contains("nesting"));
785 }
786
787 #[test]
788 fn validate_branch_reset_nesting_too_deep_errors() {
789 let v = RegexValidator::new();
790 let mut pattern = String::from("a");
791 for _ in 0..11 {
792 pattern = format!("(?|{})", pattern);
793 }
794 let err = v.validate(&pattern, 0).unwrap_err();
795 assert!(err.to_string().contains("branch reset") || err.to_string().contains("nesting"));
796 }
797
798 #[test]
799 fn validate_too_many_branches_in_reset_group_errors() {
800 let v = RegexValidator::new();
801 let alts = (0u32..51).map(|i| format!("a{i}")).collect::<Vec<_>>().join("|");
803 let pattern = format!("(?|{alts})");
804 let err = v.validate(&pattern, 0).unwrap_err();
805 assert!(err.to_string().contains("branch") || err.to_string().contains("50"));
806 }
807
808 #[test]
809 fn validate_character_class_skipped() {
810 let v = RegexValidator::new();
812 assert!(v.validate("[(?{]", 0).is_ok());
813 }
814
815 #[test]
818 fn detects_code_execution_with_code_block() {
819 let v = RegexValidator::new();
820 assert!(v.detects_code_execution("(?{ print 'hi' })"));
821 }
822
823 #[test]
824 fn detects_code_execution_with_deferred_code_block() {
825 let v = RegexValidator::new();
826 assert!(v.detects_code_execution("(??{ some_code() })"));
827 }
828
829 #[test]
830 fn detects_code_execution_false_for_non_capturing() {
831 let v = RegexValidator::new();
832 assert!(!v.detects_code_execution("(?:foo)"));
833 assert!(!v.detects_code_execution("(?=ahead)"));
834 assert!(!v.detects_code_execution("(?!not)"));
835 }
836
837 #[test]
838 fn detects_code_execution_escaped_paren_not_detected() {
839 let v = RegexValidator::new();
840 assert!(!v.detects_code_execution(r"\(?{"));
841 }
842
843 #[test]
844 fn detects_code_execution_in_char_class_not_detected() {
845 let v = RegexValidator::new();
846 assert!(!v.detects_code_execution("[(?{]"));
847 }
848
849 #[test]
850 fn detects_code_execution_empty_pattern() {
851 let v = RegexValidator::new();
852 assert!(!v.detects_code_execution(""));
853 }
854
855 #[test]
858 fn detect_nested_quantifiers_finds_plus_plus() {
859 let v = RegexValidator::new();
860 assert!(v.detect_nested_quantifiers("(a+)+"));
861 }
862
863 #[test]
864 fn detect_nested_quantifiers_finds_star_star() {
865 let v = RegexValidator::new();
866 assert!(v.detect_nested_quantifiers("(a*)*"));
867 }
868
869 #[test]
870 fn detect_nested_quantifiers_finds_brace_quantifier() {
871 let v = RegexValidator::new();
872 assert!(v.detect_nested_quantifiers("(a+){2,5}"));
873 }
874
875 #[test]
876 fn detect_nested_quantifiers_safe_patterns() {
877 let v = RegexValidator::new();
878 assert!(!v.detect_nested_quantifiers("(abc)+")); assert!(!v.detect_nested_quantifiers("[a-z]+")); assert!(!v.detect_nested_quantifiers("a+b+")); }
882
883 #[test]
886 fn default_is_same_as_new() {
887 let v: RegexValidator = Default::default();
888 assert!(v.validate("simple", 0).is_ok());
889 }
890
891 #[test]
894 fn extract_named_captures_angle_bracket_syntax() {
895 let caps = RegexAnalyzer::extract_named_captures(r"(?<year>\d{4})-(?<month>\d{2})");
896 assert_eq!(caps.len(), 2);
897 assert_eq!(caps[0].name, "year");
898 assert_eq!(caps[0].index, 1);
899 assert_eq!(caps[1].name, "month");
900 assert_eq!(caps[1].index, 2);
901 }
902
903 #[test]
904 fn extract_named_captures_single_quote_syntax() {
905 let caps = RegexAnalyzer::extract_named_captures(r"(?'name'\w+)");
906 assert_eq!(caps.len(), 1);
907 assert_eq!(caps[0].name, "name");
908 assert_eq!(caps[0].index, 1);
909 }
910
911 #[test]
912 fn extract_named_captures_no_captures() {
913 let caps = RegexAnalyzer::extract_named_captures(r"\d+\.\d+");
914 assert!(caps.is_empty());
915 }
916
917 #[test]
918 fn extract_named_captures_non_capturing_group_not_counted() {
919 let caps = RegexAnalyzer::extract_named_captures(r"(?:foo)(?<bar>baz)");
920 assert_eq!(caps.len(), 1);
921 assert_eq!(caps[0].name, "bar");
922 assert_eq!(caps[0].index, 1); }
924
925 #[test]
926 fn extract_named_captures_lookbehind_not_counted() {
927 let caps = RegexAnalyzer::extract_named_captures(r"(?<=foo)(?<word>\w+)");
929 assert_eq!(caps.len(), 1);
930 assert_eq!(caps[0].name, "word");
931 }
932
933 #[test]
934 fn extract_named_captures_escaped_paren_skipped() {
935 let caps = RegexAnalyzer::extract_named_captures(r"\((?<x>\d)\)");
936 assert_eq!(caps.len(), 1);
937 assert_eq!(caps[0].name, "x");
938 }
939
940 #[test]
941 fn extract_named_captures_stores_subpattern() {
942 let caps = RegexAnalyzer::extract_named_captures(r"(?<id>\d+)");
943 assert_eq!(caps.len(), 1);
944 assert_eq!(caps[0].pattern, r"\d+");
945 }
946
947 #[test]
950 fn hover_text_includes_pattern_and_captures() {
951 let text = RegexAnalyzer::hover_text_for_regex(r"(?<id>\d+)", "i");
952 assert!(text.contains("id"));
953 assert!(text.contains("case"));
954 }
955
956 #[test]
957 fn hover_text_modifier_explanations() {
958 let text = RegexAnalyzer::hover_text_for_regex("foo", "imsx");
959 assert!(text.contains("case-insensitive"));
960 assert!(text.contains("multiline"));
961 assert!(text.contains("single-line"));
962 assert!(text.contains("extended"));
963 }
964
965 #[test]
966 fn hover_text_global_modifier() {
967 let text = RegexAnalyzer::hover_text_for_regex("foo", "g");
968 assert!(text.contains("global"));
969 }
970
971 #[test]
972 fn hover_text_no_modifiers() {
973 let text = RegexAnalyzer::hover_text_for_regex("hello", "");
974 assert!(text.contains("hello"));
975 assert!(!text.contains("Modifiers"));
976 }
977
978 #[test]
979 fn hover_text_empty_pattern() {
980 let text = RegexAnalyzer::hover_text_for_regex("", "");
981 assert!(text.is_empty());
982 }
983
984 #[test]
985 fn hover_text_unknown_modifier_ignored() {
986 let text = RegexAnalyzer::hover_text_for_regex("x", "z");
987 assert!(!text.contains("Modifiers"));
989 }
990}