Skip to main content

perl_regex/
lib.rs

1//! Perl regex validation and analysis.
2
3pub mod analyzer;
4pub mod error;
5pub mod prelude;
6pub mod validator;
7
8mod syntax;
9
10pub use analyzer::{CaptureGroup, RegexAnalyzer};
11pub use error::RegexError;
12pub use validator::RegexValidator;
13
14#[cfg(test)]
15mod tests {
16    use super::*;
17    use crate::validator::RegexValidationConfig;
18
19    // --- RegexError ---
20
21    #[test]
22    fn regex_error_syntax_stores_message_and_offset() {
23        let err = RegexError::syntax("unexpected char", 7);
24        match &err {
25            RegexError::Syntax { message, offset } => {
26                assert_eq!(message, "unexpected char");
27                assert_eq!(*offset, 7);
28            }
29        }
30        assert!(err.to_string().contains("7"));
31        assert!(err.to_string().contains("unexpected char"));
32    }
33
34    #[test]
35    fn regex_error_implements_clone_and_partialeq() {
36        let e1 = RegexError::syntax("msg", 3);
37        let e2 = e1.clone();
38        assert_eq!(e1, e2);
39    }
40
41    // --- RegexValidator::validate (valid patterns) ---
42
43    #[test]
44    fn validate_simple_pattern_ok() {
45        let v = RegexValidator::new();
46        assert!(v.validate("hello", 0).is_ok());
47        assert!(v.validate("", 0).is_ok());
48        assert!(v.validate("(a|b)+", 0).is_ok());
49    }
50
51    #[test]
52    fn validate_unicode_property_within_limit_ok() {
53        let v = RegexValidator::new();
54        // 50 unicode properties is the limit
55        let pattern = r"\p{L}".repeat(50);
56        assert!(v.validate(&pattern, 0).is_ok());
57    }
58
59    #[test]
60    fn validate_too_many_unicode_properties_errors() {
61        let v = RegexValidator::new();
62        let pattern = r"\p{L}".repeat(51);
63        let err = v.validate(&pattern, 0).unwrap_err();
64        assert!(err.to_string().contains("Unicode"));
65    }
66
67    #[test]
68    fn validate_unicode_property_error_reports_configured_limit() {
69        let config = RegexValidationConfig {
70            max_nesting: 10,
71            max_unicode_properties: 1,
72            max_branch_reset_branches: 50,
73        };
74        let v = RegexValidator::with_config(config);
75        let result = v.validate(r"\p{L}\p{N}", 0);
76        let message = result.err().map(|err| err.to_string()).unwrap_or_default();
77        assert!(message.contains("max 1"));
78    }
79
80    #[test]
81    fn validate_unicode_property_offset_propagated() {
82        let v = RegexValidator::new();
83        let prefix = "x";
84        let pattern = format!("{}{}", prefix, r"\p{L}".repeat(51));
85        let err = v.validate(&pattern, 10).unwrap_err();
86        // The reported offset should be >= 10 (start_pos)
87        match err {
88            RegexError::Syntax { offset, .. } => assert!(offset >= 10),
89        }
90    }
91
92    #[test]
93    fn validate_lookbehind_within_limit_ok() {
94        let v = RegexValidator::new();
95        // 10 is the limit; 9 nested lookbehinds should be fine
96        let mut pattern = String::from("foo");
97        for _ in 0..9 {
98            pattern = format!("(?<={})", pattern);
99        }
100        assert!(v.validate(&pattern, 0).is_ok());
101    }
102
103    #[test]
104    fn validate_lookbehind_nesting_too_deep_errors() {
105        let v = RegexValidator::new();
106        // Build 11 nested lookbehinds to exceed the depth limit of 10
107        let mut pattern = String::from("a");
108        for _ in 0..11 {
109            pattern = format!("(?<={})", pattern);
110        }
111        let err = v.validate(&pattern, 0).unwrap_err();
112        assert!(err.to_string().contains("lookbehind") || err.to_string().contains("nesting"));
113    }
114
115    #[test]
116    fn validate_branch_reset_nesting_too_deep_errors() {
117        let v = RegexValidator::new();
118        let mut pattern = String::from("a");
119        for _ in 0..11 {
120            pattern = format!("(?|{})", pattern);
121        }
122        let err = v.validate(&pattern, 0).unwrap_err();
123        assert!(err.to_string().contains("branch reset") || err.to_string().contains("nesting"));
124    }
125
126    #[test]
127    fn validate_too_many_branches_in_reset_group_errors() {
128        let v = RegexValidator::new();
129        // 51 alternatives in one (?| ... ) group exceeds max 50 branches
130        let alts = (0u32..51).map(|i| format!("a{i}")).collect::<Vec<_>>().join("|");
131        let pattern = format!("(?|{alts})");
132        let err = v.validate(&pattern, 0).unwrap_err();
133        assert!(err.to_string().contains("branch") || err.to_string().contains("50"));
134    }
135
136    #[test]
137    fn validate_branch_reset_error_reports_configured_limit() {
138        let config = RegexValidationConfig {
139            max_nesting: 10,
140            max_unicode_properties: 50,
141            max_branch_reset_branches: 2,
142        };
143        let v = RegexValidator::with_config(config);
144        let result = v.validate("(?|a|b|c)", 0);
145        let message = result.err().map(|err| err.to_string()).unwrap_or_default();
146        assert!(message.contains("max 2"));
147    }
148
149    #[test]
150    fn validate_character_class_skipped() {
151        // `[(?{]` should not trigger embedded code detection in validate()
152        let v = RegexValidator::new();
153        assert!(v.validate("[(?{]", 0).is_ok());
154    }
155
156    // --- RegexValidator::detects_code_execution ---
157
158    #[test]
159    fn detects_code_execution_with_code_block() {
160        let v = RegexValidator::new();
161        assert!(v.detects_code_execution("(?{ print 'hi' })"));
162    }
163
164    #[test]
165    fn detects_code_execution_with_deferred_code_block() {
166        let v = RegexValidator::new();
167        assert!(v.detects_code_execution("(??{ some_code() })"));
168    }
169
170    #[test]
171    fn detects_code_execution_false_for_non_capturing() {
172        let v = RegexValidator::new();
173        assert!(!v.detects_code_execution("(?:foo)"));
174        assert!(!v.detects_code_execution("(?=ahead)"));
175        assert!(!v.detects_code_execution("(?!not)"));
176    }
177
178    #[test]
179    fn detects_code_execution_escaped_paren_not_detected() {
180        let v = RegexValidator::new();
181        assert!(!v.detects_code_execution(r"\(?{"));
182    }
183
184    #[test]
185    fn detects_code_execution_in_char_class_not_detected() {
186        let v = RegexValidator::new();
187        assert!(!v.detects_code_execution("[(?{]"));
188    }
189
190    #[test]
191    fn detects_code_execution_empty_pattern() {
192        let v = RegexValidator::new();
193        assert!(!v.detects_code_execution(""));
194    }
195
196    // --- RegexValidator::detect_nested_quantifiers ---
197
198    #[test]
199    fn detect_nested_quantifiers_finds_plus_plus() {
200        let v = RegexValidator::new();
201        assert!(v.detect_nested_quantifiers("(a+)+"));
202    }
203
204    #[test]
205    fn detect_nested_quantifiers_finds_star_star() {
206        let v = RegexValidator::new();
207        assert!(v.detect_nested_quantifiers("(a*)*"));
208    }
209
210    #[test]
211    fn detect_nested_quantifiers_finds_brace_quantifier() {
212        let v = RegexValidator::new();
213        assert!(v.detect_nested_quantifiers("(a+){2,5}"));
214    }
215
216    #[test]
217    fn detect_nested_quantifiers_safe_patterns() {
218        let v = RegexValidator::new();
219        assert!(!v.detect_nested_quantifiers("(abc)+")); // no inner quantifier
220        assert!(!v.detect_nested_quantifiers("[a-z]+")); // character class, not group
221        assert!(!v.detect_nested_quantifiers("a+b+")); // quantifiers outside groups
222    }
223
224    // --- RegexValidator::Default ---
225
226    #[test]
227    fn default_is_same_as_new() {
228        let v: RegexValidator = Default::default();
229        assert!(v.validate("simple", 0).is_ok());
230    }
231
232    // --- RegexAnalyzer::extract_named_captures ---
233
234    #[test]
235    fn extract_named_captures_angle_bracket_syntax() {
236        let caps = RegexAnalyzer::extract_named_captures(r"(?<year>\d{4})-(?<month>\d{2})");
237        assert_eq!(caps.len(), 2);
238        assert_eq!(caps[0].name, "year");
239        assert_eq!(caps[0].index, 1);
240        assert_eq!(caps[1].name, "month");
241        assert_eq!(caps[1].index, 2);
242    }
243
244    #[test]
245    fn extract_named_captures_single_quote_syntax() {
246        let caps = RegexAnalyzer::extract_named_captures(r"(?'name'\w+)");
247        assert_eq!(caps.len(), 1);
248        assert_eq!(caps[0].name, "name");
249        assert_eq!(caps[0].index, 1);
250    }
251
252    #[test]
253    fn extract_named_captures_no_captures() {
254        let caps = RegexAnalyzer::extract_named_captures(r"\d+\.\d+");
255        assert!(caps.is_empty());
256    }
257
258    #[test]
259    fn extract_named_captures_non_capturing_group_not_counted() {
260        let caps = RegexAnalyzer::extract_named_captures(r"(?:foo)(?<bar>baz)");
261        assert_eq!(caps.len(), 1);
262        assert_eq!(caps[0].name, "bar");
263        assert_eq!(caps[0].index, 1); // plain capturing groups before it still count
264    }
265
266    #[test]
267    fn extract_named_captures_lookbehind_not_counted() {
268        // (?<= ...) is lookbehind, not a named capture
269        let caps = RegexAnalyzer::extract_named_captures(r"(?<=foo)(?<word>\w+)");
270        assert_eq!(caps.len(), 1);
271        assert_eq!(caps[0].name, "word");
272    }
273
274    #[test]
275    fn extract_named_captures_escaped_paren_skipped() {
276        let caps = RegexAnalyzer::extract_named_captures(r"\((?<x>\d)\)");
277        assert_eq!(caps.len(), 1);
278        assert_eq!(caps[0].name, "x");
279    }
280
281    #[test]
282    fn extract_named_captures_stores_subpattern() {
283        let caps = RegexAnalyzer::extract_named_captures(r"(?<id>\d+)");
284        assert_eq!(caps.len(), 1);
285        assert_eq!(caps[0].pattern, r"\d+");
286    }
287
288    // --- RegexAnalyzer::hover_text_for_regex ---
289
290    #[test]
291    fn hover_text_includes_pattern_and_captures() {
292        let text = RegexAnalyzer::hover_text_for_regex(r"(?<id>\d+)", "i");
293        assert!(text.contains("id"));
294        assert!(text.contains("case"));
295    }
296
297    #[test]
298    fn hover_text_modifier_explanations() {
299        let text = RegexAnalyzer::hover_text_for_regex("foo", "imsx");
300        assert!(text.contains("case-insensitive"));
301        assert!(text.contains("multiline"));
302        assert!(text.contains("single-line"));
303        assert!(text.contains("extended"));
304    }
305
306    #[test]
307    fn hover_text_global_modifier() {
308        let text = RegexAnalyzer::hover_text_for_regex("foo", "g");
309        assert!(text.contains("global"));
310    }
311
312    #[test]
313    fn hover_text_no_modifiers() {
314        let text = RegexAnalyzer::hover_text_for_regex("hello", "");
315        assert!(text.contains("hello"));
316        assert!(!text.contains("Modifiers"));
317    }
318
319    #[test]
320    fn hover_text_empty_pattern() {
321        let text = RegexAnalyzer::hover_text_for_regex("", "");
322        assert!(text.is_empty());
323    }
324
325    #[test]
326    fn hover_text_unknown_modifier_ignored() {
327        let text = RegexAnalyzer::hover_text_for_regex("x", "z");
328        // z is not a known modifier, so no modifier section
329        assert!(!text.contains("Modifiers"));
330    }
331}