1pub mod fancy;
2#[cfg(feature = "pcre2-engine")]
3pub mod pcre2;
4#[cfg(feature = "pcre2-engine")]
5pub mod pcre2_debug;
6pub mod rust_regex;
7
8use serde::Serialize;
9use std::fmt;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum EngineKind {
13 RustRegex,
14 FancyRegex,
15 #[cfg(feature = "pcre2-engine")]
16 Pcre2,
17}
18
19impl EngineKind {
20 pub fn all() -> Vec<EngineKind> {
21 vec![
22 EngineKind::RustRegex,
23 EngineKind::FancyRegex,
24 #[cfg(feature = "pcre2-engine")]
25 EngineKind::Pcre2,
26 ]
27 }
28
29 pub fn next(self) -> EngineKind {
30 match self {
31 EngineKind::RustRegex => EngineKind::FancyRegex,
32 #[cfg(feature = "pcre2-engine")]
33 EngineKind::FancyRegex => EngineKind::Pcre2,
34 #[cfg(not(feature = "pcre2-engine"))]
35 EngineKind::FancyRegex => EngineKind::RustRegex,
36 #[cfg(feature = "pcre2-engine")]
37 EngineKind::Pcre2 => EngineKind::RustRegex,
38 }
39 }
40}
41
42impl fmt::Display for EngineKind {
43 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44 match self {
45 EngineKind::RustRegex => write!(f, "Rust regex"),
46 EngineKind::FancyRegex => write!(f, "fancy-regex"),
47 #[cfg(feature = "pcre2-engine")]
48 EngineKind::Pcre2 => write!(f, "PCRE2"),
49 }
50 }
51}
52
53#[derive(Debug, Clone, Copy)]
54pub struct EngineFlags {
55 pub case_insensitive: bool,
56 pub multi_line: bool,
57 pub dot_matches_newline: bool,
58 pub unicode: bool,
59 pub extended: bool,
60}
61
62impl Default for EngineFlags {
63 fn default() -> Self {
70 Self {
71 case_insensitive: false,
72 multi_line: false,
73 dot_matches_newline: false,
74 unicode: true,
75 extended: false,
76 }
77 }
78}
79
80impl EngineFlags {
81 #[allow(clippy::wrong_self_convention)] pub fn to_inline_prefix(&self) -> String {
86 let mut s = String::new();
87 if self.case_insensitive {
88 s.push('i');
89 }
90 if self.multi_line {
91 s.push('m');
92 }
93 if self.dot_matches_newline {
94 s.push('s');
95 }
96 if self.unicode {
97 s.push('u');
98 }
99 if self.extended {
100 s.push('x');
101 }
102 s
103 }
104
105 #[allow(clippy::wrong_self_convention)] fn to_regex_inline_prefix(&self) -> String {
114 let mut enable = String::new();
115 if self.case_insensitive {
116 enable.push('i');
117 }
118 if self.multi_line {
119 enable.push('m');
120 }
121 if self.dot_matches_newline {
122 enable.push('s');
123 }
124 if self.extended {
125 enable.push('x');
126 }
127 let disable_unicode = !self.unicode;
128 match (enable.is_empty(), disable_unicode) {
129 (true, false) => String::new(),
130 (false, false) => enable,
131 (true, true) => "-u".to_string(),
132 (false, true) => format!("{enable}-u"),
133 }
134 }
135
136 pub fn wrap_pattern(&self, pattern: &str) -> String {
137 let prefix = self.to_regex_inline_prefix();
138 if prefix.is_empty() {
139 pattern.to_string()
140 } else {
141 format!("(?{prefix}){pattern}")
142 }
143 }
144
145 pub fn toggle_case_insensitive(&mut self) {
146 self.case_insensitive = !self.case_insensitive;
147 }
148 pub fn toggle_multi_line(&mut self) {
149 self.multi_line = !self.multi_line;
150 }
151 pub fn toggle_dot_matches_newline(&mut self) {
152 self.dot_matches_newline = !self.dot_matches_newline;
153 }
154 pub fn toggle_unicode(&mut self) {
155 self.unicode = !self.unicode;
156 }
157 pub fn toggle_extended(&mut self) {
158 self.extended = !self.extended;
159 }
160}
161
162#[derive(Debug, Clone, Serialize)]
163pub struct Match {
164 #[serde(rename = "match")]
165 pub text: String,
166 pub start: usize,
167 pub end: usize,
168 #[serde(rename = "groups")]
169 pub captures: Vec<CaptureGroup>,
170}
171
172#[derive(Debug, Clone, Serialize)]
173pub struct CaptureGroup {
174 #[serde(rename = "group")]
175 pub index: usize,
176 #[serde(skip_serializing_if = "Option::is_none")]
177 pub name: Option<String>,
178 #[serde(rename = "value")]
179 pub text: String,
180 pub start: usize,
181 pub end: usize,
182}
183
184#[derive(Debug)]
185pub enum EngineError {
186 CompileError(String),
187 MatchError(String),
188}
189
190impl fmt::Display for EngineError {
191 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
192 match self {
193 EngineError::CompileError(msg) => write!(f, "Compile error: {msg}"),
194 EngineError::MatchError(msg) => write!(f, "Match error: {msg}"),
195 }
196 }
197}
198
199impl std::error::Error for EngineError {}
200
201pub type EngineResult<T> = Result<T, EngineError>;
202
203pub trait RegexEngine: Send + Sync {
204 fn kind(&self) -> EngineKind;
205 fn compile(&self, pattern: &str, flags: &EngineFlags) -> EngineResult<Box<dyn CompiledRegex>>;
206}
207
208pub trait CompiledRegex: Send + Sync {
209 fn find_matches(&self, text: &str) -> EngineResult<Vec<Match>>;
210}
211
212pub fn create_engine(kind: EngineKind) -> Box<dyn RegexEngine> {
213 match kind {
214 EngineKind::RustRegex => Box::new(rust_regex::RustRegexEngine),
215 EngineKind::FancyRegex => Box::new(fancy::FancyRegexEngine),
216 #[cfg(feature = "pcre2-engine")]
217 EngineKind::Pcre2 => Box::new(pcre2::Pcre2Engine),
218 }
219}
220
221fn engine_level(kind: EngineKind) -> u8 {
223 match kind {
224 EngineKind::RustRegex => 0,
225 EngineKind::FancyRegex => 1,
226 #[cfg(feature = "pcre2-engine")]
227 EngineKind::Pcre2 => 2,
228 }
229}
230
231pub fn detect_minimum_engine(pattern: &str) -> EngineKind {
233 #[cfg(feature = "pcre2-engine")]
234 {
235 if needs_pcre2(pattern) {
236 return EngineKind::Pcre2;
237 }
238 }
239
240 if needs_fancy(pattern) {
241 return EngineKind::FancyRegex;
242 }
243
244 EngineKind::RustRegex
245}
246
247pub fn is_engine_upgrade(current: EngineKind, suggested: EngineKind) -> bool {
249 engine_level(suggested) > engine_level(current)
250}
251
252fn needs_fancy(pattern: &str) -> bool {
253 if pattern.contains("(?=")
254 || pattern.contains("(?!")
255 || pattern.contains("(?<=")
256 || pattern.contains("(?<!")
257 {
258 return true;
259 }
260 has_backreference(pattern)
261}
262
263fn has_backreference(pattern: &str) -> bool {
264 let bytes = pattern.as_bytes();
265 let len = bytes.len();
266 let mut i = 0;
267 while i < len.saturating_sub(1) {
268 if bytes[i] == b'\\' {
269 let next = bytes[i + 1];
270 if next.is_ascii_digit() && next != b'0' {
271 return true;
272 }
273 i += 2;
275 continue;
276 }
277 i += 1;
278 }
279 false
280}
281
282#[cfg(feature = "pcre2-engine")]
283fn needs_pcre2(pattern: &str) -> bool {
284 if pattern.contains("(?R)")
285 || pattern.contains("(*SKIP)")
286 || pattern.contains("(*FAIL)")
287 || pattern.contains("(*PRUNE)")
288 || pattern.contains("(*COMMIT)")
289 || pattern.contains("\\K")
290 || pattern.contains("(?(")
291 {
292 return true;
293 }
294 has_subroutine_call(pattern)
295}
296
297#[cfg(feature = "pcre2-engine")]
298fn has_subroutine_call(pattern: &str) -> bool {
299 let bytes = pattern.as_bytes();
300 for i in 0..bytes.len().saturating_sub(2) {
301 if bytes[i] == b'('
302 && bytes[i + 1] == b'?'
303 && bytes.get(i + 2).is_some_and(u8::is_ascii_digit)
304 {
305 return true;
306 }
307 }
308 false
309}
310
311#[derive(Debug, Clone)]
314pub struct ReplaceSegment {
315 pub start: usize,
316 pub end: usize,
317 pub is_replacement: bool,
318}
319
320#[derive(Debug, Clone)]
321pub struct ReplaceResult {
322 pub output: String,
323 pub segments: Vec<ReplaceSegment>,
324}
325
326fn expand_replacement(template: &str, m: &Match) -> String {
331 let mut result = String::new();
332 let mut chars = template.char_indices().peekable();
333
334 while let Some((_i, c)) = chars.next() {
335 if c == '$' {
336 match chars.peek() {
337 None => {
338 result.push('$');
339 }
340 Some(&(_, '$')) => {
341 chars.next();
342 result.push('$');
343 }
344 Some(&(_, '&')) => {
345 chars.next();
346 result.push_str(&m.text);
347 }
348 Some(&(_, '{')) => {
349 chars.next(); let brace_start = chars.peek().map(|&(idx, _)| idx).unwrap_or(template.len());
351 if let Some(close) = template[brace_start..].find('}') {
352 let ref_name = &template[brace_start..brace_start + close];
353 if let Some(text) = lookup_capture(m, ref_name) {
354 result.push_str(text);
355 }
356 let end_byte = brace_start + close + 1;
358 while chars.peek().is_some_and(|&(idx, _)| idx < end_byte) {
359 chars.next();
360 }
361 } else {
362 result.push('$');
363 result.push('{');
364 }
365 }
366 Some(&(_, next_c)) if next_c.is_ascii_digit() => {
367 let (_, d1) = chars.next().expect("peeked value must exist");
368 let mut num_str = String::from(d1);
369 if let Some(&(_, d2)) = chars.peek() {
371 if d2.is_ascii_digit() {
372 chars.next();
373 num_str.push(d2);
374 }
375 }
376 let idx: usize = num_str.parse().unwrap_or(0);
377 if idx == 0 {
378 result.push_str(&m.text);
379 } else if let Some(cap) = m.captures.iter().find(|c| c.index == idx) {
380 result.push_str(&cap.text);
381 }
382 }
383 Some(_) => {
384 result.push('$');
385 }
386 }
387 } else {
388 result.push(c);
389 }
390 }
391
392 result
393}
394
395pub fn lookup_capture<'a>(m: &'a Match, key: &str) -> Option<&'a str> {
397 if let Ok(idx) = key.parse::<usize>() {
399 if idx == 0 {
400 return Some(&m.text);
401 }
402 return m
403 .captures
404 .iter()
405 .find(|c| c.index == idx)
406 .map(|c| c.text.as_str());
407 }
408 m.captures
410 .iter()
411 .find(|c| c.name.as_deref() == Some(key))
412 .map(|c| c.text.as_str())
413}
414
415pub fn replace_all(text: &str, matches: &[Match], template: &str) -> ReplaceResult {
417 let mut output = String::new();
418 let mut segments = Vec::new();
419 let mut pos = 0;
420
421 for m in matches {
422 if m.start > pos {
424 let seg_start = output.len();
425 output.push_str(&text[pos..m.start]);
426 segments.push(ReplaceSegment {
427 start: seg_start,
428 end: output.len(),
429 is_replacement: false,
430 });
431 }
432 let expanded = expand_replacement(template, m);
434 if !expanded.is_empty() {
435 let seg_start = output.len();
436 output.push_str(&expanded);
437 segments.push(ReplaceSegment {
438 start: seg_start,
439 end: output.len(),
440 is_replacement: true,
441 });
442 }
443 pos = m.end;
444 }
445
446 if pos < text.len() {
448 let seg_start = output.len();
449 output.push_str(&text[pos..]);
450 segments.push(ReplaceSegment {
451 start: seg_start,
452 end: output.len(),
453 is_replacement: false,
454 });
455 }
456
457 ReplaceResult { output, segments }
458}
459
460#[cfg(test)]
461mod tests {
462 use super::*;
463
464 fn make_match(start: usize, end: usize, text: &str, captures: Vec<CaptureGroup>) -> Match {
465 Match {
466 start,
467 end,
468 text: text.to_string(),
469 captures,
470 }
471 }
472
473 fn make_cap(
474 index: usize,
475 name: Option<&str>,
476 start: usize,
477 end: usize,
478 text: &str,
479 ) -> CaptureGroup {
480 CaptureGroup {
481 index,
482 name: name.map(|s| s.to_string()),
483 start,
484 end,
485 text: text.to_string(),
486 }
487 }
488
489 #[test]
490 fn test_replace_all_basic() {
491 let matches = vec![make_match(
492 0,
493 12,
494 "user@example",
495 vec![
496 make_cap(1, None, 0, 4, "user"),
497 make_cap(2, None, 5, 12, "example"),
498 ],
499 )];
500 let result = replace_all("user@example", &matches, "$2=$1");
501 assert_eq!(result.output, "example=user");
502 }
503
504 #[test]
505 fn test_replace_all_no_matches() {
506 let result = replace_all("hello world", &[], "replacement");
507 assert_eq!(result.output, "hello world");
508 assert_eq!(result.segments.len(), 1);
509 assert!(!result.segments[0].is_replacement);
510 }
511
512 #[test]
513 fn test_replace_all_empty_template() {
514 let matches = vec![
515 make_match(4, 7, "123", vec![]),
516 make_match(12, 15, "456", vec![]),
517 ];
518 let result = replace_all("abc 123 def 456 ghi", &matches, "");
519 assert_eq!(result.output, "abc def ghi");
520 }
521
522 #[test]
523 fn test_replace_all_literal_dollar() {
524 let matches = vec![make_match(0, 3, "foo", vec![])];
525 let result = replace_all("foo", &matches, "$$bar");
526 assert_eq!(result.output, "$bar");
527 }
528
529 #[test]
530 fn test_replace_all_named_groups() {
531 let matches = vec![make_match(
532 0,
533 7,
534 "2024-01",
535 vec![
536 make_cap(1, Some("y"), 0, 4, "2024"),
537 make_cap(2, Some("m"), 5, 7, "01"),
538 ],
539 )];
540 let result = replace_all("2024-01", &matches, "${m}/${y}");
541 assert_eq!(result.output, "01/2024");
542 }
543
544 #[test]
545 fn test_expand_replacement_whole_match() {
546 let m = make_match(0, 5, "hello", vec![]);
547 assert_eq!(expand_replacement("$0", &m), "hello");
548 assert_eq!(expand_replacement("$&", &m), "hello");
549 assert_eq!(expand_replacement("[$0]", &m), "[hello]");
550 }
551
552 #[test]
553 fn test_expand_replacement_non_ascii() {
554 let m = make_match(0, 5, "hello", vec![]);
555 assert_eq!(expand_replacement("café $0", &m), "café hello");
557 assert_eq!(expand_replacement("→$0←", &m), "→hello←");
558 assert_eq!(expand_replacement("日本語", &m), "日本語");
559 assert_eq!(expand_replacement("über $& cool", &m), "über hello cool");
560 }
561
562 #[test]
563 fn test_replace_segments_tracking() {
564 let matches = vec![make_match(6, 9, "123", vec![])];
565 let result = replace_all("hello 123 world", &matches, "NUM");
566 assert_eq!(result.output, "hello NUM world");
567 assert_eq!(result.segments.len(), 3);
568 assert!(!result.segments[0].is_replacement);
570 assert_eq!(
571 &result.output[result.segments[0].start..result.segments[0].end],
572 "hello "
573 );
574 assert!(result.segments[1].is_replacement);
576 assert_eq!(
577 &result.output[result.segments[1].start..result.segments[1].end],
578 "NUM"
579 );
580 assert!(!result.segments[2].is_replacement);
582 assert_eq!(
583 &result.output[result.segments[2].start..result.segments[2].end],
584 " world"
585 );
586 }
587
588 #[test]
591 fn test_detect_simple_pattern_uses_rust_regex() {
592 assert_eq!(detect_minimum_engine(r"\d+"), EngineKind::RustRegex);
593 assert_eq!(detect_minimum_engine(r"[a-z]+"), EngineKind::RustRegex);
594 assert_eq!(detect_minimum_engine(r"foo|bar"), EngineKind::RustRegex);
595 assert_eq!(detect_minimum_engine(r"^\w+$"), EngineKind::RustRegex);
596 }
597
598 #[test]
599 fn test_detect_lookahead_needs_fancy() {
600 assert_eq!(detect_minimum_engine(r"foo(?=bar)"), EngineKind::FancyRegex);
601 assert_eq!(detect_minimum_engine(r"foo(?!bar)"), EngineKind::FancyRegex);
602 }
603
604 #[test]
605 fn test_detect_lookbehind_needs_fancy() {
606 assert_eq!(
607 detect_minimum_engine(r"(?<=foo)bar"),
608 EngineKind::FancyRegex,
609 );
610 assert_eq!(
611 detect_minimum_engine(r"(?<!foo)bar"),
612 EngineKind::FancyRegex,
613 );
614 }
615
616 #[test]
617 fn test_detect_backreference_needs_fancy() {
618 assert_eq!(detect_minimum_engine(r"(\w+)\s+\1"), EngineKind::FancyRegex,);
619 assert_eq!(detect_minimum_engine(r"(a)(b)\2"), EngineKind::FancyRegex);
620 }
621
622 #[test]
623 fn test_detect_non_backreference_escapes_stay_rust() {
624 assert_eq!(detect_minimum_engine(r"\d"), EngineKind::RustRegex);
626 assert_eq!(detect_minimum_engine(r"\w\s\b"), EngineKind::RustRegex);
627 assert_eq!(detect_minimum_engine(r"\0"), EngineKind::RustRegex);
628 assert_eq!(detect_minimum_engine(r"\n\r\t"), EngineKind::RustRegex);
629 assert_eq!(detect_minimum_engine(r"\x41"), EngineKind::RustRegex);
630 assert_eq!(detect_minimum_engine(r"\u0041"), EngineKind::RustRegex);
631 assert_eq!(detect_minimum_engine(r"\p{L}"), EngineKind::RustRegex);
632 assert_eq!(detect_minimum_engine(r"\P{L}"), EngineKind::RustRegex);
633 assert_eq!(detect_minimum_engine(r"\B"), EngineKind::RustRegex);
634 }
635
636 #[test]
637 fn test_has_backreference() {
638 assert!(has_backreference(r"(\w+)\1"));
639 assert!(has_backreference(r"\1"));
640 assert!(has_backreference(r"(a)(b)(c)\3"));
641 assert!(!has_backreference(r"\d+"));
642 assert!(!has_backreference(r"\0"));
643 assert!(!has_backreference(r"plain text"));
644 assert!(!has_backreference(r"\w\s\b\B\n\r\t"));
645 }
646
647 #[test]
648 fn test_detect_empty_pattern() {
649 assert_eq!(detect_minimum_engine(""), EngineKind::RustRegex);
650 }
651
652 #[test]
653 fn test_is_engine_upgrade() {
654 assert!(is_engine_upgrade(
655 EngineKind::RustRegex,
656 EngineKind::FancyRegex
657 ));
658 assert!(!is_engine_upgrade(
659 EngineKind::FancyRegex,
660 EngineKind::RustRegex
661 ));
662 assert!(!is_engine_upgrade(
663 EngineKind::FancyRegex,
664 EngineKind::FancyRegex,
665 ));
666 }
667
668 #[test]
669 fn wrap_pattern_omits_prefix_when_flags_are_defaults() {
670 let flags = EngineFlags::default();
672 assert_eq!(flags.wrap_pattern("abc"), "abc");
673 }
674
675 #[test]
676 fn wrap_pattern_emits_minus_u_when_unicode_disabled() {
677 let flags = EngineFlags {
678 unicode: false,
679 ..EngineFlags::default()
680 };
681 assert_eq!(flags.wrap_pattern("abc"), "(?-u)abc");
682 }
683
684 #[test]
685 fn wrap_pattern_combines_enable_and_disable_unicode() {
686 let flags = EngineFlags {
687 case_insensitive: true,
688 unicode: false,
689 ..EngineFlags::default()
690 };
691 assert_eq!(flags.wrap_pattern("abc"), "(?i-u)abc");
692 }
693
694 #[test]
695 fn wrap_pattern_does_not_emit_u_when_unicode_on() {
696 let flags = EngineFlags {
700 case_insensitive: true,
701 unicode: true,
702 ..EngineFlags::default()
703 };
704 assert_eq!(flags.wrap_pattern("abc"), "(?i)abc");
705 }
706
707 #[test]
708 fn to_inline_prefix_still_emits_positive_u_for_php() {
709 let flags = EngineFlags {
714 case_insensitive: true,
715 unicode: true,
716 ..EngineFlags::default()
717 };
718 assert_eq!(flags.to_inline_prefix(), "iu");
719 }
720
721 #[cfg(feature = "pcre2-engine")]
722 mod pcre2_detection_tests {
723 use super::*;
724
725 #[test]
726 fn test_detect_recursion_needs_pcre2() {
727 assert_eq!(detect_minimum_engine(r"(?R)"), EngineKind::Pcre2);
728 }
729
730 #[test]
731 fn test_detect_backtracking_verbs_need_pcre2() {
732 assert_eq!(detect_minimum_engine(r"(*SKIP)(*FAIL)"), EngineKind::Pcre2);
733 assert_eq!(detect_minimum_engine(r"(*PRUNE)"), EngineKind::Pcre2);
734 assert_eq!(detect_minimum_engine(r"(*COMMIT)"), EngineKind::Pcre2);
735 }
736
737 #[test]
738 fn test_detect_reset_match_start_needs_pcre2() {
739 assert_eq!(detect_minimum_engine(r"foo\Kbar"), EngineKind::Pcre2);
740 }
741
742 #[test]
743 fn test_detect_conditional_needs_pcre2() {
744 assert_eq!(detect_minimum_engine(r"(?(1)yes|no)"), EngineKind::Pcre2,);
745 }
746
747 #[test]
748 fn test_detect_subroutine_call_needs_pcre2() {
749 assert_eq!(detect_minimum_engine(r"(\d+)(?1)"), EngineKind::Pcre2);
750 }
751
752 #[test]
753 fn test_is_engine_upgrade_pcre2() {
754 assert!(is_engine_upgrade(EngineKind::RustRegex, EngineKind::Pcre2));
755 assert!(is_engine_upgrade(EngineKind::FancyRegex, EngineKind::Pcre2));
756 assert!(!is_engine_upgrade(
757 EngineKind::Pcre2,
758 EngineKind::FancyRegex
759 ));
760 assert!(!is_engine_upgrade(EngineKind::Pcre2, EngineKind::RustRegex));
761 }
762 }
763}