1pub mod fancy;
2#[cfg(feature = "pcre2-engine")]
3pub mod pcre2;
4#[cfg(feature = "pcre2-engine")]
5pub mod pcre2_debug;
6pub mod rust_regex;
7
8use serde::Serialize;
9use std::fmt;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum EngineKind {
14 RustRegex,
15 FancyRegex,
16 #[cfg(feature = "pcre2-engine")]
17 Pcre2,
18}
19
20impl EngineKind {
21 pub fn all() -> Vec<Self> {
22 vec![
23 Self::RustRegex,
24 Self::FancyRegex,
25 #[cfg(feature = "pcre2-engine")]
26 Self::Pcre2,
27 ]
28 }
29
30 pub const fn next(self) -> Self {
31 match self {
32 Self::RustRegex => Self::FancyRegex,
33 #[cfg(feature = "pcre2-engine")]
34 Self::FancyRegex => Self::Pcre2,
35 #[cfg(not(feature = "pcre2-engine"))]
36 EngineKind::FancyRegex => EngineKind::RustRegex,
37 #[cfg(feature = "pcre2-engine")]
38 Self::Pcre2 => Self::RustRegex,
39 }
40 }
41}
42
43impl fmt::Display for EngineKind {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 match self {
46 Self::RustRegex => write!(f, "Rust regex"),
47 Self::FancyRegex => write!(f, "fancy-regex"),
48 #[cfg(feature = "pcre2-engine")]
49 Self::Pcre2 => write!(f, "PCRE2"),
50 }
51 }
52}
53
54#[derive(Debug, Clone, Copy)]
55pub struct EngineFlags {
56 pub case_insensitive: bool,
57 pub multi_line: bool,
58 pub dot_matches_newline: bool,
59 pub unicode: bool,
60 pub extended: bool,
61}
62
63impl Default for EngineFlags {
64 fn default() -> Self {
71 Self {
72 case_insensitive: false,
73 multi_line: false,
74 dot_matches_newline: false,
75 unicode: true,
76 extended: false,
77 }
78 }
79}
80
81impl EngineFlags {
82 #[allow(clippy::wrong_self_convention)] pub fn to_inline_prefix(&self) -> String {
87 let mut s = String::new();
88 if self.case_insensitive {
89 s.push('i');
90 }
91 if self.multi_line {
92 s.push('m');
93 }
94 if self.dot_matches_newline {
95 s.push('s');
96 }
97 if self.unicode {
98 s.push('u');
99 }
100 if self.extended {
101 s.push('x');
102 }
103 s
104 }
105
106 #[allow(clippy::wrong_self_convention)] fn to_regex_inline_prefix(&self) -> String {
115 let mut enable = String::new();
116 if self.case_insensitive {
117 enable.push('i');
118 }
119 if self.multi_line {
120 enable.push('m');
121 }
122 if self.dot_matches_newline {
123 enable.push('s');
124 }
125 if self.extended {
126 enable.push('x');
127 }
128 let disable_unicode = !self.unicode;
129 match (enable.is_empty(), disable_unicode) {
130 (true, false) => String::new(),
131 (false, false) => enable,
132 (true, true) => "-u".to_string(),
133 (false, true) => format!("{enable}-u"),
134 }
135 }
136
137 pub fn wrap_pattern(&self, pattern: &str) -> String {
138 let prefix = self.to_regex_inline_prefix();
139 if prefix.is_empty() {
140 pattern.to_string()
141 } else {
142 format!("(?{prefix}){pattern}")
143 }
144 }
145
146 pub fn toggle_case_insensitive(&mut self) {
147 self.case_insensitive = !self.case_insensitive;
148 }
149 pub fn toggle_multi_line(&mut self) {
150 self.multi_line = !self.multi_line;
151 }
152 pub fn toggle_dot_matches_newline(&mut self) {
153 self.dot_matches_newline = !self.dot_matches_newline;
154 }
155 pub fn toggle_unicode(&mut self) {
156 self.unicode = !self.unicode;
157 }
158 pub fn toggle_extended(&mut self) {
159 self.extended = !self.extended;
160 }
161}
162
163#[derive(Debug, Clone, Serialize)]
164pub struct Match {
165 #[serde(rename = "match")]
166 pub text: String,
167 pub start: usize,
168 pub end: usize,
169 #[serde(rename = "groups")]
170 pub captures: Vec<CaptureGroup>,
171}
172
173#[derive(Debug, Clone, Serialize)]
174pub struct CaptureGroup {
175 #[serde(rename = "group")]
176 pub index: usize,
177 #[serde(skip_serializing_if = "Option::is_none")]
178 pub name: Option<String>,
179 #[serde(rename = "value")]
180 pub text: String,
181 pub start: usize,
182 pub end: usize,
183}
184
185#[derive(Debug)]
186pub enum EngineError {
187 CompileError(String),
188 MatchError(String),
189}
190
191impl fmt::Display for EngineError {
192 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
193 match self {
194 Self::CompileError(msg) => write!(f, "Compile error: {msg}"),
195 Self::MatchError(msg) => write!(f, "Match error: {msg}"),
196 }
197 }
198}
199
200impl std::error::Error for EngineError {}
201
202pub type EngineResult<T> = Result<T, EngineError>;
203
204pub trait RegexEngine: Send + Sync {
205 fn kind(&self) -> EngineKind;
206 fn compile(&self, pattern: &str, flags: &EngineFlags) -> EngineResult<Box<dyn CompiledRegex>>;
207}
208
209pub trait CompiledRegex: Send + Sync {
210 fn find_matches(&self, text: &str) -> EngineResult<Vec<Match>>;
211}
212
213pub fn create_engine(kind: EngineKind) -> Box<dyn RegexEngine> {
214 match kind {
215 EngineKind::RustRegex => Box::new(rust_regex::RustRegexEngine),
216 EngineKind::FancyRegex => Box::new(fancy::FancyRegexEngine),
217 #[cfg(feature = "pcre2-engine")]
218 EngineKind::Pcre2 => Box::new(pcre2::Pcre2Engine),
219 }
220}
221
222const fn engine_level(kind: EngineKind) -> u8 {
224 match kind {
225 EngineKind::RustRegex => 0,
226 EngineKind::FancyRegex => 1,
227 #[cfg(feature = "pcre2-engine")]
228 EngineKind::Pcre2 => 2,
229 }
230}
231
232pub fn detect_minimum_engine(pattern: &str) -> EngineKind {
234 #[cfg(feature = "pcre2-engine")]
235 {
236 if needs_pcre2(pattern) {
237 return EngineKind::Pcre2;
238 }
239 }
240
241 if needs_fancy(pattern) {
242 return EngineKind::FancyRegex;
243 }
244
245 EngineKind::RustRegex
246}
247
248pub const fn is_engine_upgrade(current: EngineKind, suggested: EngineKind) -> bool {
250 engine_level(suggested) > engine_level(current)
251}
252
253fn needs_fancy(pattern: &str) -> bool {
254 if pattern.contains("(?=")
255 || pattern.contains("(?!")
256 || pattern.contains("(?<=")
257 || pattern.contains("(?<!")
258 {
259 return true;
260 }
261 has_backreference(pattern)
262}
263
264fn has_backreference(pattern: &str) -> bool {
265 let bytes = pattern.as_bytes();
266 let len = bytes.len();
267 let mut i = 0;
268 while i < len.saturating_sub(1) {
269 if bytes[i] == b'\\' {
270 let next = bytes[i + 1];
271 if next.is_ascii_digit() && next != b'0' {
272 return true;
273 }
274 i += 2;
276 continue;
277 }
278 i += 1;
279 }
280 false
281}
282
283#[cfg(feature = "pcre2-engine")]
284fn needs_pcre2(pattern: &str) -> bool {
285 if pattern.contains("(?R)")
286 || pattern.contains("(*SKIP)")
287 || pattern.contains("(*FAIL)")
288 || pattern.contains("(*PRUNE)")
289 || pattern.contains("(*COMMIT)")
290 || pattern.contains("\\K")
291 || pattern.contains("(?(")
292 {
293 return true;
294 }
295 has_subroutine_call(pattern)
296}
297
298#[cfg(feature = "pcre2-engine")]
299fn has_subroutine_call(pattern: &str) -> bool {
300 let bytes = pattern.as_bytes();
301 for i in 0..bytes.len().saturating_sub(2) {
302 if bytes[i] == b'('
303 && bytes[i + 1] == b'?'
304 && bytes.get(i + 2).is_some_and(u8::is_ascii_digit)
305 {
306 return true;
307 }
308 }
309 false
310}
311
312#[derive(Debug, Clone)]
315pub struct ReplaceSegment {
316 pub start: usize,
317 pub end: usize,
318 pub is_replacement: bool,
319}
320
321#[derive(Debug, Clone)]
322pub struct ReplaceResult {
323 pub output: String,
324 pub segments: Vec<ReplaceSegment>,
325}
326
327fn expand_replacement(template: &str, m: &Match) -> String {
332 let mut result = String::new();
333 let mut chars = template.char_indices().peekable();
334
335 while let Some((_i, c)) = chars.next() {
336 if c == '$' {
337 match chars.peek() {
338 None => {
339 result.push('$');
340 }
341 Some(&(_, '$')) => {
342 chars.next();
343 result.push('$');
344 }
345 Some(&(_, '&')) => {
346 chars.next();
347 result.push_str(&m.text);
348 }
349 Some(&(_, '{')) => {
350 chars.next(); let brace_start = chars.peek().map_or(template.len(), |&(idx, _)| idx);
352 if let Some(close) = template[brace_start..].find('}') {
353 let ref_name = &template[brace_start..brace_start + close];
354 if let Some(text) = lookup_capture(m, ref_name) {
355 result.push_str(text);
356 }
357 let end_byte = brace_start + close + 1;
359 while chars.peek().is_some_and(|&(idx, _)| idx < end_byte) {
360 chars.next();
361 }
362 } else {
363 result.push('$');
364 result.push('{');
365 }
366 }
367 Some(&(_, next_c)) if next_c.is_ascii_digit() => {
368 let (_, d1) = chars.next().expect("peeked value must exist");
369 let mut num_str = String::from(d1);
370 if let Some(&(_, d2)) = chars.peek() {
372 if d2.is_ascii_digit() {
373 chars.next();
374 num_str.push(d2);
375 }
376 }
377 let idx: usize = num_str.parse().unwrap_or(0);
378 if idx == 0 {
379 result.push_str(&m.text);
380 } else if let Some(cap) = m.captures.iter().find(|c| c.index == idx) {
381 result.push_str(&cap.text);
382 }
383 }
384 Some(_) => {
385 result.push('$');
386 }
387 }
388 } else {
389 result.push(c);
390 }
391 }
392
393 result
394}
395
396pub fn lookup_capture<'a>(m: &'a Match, key: &str) -> Option<&'a str> {
398 if let Ok(idx) = key.parse::<usize>() {
400 if idx == 0 {
401 return Some(&m.text);
402 }
403 return m
404 .captures
405 .iter()
406 .find(|c| c.index == idx)
407 .map(|c| c.text.as_str());
408 }
409 m.captures
411 .iter()
412 .find(|c| c.name.as_deref() == Some(key))
413 .map(|c| c.text.as_str())
414}
415
416pub fn replace_all(text: &str, matches: &[Match], template: &str) -> ReplaceResult {
418 let mut output = String::new();
419 let mut segments = Vec::new();
420 let mut pos = 0;
421
422 for m in matches {
423 if m.start > pos {
425 let seg_start = output.len();
426 output.push_str(&text[pos..m.start]);
427 segments.push(ReplaceSegment {
428 start: seg_start,
429 end: output.len(),
430 is_replacement: false,
431 });
432 }
433 let expanded = expand_replacement(template, m);
435 if !expanded.is_empty() {
436 let seg_start = output.len();
437 output.push_str(&expanded);
438 segments.push(ReplaceSegment {
439 start: seg_start,
440 end: output.len(),
441 is_replacement: true,
442 });
443 }
444 pos = m.end;
445 }
446
447 if pos < text.len() {
449 let seg_start = output.len();
450 output.push_str(&text[pos..]);
451 segments.push(ReplaceSegment {
452 start: seg_start,
453 end: output.len(),
454 is_replacement: false,
455 });
456 }
457
458 ReplaceResult { output, segments }
459}
460
461#[cfg(test)]
462mod tests {
463 use super::*;
464
465 fn make_match(start: usize, end: usize, text: &str, captures: Vec<CaptureGroup>) -> Match {
466 Match {
467 start,
468 end,
469 text: text.to_string(),
470 captures,
471 }
472 }
473
474 fn make_cap(
475 index: usize,
476 name: Option<&str>,
477 start: usize,
478 end: usize,
479 text: &str,
480 ) -> CaptureGroup {
481 CaptureGroup {
482 index,
483 name: name.map(std::string::ToString::to_string),
484 start,
485 end,
486 text: text.to_string(),
487 }
488 }
489
490 #[test]
491 fn test_replace_all_basic() {
492 let matches = vec![make_match(
493 0,
494 12,
495 "user@example",
496 vec![
497 make_cap(1, None, 0, 4, "user"),
498 make_cap(2, None, 5, 12, "example"),
499 ],
500 )];
501 let result = replace_all("user@example", &matches, "$2=$1");
502 assert_eq!(result.output, "example=user");
503 }
504
505 #[test]
506 fn test_replace_all_no_matches() {
507 let result = replace_all("hello world", &[], "replacement");
508 assert_eq!(result.output, "hello world");
509 assert_eq!(result.segments.len(), 1);
510 assert!(!result.segments[0].is_replacement);
511 }
512
513 #[test]
514 fn test_replace_all_empty_template() {
515 let matches = vec![
516 make_match(4, 7, "123", vec![]),
517 make_match(12, 15, "456", vec![]),
518 ];
519 let result = replace_all("abc 123 def 456 ghi", &matches, "");
520 assert_eq!(result.output, "abc def ghi");
521 }
522
523 #[test]
524 fn test_replace_all_literal_dollar() {
525 let matches = vec![make_match(0, 3, "foo", vec![])];
526 let result = replace_all("foo", &matches, "$$bar");
527 assert_eq!(result.output, "$bar");
528 }
529
530 #[test]
531 fn test_replace_all_named_groups() {
532 let matches = vec![make_match(
533 0,
534 7,
535 "2024-01",
536 vec![
537 make_cap(1, Some("y"), 0, 4, "2024"),
538 make_cap(2, Some("m"), 5, 7, "01"),
539 ],
540 )];
541 let result = replace_all("2024-01", &matches, "${m}/${y}");
542 assert_eq!(result.output, "01/2024");
543 }
544
545 #[test]
546 fn test_expand_replacement_whole_match() {
547 let m = make_match(0, 5, "hello", vec![]);
548 assert_eq!(expand_replacement("$0", &m), "hello");
549 assert_eq!(expand_replacement("$&", &m), "hello");
550 assert_eq!(expand_replacement("[$0]", &m), "[hello]");
551 }
552
553 #[test]
554 fn test_expand_replacement_non_ascii() {
555 let m = make_match(0, 5, "hello", vec![]);
556 assert_eq!(expand_replacement("café $0", &m), "café hello");
558 assert_eq!(expand_replacement("→$0←", &m), "→hello←");
559 assert_eq!(expand_replacement("日本語", &m), "日本語");
560 assert_eq!(expand_replacement("über $& cool", &m), "über hello cool");
561 }
562
563 #[test]
564 fn test_replace_segments_tracking() {
565 let matches = vec![make_match(6, 9, "123", vec![])];
566 let result = replace_all("hello 123 world", &matches, "NUM");
567 assert_eq!(result.output, "hello NUM world");
568 assert_eq!(result.segments.len(), 3);
569 assert!(!result.segments[0].is_replacement);
571 assert_eq!(
572 &result.output[result.segments[0].start..result.segments[0].end],
573 "hello "
574 );
575 assert!(result.segments[1].is_replacement);
577 assert_eq!(
578 &result.output[result.segments[1].start..result.segments[1].end],
579 "NUM"
580 );
581 assert!(!result.segments[2].is_replacement);
583 assert_eq!(
584 &result.output[result.segments[2].start..result.segments[2].end],
585 " world"
586 );
587 }
588
589 #[test]
592 fn test_detect_simple_pattern_uses_rust_regex() {
593 assert_eq!(detect_minimum_engine(r"\d+"), EngineKind::RustRegex);
594 assert_eq!(detect_minimum_engine(r"[a-z]+"), EngineKind::RustRegex);
595 assert_eq!(detect_minimum_engine(r"foo|bar"), EngineKind::RustRegex);
596 assert_eq!(detect_minimum_engine(r"^\w+$"), EngineKind::RustRegex);
597 }
598
599 #[test]
600 fn test_detect_lookahead_needs_fancy() {
601 assert_eq!(detect_minimum_engine(r"foo(?=bar)"), EngineKind::FancyRegex);
602 assert_eq!(detect_minimum_engine(r"foo(?!bar)"), EngineKind::FancyRegex);
603 }
604
605 #[test]
606 fn test_detect_lookbehind_needs_fancy() {
607 assert_eq!(
608 detect_minimum_engine(r"(?<=foo)bar"),
609 EngineKind::FancyRegex,
610 );
611 assert_eq!(
612 detect_minimum_engine(r"(?<!foo)bar"),
613 EngineKind::FancyRegex,
614 );
615 }
616
617 #[test]
618 fn test_detect_backreference_needs_fancy() {
619 assert_eq!(detect_minimum_engine(r"(\w+)\s+\1"), EngineKind::FancyRegex,);
620 assert_eq!(detect_minimum_engine(r"(a)(b)\2"), EngineKind::FancyRegex);
621 }
622
623 #[test]
624 fn test_detect_non_backreference_escapes_stay_rust() {
625 assert_eq!(detect_minimum_engine(r"\d"), EngineKind::RustRegex);
627 assert_eq!(detect_minimum_engine(r"\w\s\b"), EngineKind::RustRegex);
628 assert_eq!(detect_minimum_engine(r"\0"), EngineKind::RustRegex);
629 assert_eq!(detect_minimum_engine(r"\n\r\t"), EngineKind::RustRegex);
630 assert_eq!(detect_minimum_engine(r"\x41"), EngineKind::RustRegex);
631 assert_eq!(detect_minimum_engine(r"\u0041"), EngineKind::RustRegex);
632 assert_eq!(detect_minimum_engine(r"\p{L}"), EngineKind::RustRegex);
633 assert_eq!(detect_minimum_engine(r"\P{L}"), EngineKind::RustRegex);
634 assert_eq!(detect_minimum_engine(r"\B"), EngineKind::RustRegex);
635 }
636
637 #[test]
638 fn test_has_backreference() {
639 assert!(has_backreference(r"(\w+)\1"));
640 assert!(has_backreference(r"\1"));
641 assert!(has_backreference(r"(a)(b)(c)\3"));
642 assert!(!has_backreference(r"\d+"));
643 assert!(!has_backreference(r"\0"));
644 assert!(!has_backreference(r"plain text"));
645 assert!(!has_backreference(r"\w\s\b\B\n\r\t"));
646 }
647
648 #[test]
649 fn test_detect_empty_pattern() {
650 assert_eq!(detect_minimum_engine(""), EngineKind::RustRegex);
651 }
652
653 #[test]
654 fn test_is_engine_upgrade() {
655 assert!(is_engine_upgrade(
656 EngineKind::RustRegex,
657 EngineKind::FancyRegex
658 ));
659 assert!(!is_engine_upgrade(
660 EngineKind::FancyRegex,
661 EngineKind::RustRegex
662 ));
663 assert!(!is_engine_upgrade(
664 EngineKind::FancyRegex,
665 EngineKind::FancyRegex,
666 ));
667 }
668
669 #[test]
670 fn wrap_pattern_omits_prefix_when_flags_are_defaults() {
671 let flags = EngineFlags::default();
673 assert_eq!(flags.wrap_pattern("abc"), "abc");
674 }
675
676 #[test]
677 fn wrap_pattern_emits_minus_u_when_unicode_disabled() {
678 let flags = EngineFlags {
679 unicode: false,
680 ..EngineFlags::default()
681 };
682 assert_eq!(flags.wrap_pattern("abc"), "(?-u)abc");
683 }
684
685 #[test]
686 fn wrap_pattern_combines_enable_and_disable_unicode() {
687 let flags = EngineFlags {
688 case_insensitive: true,
689 unicode: false,
690 ..EngineFlags::default()
691 };
692 assert_eq!(flags.wrap_pattern("abc"), "(?i-u)abc");
693 }
694
695 #[test]
696 fn wrap_pattern_does_not_emit_u_when_unicode_on() {
697 let flags = EngineFlags {
701 case_insensitive: true,
702 unicode: true,
703 ..EngineFlags::default()
704 };
705 assert_eq!(flags.wrap_pattern("abc"), "(?i)abc");
706 }
707
708 #[test]
709 fn to_inline_prefix_still_emits_positive_u_for_php() {
710 let flags = EngineFlags {
715 case_insensitive: true,
716 unicode: true,
717 ..EngineFlags::default()
718 };
719 assert_eq!(flags.to_inline_prefix(), "iu");
720 }
721
722 #[cfg(feature = "pcre2-engine")]
723 mod pcre2_detection_tests {
724 use super::*;
725
726 #[test]
727 fn test_detect_recursion_needs_pcre2() {
728 assert_eq!(detect_minimum_engine(r"(?R)"), EngineKind::Pcre2);
729 }
730
731 #[test]
732 fn test_detect_backtracking_verbs_need_pcre2() {
733 assert_eq!(detect_minimum_engine(r"(*SKIP)(*FAIL)"), EngineKind::Pcre2);
734 assert_eq!(detect_minimum_engine(r"(*PRUNE)"), EngineKind::Pcre2);
735 assert_eq!(detect_minimum_engine(r"(*COMMIT)"), EngineKind::Pcre2);
736 }
737
738 #[test]
739 fn test_detect_reset_match_start_needs_pcre2() {
740 assert_eq!(detect_minimum_engine(r"foo\Kbar"), EngineKind::Pcre2);
741 }
742
743 #[test]
744 fn test_detect_conditional_needs_pcre2() {
745 assert_eq!(detect_minimum_engine(r"(?(1)yes|no)"), EngineKind::Pcre2,);
746 }
747
748 #[test]
749 fn test_detect_subroutine_call_needs_pcre2() {
750 assert_eq!(detect_minimum_engine(r"(\d+)(?1)"), EngineKind::Pcre2);
751 }
752
753 #[test]
754 fn test_is_engine_upgrade_pcre2() {
755 assert!(is_engine_upgrade(EngineKind::RustRegex, EngineKind::Pcre2));
756 assert!(is_engine_upgrade(EngineKind::FancyRegex, EngineKind::Pcre2));
757 assert!(!is_engine_upgrade(
758 EngineKind::Pcre2,
759 EngineKind::FancyRegex
760 ));
761 assert!(!is_engine_upgrade(EngineKind::Pcre2, EngineKind::RustRegex));
762 }
763 }
764}