Skip to main content

ripsed_core/
matcher.rs

1use crate::error::RipsedError;
2use crate::operation::Op;
3use regex::Regex;
4
5/// Abstraction over literal and regex matching.
6#[derive(Debug)]
7pub enum Matcher {
8    Literal {
9        pattern: String,
10    },
11    /// A regex matcher — used for both explicit `--regex` patterns and as the
12    /// implementation backing case-insensitive literal matching (via
13    /// `regex::escape` + `(?i)`), which avoids byte-offset mismatches from
14    /// `str::to_lowercase()` on multi-byte Unicode characters.
15    Regex(Regex),
16}
17
18impl Matcher {
19    /// Create a new matcher from an operation.
20    pub fn new(op: &Op) -> Result<Self, RipsedError> {
21        let pattern = op.find_pattern();
22        let is_regex = op.is_regex();
23        let case_insensitive = op.is_case_insensitive();
24
25        if is_regex || case_insensitive {
26            // For case-insensitive literals, escape the pattern and delegate to
27            // the regex engine which handles Unicode casing correctly.
28            let re_src = if is_regex {
29                pattern.to_string()
30            } else {
31                regex::escape(pattern)
32            };
33            let re_pattern = if case_insensitive {
34                format!("(?i){re_src}")
35            } else {
36                re_src
37            };
38            Regex::new(&re_pattern).map(Matcher::Regex).map_err(|e| {
39                let mut err = RipsedError::invalid_regex(0, pattern, &e.to_string());
40                err.operation_index = None;
41                err
42            })
43        } else {
44            Ok(Matcher::Literal {
45                pattern: pattern.to_string(),
46            })
47        }
48    }
49
50    /// Check if the given text matches.
51    pub fn is_match(&self, text: &str) -> bool {
52        match self {
53            Matcher::Literal { pattern, .. } => text.contains(pattern.as_str()),
54            Matcher::Regex(re) => re.is_match(text),
55        }
56    }
57
58    /// Replace all matches in the given text. Returns None if no matches.
59    pub fn replace(&self, text: &str, replacement: &str) -> Option<String> {
60        match self {
61            Matcher::Literal { pattern, .. } => {
62                if text.contains(pattern.as_str()) {
63                    Some(text.replace(pattern.as_str(), replacement))
64                } else {
65                    None
66                }
67            }
68            Matcher::Regex(re) => {
69                if re.is_match(text) {
70                    Some(re.replace_all(text, replacement).into_owned())
71                } else {
72                    None
73                }
74            }
75        }
76    }
77}
78
79#[cfg(test)]
80mod tests {
81    use super::*;
82
83    #[test]
84    fn test_literal_match() {
85        let op = Op::Replace {
86            find: "hello".to_string(),
87            replace: "hi".to_string(),
88            regex: false,
89            case_insensitive: false,
90        };
91        let m = Matcher::new(&op).unwrap();
92        assert!(m.is_match("say hello world"));
93        assert!(!m.is_match("say Hi world"));
94    }
95
96    #[test]
97    fn test_literal_case_insensitive() {
98        let op = Op::Replace {
99            find: "hello".to_string(),
100            replace: "hi".to_string(),
101            regex: false,
102            case_insensitive: true,
103        };
104        let m = Matcher::new(&op).unwrap();
105        assert!(m.is_match("say HELLO world"));
106        assert!(m.is_match("say Hello world"));
107    }
108
109    #[test]
110    fn test_regex_match() {
111        let op = Op::Replace {
112            find: r"fn\s+(\w+)".to_string(),
113            replace: "fn new_$1".to_string(),
114            regex: true,
115            case_insensitive: false,
116        };
117        let m = Matcher::new(&op).unwrap();
118        assert!(m.is_match("fn old_func() {"));
119        assert!(!m.is_match("let x = 5;"));
120    }
121
122    #[test]
123    fn test_regex_replace_with_captures() {
124        let op = Op::Replace {
125            find: r"fn\s+old_(\w+)".to_string(),
126            replace: "fn new_$1".to_string(),
127            regex: true,
128            case_insensitive: false,
129        };
130        let m = Matcher::new(&op).unwrap();
131        let result = m.replace("fn old_function() {", "fn new_$1");
132        assert_eq!(result, Some("fn new_function() {".to_string()));
133    }
134
135    #[test]
136    fn test_invalid_regex() {
137        let op = Op::Replace {
138            find: "fn (foo".to_string(),
139            replace: "bar".to_string(),
140            regex: true,
141            case_insensitive: false,
142        };
143        let err = Matcher::new(&op).unwrap_err();
144        assert_eq!(err.code, crate::error::ErrorCode::InvalidRegex);
145    }
146
147    // ---------------------------------------------------------------
148    // Empty pattern behavior
149    // ---------------------------------------------------------------
150
151    #[test]
152    fn test_empty_pattern_literal_matches_everything() {
153        let op = Op::Replace {
154            find: "".to_string(),
155            replace: "x".to_string(),
156            regex: false,
157            case_insensitive: false,
158        };
159        let m = Matcher::new(&op).unwrap();
160        // An empty string is contained in every string
161        assert!(m.is_match("anything"));
162        assert!(m.is_match(""));
163    }
164
165    #[test]
166    fn test_empty_pattern_literal_replace() {
167        let op = Op::Replace {
168            find: "".to_string(),
169            replace: "x".to_string(),
170            regex: false,
171            case_insensitive: false,
172        };
173        let m = Matcher::new(&op).unwrap();
174        // Rust's str::replace("", "x") inserts "x" between every char and at start/end
175        let result = m.replace("ab", "x");
176        assert_eq!(result, Some("xaxbx".to_string()));
177    }
178
179    #[test]
180    fn test_empty_pattern_regex_matches_everything() {
181        let op = Op::Replace {
182            find: "".to_string(),
183            replace: "x".to_string(),
184            regex: true,
185            case_insensitive: false,
186        };
187        let m = Matcher::new(&op).unwrap();
188        assert!(m.is_match("anything"));
189        assert!(m.is_match(""));
190    }
191
192    // ---------------------------------------------------------------
193    // Pattern that matches entire line
194    // ---------------------------------------------------------------
195
196    #[test]
197    fn test_pattern_matches_entire_line_literal() {
198        let op = Op::Replace {
199            find: "hello world".to_string(),
200            replace: "goodbye".to_string(),
201            regex: false,
202            case_insensitive: false,
203        };
204        let m = Matcher::new(&op).unwrap();
205        let result = m.replace("hello world", "goodbye");
206        assert_eq!(result, Some("goodbye".to_string()));
207    }
208
209    #[test]
210    fn test_pattern_matches_entire_line_regex() {
211        let op = Op::Replace {
212            find: r"^.*$".to_string(),
213            replace: "replaced".to_string(),
214            regex: true,
215            case_insensitive: false,
216        };
217        let m = Matcher::new(&op).unwrap();
218        let result = m.replace("anything here", "replaced");
219        assert_eq!(result, Some("replaced".to_string()));
220    }
221
222    #[test]
223    fn test_regex_anchored_full_line() {
224        let op = Op::Replace {
225            find: r"^fn main\(\)$".to_string(),
226            replace: "fn start()".to_string(),
227            regex: true,
228            case_insensitive: false,
229        };
230        let m = Matcher::new(&op).unwrap();
231        assert!(m.is_match("fn main()"));
232        assert!(!m.is_match("  fn main()")); // leading whitespace
233        assert!(!m.is_match("fn main() {")); // trailing content
234    }
235
236    // ---------------------------------------------------------------
237    // Case-insensitive with unicode (Turkish I problem, etc.)
238    // ---------------------------------------------------------------
239
240    #[test]
241    fn test_case_insensitive_ascii() {
242        let op = Op::Replace {
243            find: "Hello".to_string(),
244            replace: "hi".to_string(),
245            regex: false,
246            case_insensitive: true,
247        };
248        let m = Matcher::new(&op).unwrap();
249        assert!(m.is_match("HELLO"));
250        assert!(m.is_match("hello"));
251        assert!(m.is_match("HeLLo"));
252        let result = m.replace("say HELLO there", "hi");
253        assert_eq!(result, Some("say hi there".to_string()));
254    }
255
256    #[test]
257    fn test_case_insensitive_german_eszett() {
258        // German sharp-s: lowercase to_lowercase() of "SS" is "ss",
259        // and to_lowercase() of "\u{00DF}" (sharp-s) is "\u{00DF}"
260        // This tests that the engine handles non-trivial unicode casing
261        let op = Op::Replace {
262            find: "stra\u{00DF}e".to_string(), // "strasse" with sharp-s
263            replace: "street".to_string(),
264            regex: false,
265            case_insensitive: true,
266        };
267        let m = Matcher::new(&op).unwrap();
268        assert!(m.is_match("STRA\u{00DF}E"));
269    }
270
271    #[test]
272    fn test_case_insensitive_turkish_i_lowercase() {
273        // Turkish dotted I: \u{0130} (capital I with dot above)
274        // This is a known edge case. We test that the matcher doesn't panic
275        // and behaves consistently with Unicode simple case folding.
276        let op = Op::Replace {
277            find: "i".to_string(),
278            replace: "x".to_string(),
279            regex: false,
280            case_insensitive: true,
281        };
282        let m = Matcher::new(&op).unwrap();
283        // Standard ASCII: "I" simple-folds to "i", so this matches
284        assert!(m.is_match("I"));
285        // \u{0130} (İ) has no simple case fold to "i" in Unicode — the full
286        // fold is "i\u{0307}" but the regex engine only uses simple folds.
287        // This correctly does NOT match, avoiding false positives from the
288        // old to_lowercase()-based byte-offset approach.
289        assert!(!m.is_match("\u{0130}"));
290    }
291
292    // ---------------------------------------------------------------
293    // Regex special characters in literal mode
294    // ---------------------------------------------------------------
295
296    #[test]
297    fn test_literal_mode_regex_metacharacters() {
298        // All these are regex metacharacters but should be treated literally
299        let patterns = vec![
300            (".", "dot"),
301            ("*", "star"),
302            ("+", "plus"),
303            ("?", "question"),
304            ("(", "paren"),
305            ("[", "bracket"),
306            ("{", "brace"),
307            ("^", "caret"),
308            ("$", "dollar"),
309            ("|", "pipe"),
310            ("\\", "backslash"),
311        ];
312        for (pat, name) in patterns {
313            let op = Op::Replace {
314                find: pat.to_string(),
315                replace: "X".to_string(),
316                regex: false,
317                case_insensitive: false,
318            };
319            let m = Matcher::new(&op).unwrap();
320            let text = format!("before {pat} after");
321            assert!(
322                m.is_match(&text),
323                "Literal mode should match '{name}' ({pat}) as a literal character"
324            );
325            let result = m.replace(&text, "X");
326            assert_eq!(
327                result,
328                Some("before X after".to_string()),
329                "Literal mode should replace '{name}' ({pat}) as a literal"
330            );
331        }
332    }
333
334    // ---------------------------------------------------------------
335    // Multiple matches on same line
336    // ---------------------------------------------------------------
337
338    #[test]
339    fn test_multiple_matches_same_line() {
340        let op = Op::Replace {
341            find: "ab".to_string(),
342            replace: "X".to_string(),
343            regex: false,
344            case_insensitive: false,
345        };
346        let m = Matcher::new(&op).unwrap();
347        let result = m.replace("ab cd ab ef ab", "X");
348        assert_eq!(result, Some("X cd X ef X".to_string()));
349    }
350
351    #[test]
352    fn test_replace_with_empty_string() {
353        let op = Op::Replace {
354            find: "remove".to_string(),
355            replace: "".to_string(),
356            regex: false,
357            case_insensitive: false,
358        };
359        let m = Matcher::new(&op).unwrap();
360        let result = m.replace("please remove this", "");
361        assert_eq!(result, Some("please  this".to_string()));
362    }
363
364    #[test]
365    fn test_no_match_returns_none() {
366        let op = Op::Replace {
367            find: "xyz".to_string(),
368            replace: "abc".to_string(),
369            regex: false,
370            case_insensitive: false,
371        };
372        let m = Matcher::new(&op).unwrap();
373        assert!(m.replace("nothing here", "abc").is_none());
374    }
375
376    // ---------------------------------------------------------------
377    // Pathological / adversarial pattern tests
378    //
379    // These lock in behavior for patterns that look like they ought to
380    // break something: regex metacharacters misused in literal mode,
381    // empty inputs, patterns with backreference-like replacement strings,
382    // and regex that would blow up a backtracking engine.
383    // ---------------------------------------------------------------
384
385    /// A literal pattern of `$1` (which would be a capture backreference in
386    /// a regex replacement context) must match the literal two-character
387    /// sequence in text and be replaceable without invoking capture-group
388    /// semantics. Regression guard against anyone accidentally swapping
389    /// `str::replace` for `Regex::replace_all` in the literal path.
390    #[test]
391    fn test_literal_dollar_one_pattern() {
392        let op = Op::Replace {
393            find: "$1".to_string(),
394            replace: "REPLACED".to_string(),
395            regex: false,
396            case_insensitive: false,
397        };
398        let m = Matcher::new(&op).unwrap();
399        assert!(m.is_match("value is $1 here"));
400        let result = m.replace("value is $1 here", "REPLACED");
401        assert_eq!(result, Some("value is REPLACED here".to_string()));
402    }
403
404    /// A regex pattern whose replacement string contains `$0`, `$1`, etc.
405    /// should be interpreted as a capture-backreference in regex mode.
406    /// This is intended behavior; locking it in so nobody accidentally
407    /// escapes it.
408    #[test]
409    fn test_regex_backreferences_work_in_replace() {
410        let op = Op::Replace {
411            find: r"hello (\w+)".to_string(),
412            replace: "greetings, $1!".to_string(),
413            regex: true,
414            case_insensitive: false,
415        };
416        let m = Matcher::new(&op).unwrap();
417        let result = m.replace("hello world", "greetings, $1!");
418        assert_eq!(result, Some("greetings, world!".to_string()));
419    }
420
421    /// **Adversarial**: the classic "catastrophic backtracking" pattern
422    /// `(a+)+$` on a long non-matching input is O(2^n) in a naive NFA.
423    /// The `regex` crate uses a DFA/bounded-time engine so this should
424    /// complete effectively instantly. Lock in that we've picked a safe
425    /// engine — switching to a backtracking regex crate would hang here.
426    #[test]
427    fn test_regex_no_catastrophic_backtracking() {
428        let op = Op::Replace {
429            find: r"(a+)+$".to_string(),
430            replace: "X".to_string(),
431            regex: true,
432            case_insensitive: false,
433        };
434        let m = Matcher::new(&op).unwrap();
435        // 30 'a's followed by 'b' — classic ReDoS trigger for backtracking engines.
436        let mut input = "a".repeat(30);
437        input.push('b');
438        let start = std::time::Instant::now();
439        let result = m.is_match(&input);
440        let elapsed = start.elapsed();
441        assert!(!result, "pattern should not match 'aaaa...b'");
442        // Generous bound — should actually complete in microseconds.
443        assert!(
444            elapsed < std::time::Duration::from_millis(500),
445            "regex took too long ({elapsed:?}) — possible ReDoS"
446        );
447    }
448
449    /// **Adversarial**: the replacement string is NUL-separated or contains
450    /// control characters. Must pass through unchanged (no shell-like
451    /// interpretation).
452    #[test]
453    fn test_replacement_with_control_chars() {
454        let op = Op::Replace {
455            find: "placeholder".to_string(),
456            replace: "\x07bell\x1bescape\x00nul".to_string(),
457            regex: false,
458            case_insensitive: false,
459        };
460        let m = Matcher::new(&op).unwrap();
461        let result = m.replace("use placeholder here", "\x07bell\x1bescape\x00nul");
462        assert_eq!(
463            result,
464            Some("use \x07bell\x1bescape\x00nul here".to_string())
465        );
466    }
467
468    /// **Adversarial**: a regex that is a valid-but-empty-matching pattern
469    /// (like `(?:)`) produces an empty match at every position. This is a
470    /// weird edge case that can blow up naive replace loops. Lock in that
471    /// we produce *some* deterministic output without panicking.
472    #[test]
473    fn test_empty_regex_match_does_not_panic() {
474        let op = Op::Replace {
475            find: r"(?:)".to_string(),
476            replace: "X".to_string(),
477            regex: true,
478            case_insensitive: false,
479        };
480        let m = Matcher::new(&op).unwrap();
481        // Must not panic — actual content of the result is implementation-defined.
482        let _ = m.replace("abc", "X");
483    }
484}
485
486// ---------------------------------------------------------------
487// Property-based tests (proptest)
488// ---------------------------------------------------------------
489#[cfg(test)]
490mod proptests {
491    use super::*;
492    use proptest::prelude::*;
493
494    proptest! {
495        /// Invariant: in literal mode, `Matcher::is_match(text)` ⟺
496        /// `text.contains(pattern)`. This guards against a future optimization
497        /// accidentally changing the semantics of literal matching.
498        #[test]
499        fn prop_literal_matches_iff_contains(
500            pattern in "[a-zA-Z0-9 ]{1,10}",
501            text in "[a-zA-Z0-9 ]{0,60}",
502        ) {
503            let op = Op::Replace {
504                find: pattern.clone(),
505                replace: "".into(),
506                regex: false,
507                case_insensitive: false,
508            };
509            let m = Matcher::new(&op).unwrap();
510            prop_assert_eq!(m.is_match(&text), text.contains(&pattern));
511        }
512
513        /// Invariant: `replace(text, pat)` returns `None` iff `is_match(text)`
514        /// is `false`. A mismatch here means we'd record a spurious "change"
515        /// with no actual edit.
516        #[test]
517        fn prop_replace_none_iff_not_match(
518            pattern in "[a-zA-Z0-9]{1,6}",
519            text in "[a-zA-Z0-9]{0,40}",
520            replacement in "[a-zA-Z0-9]{0,6}",
521        ) {
522            let op = Op::Replace {
523                find: pattern.clone(),
524                replace: replacement.clone(),
525                regex: false,
526                case_insensitive: false,
527            };
528            let m = Matcher::new(&op).unwrap();
529            let is_match = m.is_match(&text);
530            let replaced = m.replace(&text, &replacement);
531            prop_assert_eq!(replaced.is_some(), is_match);
532        }
533
534        /// Invariant: replacing pattern with itself is a no-op on content
535        /// (the returned String equals the input). This is a fixed-point
536        /// test that catches mis-implementations of the literal replace path.
537        #[test]
538        fn prop_replace_with_self_is_identity(
539            pattern in "[a-zA-Z0-9]{1,6}",
540            text in "[a-zA-Z0-9 ]{0,50}",
541        ) {
542            let op = Op::Replace {
543                find: pattern.clone(),
544                replace: pattern.clone(),
545                regex: false,
546                case_insensitive: false,
547            };
548            let m = Matcher::new(&op).unwrap();
549            if let Some(replaced) = m.replace(&text, &pattern) {
550                prop_assert_eq!(replaced, text);
551            }
552        }
553
554        /// Invariant: case-insensitive literal matching is symmetric —
555        /// `Matcher(p, ci=true).is_match(t)` equals
556        /// `Matcher(t.to_lowercase(), ci=false).is_match(p.to_lowercase())`
557        /// for ASCII patterns. (Restricts to ASCII because Unicode case folding
558        /// is famously asymmetric; our ASCII invariant is what callers rely on.)
559        #[test]
560        fn prop_case_insensitive_ascii_symmetric(
561            pattern in "[a-zA-Z]{1,6}",
562            text in "[a-zA-Z]{0,30}",
563        ) {
564            let op = Op::Replace {
565                find: pattern.clone(),
566                replace: String::new(),
567                regex: false,
568                case_insensitive: true,
569            };
570            let m = Matcher::new(&op).unwrap();
571            let matches = m.is_match(&text);
572            prop_assert_eq!(
573                matches,
574                text.to_ascii_lowercase().contains(&pattern.to_ascii_lowercase())
575            );
576        }
577    }
578}