Skip to main content

codex_patcher/
fuzzy.rs

1//! Fuzzy matching for patch text search
2//!
3//! Provides fallback matching when exact text search fails, using
4//! normalized Levenshtein distance on a sliding window of lines.
5
6use strsim::normalized_levenshtein;
7
8/// Result of a fuzzy match operation
9#[derive(Debug, Clone)]
10pub struct FuzzyMatch {
11    /// Byte offset where the match starts
12    pub start: usize,
13    /// Byte offset where the match ends
14    pub end: usize,
15    /// The actual text that was matched
16    pub matched_text: String,
17    /// Similarity score (0.0 to 1.0, higher is better)
18    pub score: f64,
19}
20
21/// Find the best fuzzy match for `needle` in `haystack`.
22///
23/// Uses a sliding window of exactly `needle.lines().count()` lines.
24/// Falls back gracefully: returns `None` when no window exceeds `threshold`.
25///
26/// # Arguments
27/// * `needle` - The text pattern to search for
28/// * `haystack` - The content to search within
29/// * `threshold` - Minimum similarity score (0.0-1.0) to consider a match
30pub fn find_best_match(needle: &str, haystack: &str, threshold: f64) -> Option<FuzzyMatch> {
31    let window_size = needle.lines().count();
32    let (lines, offsets) = build_haystack_info(haystack);
33    slide_window(needle, haystack, &lines, &offsets, threshold, window_size)
34}
35
36/// Like `find_best_match` but tries window sizes `needle_lines..=needle_lines+max_expansion`.
37///
38/// Returns the highest-scoring match across all window sizes above `threshold`.
39/// This handles cases where N lines were inserted inside the needle's span between
40/// version bumps: a window larger than the original needle can still score highly
41/// because the inserted lines are counted only once in the edit distance.
42///
43/// When `max_expansion == 0`, behaviour is identical to [`find_best_match`].
44///
45/// # Arguments
46/// * `needle` - The text pattern to search for
47/// * `haystack` - The content to search within
48/// * `threshold` - Minimum similarity score (0.0-1.0) to consider a match
49/// * `max_expansion` - Maximum extra lines to extend the window (capped at 200 by schema validation)
50pub fn find_best_match_elastic(
51    needle: &str,
52    haystack: &str,
53    threshold: f64,
54    max_expansion: usize,
55) -> Option<FuzzyMatch> {
56    let base_lines = needle.lines().count();
57    // Pre-compute once; all expansion iterations reuse the same slices.
58    let (lines, offsets) = build_haystack_info(haystack);
59
60    let mut best: Option<FuzzyMatch> = None;
61    for expansion in 0..=max_expansion {
62        let candidate = slide_window(
63            needle,
64            haystack,
65            &lines,
66            &offsets,
67            threshold,
68            base_lines + expansion,
69        );
70        if let Some(c) = candidate {
71            if best.as_ref().is_none_or(|b| c.score > b.score) {
72                best = Some(c);
73            }
74        }
75    }
76    best
77}
78
79// ── Internal helpers ──────────────────────────────────────────────────────────
80
81/// Pre-compute the line slice vector and byte-offset table for `haystack`.
82///
83/// `line_offsets[i]` is the byte index of the start of line `i`.
84/// The vector has one extra sentinel at the end pointing just past the last
85/// character, which lets the window-end calculation avoid a bounds check.
86fn build_haystack_info(haystack: &str) -> (Vec<&str>, Vec<usize>) {
87    let lines: Vec<&str> = haystack.lines().collect();
88    // split('\n') rather than lines() so we always produce the +1 sentinel entry
89    // even when the file has a trailing newline.
90    let offsets: Vec<usize> = std::iter::once(0)
91        .chain(haystack.split('\n').scan(0usize, |acc, segment| {
92            *acc += segment.len() + 1; // +1 for the '\n' character
93            Some(*acc)
94        }))
95        .collect();
96    (lines, offsets)
97}
98
99/// Slide a window of exactly `window_size` lines over `haystack_lines`,
100/// scoring each window against `needle` with normalized Levenshtein distance.
101/// Returns the best-scoring window that is at or above `threshold`.
102fn slide_window(
103    needle: &str,
104    haystack: &str,
105    haystack_lines: &[&str],
106    line_offsets: &[usize],
107    threshold: f64,
108    window_size: usize,
109) -> Option<FuzzyMatch> {
110    if window_size == 0 || haystack_lines.len() < window_size {
111        return None;
112    }
113
114    let mut best: Option<FuzzyMatch> = None;
115
116    for i in 0..=haystack_lines.len().saturating_sub(window_size) {
117        let window = haystack_lines[i..i + window_size].join("\n");
118        let score = normalized_levenshtein(needle, &window);
119
120        if score >= threshold && best.as_ref().is_none_or(|b| score > b.score) {
121            let start = line_offsets[i];
122            // End byte: start of the line *after* our window, minus the preceding '\n'.
123            // The sentinel entry in `line_offsets` guarantees this index always exists.
124            let end = if i + window_size < line_offsets.len() {
125                line_offsets[i + window_size].saturating_sub(1)
126            } else {
127                haystack.len()
128            };
129
130            best = Some(FuzzyMatch {
131                start,
132                end,
133                matched_text: window,
134                score,
135            });
136        }
137    }
138
139    best
140}
141
142// ── Tests ─────────────────────────────────────────────────────────────────────
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    // ── find_best_match ───────────────────────────────────────────────────────
149
150    #[test]
151    fn exact_match_scores_one() {
152        let src = "fn main() {\n    println!(\"hello\");\n}";
153        let m = find_best_match(src, src, 0.9).expect("should find exact match");
154        assert!((m.score - 1.0).abs() < 1e-9);
155    }
156
157    #[test]
158    fn similar_match_above_threshold() {
159        let haystack = "fn main() {\n    println!(\"hello world\");\n}";
160        let needle = "fn main() {\n    println!(\"hello\");\n}";
161        let m = find_best_match(needle, haystack, 0.8).expect("should find similar match");
162        assert!(m.score > 0.8);
163    }
164
165    #[test]
166    fn dissimilar_content_below_threshold() {
167        let haystack = "completely different content";
168        let needle = "fn main() {}";
169        assert!(find_best_match(needle, haystack, 0.8).is_none());
170    }
171
172    #[test]
173    fn selects_best_match_among_candidates() {
174        let haystack = "fn foo() {}\nfn bar() {\n    x\n}\nfn baz() {\n    y\n}";
175        let needle = "fn bar() {\n    x\n}";
176        let m = find_best_match(needle, haystack, 0.9).expect("should match");
177        assert!(m.matched_text.contains("bar"));
178    }
179
180    #[test]
181    fn byte_positions_are_correct() {
182        let haystack = "line1\nline2\nline3";
183        let needle = "line2";
184        let m = find_best_match(needle, haystack, 0.9).expect("should match");
185        assert_eq!(m.start, 6, "\"line1\\n\" is 6 bytes");
186        assert_eq!(&haystack[m.start..m.end], "line2");
187    }
188
189    #[test]
190    fn trailing_newline_does_not_corrupt_offsets() {
191        let haystack = "alpha\nbeta\n";
192        let needle = "beta";
193        let m = find_best_match(needle, haystack, 0.9).expect("should match");
194        assert_eq!(&haystack[m.start..m.end], "beta");
195    }
196
197    // ── find_best_match_elastic ───────────────────────────────────────────────
198
199    #[test]
200    fn elastic_zero_expansion_matches_fixed_window() {
201        let needle = "fn foo() {}";
202        let haystack = "fn foo() {}\nfn bar() {}";
203        let fixed = find_best_match(needle, haystack, 0.9);
204        let elastic = find_best_match_elastic(needle, haystack, 0.9, 0);
205        match (fixed, elastic) {
206            (None, None) => {}
207            (Some(a), Some(b)) => {
208                assert_eq!(a.start, b.start);
209                assert_eq!(a.end, b.end);
210                assert!((a.score - b.score).abs() < 1e-9);
211            }
212            (a, b) => panic!("results differ: fixed={a:?} elastic={b:?}"),
213        }
214    }
215
216    #[test]
217    fn elastic_finds_needle_with_inserted_lines() {
218        // When one line is inserted between two anchor lines, no fixed-size window
219        // contains both anchors.  An elastic window of size needle_lines+1 does,
220        // and scores higher than any fixed window.
221        //
222        // Verified scores (strsim::normalized_levenshtein):
223        //   fixed w2[0..2] = 0.458  ("TOP_ANCHOR\nINSERTED_LINE")
224        //   fixed w2[1..3] = 0.519  ("INSERTED_LINE\nBOTTOM_ANCHOR")  ← best fixed
225        //   elastic w3[0..3] = 0.632 ("TOP_ANCHOR\nINSERTED_LINE\nBOTTOM_ANCHOR")
226        //
227        // At threshold=0.55: fixed → None, elastic → Some.
228        let needle = "TOP_ANCHOR\nBOTTOM_ANCHOR";
229        let haystack = "TOP_ANCHOR\nINSERTED_LINE\nBOTTOM_ANCHOR\nextra";
230
231        assert!(
232            find_best_match(needle, haystack, 0.55).is_none(),
233            "fixed window (best score 0.519) should miss at threshold 0.55"
234        );
235
236        let m = find_best_match_elastic(needle, haystack, 0.55, 1)
237            .expect("elastic (best score 0.632) should find match at threshold 0.55");
238        assert!(
239            m.matched_text.contains("TOP_ANCHOR"),
240            "match must contain opening anchor"
241        );
242        assert!(
243            m.matched_text.contains("BOTTOM_ANCHOR"),
244            "match must contain closing anchor"
245        );
246    }
247
248    #[test]
249    fn elastic_picks_highest_score_across_expansions() {
250        // A perfect-match window exists at expansion=0; larger windows should
251        // score lower and not displace it.
252        let needle = "fn exact() {}";
253        let haystack = "preamble\nfn exact() {}\npostamble\nmore stuff\n";
254        let m = find_best_match_elastic(needle, haystack, 0.9, 5).expect("should find exact match");
255        assert!((m.score - 1.0).abs() < 1e-9, "perfect match should win");
256        assert_eq!(&haystack[m.start..m.end], "fn exact() {}");
257    }
258
259    #[test]
260    fn elastic_returns_none_when_nothing_exceeds_threshold() {
261        let needle = "fn absolutely_nothing_like_this() {}";
262        let haystack = "let a = 1;\nlet b = 2;\nlet c = 3;\n";
263        assert!(find_best_match_elastic(needle, haystack, 0.85, 10).is_none());
264    }
265
266    #[test]
267    fn elastic_handles_haystack_shorter_than_window() {
268        let needle = "a\nb\nc\nd\ne";
269        let haystack = "a\nb";
270        // haystack has 2 lines, needle has 5 — no window fits even at expansion=0
271        assert!(find_best_match_elastic(needle, haystack, 0.5, 20).is_none());
272    }
273}