Skip to main content

aft/patch/
matcher.rs

1//! Patch-specific line-sequence matcher ported from the TypeScript apply_patch engine.
2//!
3//! This module intentionally does not reuse `fuzzy_match`: edit matching works in byte
4//! ranges, while apply_patch needs line indexes, EOF anchoring, and unique-only reflow.
5
6use std::collections::HashSet;
7
8/// Allow candidate reflow windows to differ by up to eight non-whitespace characters before exact normalized comparison.
9pub const REFLOW_NON_WS_TOLERANCE: usize = 8;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum MatchTier {
13    Exact,
14    Rstrip,
15    Trim,
16    Indent,
17    Unicode,
18    Reflow,
19}
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct SequenceMatch {
23    pub found: usize,
24    pub tier: MatchTier,
25    pub line_count: usize,
26}
27
28/// Convert smart quotes, dash variants, ellipsis, and NBSP to their ASCII forms; mirrors `patch-parser.ts:207-214`.
29pub fn normalize_unicode(input: &str) -> String {
30    let mut normalized = String::with_capacity(input.len());
31    for ch in input.chars() {
32        match ch {
33            '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => normalized.push('\''),
34            '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => normalized.push('"'),
35            '\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}' => {
36                normalized.push('-');
37            }
38            '\u{2026}' => normalized.push_str("..."),
39            '\u{00A0}' => normalized.push(' '),
40            _ => normalized.push(ch),
41        }
42    }
43    normalized
44}
45
46/// Replace a leading run of tabs and spaces with the same number of plain spaces; mirrors `patch-parser.ts:227-229`.
47pub fn normalize_indent(input: &str) -> String {
48    let mut leading_chars = 0;
49    let mut leading_bytes = 0;
50
51    for ch in input.chars() {
52        if ch != '\t' && ch != ' ' {
53            break;
54        }
55        leading_chars += 1;
56        leading_bytes += ch.len_utf8();
57    }
58
59    if leading_chars == 0 {
60        return input.to_owned();
61    }
62
63    let mut normalized = String::with_capacity(input.len());
64    normalized.push_str(&" ".repeat(leading_chars));
65    normalized.push_str(&input[leading_bytes..]);
66    normalized
67}
68
69/// Collapse every Unicode whitespace run to one space and trim the ends; mirrors `patch-parser.ts:233-235`.
70pub fn normalize_reflow_whitespace(input: &str) -> String {
71    let mut collapsed = String::with_capacity(input.len());
72    let mut in_whitespace = false;
73
74    for ch in input.chars() {
75        if ch.is_whitespace() {
76            if !in_whitespace {
77                collapsed.push(' ');
78                in_whitespace = true;
79            }
80        } else {
81            collapsed.push(ch);
82            in_whitespace = false;
83        }
84    }
85
86    collapsed.trim().to_owned()
87}
88
89/// Remove every Unicode whitespace character; mirrors `patch-parser.ts:237-239`.
90pub fn strip_reflow_whitespace(input: &str) -> String {
91    input.chars().filter(|ch| !ch.is_whitespace()).collect()
92}
93
94/// Return true when a line has any non-whitespace content; mirrors `patch-parser.ts:241-243`.
95pub fn has_reflow_content(input: &str) -> bool {
96    input.chars().any(|ch| !ch.is_whitespace())
97}
98
99fn matches_at<F>(lines: &[&str], pattern: &[&str], start: usize, compare: &F) -> bool
100where
101    F: Fn(&str, &str) -> bool,
102{
103    pattern
104        .iter()
105        .enumerate()
106        .all(|(offset, expected)| compare(lines[start + offset], expected))
107}
108
109/// Search for a full pattern with a caller-supplied comparator, optionally anchored at EOF; mirrors `patch-parser.ts:247-281`.
110pub fn try_match<F>(
111    lines: &[&str],
112    pattern: &[&str],
113    start_index: usize,
114    compare: F,
115    eof: bool,
116) -> Option<usize>
117where
118    F: Fn(&str, &str) -> bool,
119{
120    if pattern.is_empty() || pattern.len() > lines.len() {
121        return None;
122    }
123
124    if eof {
125        let from_end = lines.len() - pattern.len();
126        if from_end >= start_index && matches_at(lines, pattern, from_end, &compare) {
127            return Some(from_end);
128        }
129        return None;
130    }
131
132    let last_start = lines.len() - pattern.len();
133    if start_index > last_start {
134        return None;
135    }
136
137    (start_index..=last_start).find(|&start| matches_at(lines, pattern, start, &compare))
138}
139
140fn non_whitespace_unit_count(input: &str) -> usize {
141    // TypeScript uses UTF-16 code units for `.length`; Rust has no direct equivalent on `str`.
142    // The length check only bounds candidate windows before exact string equality, so counting
143    // Unicode scalar values keeps non-ASCII text from being over-weighted by UTF-8 byte length.
144    strip_reflow_whitespace(input).chars().count()
145}
146
147/// Find one unique whitespace-reflowed window, returning `(found_line, line_count)`; mirrors `patch-parser.ts:310-351`.
148pub fn find_reflow_match(
149    lines: &[&str],
150    pattern: &[&str],
151    start_index: usize,
152) -> Option<(usize, usize)> {
153    let needle_text = pattern.join("\n");
154    let normalized_needle = normalize_reflow_whitespace(&needle_text);
155    let needle_non_whitespace = strip_reflow_whitespace(&needle_text);
156    if normalized_needle.is_empty() || needle_non_whitespace.is_empty() {
157        return None;
158    }
159
160    let needle_non_whitespace_len = needle_non_whitespace.chars().count();
161    let min_non_whitespace = needle_non_whitespace_len.saturating_sub(REFLOW_NON_WS_TOLERANCE);
162    let max_non_whitespace = needle_non_whitespace_len + REFLOW_NON_WS_TOLERANCE;
163    let mut matches = Vec::new();
164    let mut seen = HashSet::new();
165
166    for start in start_index..lines.len() {
167        if !has_reflow_content(lines[start]) {
168            continue;
169        }
170
171        let mut window_non_whitespace_len = 0;
172        for end in (start + 1)..=lines.len() {
173            let line = lines[end - 1];
174            window_non_whitespace_len += non_whitespace_unit_count(line);
175
176            if window_non_whitespace_len > max_non_whitespace {
177                break;
178            }
179            if window_non_whitespace_len < min_non_whitespace {
180                continue;
181            }
182            if !has_reflow_content(line) {
183                continue;
184            }
185
186            let window_text = lines[start..end].join("\n");
187            let window_non_whitespace = strip_reflow_whitespace(&window_text);
188            if window_non_whitespace != needle_non_whitespace {
189                continue;
190            }
191            if normalize_reflow_whitespace(&window_text) != normalized_needle {
192                continue;
193            }
194
195            if seen.insert((start, end)) {
196                matches.push((start, end - start));
197            }
198        }
199    }
200
201    if matches.len() == 1 {
202        Some(matches[0])
203    } else {
204        None
205    }
206}
207
208/// Run the first-hit-wins Exact/Rstrip/Trim/Indent/Unicode/Reflow ladder; mirrors `patch-parser.ts:353-399`.
209pub fn seek_sequence_tiered(
210    lines: &[&str],
211    pattern: &[&str],
212    start_index: usize,
213    eof: bool,
214) -> Option<SequenceMatch> {
215    if pattern.is_empty() {
216        return None;
217    }
218
219    if let Some(found) = try_match(lines, pattern, start_index, |a, b| a == b, eof) {
220        return Some(SequenceMatch {
221            found,
222            tier: MatchTier::Exact,
223            line_count: pattern.len(),
224        });
225    }
226
227    if let Some(found) = try_match(
228        lines,
229        pattern,
230        start_index,
231        |a, b| a.trim_end() == b.trim_end(),
232        eof,
233    ) {
234        return Some(SequenceMatch {
235            found,
236            tier: MatchTier::Rstrip,
237            line_count: pattern.len(),
238        });
239    }
240
241    if let Some(found) = try_match(
242        lines,
243        pattern,
244        start_index,
245        |a, b| a.trim() == b.trim(),
246        eof,
247    ) {
248        return Some(SequenceMatch {
249            found,
250            tier: MatchTier::Trim,
251            line_count: pattern.len(),
252        });
253    }
254
255    if let Some(found) = try_match(
256        lines,
257        pattern,
258        start_index,
259        |a, b| normalize_indent(a).trim_end() == normalize_indent(b).trim_end(),
260        eof,
261    ) {
262        return Some(SequenceMatch {
263            found,
264            tier: MatchTier::Indent,
265            line_count: pattern.len(),
266        });
267    }
268
269    if let Some(found) = try_match(
270        lines,
271        pattern,
272        start_index,
273        |a, b| normalize_unicode(a.trim()) == normalize_unicode(b.trim()),
274        eof,
275    ) {
276        return Some(SequenceMatch {
277            found,
278            tier: MatchTier::Unicode,
279            line_count: pattern.len(),
280        });
281    }
282
283    if eof {
284        return None;
285    }
286
287    find_reflow_match(lines, pattern, start_index).map(|(found, line_count)| SequenceMatch {
288        found,
289        tier: MatchTier::Reflow,
290        line_count,
291    })
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    fn assert_match(
299        actual: Option<SequenceMatch>,
300        found: usize,
301        tier: MatchTier,
302        line_count: usize,
303    ) {
304        assert_eq!(
305            actual,
306            Some(SequenceMatch {
307                found,
308                tier,
309                line_count,
310            })
311        );
312    }
313
314    #[test]
315    fn normalization_helpers_match_patch_parser_sources() {
316        assert_eq!(
317            normalize_unicode("‘’‚‛“”„‟‐‑‒–—―…\u{00A0}"),
318            "''''\"\"\"\"------... "
319        );
320        assert_eq!(normalize_indent("\t  alpha\t beta  "), "   alpha\t beta  ");
321        assert_eq!(normalize_indent(""), "");
322        assert_eq!(
323            normalize_reflow_whitespace(" \talpha\n\u{00A0} beta  "),
324            "alpha beta"
325        );
326        assert_eq!(
327            strip_reflow_whitespace(" \talpha\n\u{00A0} beta  "),
328            "alphabeta"
329        );
330        assert!(has_reflow_content("\u{00A0}x"));
331        assert!(!has_reflow_content(" \t\n"));
332    }
333
334    #[test]
335    fn exact_tier_wins_without_upgrading_to_later_tiers() {
336        assert_match(
337            seek_sequence_tiered(&["alpha", "beta"], &["beta"], 0, false),
338            1,
339            MatchTier::Exact,
340            1,
341        );
342    }
343
344    #[test]
345    fn rstrip_tier_wins_before_trim() {
346        assert_match(
347            seek_sequence_tiered(&["alpha   "], &["alpha"], 0, false),
348            0,
349            MatchTier::Rstrip,
350            1,
351        );
352    }
353
354    #[test]
355    fn trim_tier_wins_before_indent_and_unicode() {
356        assert_match(
357            seek_sequence_tiered(&["  alpha  "], &["alpha"], 0, false),
358            0,
359            MatchTier::Trim,
360            1,
361        );
362    }
363
364    #[test]
365    fn indent_normalization_matches_tab_space_drift_but_trim_shadows_the_tier() {
366        assert_eq!(normalize_indent("\treturn 42;"), " return 42;");
367        assert_eq!(normalize_indent(" return 42;"), " return 42;");
368        assert_eq!(
369            try_match(
370                &["\treturn 42;"],
371                &[" return 42;"],
372                0,
373                |a, b| normalize_indent(a).trim_end() == normalize_indent(b).trim_end(),
374                false,
375            ),
376            Some(0)
377        );
378        // Expect Trim, not Indent, for tab-vs-space input: a leading tab-vs-space drift
379        // is already accepted by the earlier trim tier, so the nominal indent tier is shadowed.
380        assert_match(
381            seek_sequence_tiered(&["\treturn 42;"], &["    return 42;"], 0, false),
382            0,
383            MatchTier::Trim,
384            1,
385        );
386    }
387
388    #[test]
389    fn unicode_tier_normalizes_smart_punctuation_after_stricter_tiers_fail() {
390        assert_match(
391            seek_sequence_tiered(
392                &["const label = “alpha”—beta…;"],
393                &["const label = \"alpha\"-beta...;"],
394                0,
395                false,
396            ),
397            0,
398            MatchTier::Unicode,
399            1,
400        );
401    }
402
403    #[test]
404    fn reflow_tier_matches_one_line_hunk_against_three_line_formatter_split() {
405        let lines = [
406            "function demo() {",
407            "  const value = alpha +",
408            "    beta +",
409            "    gamma;",
410            "  return value;",
411            "}",
412        ];
413        let pattern = ["  const value = alpha + beta + gamma;"];
414
415        assert_match(
416            seek_sequence_tiered(&lines, &pattern, 0, false),
417            1,
418            MatchTier::Reflow,
419            3,
420        );
421    }
422
423    #[test]
424    fn rejects_ambiguous_reflow_matches_instead_of_choosing_a_window() {
425        // Reject a reflow match when the pattern could match more than one distinct window.
426        let lines = [
427            "const value = alpha +",
428            "  beta +",
429            "  gamma;",
430            "",
431            "const value = alpha +",
432            "  beta +",
433            "  gamma;",
434        ];
435        let pattern = ["const value = alpha + beta + gamma;"];
436
437        assert_eq!(find_reflow_match(&lines, &pattern, 0), None);
438        assert_eq!(seek_sequence_tiered(&lines, &pattern, 0, false), None);
439    }
440
441    #[test]
442    fn uses_line_contiguous_match_before_considering_reflow_candidate() {
443        // A line-contiguous match wins before any reflow candidate is considered.
444        let lines = [
445            "const value = alpha +",
446            "  beta +",
447            "  gamma;",
448            "const value = alpha + beta + gamma;",
449        ];
450        let pattern = ["const value = alpha + beta + gamma;"];
451
452        assert_match(
453            seek_sequence_tiered(&lines, &pattern, 0, false),
454            3,
455            MatchTier::Exact,
456            1,
457        );
458    }
459
460    #[test]
461    fn eof_hunk_only_matches_the_tail_and_never_forward_scans() {
462        // EOF-anchored hunks match only the tail and never forward-scan.
463        let pattern = ["marker", "old"];
464
465        assert_match(
466            seek_sequence_tiered(
467                &["header", "marker", "old", "middle", "marker", "old"],
468                &pattern,
469                0,
470                true,
471            ),
472            4,
473            MatchTier::Exact,
474            2,
475        );
476        assert_eq!(
477            seek_sequence_tiered(
478                &["header", "marker", "old", "middle", "marker", "changed"],
479                &pattern,
480                0,
481                true,
482            ),
483            None
484        );
485    }
486
487    #[test]
488    fn eof_hunk_skips_reflow_even_when_the_tail_would_reflow_match() {
489        let lines = ["header", "const value = alpha +", "  beta +", "  gamma;"];
490        let pattern = ["const value = alpha + beta + gamma;"];
491
492        assert_eq!(find_reflow_match(&lines, &pattern, 0), Some((1, 3)));
493        assert_eq!(seek_sequence_tiered(&lines, &pattern, 0, true), None);
494    }
495
496    #[test]
497    fn try_match_honors_start_index_for_forward_scans_and_eof_anchor() {
498        assert_eq!(
499            try_match(&["a", "b", "a", "b"], &["a", "b"], 1, |a, b| a == b, false),
500            Some(2)
501        );
502        assert_eq!(
503            try_match(&["a", "b", "a", "b"], &["a", "b"], 3, |a, b| a == b, false),
504            None
505        );
506        assert_eq!(
507            try_match(&["a", "b", "a", "b"], &["a", "b"], 3, |a, b| a == b, true),
508            None
509        );
510    }
511}