Skip to main content

harness_write/
matching.rs

1use crate::constants::{
2    CONTEXT_LINES, DEFAULT_FUZZY_LENGTH_TOLERANCE, DEFAULT_FUZZY_THRESHOLD, DEFAULT_FUZZY_TOP_K,
3};
4use crate::levenshtein::similarity;
5use crate::types::{FuzzyCandidate, MatchLocation};
6
7/// 1-based line of a byte offset in LF-normalized text.
8pub fn line_of_offset(text: &str, offset: usize) -> usize {
9    if offset == 0 {
10        return 1;
11    }
12    let mut line = 1usize;
13    let limit = offset.min(text.len());
14    for b in text.as_bytes()[..limit].iter() {
15        if *b == b'\n' {
16            line += 1;
17        }
18    }
19    line
20}
21
22fn split_lines(text: &str) -> Vec<&str> {
23    if text.is_empty() {
24        return Vec::new();
25    }
26    text.split('\n').collect()
27}
28
29fn context_around(
30    file_lines: &[&str],
31    first_line_1based: usize,
32    window_line_count: usize,
33) -> (Vec<String>, Vec<String>) {
34    let first_idx = first_line_1based.saturating_sub(1);
35    let last_idx = first_idx + window_line_count.saturating_sub(1);
36    let before_start = first_idx.saturating_sub(CONTEXT_LINES);
37    let before_end = first_idx;
38    let after_start = last_idx + 1;
39    let after_end = (last_idx + 1 + CONTEXT_LINES).min(file_lines.len());
40    let before = file_lines
41        .get(before_start..before_end)
42        .unwrap_or(&[])
43        .iter()
44        .map(|s| s.to_string())
45        .collect();
46    let after = file_lines
47        .get(after_start..after_end)
48        .unwrap_or(&[])
49        .iter()
50        .map(|s| s.to_string())
51        .collect();
52    (before, after)
53}
54
55/// Find all exact occurrences of `needle` in `haystack`. Returns byte offsets.
56pub fn find_all_occurrences(haystack: &str, needle: &str) -> Vec<usize> {
57    if needle.is_empty() {
58        return Vec::new();
59    }
60    let mut out = Vec::new();
61    let mut from = 0usize;
62    let hb = haystack.as_bytes();
63    let nb = needle.as_bytes();
64    while from + nb.len() <= hb.len() {
65        let slice = &hb[from..];
66        match find_subslice(slice, nb) {
67            Some(rel) => {
68                let abs = from + rel;
69                out.push(abs);
70                from = abs + nb.len();
71            }
72            None => break,
73        }
74    }
75    out
76}
77
78fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
79    if needle.is_empty() {
80        return Some(0);
81    }
82    if needle.len() > haystack.len() {
83        return None;
84    }
85    (0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
86}
87
88pub fn build_match_locations(
89    file: &str,
90    needle: &str,
91    offsets: &[usize],
92) -> Vec<MatchLocation> {
93    let file_lines = split_lines(file);
94    let needle_line_count = split_lines(needle).len().max(1);
95    offsets
96        .iter()
97        .map(|&off| {
98            let first_line = line_of_offset(file, off);
99            let (before, after) = context_around(&file_lines, first_line, needle_line_count);
100            MatchLocation {
101                line: first_line,
102                preview: needle.to_string(),
103                context_before: before,
104                context_after: after,
105            }
106        })
107        .collect()
108}
109
110pub struct FuzzyOpts {
111    pub top_k: usize,
112    pub threshold: f64,
113    pub length_tolerance: f64,
114}
115
116impl Default for FuzzyOpts {
117    fn default() -> Self {
118        Self {
119            top_k: DEFAULT_FUZZY_TOP_K,
120            threshold: DEFAULT_FUZZY_THRESHOLD,
121            length_tolerance: DEFAULT_FUZZY_LENGTH_TOLERANCE,
122        }
123    }
124}
125
126pub fn find_fuzzy_candidates(file: &str, needle: &str, opts: FuzzyOpts) -> Vec<FuzzyCandidate> {
127    if file.is_empty() || needle.is_empty() {
128        return Vec::new();
129    }
130    let file_lines = split_lines(file);
131    let needle_lines = split_lines(needle);
132    let window_line_count = needle_lines.len().max(1);
133    if file_lines.len() < window_line_count {
134        return Vec::new();
135    }
136
137    let mut cands: Vec<(usize, f64, String)> = Vec::new();
138    for i in 0..=file_lines.len().saturating_sub(window_line_count) {
139        let window = file_lines[i..i + window_line_count].join("\n");
140        let max_len = window.len().max(needle.len());
141        if max_len > 0 {
142            let delta =
143                (window.len() as f64 - needle.len() as f64).abs() / max_len as f64;
144            if delta > opts.length_tolerance {
145                continue;
146            }
147        }
148        let score = similarity(&window, needle);
149        if score < opts.threshold {
150            continue;
151        }
152        if (score - 1.0).abs() < f64::EPSILON {
153            continue;
154        }
155        cands.push((i + 1, score, window));
156    }
157
158    cands.sort_by(|a, b| {
159        b.1.partial_cmp(&a.1)
160            .unwrap_or(std::cmp::Ordering::Equal)
161            .then_with(|| a.0.cmp(&b.0))
162    });
163
164    cands
165        .into_iter()
166        .take(opts.top_k)
167        .map(|(line, score, window_text)| {
168            let (before, after) = context_around(&file_lines, line, window_line_count);
169            FuzzyCandidate {
170                line,
171                score: (score * 100.0).round() / 100.0,
172                preview: window_text,
173                context_before: before,
174                context_after: after,
175            }
176        })
177        .collect()
178}
179
180/// Boundary-collision check: flag lines where `needle` sits adjacent to
181/// identifier characters — caller may be stomping on a longer identifier.
182pub fn substring_boundary_collisions(file: &str, needle: &str, offsets: &[usize]) -> Vec<usize> {
183    let mut flagged: Vec<usize> = Vec::new();
184    let fb = file.as_bytes();
185    let nb_len = needle.as_bytes().len();
186    for &off in offsets {
187        let before = if off > 0 { Some(fb[off - 1]) } else { None };
188        let after = fb.get(off + nb_len).copied();
189        if is_ident(before) || is_ident(after) {
190            flagged.push(line_of_offset(file, off));
191        }
192    }
193    flagged
194}
195
196fn is_ident(b: Option<u8>) -> bool {
197    match b {
198        Some(c) => matches!(c, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_'),
199        None => false,
200    }
201}