Skip to main content

tycode_core/file/
find.rs

1/// The result of a "find in file" operation that finds the closest match to a
2/// given vec of lines in a test.
3///
4/// When searching text, AIs (even opus!) tend to fail to make tool calls for
5/// file edits correctly (either by not understanding the tool, or getting
6/// spacing or something wrong in the search block). When this happens we return
7/// a helpful error "did you mean?" style error message to help guide them
8/// towards a valid tool call. Perhaps we could just accept the file if its
9/// "close enough".
10///
11/// We previously gave models back the incorrect text they gave us as the error
12/// message. Someone on reddit gave a good example of why this was dumb agent
13/// behavior:
14/// "DO NOT SAY APPLE. DEFINITELY DO NOT SAY APPLE. WHATEVER YOU SAY IN RESPONSE
15/// TO MY NEXT REQUEST DO NOT SAY APPLE. Ok, name a fruit."
16#[derive(Debug, Clone)]
17pub struct MatchResult {
18    pub matched_lines: Vec<String>,
19    pub start_index: usize,
20    /// Similarity score (0.0 = no match, 1.0 = perfect match)
21    pub similarity: f64,
22}
23
24impl MatchResult {
25    /// Returns correction feedback if the match is not exact
26    /// Returns None for perfect matches (similarity = 1.0)
27    pub fn get_correction_feedback(&self) -> Option<String> {
28        if self.similarity >= 1.0 {
29            return None;
30        }
31
32        let mut feedback = String::new();
33        feedback.push_str(&format!(
34            "Found closest match with {:.1}% similarity at line {}\n\n",
35            self.similarity * 100.0,
36            self.start_index + 1
37        ));
38        feedback.push_str("Closest match:\n");
39
40        for line in &self.matched_lines {
41            feedback.push_str(&format!("{line}\n"));
42        }
43
44        Some(feedback)
45    }
46}
47
48/// Find the closest matching section in source lines for the given search lines
49pub fn find_closest_match(source: Vec<String>, search: Vec<String>) -> Option<MatchResult> {
50    if search.is_empty() || source.is_empty() {
51        return None;
52    }
53
54    if search.len() > source.len() {
55        return None;
56    }
57
58    let mut best_match: Option<(usize, f64, Vec<String>)> = None;
59
60    // Slide window through source
61    for i in 0..=source.len() - search.len() {
62        let window: Vec<String> = source[i..i + search.len()].to_vec();
63        let similarity = calculate_similarity(&window, &search);
64
65        match &best_match {
66            None => best_match = Some((i, similarity, window)),
67            Some((_, best_sim, _)) if similarity > *best_sim => {
68                best_match = Some((i, similarity, window));
69            }
70            _ => {}
71        }
72    }
73
74    best_match.map(|(start_index, similarity, matched_lines)| MatchResult {
75        matched_lines,
76        start_index,
77        similarity,
78    })
79}
80
81fn calculate_similarity(window: &[String], search: &[String]) -> f64 {
82    let mut total_similarity = 0.0;
83
84    for (window_line, search_line) in window.iter().zip(search.iter()) {
85        let line_similarity = calculate_line_similarity(window_line, search_line);
86        total_similarity += line_similarity;
87    }
88
89    total_similarity / search.len() as f64
90}
91
92fn calculate_line_similarity(s1: &str, s2: &str) -> f64 {
93    if s1 == s2 {
94        return 1.0;
95    }
96
97    let distance = levenshtein_distance(s1, s2);
98    let max_len = s1.len().max(s2.len());
99
100    if max_len == 0 {
101        return 1.0;
102    }
103
104    1.0 - (distance as f64 / max_len as f64)
105}
106
107fn levenshtein_distance(s1: &str, s2: &str) -> usize {
108    let len1 = s1.len();
109    let len2 = s2.len();
110
111    if len1 == 0 {
112        return len2;
113    }
114    if len2 == 0 {
115        return len1;
116    }
117
118    let mut prev_row: Vec<usize> = (0..=len2).collect();
119    let mut curr_row = vec![0; len2 + 1];
120
121    for (i, c1) in s1.chars().enumerate() {
122        curr_row[0] = i + 1;
123
124        for (j, c2) in s2.chars().enumerate() {
125            let cost = if c1 == c2 { 0 } else { 1 };
126            curr_row[j + 1] = (prev_row[j + 1] + 1) // deletion
127                .min(curr_row[j] + 1) // insertion
128                .min(prev_row[j] + cost); // substitution
129        }
130
131        std::mem::swap(&mut prev_row, &mut curr_row);
132    }
133
134    prev_row[len2]
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn test_exact_match() {
143        let source = vec![
144            "line 1".to_string(),
145            "line 2".to_string(),
146            "line 3".to_string(),
147        ];
148        let search = vec!["line 2".to_string()];
149
150        let result = find_closest_match(source, search).unwrap();
151        assert_eq!(result.start_index, 1);
152        assert_eq!(result.similarity, 1.0);
153        assert_eq!(result.matched_lines, vec!["line 2".to_string()]);
154    }
155
156    #[test]
157    fn test_fuzzy_match() {
158        let source = vec![
159            "if ft.is_dir() {".to_string(),
160            " return true;".to_string(),
161            "}".to_string(),
162        ];
163        let search = vec![
164            "if ft.is_dir() {".to_string(),
165            " return true".to_string(), // Missing semicolon
166        ];
167
168        let result = find_closest_match(source, search).unwrap();
169        assert_eq!(result.start_index, 0);
170        assert!(result.similarity > 0.9);
171        assert_eq!(result.matched_lines[0], "if ft.is_dir() {");
172    }
173
174    #[test]
175    fn test_multiline_match() {
176        let source = vec![
177            "None => return false,".to_string(),
178            "Some(ft) => ft,".to_string(),
179            "};".to_string(),
180            "if ft.is_dir() {".to_string(),
181            "return true;".to_string(),
182            "}".to_string(),
183        ];
184        let search = vec![
185            "None => return false,".to_string(),
186            "Some(ft) => ft,".to_string(),
187            "};".to_string(),
188            "if ft.is_dir() {".to_string(),
189            "return true".to_string(),
190        ];
191
192        let result = find_closest_match(source.clone(), search).unwrap();
193        assert_eq!(result.start_index, 0);
194        assert!(result.similarity > 0.95);
195    }
196
197    #[test]
198    fn test_performance_full_file() {
199        let full_file = r#"/!
200Defines a builder for haystacks.
201A "haystack" represents something we want to search. It encapsulates the logic
202for whether a haystack ought to be searched or not, separate from the standard
203ignore rules and other filtering logic.
204Effectively, a haystack wraps a directory entry and adds some light application
205level logic around it.
206/
207use std::path::Path;
208/// A builder for constructing things to search over.
209#[derive(Clone, Debug)]
210pub(crate) struct HaystackBuilder {
211 strip_dot_prefix: bool,
212}
213impl HaystackBuilder {
214 /// Return a new haystack builder with a default configuration.
215 pub(crate) fn new() -> HaystackBuilder {
216 HaystackBuilder { strip_dot_prefix: false }
217 }
218 /// Create a new haystack from a possibly missing directory entry.
219 ///
220 /// If the directory entry isn't present, then the corresponding error is
221 /// logged if messages have been configured. Otherwise, if the directory
222 /// entry is deemed searchable, then it is returned as a haystack.
223 pub(crate) fn build_from_result(
224 &self,
225 result: Result<ignore::DirEntry, ignore::Error>,
226 ) -> Option<Haystack> {
227 match result {
228 Ok(dent) => self.build(dent),
229 Err(err) => {
230 err_message!("{err}");
231 None
232 }
233 }
234 }
235 /// Create a new haystack using this builder's configuration.
236 ///
237 /// If a directory entry could not be created or should otherwise not be
238 /// searched, then this returns None after emitting any relevant log
239 /// messages.
240 fn build(&self, dent: ignore::DirEntry) -> Option<Haystack> {
241 let hay = Haystack { dent, strip_dot_prefix: self.strip_dot_prefix };
242 if let Some(err) = hay.dent.error() {
243 ignore_message!("{err}");
244 }
245 // If this entry was explicitly provided by an end user, then we always
246 // want to search it.
247 if hay.is_explicit() {
248 return Some(hay);
249 }
250 // At this point, we only want to search something if it's explicitly a
251 // file. This omits symlinks. (If ripgrep was configured to follow
252 // symlinks, then they have already been followed by the directory
253 // traversal.)
254 if hay.is_file() {
255 return Some(hay);
256 }
257 // We got nothing. Emit a debug message, but only if this isn't a
258 // directory. Otherwise, emitting messages for directories is just
259 // noisy.
260 if !hay.is_dir() {
261 log::debug!(
262 "ignoring {}: failed to pass haystack filter: \
263 file type: {:?}, metadata: {:?}",
264 hay.dent.path().display(),
265 hay.dent.file_type(),
266 hay.dent.metadata()
267 );
268 }
269 None
270 }
271 /// When enabled, if the haystack's file path starts with ./ then it is
272 /// stripped.
273 ///
274 /// This is useful when implicitly searching the current working directory.
275 pub(crate) fn strip_dot_prefix(
276 &mut self,
277 yes: bool,
278 ) -> &mut HaystackBuilder {
279 self.strip_dot_prefix = yes;
280 self
281 }
282}
283/// A haystack is a thing we want to search.
284///
285/// Generally, a haystack is either a file or stdin.
286#[derive(Clone, Debug)]
287pub(crate) struct Haystack {
288 dent: ignore::DirEntry,
289 strip_dot_prefix: bool,
290}
291impl Haystack {
292 /// Return the file path corresponding to this haystack.
293 ///
294 /// If this haystack corresponds to stdin, then a special <stdin> path
295 /// is returned instead.
296 pub(crate) fn path(&self) -> &Path {
297 if self.strip_dot_prefix && self.dent.path().starts_with("./") {
298 self.dent.path().strip_prefix("./").unwrap()
299 } else {
300 self.dent.path()
301 }
302 }
303 /// Returns true if and only if this entry corresponds to stdin.
304 pub(crate) fn is_stdin(&self) -> bool {
305 self.dent.is_stdin()
306 }
307 /// Returns true if and only if this entry corresponds to a haystack to
308 /// search that was explicitly supplied by an end user.
309 ///
310 /// Generally, this corresponds to either stdin or an explicit file path
311 /// argument. e.g., in rg foo some-file ./some-dir/, some-file is
312 /// an explicit haystack, but, e.g., ./some-dir/some-other-file is not.
313 ///
314 /// However, note that ripgrep does not see through shell globbing. e.g.,
315 /// in rg foo ./some-dir/*, ./some-dir/some-other-file will be treated
316 /// as an explicit haystack.
317 pub(crate) fn is_explicit(&self) -> bool {
318 // stdin is obvious. When an entry has a depth of 0, that means it
319 // was explicitly provided to our directory iterator, which means it
320 // was in turn explicitly provided by the end user. The !is_dir check
321 // means that we want to search files even if their symlinks, again,
322 // because they were explicitly provided. (And we never want to try
323 // to search a directory.)
324 self.is_stdin() || (self.dent.depth() == 0 && !self.is_dir())
325 }
326 /// Returns true if and only if this haystack points to a directory after
327 /// following symbolic links.
328 fn is_dir(&self) -> bool {
329 let ft = match self.dent.file_type() {
330 None => return false,
331 Some(ft) => ft,
332 };
333 if ft.is_dir() {
334 return true;
335 }
336 // If this is a symlink, then we want to follow it to determine
337 // whether it's a directory or not.
338 self.dent.path_is_symlink() && self.dent.path().is_dir()
339 }
340 /// Returns true if and only if this haystack points to a file.
341 fn is_file(&self) -> bool {
342 self.dent.file_type().map_or(false, |ft| ft.is_file())
343 }
344}"#;
345
346        let source: Vec<String> = full_file.lines().map(|s| s.to_string()).collect();
347        let search = vec![
348            " None => return false,".to_string(),
349            " Some(ft) => ft,".to_string(),
350            " };".to_string(),
351            " if ft.is_dir() {".to_string(),
352            " return true;".to_string(),
353        ];
354
355        use std::time::Instant;
356        let start = Instant::now();
357        let result = find_closest_match(source.clone(), search).unwrap();
358        let duration = start.elapsed();
359
360        println!("Search took: {:?} for {} lines", duration, source.len());
361        assert_eq!(result.start_index, 131);
362        assert!(result.similarity > 0.95);
363        assert_eq!(result.matched_lines[4], " return true;");
364    }
365
366    #[test]
367    fn test_correction_feedback() {
368        let source = vec![
369            "let ft = match self.dent.file_type() {".to_string(),
370            " None => return false,".to_string(),
371            " Some(ft) => ft,".to_string(),
372            "};".to_string(),
373            "if ft.is_dir() {".to_string(),
374            " return true;".to_string(),
375            "}".to_string(),
376        ];
377
378        // Search with missing semicolon and wrong indentation
379        let search = vec![
380            "None => return false,".to_string(),
381            " Some(ft) => ft,".to_string(),
382            " };".to_string(),
383            " if ft.is_dir() {".to_string(),
384            " return true".to_string(),
385        ];
386
387        let result = find_closest_match(source, search).unwrap();
388        let feedback = result.get_correction_feedback();
389
390        // Should return Some feedback for imperfect match
391        assert!(feedback.is_some());
392
393        let feedback_text = feedback.unwrap();
394        assert!(feedback_text.contains("None => return false,"));
395        println!("Correction feedback:\n{feedback_text}");
396    }
397
398    #[test]
399    fn test_no_feedback_for_perfect_match() {
400        let source = vec![
401            "line 1".to_string(),
402            "line 2".to_string(),
403            "line 3".to_string(),
404        ];
405        let search = vec!["line 2".to_string()];
406
407        let result = find_closest_match(source, search).unwrap();
408        let feedback = result.get_correction_feedback();
409
410        // Should return None for perfect match
411        assert!(feedback.is_none());
412    }
413}