1#[derive(Debug, Clone)]
17pub struct MatchResult {
18 pub matched_lines: Vec<String>,
19 pub start_index: usize,
20 pub similarity: f64,
22}
23
24impl MatchResult {
25 pub fn get_correction_feedback(&self) -> Option<String> {
28 if self.similarity >= 1.0 {
29 return None;
30 }
31
32 let mut feedback = String::new();
33 feedback.push_str(&format!(
34 "Found closest match with {:.1}% similarity at line {}\n\n",
35 self.similarity * 100.0,
36 self.start_index + 1
37 ));
38 feedback.push_str("Closest match:\n");
39
40 for line in &self.matched_lines {
41 feedback.push_str(&format!("{line}\n"));
42 }
43
44 Some(feedback)
45 }
46}
47
48pub fn find_closest_match(source: Vec<String>, search: Vec<String>) -> Option<MatchResult> {
50 if search.is_empty() || source.is_empty() {
51 return None;
52 }
53
54 if search.len() > source.len() {
55 return None;
56 }
57
58 let mut best_match: Option<(usize, f64, Vec<String>)> = None;
59
60 for i in 0..=source.len() - search.len() {
62 let window: Vec<String> = source[i..i + search.len()].to_vec();
63 let similarity = calculate_similarity(&window, &search);
64
65 match &best_match {
66 None => best_match = Some((i, similarity, window)),
67 Some((_, best_sim, _)) if similarity > *best_sim => {
68 best_match = Some((i, similarity, window));
69 }
70 _ => {}
71 }
72 }
73
74 best_match.map(|(start_index, similarity, matched_lines)| MatchResult {
75 matched_lines,
76 start_index,
77 similarity,
78 })
79}
80
81fn calculate_similarity(window: &[String], search: &[String]) -> f64 {
82 let mut total_similarity = 0.0;
83
84 for (window_line, search_line) in window.iter().zip(search.iter()) {
85 let line_similarity = calculate_line_similarity(window_line, search_line);
86 total_similarity += line_similarity;
87 }
88
89 total_similarity / search.len() as f64
90}
91
92fn calculate_line_similarity(s1: &str, s2: &str) -> f64 {
93 if s1 == s2 {
94 return 1.0;
95 }
96
97 let distance = levenshtein_distance(s1, s2);
98 let max_len = s1.len().max(s2.len());
99
100 if max_len == 0 {
101 return 1.0;
102 }
103
104 1.0 - (distance as f64 / max_len as f64)
105}
106
107fn levenshtein_distance(s1: &str, s2: &str) -> usize {
108 let len1 = s1.len();
109 let len2 = s2.len();
110
111 if len1 == 0 {
112 return len2;
113 }
114 if len2 == 0 {
115 return len1;
116 }
117
118 let mut prev_row: Vec<usize> = (0..=len2).collect();
119 let mut curr_row = vec![0; len2 + 1];
120
121 for (i, c1) in s1.chars().enumerate() {
122 curr_row[0] = i + 1;
123
124 for (j, c2) in s2.chars().enumerate() {
125 let cost = if c1 == c2 { 0 } else { 1 };
126 curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
130
131 std::mem::swap(&mut prev_row, &mut curr_row);
132 }
133
134 prev_row[len2]
135}
136
137#[cfg(test)]
138mod tests {
139 use super::*;
140
141 #[test]
142 fn test_exact_match() {
143 let source = vec![
144 "line 1".to_string(),
145 "line 2".to_string(),
146 "line 3".to_string(),
147 ];
148 let search = vec!["line 2".to_string()];
149
150 let result = find_closest_match(source, search).unwrap();
151 assert_eq!(result.start_index, 1);
152 assert_eq!(result.similarity, 1.0);
153 assert_eq!(result.matched_lines, vec!["line 2".to_string()]);
154 }
155
156 #[test]
157 fn test_fuzzy_match() {
158 let source = vec![
159 "if ft.is_dir() {".to_string(),
160 " return true;".to_string(),
161 "}".to_string(),
162 ];
163 let search = vec![
164 "if ft.is_dir() {".to_string(),
165 " return true".to_string(), ];
167
168 let result = find_closest_match(source, search).unwrap();
169 assert_eq!(result.start_index, 0);
170 assert!(result.similarity > 0.9);
171 assert_eq!(result.matched_lines[0], "if ft.is_dir() {");
172 }
173
174 #[test]
175 fn test_multiline_match() {
176 let source = vec![
177 "None => return false,".to_string(),
178 "Some(ft) => ft,".to_string(),
179 "};".to_string(),
180 "if ft.is_dir() {".to_string(),
181 "return true;".to_string(),
182 "}".to_string(),
183 ];
184 let search = vec![
185 "None => return false,".to_string(),
186 "Some(ft) => ft,".to_string(),
187 "};".to_string(),
188 "if ft.is_dir() {".to_string(),
189 "return true".to_string(),
190 ];
191
192 let result = find_closest_match(source.clone(), search).unwrap();
193 assert_eq!(result.start_index, 0);
194 assert!(result.similarity > 0.95);
195 }
196
197 #[test]
198 fn test_performance_full_file() {
199 let full_file = r#"/!
200Defines a builder for haystacks.
201A "haystack" represents something we want to search. It encapsulates the logic
202for whether a haystack ought to be searched or not, separate from the standard
203ignore rules and other filtering logic.
204Effectively, a haystack wraps a directory entry and adds some light application
205level logic around it.
206/
207use std::path::Path;
208/// A builder for constructing things to search over.
209#[derive(Clone, Debug)]
210pub(crate) struct HaystackBuilder {
211 strip_dot_prefix: bool,
212}
213impl HaystackBuilder {
214 /// Return a new haystack builder with a default configuration.
215 pub(crate) fn new() -> HaystackBuilder {
216 HaystackBuilder { strip_dot_prefix: false }
217 }
218 /// Create a new haystack from a possibly missing directory entry.
219 ///
220 /// If the directory entry isn't present, then the corresponding error is
221 /// logged if messages have been configured. Otherwise, if the directory
222 /// entry is deemed searchable, then it is returned as a haystack.
223 pub(crate) fn build_from_result(
224 &self,
225 result: Result<ignore::DirEntry, ignore::Error>,
226 ) -> Option<Haystack> {
227 match result {
228 Ok(dent) => self.build(dent),
229 Err(err) => {
230 err_message!("{err}");
231 None
232 }
233 }
234 }
235 /// Create a new haystack using this builder's configuration.
236 ///
237 /// If a directory entry could not be created or should otherwise not be
238 /// searched, then this returns None after emitting any relevant log
239 /// messages.
240 fn build(&self, dent: ignore::DirEntry) -> Option<Haystack> {
241 let hay = Haystack { dent, strip_dot_prefix: self.strip_dot_prefix };
242 if let Some(err) = hay.dent.error() {
243 ignore_message!("{err}");
244 }
245 // If this entry was explicitly provided by an end user, then we always
246 // want to search it.
247 if hay.is_explicit() {
248 return Some(hay);
249 }
250 // At this point, we only want to search something if it's explicitly a
251 // file. This omits symlinks. (If ripgrep was configured to follow
252 // symlinks, then they have already been followed by the directory
253 // traversal.)
254 if hay.is_file() {
255 return Some(hay);
256 }
257 // We got nothing. Emit a debug message, but only if this isn't a
258 // directory. Otherwise, emitting messages for directories is just
259 // noisy.
260 if !hay.is_dir() {
261 log::debug!(
262 "ignoring {}: failed to pass haystack filter: \
263 file type: {:?}, metadata: {:?}",
264 hay.dent.path().display(),
265 hay.dent.file_type(),
266 hay.dent.metadata()
267 );
268 }
269 None
270 }
271 /// When enabled, if the haystack's file path starts with ./ then it is
272 /// stripped.
273 ///
274 /// This is useful when implicitly searching the current working directory.
275 pub(crate) fn strip_dot_prefix(
276 &mut self,
277 yes: bool,
278 ) -> &mut HaystackBuilder {
279 self.strip_dot_prefix = yes;
280 self
281 }
282}
283/// A haystack is a thing we want to search.
284///
285/// Generally, a haystack is either a file or stdin.
286#[derive(Clone, Debug)]
287pub(crate) struct Haystack {
288 dent: ignore::DirEntry,
289 strip_dot_prefix: bool,
290}
291impl Haystack {
292 /// Return the file path corresponding to this haystack.
293 ///
294 /// If this haystack corresponds to stdin, then a special <stdin> path
295 /// is returned instead.
296 pub(crate) fn path(&self) -> &Path {
297 if self.strip_dot_prefix && self.dent.path().starts_with("./") {
298 self.dent.path().strip_prefix("./").unwrap()
299 } else {
300 self.dent.path()
301 }
302 }
303 /// Returns true if and only if this entry corresponds to stdin.
304 pub(crate) fn is_stdin(&self) -> bool {
305 self.dent.is_stdin()
306 }
307 /// Returns true if and only if this entry corresponds to a haystack to
308 /// search that was explicitly supplied by an end user.
309 ///
310 /// Generally, this corresponds to either stdin or an explicit file path
311 /// argument. e.g., in rg foo some-file ./some-dir/, some-file is
312 /// an explicit haystack, but, e.g., ./some-dir/some-other-file is not.
313 ///
314 /// However, note that ripgrep does not see through shell globbing. e.g.,
315 /// in rg foo ./some-dir/*, ./some-dir/some-other-file will be treated
316 /// as an explicit haystack.
317 pub(crate) fn is_explicit(&self) -> bool {
318 // stdin is obvious. When an entry has a depth of 0, that means it
319 // was explicitly provided to our directory iterator, which means it
320 // was in turn explicitly provided by the end user. The !is_dir check
321 // means that we want to search files even if their symlinks, again,
322 // because they were explicitly provided. (And we never want to try
323 // to search a directory.)
324 self.is_stdin() || (self.dent.depth() == 0 && !self.is_dir())
325 }
326 /// Returns true if and only if this haystack points to a directory after
327 /// following symbolic links.
328 fn is_dir(&self) -> bool {
329 let ft = match self.dent.file_type() {
330 None => return false,
331 Some(ft) => ft,
332 };
333 if ft.is_dir() {
334 return true;
335 }
336 // If this is a symlink, then we want to follow it to determine
337 // whether it's a directory or not.
338 self.dent.path_is_symlink() && self.dent.path().is_dir()
339 }
340 /// Returns true if and only if this haystack points to a file.
341 fn is_file(&self) -> bool {
342 self.dent.file_type().map_or(false, |ft| ft.is_file())
343 }
344}"#;
345
346 let source: Vec<String> = full_file.lines().map(|s| s.to_string()).collect();
347 let search = vec![
348 " None => return false,".to_string(),
349 " Some(ft) => ft,".to_string(),
350 " };".to_string(),
351 " if ft.is_dir() {".to_string(),
352 " return true;".to_string(),
353 ];
354
355 use std::time::Instant;
356 let start = Instant::now();
357 let result = find_closest_match(source.clone(), search).unwrap();
358 let duration = start.elapsed();
359
360 println!("Search took: {:?} for {} lines", duration, source.len());
361 assert_eq!(result.start_index, 131);
362 assert!(result.similarity > 0.95);
363 assert_eq!(result.matched_lines[4], " return true;");
364 }
365
366 #[test]
367 fn test_correction_feedback() {
368 let source = vec![
369 "let ft = match self.dent.file_type() {".to_string(),
370 " None => return false,".to_string(),
371 " Some(ft) => ft,".to_string(),
372 "};".to_string(),
373 "if ft.is_dir() {".to_string(),
374 " return true;".to_string(),
375 "}".to_string(),
376 ];
377
378 let search = vec![
380 "None => return false,".to_string(),
381 " Some(ft) => ft,".to_string(),
382 " };".to_string(),
383 " if ft.is_dir() {".to_string(),
384 " return true".to_string(),
385 ];
386
387 let result = find_closest_match(source, search).unwrap();
388 let feedback = result.get_correction_feedback();
389
390 assert!(feedback.is_some());
392
393 let feedback_text = feedback.unwrap();
394 assert!(feedback_text.contains("None => return false,"));
395 println!("Correction feedback:\n{feedback_text}");
396 }
397
398 #[test]
399 fn test_no_feedback_for_perfect_match() {
400 let source = vec![
401 "line 1".to_string(),
402 "line 2".to_string(),
403 "line 3".to_string(),
404 ];
405 let search = vec!["line 2".to_string()];
406
407 let result = find_closest_match(source, search).unwrap();
408 let feedback = result.get_correction_feedback();
409
410 assert!(feedback.is_none());
412 }
413}