Skip to main content

vtcode_file_search/
lib.rs

1//! Fast fuzzy file search library for VT Code.
2//!
3//! Uses the `ignore` crate (same as ripgrep) for parallel directory traversal
4//! and `nucleo-matcher` for fuzzy matching.
5//!
6//! # Example
7//!
8//! ```ignore
9//! use std::num::NonZero;
10//! use std::path::Path;
11//! use std::sync::Arc;
12//! use std::sync::atomic::AtomicBool;
13//! use vtcode_file_search::run;
14//!
15//! let results = run(
16//!     "main",
17//!     NonZero::new(100).unwrap(),
18//!     Path::new("."),
19//!     vec![],
20//!     NonZero::new(4).unwrap(),
21//!     Arc::new(AtomicBool::new(false)),
22//!     false,
23//!     true,
24//! )?;
25//!
26//! for m in results.matches {
27//!     println!("{}: {}", m.path, m.score);
28//! }
29//! # Ok::<(), anyhow::Error>(())
30//! ```
31
32use parking_lot::Mutex;
33use serde::{Deserialize, Serialize};
34use std::cmp::Reverse;
35use std::collections::BinaryHeap;
36use std::num::NonZero;
37use std::sync::Arc;
38use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
39
40/// A single file match result.
41///
42/// Fields:
43/// - `score`: Relevance score from fuzzy matching (higher is better)
44/// - `path`: Path relative to the search directory
45/// - `match_type`: Whether the match is a file or directory
46/// - `indices`: Optional character positions for highlighting matched characters
47#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
48#[serde(rename_all = "lowercase")]
49pub enum MatchType {
50    File,
51    Directory,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct FileMatch {
56    pub score: u32,
57    pub path: String,
58    pub match_type: MatchType,
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub indices: Option<Vec<u32>>,
61}
62
63/// Complete search results with total match count.
64#[derive(Debug)]
65pub struct FileSearchResults {
66    pub matches: Vec<FileMatch>,
67    pub total_match_count: usize,
68}
69
70/// Configuration for file search operations.
71pub struct FileSearchConfig {
72    pub pattern_text: String,
73    pub limit: NonZero<usize>,
74    pub search_directory: std::path::PathBuf,
75    pub exclude: Vec<String>,
76    pub threads: NonZero<usize>,
77    pub cancel_flag: Arc<AtomicBool>,
78    pub compute_indices: bool,
79    pub respect_gitignore: bool,
80}
81
82pub use vtcode_commons::paths::file_name_from_path;
83
84/// Best matches list per worker thread (lock-free collection).
85///
86/// Each worker thread gets its own instance to avoid locking during
87/// directory traversal. Results are merged at the end.
88struct BestMatchesList {
89    matches: BinaryHeap<Reverse<(u32, String, MatchType)>>,
90    limit: usize,
91    matcher: nucleo_matcher::Matcher,
92    haystack_buf: Vec<char>,
93    pattern_buf: Vec<char>,
94    pattern_text: String,
95}
96
97impl BestMatchesList {
98    fn new(limit: usize, pattern_text: &str) -> Self {
99        Self {
100            matches: BinaryHeap::new(),
101            limit,
102            matcher: nucleo_matcher::Matcher::new(nucleo_matcher::Config::DEFAULT),
103            haystack_buf: Vec::with_capacity(256),
104            pattern_buf: Vec::with_capacity(pattern_text.len()),
105            pattern_text: pattern_text.to_string(),
106        }
107    }
108
109    /// Record a matching path while preserving the worker-local top-K heap.
110    ///
111    /// Returns true when the path matches the search pattern, even if it
112    /// does not survive the top-K cutoff.
113    fn record_match(&mut self, path: &str, match_type: MatchType) -> bool {
114        let haystack = nucleo_matcher::Utf32Str::new(path, &mut self.haystack_buf);
115        let needle = nucleo_matcher::Utf32Str::new(&self.pattern_text, &mut self.pattern_buf);
116        let Some(score) = self.matcher.fuzzy_match(haystack, needle) else {
117            return false;
118        };
119
120        push_top_match(
121            &mut self.matches,
122            self.limit,
123            score as u32,
124            path.to_string(),
125            match_type,
126        );
127        true
128    }
129}
130
131fn push_top_match(
132    matches: &mut BinaryHeap<Reverse<(u32, String, MatchType)>>,
133    limit: usize,
134    score: u32,
135    path: String,
136    match_type: MatchType,
137) -> bool {
138    if matches.len() < limit {
139        matches.push(Reverse((score, path, match_type)));
140        return true;
141    }
142
143    let Some(min_score) = matches.peek().map(|entry| entry.0.0) else {
144        return false;
145    };
146
147    if score <= min_score {
148        return false;
149    }
150
151    matches.pop();
152    matches.push(Reverse((score, path, match_type)));
153    true
154}
155
156/// Run fuzzy file search with parallel traversal.
157///
158/// # Arguments
159///
160/// * `config` - File search configuration containing all search parameters
161///
162/// # Returns
163///
164/// FileSearchResults containing matched files and total match count.
165pub fn run(config: FileSearchConfig) -> anyhow::Result<FileSearchResults> {
166    let limit = config.limit.get();
167    let search_directory = &config.search_directory;
168    let exclude = &config.exclude;
169    let threads = config.threads.get();
170    let cancel_flag = &config.cancel_flag;
171    let compute_indices = config.compute_indices;
172    let respect_gitignore = config.respect_gitignore;
173    // Store pattern text for cloning across threads
174    // (Pattern is parsed per-match to work with Utf32Str)
175
176    // Build the directory walker
177    let mut walk_builder = ignore::WalkBuilder::new(search_directory);
178    walk_builder
179        .threads(threads)
180        .hidden(false)
181        .follow_links(true)
182        .require_git(false);
183
184    if !respect_gitignore {
185        walk_builder
186            .git_ignore(false)
187            .git_global(false)
188            .git_exclude(false)
189            .ignore(false)
190            .parents(false);
191    }
192
193    // Add exclusion patterns
194    if !exclude.is_empty() {
195        let mut override_builder = ignore::overrides::OverrideBuilder::new(search_directory);
196        for exclude_pattern in exclude {
197            let pattern = format!("!{}", exclude_pattern);
198            override_builder.add(&pattern)?;
199        }
200        let override_matcher = override_builder.build()?;
201        walk_builder.overrides(override_matcher);
202    }
203
204    let walker = walk_builder.build_parallel();
205
206    // Create per-worker result collection using Arc + Mutex for thread safety
207    let best_matchers_per_worker: Vec<Arc<Mutex<BestMatchesList>>> = (0..threads)
208        .map(|_| {
209            Arc::new(Mutex::new(BestMatchesList::new(
210                limit,
211                &config.pattern_text,
212            )))
213        })
214        .collect();
215
216    let index_counter = AtomicUsize::new(0);
217    let total_match_count = Arc::new(AtomicUsize::new(0));
218
219    // Run parallel traversal
220    walker.run(|| {
221        let worker_id =
222            index_counter.fetch_add(1, Ordering::Relaxed) % best_matchers_per_worker.len();
223        let best_list = best_matchers_per_worker[worker_id].clone();
224        let cancel_flag_clone = cancel_flag.clone();
225        let total_match_count_clone = total_match_count.clone();
226
227        Box::new(move |result| {
228            // Check cancellation flag periodically
229            if cancel_flag_clone.load(Ordering::Relaxed) {
230                return ignore::WalkState::Quit;
231            }
232
233            let entry = match result {
234                Ok(e) => e,
235                Err(_) => return ignore::WalkState::Continue,
236            };
237
238            let path = match entry.path().to_str() {
239                Some(p) => p,
240                None => return ignore::WalkState::Continue,
241            };
242            let match_type = if entry.path().is_dir() {
243                MatchType::Directory
244            } else {
245                MatchType::File
246            };
247
248            // Try to add to results
249            {
250                let mut list = best_list.lock();
251                if list.record_match(path, match_type) {
252                    total_match_count_clone.fetch_add(1, Ordering::Relaxed);
253                }
254            }
255
256            ignore::WalkState::Continue
257        })
258    });
259
260    // Merge worker-local top-K heaps into one final top-K heap.
261    let mut merged_matches = BinaryHeap::with_capacity(limit);
262    for arc in best_matchers_per_worker {
263        let mut list = arc.lock();
264        for Reverse((score, path, match_type)) in std::mem::take(&mut list.matches).into_vec() {
265            push_top_match(&mut merged_matches, limit, score, path, match_type);
266        }
267    }
268
269    // Build final results
270    let matches = merged_matches
271        .into_sorted_vec()
272        .into_iter()
273        .map(|Reverse((score, path, match_type))| FileMatch {
274            score,
275            path,
276            match_type,
277            indices: if compute_indices {
278                Some(Vec::new())
279            } else {
280                None
281            },
282        })
283        .collect();
284
285    Ok(FileSearchResults {
286        matches,
287        total_match_count: total_match_count.load(Ordering::Relaxed),
288    })
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294    use std::fs;
295    use tempfile::TempDir;
296
297    #[test]
298    fn test_file_name_from_path() {
299        assert_eq!(file_name_from_path("src/main.rs"), "main.rs");
300        assert_eq!(file_name_from_path("Cargo.toml"), "Cargo.toml");
301        assert_eq!(file_name_from_path("/absolute/path/file.txt"), "file.txt");
302        assert_eq!(file_name_from_path("file.txt"), "file.txt");
303        assert_eq!(file_name_from_path(""), "");
304    }
305
306    #[test]
307    fn test_run_search() -> anyhow::Result<()> {
308        let temp = TempDir::new()?;
309        fs::write(temp.path().join("hello.rs"), "fn main() {}")?;
310        fs::write(temp.path().join("world.txt"), "world")?;
311
312        let results = run(FileSearchConfig {
313            pattern_text: "hello".to_string(),
314            limit: NonZero::new(10).unwrap(),
315            search_directory: temp.path().to_path_buf(),
316            exclude: vec![],
317            threads: NonZero::new(1).unwrap(),
318            cancel_flag: Arc::new(AtomicBool::new(false)),
319            compute_indices: false,
320            respect_gitignore: false,
321        })?;
322
323        assert_eq!(results.matches.len(), 1);
324        assert!(results.matches[0].path.contains("hello"));
325        assert_eq!(results.matches[0].match_type, MatchType::File);
326
327        Ok(())
328    }
329
330    #[test]
331    fn test_multiple_matches() -> anyhow::Result<()> {
332        let temp = TempDir::new()?;
333        fs::write(temp.path().join("test1.rs"), "")?;
334        fs::write(temp.path().join("test2.rs"), "")?;
335        fs::write(temp.path().join("test3.rs"), "")?;
336        fs::write(temp.path().join("other.txt"), "")?;
337
338        let results = run(FileSearchConfig {
339            pattern_text: "test".to_string(),
340            limit: NonZero::new(10).unwrap(),
341            search_directory: temp.path().to_path_buf(),
342            exclude: vec![],
343            threads: NonZero::new(2).unwrap(),
344            cancel_flag: Arc::new(AtomicBool::new(false)),
345            compute_indices: false,
346            respect_gitignore: false,
347        })?;
348
349        assert_eq!(results.matches.len(), 3);
350        assert!(results.matches.iter().all(|m| m.path.contains("test")));
351        assert!(
352            results
353                .matches
354                .iter()
355                .all(|m| matches!(m.match_type, MatchType::File))
356        );
357
358        Ok(())
359    }
360
361    #[test]
362    fn test_limit_is_respected_across_workers() -> anyhow::Result<()> {
363        let temp = TempDir::new()?;
364        for name in ["alpha.rs", "alphabet.rs", "alphanumeric.rs", "alpaca.rs"] {
365            fs::write(temp.path().join(name), "")?;
366        }
367
368        let results = run(FileSearchConfig {
369            pattern_text: "alpha".to_string(),
370            limit: NonZero::new(2).unwrap(),
371            search_directory: temp.path().to_path_buf(),
372            exclude: vec![],
373            threads: NonZero::new(4).unwrap(),
374            cancel_flag: Arc::new(AtomicBool::new(false)),
375            compute_indices: false,
376            respect_gitignore: false,
377        })?;
378
379        assert_eq!(results.matches.len(), 2);
380        assert!(
381            results
382                .matches
383                .windows(2)
384                .all(|window| window[0].score >= window[1].score)
385        );
386
387        Ok(())
388    }
389
390    #[test]
391    fn test_exclusion_patterns() -> anyhow::Result<()> {
392        let temp = TempDir::new()?;
393        fs::write(temp.path().join("keep.rs"), "")?;
394        fs::create_dir(temp.path().join("target"))?;
395        fs::write(temp.path().join("target/ignore.rs"), "")?;
396
397        let results = run(FileSearchConfig {
398            pattern_text: "rs".to_string(),
399            limit: NonZero::new(10).unwrap(),
400            search_directory: temp.path().to_path_buf(),
401            exclude: vec!["target/**".to_string()],
402            threads: NonZero::new(2).unwrap(),
403            cancel_flag: Arc::new(AtomicBool::new(false)),
404            compute_indices: false,
405            respect_gitignore: false,
406        })?;
407
408        assert_eq!(results.matches.len(), 1);
409        assert!(results.matches[0].path.contains("keep.rs"));
410        assert_eq!(results.matches[0].match_type, MatchType::File);
411
412        Ok(())
413    }
414
415    #[test]
416    fn test_cancellation() -> anyhow::Result<()> {
417        let temp = TempDir::new()?;
418        for i in 0..10 {
419            fs::write(temp.path().join(format!("file{}.rs", i)), "")?;
420        }
421
422        let cancel_flag = Arc::new(AtomicBool::new(true));
423        let results = run(FileSearchConfig {
424            pattern_text: "file".to_string(),
425            limit: NonZero::new(10).unwrap(),
426            search_directory: temp.path().to_path_buf(),
427            exclude: vec![],
428            threads: NonZero::new(1).unwrap(),
429            cancel_flag,
430            compute_indices: false,
431            respect_gitignore: false,
432        })?;
433
434        // Should return early due to cancellation
435        assert!(results.matches.is_empty());
436
437        Ok(())
438    }
439
440    #[test]
441    fn test_directory_matches_are_returned() -> anyhow::Result<()> {
442        let temp = TempDir::new()?;
443        fs::create_dir_all(temp.path().join("docs/guides"))?;
444        fs::write(temp.path().join("docs/guides/intro.md"), "intro")?;
445        fs::write(temp.path().join("docs/readme.md"), "readme")?;
446
447        let results = run(FileSearchConfig {
448            pattern_text: "guides".to_string(),
449            limit: NonZero::new(10).unwrap(),
450            search_directory: temp.path().to_path_buf(),
451            exclude: vec![],
452            threads: NonZero::new(2).unwrap(),
453            cancel_flag: Arc::new(AtomicBool::new(false)),
454            compute_indices: false,
455            respect_gitignore: false,
456        })?;
457
458        assert!(results.matches.iter().any(
459            |m| m.path.ends_with("docs/guides") && matches!(m.match_type, MatchType::Directory)
460        ));
461
462        Ok(())
463    }
464}