Skip to main content

vtcode_indexer/
file_search.rs

1//! Fast fuzzy file search library for VT Code.
2//!
3//! Uses the `ignore` crate (same as ripgrep) for parallel directory traversal
4//! and `nucleo-matcher` for fuzzy matching.
5//!
6//! # Example
7//!
8//! ```ignore
9//! use std::num::NonZero;
10//! use std::path::Path;
11//! use std::sync::Arc;
12//! use std::sync::atomic::AtomicBool;
13//! use vtcode_indexer::file_search::run;
14//!
15//! let results = run(
16//!     "main",
17//!     NonZero::new(100).unwrap(),
18//!     Path::new("."),
19//!     vec![],
20//!     NonZero::new(4).unwrap(),
21//!     Arc::new(AtomicBool::new(false)),
22//!     false,
23//!     true,
24//! )?;
25//!
26//! for m in results.matches {
27//!     println!("{}: {}", m.path, m.score);
28//! }
29//! # Ok::<(), anyhow::Error>(())
30//! ```
31
32use parking_lot::Mutex;
33use serde::{Deserialize, Serialize};
34use std::cmp::Reverse;
35use std::collections::BinaryHeap;
36use std::num::NonZero;
37use std::path::Path;
38use std::sync::Arc;
39use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
40use tokio::sync::RwLock;
41
42/// Pre-computed file index for instant queries.
43///
44/// This index is built in the background and cached to avoid
45/// repeated directory traversals on every search.
46pub struct FileIndex {
47    /// All file paths in the workspace
48    files: Vec<String>,
49    /// All directory paths in the workspace
50    directories: Vec<String>,
51    /// When this index was last built
52    last_built: std::time::Instant,
53}
54
55/// Build a parallel walker with the given configuration.
56fn build_parallel_walker(
57    search_directory: &Path,
58    exclude: &[String],
59    threads: usize,
60    respect_gitignore: bool,
61) -> anyhow::Result<ignore::WalkParallel> {
62    let mut walk_builder = ignore::WalkBuilder::new(search_directory);
63    vtcode_commons::walk::apply_defaults(&mut walk_builder);
64
65    // File-search-specific overrides
66    walk_builder.threads(threads);
67    walk_builder.follow_links(true); // Search follows symlinks
68    walk_builder.require_git(false); // Search works outside git repos
69
70    if !respect_gitignore {
71        walk_builder
72            .git_ignore(false)
73            .git_global(false)
74            .git_exclude(false)
75            .ignore(false)
76            .parents(false);
77    }
78
79    if !exclude.is_empty() {
80        let mut override_builder = ignore::overrides::OverrideBuilder::new(search_directory);
81        for exclude_pattern in exclude {
82            let pattern = format!("!{}", exclude_pattern);
83            override_builder.add(&pattern)?;
84        }
85        walk_builder.overrides(override_builder.build()?);
86    }
87
88    Ok(walk_builder.build_parallel())
89}
90
91impl FileIndex {
92    /// Build a file index by traversing the directory tree.
93    /// This is expensive but only done once.
94    fn build_from_directory(
95        search_directory: &Path,
96        exclude: &[String],
97        respect_gitignore: bool,
98        threads: usize,
99    ) -> anyhow::Result<Self> {
100        let walker = build_parallel_walker(search_directory, exclude, threads, respect_gitignore)?;
101
102        // Collect all files and directories
103        let files_arc = Arc::new(Mutex::new(Vec::new()));
104        let dirs_arc = Arc::new(Mutex::new(Vec::new()));
105
106        walker.run(|| {
107            let files_clone = files_arc.clone();
108            let dirs_clone = dirs_arc.clone();
109            let search_dir = search_directory.to_path_buf();
110
111            Box::new(move |result| {
112                let entry = match result {
113                    Ok(e) => e,
114                    Err(_) => return ignore::WalkState::Continue,
115                };
116
117                // Make path relative to search directory
118                if let Some(rel_path) = entry
119                    .path()
120                    .strip_prefix(&search_dir)
121                    .ok()
122                    .and_then(|p| p.to_str())
123                    && !rel_path.is_empty()
124                {
125                    if entry.path().is_dir() {
126                        dirs_clone.lock().push(rel_path.to_string());
127                    } else {
128                        files_clone.lock().push(rel_path.to_string());
129                    }
130                }
131
132                ignore::WalkState::Continue
133            })
134        });
135
136        let files = Arc::try_unwrap(files_arc)
137            .map_err(|arc| {
138                anyhow::anyhow!(
139                    "failed to unwrap files arc, {} references remain",
140                    Arc::strong_count(&arc)
141                )
142            })?
143            .into_inner();
144        let directories = Arc::try_unwrap(dirs_arc)
145            .map_err(|arc| {
146                anyhow::anyhow!(
147                    "failed to unwrap dirs arc, {} references remain",
148                    Arc::strong_count(&arc)
149                )
150            })?
151            .into_inner();
152
153        Ok(Self {
154            files,
155            directories,
156            last_built: std::time::Instant::now(),
157        })
158    }
159
160    /// Query the index for matching paths.
161    /// Much faster than re-traversing the filesystem.
162    fn query(
163        &self,
164        pattern_text: &str,
165        limit: usize,
166        match_type_filter: Option<MatchType>,
167    ) -> Vec<(u32, String, MatchType)> {
168        let mut results = BinaryHeap::with_capacity(limit);
169
170        // Normalize pattern to lowercase to work around a nucleo-matcher bug:
171        // its prefilter only does case-insensitive search for lowercase needle
172        // chars, not uppercase. See https://github.com/openai/codex/pull/15772.
173        let pattern_storage = if pattern_text.is_ascii() {
174            PatternStorage::Ascii(pattern_text.to_ascii_lowercase().into_bytes())
175        } else {
176            PatternStorage::Unicode(pattern_text.to_lowercase().chars().collect())
177        };
178
179        // Reuse single matcher across all queries (mem-reuse-collections)
180        let mut matcher = nucleo_matcher::Matcher::new(nucleo_matcher::Config::DEFAULT);
181        let mut haystack_buf = Vec::with_capacity(256);
182
183        // Iterate over files
184        if match_type_filter.is_none_or(|t| t == MatchType::File) {
185            for path in &self.files {
186                if let Some(score) =
187                    self.score_path(path, &pattern_storage, &mut matcher, &mut haystack_buf)
188                {
189                    push_top_match(&mut results, limit, score, path.clone(), MatchType::File);
190                }
191            }
192        }
193
194        // Iterate over directories
195        if match_type_filter.is_none_or(|t| t == MatchType::Directory) {
196            for path in &self.directories {
197                if let Some(score) =
198                    self.score_path(path, &pattern_storage, &mut matcher, &mut haystack_buf)
199                {
200                    push_top_match(
201                        &mut results,
202                        limit,
203                        score,
204                        path.clone(),
205                        MatchType::Directory,
206                    );
207                }
208            }
209        }
210
211        results
212            .into_sorted_vec()
213            .into_iter()
214            .map(|Reverse(item)| item)
215            .collect()
216    }
217
218    fn score_path(
219        &self,
220        path: &str,
221        pattern: &PatternStorage,
222        matcher: &mut nucleo_matcher::Matcher,
223        haystack_buf: &mut Vec<char>,
224    ) -> Option<u32> {
225        let haystack = nucleo_matcher::Utf32Str::new(path, haystack_buf);
226
227        let needle = match pattern {
228            PatternStorage::Ascii(bytes) => nucleo_matcher::Utf32Str::Ascii(bytes),
229            PatternStorage::Unicode(chars) => nucleo_matcher::Utf32Str::Unicode(chars),
230        };
231
232        matcher.fuzzy_match(haystack, needle).map(|s| s as u32)
233    }
234}
235
236/// A cached file index that can be shared across searches.
237pub struct FileIndexCache {
238    cache: Arc<RwLock<Option<Arc<FileIndex>>>>,
239    search_directory: std::path::PathBuf,
240    exclude: Vec<String>,
241    respect_gitignore: bool,
242    threads: usize,
243}
244
245impl FileIndexCache {
246    pub fn new(
247        search_directory: std::path::PathBuf,
248        exclude: impl IntoIterator<Item = String>,
249        respect_gitignore: bool,
250        threads: usize,
251    ) -> Self {
252        Self {
253            cache: Arc::new(RwLock::new(None)),
254            search_directory,
255            exclude: exclude.into_iter().collect(),
256            respect_gitignore,
257            threads,
258        }
259    }
260
261    /// Get or build the file index.
262    pub async fn get_or_build(&self) -> anyhow::Result<Arc<FileIndex>> {
263        // Check if we have a cached index
264        {
265            let guard = self.cache.read().await;
266            if let Some(index) = guard.as_ref() {
267                // Check if index is stale (older than 5 minutes)
268                if index.last_built.elapsed() < std::time::Duration::from_secs(300) {
269                    return Ok(Arc::clone(index));
270                }
271            }
272        }
273
274        // Build a new index
275        let index = Arc::new(FileIndex::build_from_directory(
276            &self.search_directory,
277            &self.exclude,
278            self.respect_gitignore,
279            self.threads,
280        )?);
281
282        // Cache and return
283        {
284            let mut guard = self.cache.write().await;
285            *guard = Some(Arc::clone(&index));
286        }
287        Ok(index)
288    }
289
290    /// Force refresh the index in the background.
291    /// Returns the old index immediately while rebuilding happens asynchronously.
292    pub fn refresh_background(&self) -> Option<Arc<FileIndex>> {
293        // Build new index asynchronously
294        let search_directory = self.search_directory.clone();
295        let exclude = self.exclude.clone();
296        let respect_gitignore = self.respect_gitignore;
297        let threads = self.threads;
298        let cache = self.cache.clone();
299
300        tokio::spawn(async move {
301            match FileIndex::build_from_directory(
302                &search_directory,
303                &exclude,
304                respect_gitignore,
305                threads,
306            ) {
307                Ok(new_index) => {
308                    let mut guard = cache.write().await;
309                    *guard = Some(Arc::new(new_index));
310                }
311                Err(e) => {
312                    tracing::error!("failed to rebuild file index: {e}");
313                }
314            }
315        });
316
317        // Return old index if available
318        let guard = self.cache.blocking_read();
319        guard.as_ref().map(Arc::clone)
320    }
321
322    /// Incrementally update the index when a file change is detected.
323    /// This is faster than a full rebuild for single file changes.
324    pub fn update_file(&self, path: &str, is_added: bool) {
325        let mut guard = self.cache.blocking_write();
326        let Some(existing) = guard.take() else { return };
327
328        let mut index = Arc::try_unwrap(existing).unwrap_or_else(|arc| (*arc).clone());
329        if is_added {
330            if Path::new(path).is_dir() {
331                index.directories.push(path.to_string());
332            } else {
333                index.files.push(path.to_string());
334            }
335        } else {
336            index.files.retain(|p| p != path);
337            index.directories.retain(|p| p != path);
338        }
339        index.last_built = std::time::Instant::now();
340        *guard = Some(Arc::new(index));
341    }
342
343    /// Get the age of the current index.
344    pub async fn index_age(&self) -> Option<std::time::Duration> {
345        let guard = self.cache.read().await;
346        guard.as_ref().map(|idx| idx.last_built.elapsed())
347    }
348}
349
350// Make FileIndex cloneable
351impl Clone for FileIndex {
352    fn clone(&self) -> Self {
353        Self {
354            files: self.files.clone(),
355            directories: self.directories.clone(),
356            last_built: self.last_built,
357        }
358    }
359}
360
361/// A single file match result.
362///
363/// Fields:
364/// - `score`: Relevance score from fuzzy matching (higher is better)
365/// - `path`: Path relative to the search directory
366/// - `match_type`: Whether the match is a file or directory
367/// - `indices`: Optional character positions for highlighting matched characters
368#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
369#[serde(rename_all = "lowercase")]
370pub enum MatchType {
371    File,
372    Directory,
373}
374
375#[derive(Debug, Clone, Serialize, Deserialize)]
376pub struct FileMatch {
377    pub score: u32,
378    pub path: String,
379    pub match_type: MatchType,
380    #[serde(skip_serializing_if = "Option::is_none")]
381    pub indices: Option<Vec<u32>>,
382}
383
384/// Complete search results with total match count.
385#[derive(Debug)]
386pub struct FileSearchResults {
387    pub matches: Vec<FileMatch>,
388    pub total_match_count: usize,
389}
390
391/// Configuration for file search operations.
392pub struct FileSearchConfig {
393    pub pattern_text: String,
394    pub limit: NonZero<usize>,
395    pub search_directory: std::path::PathBuf,
396    pub exclude: Vec<String>,
397    pub threads: NonZero<usize>,
398    pub cancel_flag: Arc<AtomicBool>,
399    pub compute_indices: bool,
400    pub respect_gitignore: bool,
401}
402
403pub use vtcode_commons::paths::file_name_from_path;
404
405/// Best matches list per worker thread (lock-free collection).
406///
407/// Each worker thread gets its own instance to avoid locking during
408/// directory traversal. Results are merged at the end.
409struct BestMatchesList {
410    matches: BinaryHeap<Reverse<(u32, String, MatchType)>>,
411    limit: usize,
412    matcher: nucleo_matcher::Matcher,
413    haystack_buf: Vec<char>,
414    /// Pre-computed pattern - avoids per-match UTF-32 conversion
415    pattern: PatternStorage,
416}
417
418/// Stores a pattern in the optimal form for Utf32Str creation.
419enum PatternStorage {
420    /// ASCII pattern - can be used directly with Utf32Str::Ascii
421    Ascii(Vec<u8>),
422    /// Unicode pattern - stored as chars for Utf32Str::Unicode
423    Unicode(Vec<char>),
424}
425
426impl BestMatchesList {
427    fn new(limit: usize, pattern_text: &str) -> Self {
428        // Normalize pattern to lowercase to work around a nucleo-matcher bug:
429        // its prefilter only does case-insensitive search for lowercase needle
430        // chars, not uppercase. See https://github.com/openai/codex/pull/15772.
431        let pattern = if pattern_text.is_ascii() {
432            PatternStorage::Ascii(pattern_text.to_ascii_lowercase().into_bytes())
433        } else {
434            PatternStorage::Unicode(pattern_text.to_lowercase().chars().collect())
435        };
436
437        Self {
438            matches: BinaryHeap::new(),
439            limit,
440            matcher: nucleo_matcher::Matcher::new(nucleo_matcher::Config::DEFAULT),
441            haystack_buf: Vec::with_capacity(256),
442            pattern,
443        }
444    }
445
446    /// Record a matching path while preserving the worker-local top-K heap.
447    ///
448    /// Returns true when the path matches the search pattern, even if it
449    /// does not survive the top-K cutoff.
450    fn record_match(&mut self, path: &str, match_type: MatchType) -> bool {
451        // Use pre-computed pattern directly - zero allocation per match
452        let haystack = nucleo_matcher::Utf32Str::new(path, &mut self.haystack_buf);
453        let needle = match &self.pattern {
454            PatternStorage::Ascii(bytes) => nucleo_matcher::Utf32Str::Ascii(bytes),
455            PatternStorage::Unicode(chars) => nucleo_matcher::Utf32Str::Unicode(chars),
456        };
457        let Some(score) = self.matcher.fuzzy_match(haystack, needle) else {
458            return false;
459        };
460
461        push_top_match(
462            &mut self.matches,
463            self.limit,
464            score as u32,
465            path.to_string(),
466            match_type,
467        );
468        true
469    }
470}
471
472fn push_top_match(
473    matches: &mut BinaryHeap<Reverse<(u32, String, MatchType)>>,
474    limit: usize,
475    score: u32,
476    path: String,
477    match_type: MatchType,
478) -> bool {
479    if matches.len() < limit {
480        matches.push(Reverse((score, path, match_type)));
481        return true;
482    }
483
484    let Some(min_score) = matches.peek().map(|entry| entry.0.0) else {
485        return false;
486    };
487
488    if score <= min_score {
489        return false;
490    }
491
492    matches.pop();
493    matches.push(Reverse((score, path, match_type)));
494    true
495}
496
497/// Run fuzzy file search using a pre-computed file index.
498///
499/// This is much faster than `run()` for repeated queries on the same
500/// directory because it avoids re-traversing the filesystem.
501///
502/// # Arguments
503///
504/// * `config` - File search configuration
505/// * `index_cache` - Shared cache for the pre-computed file index
506///
507/// # Returns
508///
509/// FileSearchResults containing matched files and total match count.
510pub async fn run_with_index(
511    config: FileSearchConfig,
512    index_cache: &FileIndexCache,
513) -> anyhow::Result<FileSearchResults> {
514    let limit = config.limit.get();
515    let cancel_flag = &config.cancel_flag;
516    let compute_indices = config.compute_indices;
517
518    // Get or build the file index
519    let index = index_cache.get_or_build().await?;
520
521    // Check cancellation
522    if cancel_flag.load(Ordering::Relaxed) {
523        return Ok(FileSearchResults {
524            matches: Vec::new(),
525            total_match_count: 0,
526        });
527    }
528
529    // Query the index
530    let matched_paths = index.query(&config.pattern_text, limit, None);
531    let total_match_count = matched_paths.len();
532
533    // Build final results
534    let matches = matched_paths
535        .into_iter()
536        .map(|(score, path, match_type)| FileMatch {
537            score,
538            path,
539            match_type,
540            indices: if compute_indices {
541                Some(Vec::new())
542            } else {
543                None
544            },
545        })
546        .collect();
547
548    Ok(FileSearchResults {
549        matches,
550        total_match_count,
551    })
552}
553
554/// Run fuzzy file search with parallel traversal.
555///
556/// # Arguments
557///
558/// * `config` - File search configuration containing all search parameters
559///
560/// # Returns
561///
562/// FileSearchResults containing matched files and total match count.
563pub fn run(config: FileSearchConfig) -> anyhow::Result<FileSearchResults> {
564    let limit = config.limit.get();
565    let search_directory = &config.search_directory;
566    let exclude = &config.exclude;
567    let threads = config.threads.get();
568    let cancel_flag = &config.cancel_flag;
569    let compute_indices = config.compute_indices;
570    let respect_gitignore = config.respect_gitignore;
571
572    let walker = build_parallel_walker(search_directory, exclude, threads, respect_gitignore)?;
573
574    // Create per-worker result collection using Arc + Mutex for thread safety.
575    // Each worker gets exactly one instance - no sharing between workers.
576    let best_matchers_per_worker: Vec<Arc<Mutex<BestMatchesList>>> = (0..threads)
577        .map(|_| {
578            Arc::new(Mutex::new(BestMatchesList::new(
579                limit,
580                &config.pattern_text,
581            )))
582        })
583        .collect();
584
585    let total_match_count = Arc::new(AtomicUsize::new(0));
586
587    // Run parallel traversal - the closure is called once per worker thread.
588    // We use a local counter to assign each worker a unique index.
589    let worker_counter = AtomicUsize::new(0);
590    let worker_count = best_matchers_per_worker.len();
591    walker.run(|| {
592        let worker_id = worker_counter.fetch_add(1, Ordering::Relaxed) % worker_count;
593        let best_list = best_matchers_per_worker[worker_id].clone();
594        let cancel_flag_clone = cancel_flag.clone();
595        let total_match_count_clone = total_match_count.clone();
596
597        Box::new(move |result| {
598            // Check cancellation flag periodically
599            if cancel_flag_clone.load(Ordering::Relaxed) {
600                return ignore::WalkState::Quit;
601            }
602
603            let entry = match result {
604                Ok(e) => e,
605                Err(_) => return ignore::WalkState::Continue,
606            };
607
608            // Make path relative to search directory
609            let relative_path = entry
610                .path()
611                .strip_prefix(search_directory)
612                .ok()
613                .and_then(|p| p.to_str());
614
615            let path_to_match = match relative_path {
616                Some(p) if !p.is_empty() => p,
617                _ => return ignore::WalkState::Continue, // Skip root and non-relative paths
618            };
619
620            let match_type = if entry.path().is_dir() {
621                MatchType::Directory
622            } else {
623                MatchType::File
624            };
625
626            // Try to add to results - no contention with other workers
627            {
628                let mut list = best_list.lock();
629                if list.record_match(path_to_match, match_type) {
630                    total_match_count_clone.fetch_add(1, Ordering::Relaxed);
631                }
632            }
633
634            ignore::WalkState::Continue
635        })
636    });
637
638    // Merge worker-local top-K heaps into one final top-K heap.
639    let mut merged_matches = BinaryHeap::with_capacity(limit);
640    for arc in best_matchers_per_worker {
641        let mut list = arc.lock();
642        for Reverse((score, path, match_type)) in std::mem::take(&mut list.matches).into_vec() {
643            push_top_match(&mut merged_matches, limit, score, path, match_type);
644        }
645    }
646
647    // Build final results
648    let matches = merged_matches
649        .into_sorted_vec()
650        .into_iter()
651        .map(|Reverse((score, path, match_type))| FileMatch {
652            score,
653            path,
654            match_type,
655            indices: if compute_indices {
656                Some(Vec::new())
657            } else {
658                None
659            },
660        })
661        .collect();
662
663    Ok(FileSearchResults {
664        matches,
665        total_match_count: total_match_count.load(Ordering::Relaxed),
666    })
667}