Skip to main content

oo_ide/
file_index.rs

1//! Background file indexer and fuzzy search engine for the file selector.
2//!
3//! Key design points:
4//!
5//! * [`FileIndex`] is built once at startup by [`spawn_indexer`] in a
6//!   `tokio::task::spawn_blocking` thread.
7//! * The result is stored in a [`SharedFileIndex`] which is an
8//!   `Arc<ArcSwap<Option<FileIndex>>>`.  Reads are **lock-free** (`ArcSwap::load`);
9//!   writes are atomic pointer swaps.
10//! * [`spawn_watcher`] keeps the index live: it watches the project root with
11//!   `notify-debouncer-mini`, and rebuilds the full index (in a background
12//!   blocking task) whenever files change.  A 200 ms burst-coalescing sleep
13//!   prevents rebuild storms during e.g. `git checkout`.
14//! * [`NucleoSearch`] wraps `nucleo_matcher` for fast fuzzy matching.
15//!   [`NucleoSearch::search_top`] uses a bounded min-heap (size = `max`) so
16//!   only the top-K entries are ever kept in memory, avoiding a full sort of
17//!   potentially millions of candidates.
18//! * [`FileIndex`] carries a **trigram prefilter**: for queries of ≥ 3 bytes,
19//!   only files whose path contains the first trigram of the (lowercased) query
20//!   are passed to the matcher, reducing matcher calls by 10–50×.
21
22use std::cmp::Reverse;
23use std::collections::BinaryHeap;
24use std::collections::HashMap;
25use std::path::{Path, PathBuf};
26use std::sync::Arc;
27
28use arc_swap::ArcSwap;
29use ignore::WalkBuilder;
30use ignore::gitignore::GitignoreBuilder;
31
32use nucleo_matcher::pattern::{CaseMatching, Normalization, Pattern};
33use nucleo_matcher::{Config, Matcher, Utf32String};
34use notify::Watcher;
35
36
37// ── Data structures ───────────────────────────────────────────────────────────
38
39pub struct FileEntry {
40    /// Relative path from the project root.
41    pub path: PathBuf,
42    /// Pre-computed UTF-32 representation for nucleo — avoids per-query allocs.
43    utf32: Utf32String,
44}
45
46pub struct FileIndex {
47    files: Vec<FileEntry>,
48    /// Maps each 3-byte lowercase trigram to the indices of files whose
49    /// lowercased path contains that trigram.  Used as a fast prefilter.
50    trigrams: HashMap<[u8; 3], Vec<usize>>,
51}
52
53impl std::fmt::Debug for FileIndex {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        write!(f, "FileIndex({} files)", self.files.len())
56    }
57}
58
59impl FileIndex {
60    /// Walk `root` using `ignore` (respects `.gitignore`) and build the index.
61    ///
62    /// `max_depth` can be provided for shallow indexing (None = full).
63    pub fn build_with_max_depth(root: &Path, max_depth: Option<usize>) -> Self {
64        let mut files = Vec::with_capacity(4096);
65        let mut trigrams: HashMap<[u8; 3], Vec<usize>> = HashMap::new();
66
67        let mut builder = WalkBuilder::new(root);
68        builder
69            .hidden(false) // show dotfiles (.gitignore, .env, …)
70            .git_ignore(true) // respect .gitignore — handles target/, node_modules/, …
71            .git_exclude(true)
72            .parents(true);
73
74        if let Some(d) = max_depth {
75            builder.max_depth(Some(d));
76        }
77
78        for result in builder.build() {
79            let Ok(entry) = result else { continue };
80            let Some(ft) = entry.file_type() else {
81                continue;
82            };
83            if !ft.is_file() {
84                continue;
85            }
86
87            let path = entry.path();
88            let rel = path.strip_prefix(root).unwrap_or(path);
89
90            // Belt-and-suspenders: skip common build dirs even when they're not
91            // in .gitignore (e.g. freshly-cloned repos without Cargo.lock).
92            if rel.components().any(|c| {
93                c.as_os_str()
94                    .to_str()
95                    .map(|s| matches!(s, "target" | "node_modules" | "__pycache__"))
96                    .unwrap_or(false)
97            }) {
98                continue;
99            }
100
101            let s = rel.to_string_lossy().replace('\\', "/");
102            let idx = files.len();
103            index_trigrams(s.as_bytes(), idx, &mut trigrams);
104            files.push(FileEntry {
105                path: rel.to_path_buf(),
106                utf32: Utf32String::from(s.as_str()),
107            });
108        }
109
110        FileIndex { files, trigrams }
111    }
112
113    /// Convenience wrapper for the former behaviour: full walk (no depth limit).
114    pub fn build(root: &Path) -> Self {
115        Self::build_with_max_depth(root, None)
116    }
117
118    /// Build from an already-known list of relative paths — used in tests and
119    /// benchmarks where no real filesystem walk is needed.
120    #[allow(dead_code)]
121    pub fn from_paths(paths: Vec<PathBuf>) -> Self {
122        let mut files = Vec::with_capacity(paths.len());
123        let mut trigrams: HashMap<[u8; 3], Vec<usize>> = HashMap::new();
124
125        for path in paths {
126            let s = path.to_string_lossy().replace('\\', "/");
127            let idx = files.len();
128            index_trigrams(s.as_bytes(), idx, &mut trigrams);
129            files.push(FileEntry {
130                utf32: Utf32String::from(s.as_str()),
131                path,
132            });
133        }
134
135        FileIndex { files, trigrams }
136    }
137
138    #[allow(dead_code)]
139    pub fn files(&self) -> &[FileEntry] {
140        &self.files
141    }
142
143    /// Return the indices (into `self.files`) of files whose lowercased path
144    /// contains the first trigram (first 3 ASCII bytes, lowercased) of `query`.
145    ///
146    /// Returns `None` when the query is shorter than 3 bytes — callers should
147    /// fall back to scanning all files in that case.
148    fn trigram_candidate_indices(&self, query: &str) -> Option<Vec<usize>> {
149        // Strip spaces before computing trigrams: spaces can't appear in
150        // indexed paths, so a trigram like ['i', ' ', 'v'] (from "multi v")
151        // would never match anything and incorrectly empty the candidate set.
152        let q: Vec<u8> = query.bytes().filter(|&b| b != b' ').collect();
153        if q.len() < 3 {
154            return None;
155        }
156
157        let first = [
158            q[0].to_ascii_lowercase(),
159            q[1].to_ascii_lowercase(),
160            q[2].to_ascii_lowercase(),
161        ];
162
163        let a = self.trigrams.get(&first)?;
164
165        // If query is short we just use the first trigram.
166        if q.len() < 6 {
167            return Some(a.clone());
168        }
169
170        let last = [
171            q[q.len() - 3].to_ascii_lowercase(),
172            q[q.len() - 2].to_ascii_lowercase(),
173            q[q.len() - 1].to_ascii_lowercase(),
174        ];
175
176        let b = match self.trigrams.get(&last) {
177            Some(v) => v,
178            None => return Some(vec![]),
179        };
180
181        // Intersect the two candidate lists
182        let mut out = Vec::with_capacity(a.len().min(b.len()));
183        let set: std::collections::HashSet<_> = b.iter().copied().collect();
184
185        for &idx in a {
186            if set.contains(&idx) {
187                out.push(idx);
188            }
189        }
190
191        Some(out)
192    }
193}
194
195/// Insert all trigrams from `bytes` (lowercased) into `map`, pointing to `idx`.
196fn index_trigrams(bytes: &[u8], idx: usize, map: &mut HashMap<[u8; 3], Vec<usize>>) {
197    use std::collections::HashSet;
198    let mut seen = HashSet::new();
199    for tri in bytes.windows(3) {
200        let key = [
201            tri[0].to_ascii_lowercase(),
202            tri[1].to_ascii_lowercase(),
203            tri[2].to_ascii_lowercase(),
204        ];
205        if seen.insert(key) {
206            map.entry(key).or_default().push(idx);
207        }
208    }
209}
210
211// ── Shared index type ─────────────────────────────────────────────────────────
212
213/// Lock-free shared index.  `None` while the initial walk is still in progress.
214///
215/// Use `index.load()` for reads (returns a `Guard` — no lock taken).
216/// Use `index.store(Arc::new(Some(new_idx)))` for writes (atomic swap).
217pub type SharedFileIndex = Arc<ArcSwap<Option<FileIndex>>>;
218
219// ── Initial indexer ───────────────────────────────────────────────────────────
220
221/// Spawn a background blocking task that builds the index for `root`, then
222/// atomically stores it.  Returns immediately.
223pub fn spawn_indexer(root: PathBuf) -> SharedFileIndex {
224    let shared: SharedFileIndex = Arc::new(ArcSwap::from_pointee(None));
225    let out = shared.clone();
226    tokio::task::spawn_blocking(move || {
227        let idx = FileIndex::build(&root);
228        log::info!("file index built ({} files)", idx.files.len());
229        out.store(Arc::new(Some(idx)));
230    });
231    shared
232}
233
234fn should_trigger_notify_event(kind: &notify::EventKind) -> bool {
235    match kind {
236        notify::EventKind::Create(_) => true,
237        notify::EventKind::Remove(_) => true,
238        notify::EventKind::Modify(mod_kind) => {
239            // Only trigger on name changes (renames) — ignore data/metadata writes.
240            matches!(mod_kind, notify::event::ModifyKind::Name(_))
241        }
242        _ => false,
243    }
244}
245
246// ── Filesystem watcher ────────────────────────────────────────────────────────
247
248/// Determine whether a single path from a notify::Event should be treated as
249/// relevant (i.e., not ignored) according to the repository's .gitignore
250/// semantics. This uses `ignore::WalkBuilder` configured the same way as the
251/// indexer to ensure consistent behavior with FileIndex::build.
252fn is_path_relevant(root: &Path, path: &Path) -> bool {
253    // Only consider paths under the repo root; treat external paths as
254    // relevant to be conservative.
255    if let Ok(_rel) = path.strip_prefix(root) {
256        // Build a Gitignore matcher by discovering all .gitignore files in
257        // the repository. This is more expensive than a single WalkBuilder
258        // check but ensures nested .gitignore files are respected exactly
259        // the same way the indexer would.
260        let mut gbuilder = GitignoreBuilder::new(root);
261
262        let mut walker = WalkBuilder::new(root);
263        walker.hidden(false).git_ignore(false).git_exclude(false);
264
265        for result in walker.build() {
266            if let Ok(entry) = result
267                && let Some(name_os) = entry.path().file_name()
268                    && let Some(name) = name_os.to_str()
269                        && name == ".gitignore" {
270                            let _ = gbuilder.add(entry.path());
271                        }
272        }
273
274        // Also include .git/info/exclude when present
275        let git_info_exclude = root.join(".git").join("info").join("exclude");
276        if git_info_exclude.is_file() {
277            let _ = gbuilder.add(git_info_exclude);
278        }
279
280        let gi = match gbuilder.build() {
281            Ok(g) => g,
282            Err(_) => ignore::gitignore::Gitignore::empty(),
283        };
284
285        let is_dir = match path.metadata() {
286            Ok(md) => md.is_dir(),
287            Err(_) => path.extension().is_none(),
288        };
289
290        !gi.matched(path, is_dir).is_ignore()
291    } else {
292        true
293    }
294}
295
296/// Returns true if any path in the event is relevant (not ignored).
297fn is_event_relevant(root: &Path, event: &notify::Event) -> bool {
298    event.paths.iter().any(|p| is_path_relevant(root, p))
299}
300
301/// Watch `root` for file-system changes and rebuild the index automatically.
302///
303/// Uses `notify-debouncer-mini` (500 ms window) to collapse burst events from
304/// the OS layer.  The async loop adds a further 200 ms coalescing sleep so that
305/// rapid cascades (e.g. `git checkout`) are collapsed into a single rebuild.
306/// Each rebuild is run in a `spawn_blocking` task; a new trigger aborts any
307/// in-progress rebuild so only the latest one runs.
308pub fn spawn_watcher(root: PathBuf, index: SharedFileIndex) {
309    let (trigger_tx, mut trigger_rx) = tokio::sync::mpsc::unbounded_channel::<()>();
310
311    // Dedicated OS thread owns the raw notify watcher. `trigger_tx` is
312    // cloned into the callback so the async task can coalesce triggers. Keep
313    // the thread alive by blocking on a local receiver instead of repeatedly
314    // parking the thread.
315    {
316        let root = root.clone();
317        let trigger_tx_clone = trigger_tx.clone();
318
319        std::thread::spawn(move || {
320            // Create a raw notify watcher with a callback that forwards only
321            // structural events for non-ignored paths. Sending into the async
322            // channel is cheap; the async task implements the debounce/batching
323            // semantics.
324            let watcher_root = root.clone();
325            let mut watcher = match notify::RecommendedWatcher::new(
326                move |res: Result<notify::Event, notify::Error>| {
327                    match res {
328                        Ok(event) => {
329                            // Use WalkBuilder to determine whether this event
330                            // contains any paths that would be visible to the
331                            // indexer (i.e., not ignored by .gitignore semantics).
332                            if !is_event_relevant(&watcher_root, &event) {
333                                return;
334                            }
335
336                            if should_trigger_notify_event(&event.kind) {
337                                // best-effort send; ignore errors (receiver closed)
338                                let _ = trigger_tx_clone.send(());
339                            }
340                        }
341                        Err(e) => log::warn!("file watcher error: {e}"),
342                    }
343                },
344                notify::Config::default(),
345            ) {
346                Ok(w) => w,
347                Err(e) => {
348                    log::warn!("file watcher: failed to create watcher: {e}");
349                    return;
350                }
351            };
352
353            if let Err(e) = watcher.watch(&root, notify::RecursiveMode::Recursive) {
354                log::warn!("file watcher: failed to watch {root:?}: {e}");
355                return;
356            }
357
358            // Block the thread indefinitely. Keeping the watcher value in
359            // scope keeps the underlying watcher active.
360            let (_tx_keepalive, rx_keepalive) = std::sync::mpsc::channel::<()>();
361            let _ = rx_keepalive.recv();
362        });
363    }
364
365    // Async task coalesces triggers and schedules index rebuilds. This
366    // implements the "Option B" debounce strategy: drain any already queued
367    // triggers, wait briefly for the filesystem to settle, drain again, then
368    // perform a single rebuild for the burst.
369    tokio::spawn(async move {
370        let mut rebuild: Option<tokio::task::JoinHandle<()>> = None;
371        while trigger_rx.recv().await.is_some() {
372            // Drain any triggers that were queued before we woke up.
373            while trigger_rx.try_recv().is_ok() {}
374
375            // Wait briefly so related events (rename/create/remove etc.) can
376            // arrive — this collapses bursts into a single rebuild.
377            tokio::time::sleep(std::time::Duration::from_millis(200)).await;
378
379            // Drain anything that arrived while we were sleeping.
380            while trigger_rx.try_recv().is_ok() {}
381
382            // Abort any in-progress rebuild and request a fresh one for the
383            // current filesystem snapshot.
384            if let Some(h) = rebuild.take() {
385                h.abort();
386            }
387            let idx = index.clone();
388            let r = root.clone();
389            rebuild = Some(tokio::task::spawn_blocking(move || {
390                idx.store(Arc::new(Some(FileIndex::build(&r))));
391            }));
392        }
393    });
394}
395
396// ── Fuzzy search ─────────────────────────────────────────────────────────────
397
398/// Stateful fuzzy search engine backed by `nucleo_matcher`.
399///
400/// Create one instance per search task; the `Matcher`'s internal scratch
401/// buffer is reused across calls to `search_top` within the same task.
402pub struct NucleoSearch {
403    matcher: Matcher,
404}
405
406impl std::fmt::Debug for NucleoSearch {
407    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
408        f.write_str("NucleoSearch")
409    }
410}
411
412impl Default for NucleoSearch {
413    fn default() -> Self {
414        Self::new()
415    }
416}
417
418impl NucleoSearch {
419    pub fn new() -> Self {
420        Self {
421            matcher: Matcher::new(Config::DEFAULT),
422        }
423    }
424
425    /// Fuzzy-search `index` for `query`, returning at most `max` entries
426    /// sorted best-first.
427    ///
428    /// Uses a trigram prefilter to reduce the candidate set, then maintains a
429    /// bounded min-heap of size `max` so the full candidate list is never
430    /// sorted — only the final top-K are extracted and sorted once.
431    pub fn search_top<'a>(
432        &mut self,
433        index: &'a FileIndex,
434        query: &str,
435        max: usize,
436    ) -> Vec<&'a FileEntry> {
437        if query.is_empty() {
438            return index.files.iter().take(max).collect();
439        }
440
441        let pattern = Pattern::parse(query, CaseMatching::Ignore, Normalization::Smart);
442
443        let mut heap: BinaryHeap<(Reverse<u32>, usize)> = BinaryHeap::with_capacity(max);
444
445        // Helper closure to score a candidate file index.
446        let mut process_candidate = |fi: usize| {
447            let entry = &index.files[fi];
448
449            let Some(score) = pattern.score(entry.utf32.slice(..), &mut self.matcher) else {
450                return;
451            };
452
453            if heap.len() < max {
454                heap.push((Reverse(score), fi));
455            } else if let Some(&(Reverse(min_score), _)) = heap.peek()
456                && score > min_score {
457                    heap.pop();
458                    heap.push((Reverse(score), fi));
459                }
460        };
461
462        // Use trigram prefilter when possible
463        if let Some(indices) = index.trigram_candidate_indices(query) {
464            for fi in indices {
465                process_candidate(fi);
466            }
467        } else {
468            for fi in 0..index.files.len() {
469                process_candidate(fi);
470            }
471        }
472
473        // Extract and sort results
474        let mut results: Vec<(u32, usize)> = heap
475            .into_iter()
476            .map(|(Reverse(score), idx)| (score, idx))
477            .collect();
478
479        results.sort_unstable_by(|a, b| b.0.cmp(&a.0));
480
481        results
482            .into_iter()
483            .map(|(_, idx)| &index.files[idx])
484            .collect()
485    }
486
487    /// Convenience wrapper returning all results.
488    #[allow(dead_code)]
489    pub fn search<'a>(&mut self, index: &'a FileIndex, query: &str) -> Vec<&'a FileEntry> {
490        self.search_top(index, query, usize::MAX)
491    }
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497    use notify::event::{CreateKind, DataChange, ModifyKind, RenameMode};
498    use notify::Event;
499    use std::path::PathBuf;
500    use tempfile::tempdir;
501    use std::fs::{create_dir_all, write};
502
503    #[test]
504    fn should_trigger_on_create() {
505        let ev = Event { kind: notify::EventKind::Create(CreateKind::File), paths: vec![PathBuf::from("a")], attrs: Default::default() };
506        assert!(should_trigger_notify_event(&ev.kind));
507    }
508
509    #[test]
510    fn should_ignore_modify_data() {
511        let ev = Event { kind: notify::EventKind::Modify(ModifyKind::Data(DataChange::Content)), paths: vec![PathBuf::from("a")], attrs: Default::default() };
512        assert!(!should_trigger_notify_event(&ev.kind));
513    }
514
515    #[test]
516    fn should_trigger_rename() {
517        let ev = Event { kind: notify::EventKind::Modify(ModifyKind::Name(RenameMode::Both)), paths: vec![PathBuf::from("a")], attrs: Default::default() };
518        assert!(should_trigger_notify_event(&ev.kind));
519    }
520
521    #[test]
522    fn shallow_build_limits_depth() {
523        let td = tempdir().unwrap();
524        let root = td.path();
525        create_dir_all(root.join("a/b")).unwrap();
526        write(root.join("file1.txt"), b"").unwrap();
527        write(root.join("a").join("file2.txt"), b"").unwrap();
528        write(root.join("a").join("b").join("file3.txt"), b"").unwrap();
529
530        let idx_full = FileIndex::build_with_max_depth(root, None);
531        let paths_full: Vec<String> = idx_full.files.iter().map(|e| e.path.to_string_lossy().replace('\\', "/").to_string()).collect();
532        assert!(paths_full.iter().any(|p| p == "file1.txt"));
533        assert!(paths_full.iter().any(|p| p == "a/file2.txt"));
534        assert!(paths_full.iter().any(|p| p == "a/b/file3.txt"));
535
536        let idx_shallow = FileIndex::build_with_max_depth(root, Some(1));
537        let paths_shallow: Vec<String> = idx_shallow.files.iter().map(|e| e.path.to_string_lossy().replace('\\', "/").to_string()).collect();
538        assert!(paths_shallow.iter().any(|p| p == "file1.txt"));
539        assert!(!paths_shallow.iter().any(|p| p == "a/file2.txt"));
540        assert!(!paths_shallow.iter().any(|p| p == "a/b/file3.txt"));
541    }
542
543    #[test]
544    fn walkbuilder_respects_gitignore() {
545        let td = tempdir().unwrap();
546        let root = td.path();
547        // create .gitignore listing ignored.txt
548        write(root.join(".gitignore"), b"ignored.txt\n").unwrap();
549        write(root.join("ignored.txt"), b"").unwrap();
550        write(root.join("not_ignored.txt"), b"").unwrap();
551
552        // event for ignored file should be filtered out
553        let ev_ignored = Event { kind: notify::EventKind::Create(CreateKind::File), paths: vec![root.join("ignored.txt")], attrs: Default::default() };
554        assert!(!is_event_relevant(root, &ev_ignored));
555
556        // event for not ignored file should be relevant
557        let ev_not = Event { kind: notify::EventKind::Create(CreateKind::File), paths: vec![root.join("not_ignored.txt")], attrs: Default::default() };
558        assert!(is_event_relevant(root, &ev_not));
559    }
560}