Skip to main content

fff_search/
file_picker.rs

1//! Core file picker: filesystem indexing, background watching, and fuzzy search.
2//!
3//! [`FilePicker`] is the central component of fff-search. It:
4//!
5//! 1. **Indexes** a directory tree in a background thread, collecting every
6//!    non-ignored file into a path-sorted `Vec<FileItem>`.
7//! 2. **Watches** the filesystem via the `notify` crate, applying
8//!    create/modify/delete events to the index in real time.
9//! 3. **Owns files**: Provides a values for search and provides a good entry point for
10//!    fuzzy search and live grep
11//!
12//! # Lifecycle
13//!
14//! ```text
15//!   new_with_shared_state()
16//!     │
17//!     ├─> background scan thread ──> populates SharedPicker
18//!     └─> file-system watcher    ──> live updates SharedPicker
19//!
20//!   search()         <── borrows &self, delegates to fuzzy_search
21//!   grep()           <── static, borrows &[FileItem] (live content search)
22//!   trigger_rescan() <── synchronous re-index
23//!   cancel()         <── shuts down background work
24//! ```
25//!
26//! # Thread Safety
27//!
28//! `FilePicker` itself is **not** `Sync`!
29//! all concurrent access goes through [`SharedPicker`](crate::SharedPicker) .
30//! The background scanner and watcher acquire write locks only when mutating
31//! the file index, so read-heavy search workloads rarely contend.
32
33use crate::FFFStringStorage;
34use crate::background_watcher::BackgroundWatcher;
35use crate::bigram_filter::{BigramFilter, BigramIndexBuilder, BigramOverlay};
36use crate::error::Error;
37use crate::frecency::FrecencyTracker;
38use crate::git::GitStatusCache;
39use crate::grep::{GrepResult, GrepSearchOptions, grep_search, multi_grep_search};
40use crate::ignore::non_git_repo_overrides;
41use crate::query_tracker::QueryTracker;
42use crate::score::fuzzy_match_and_score_files;
43use crate::shared::{SharedFrecency, SharedPicker};
44use crate::simd_path::ArenaPtr;
45use crate::types::{
46    ContentCacheBudget, DirItem, DirSearchResult, FileItem, MixedItemRef, MixedSearchResult,
47    PaginationArgs, Score, ScoringContext, SearchResult,
48};
49use fff_query_parser::FFFQuery;
50use git2::{Repository, Status, StatusOptions};
51use rayon::prelude::*;
52use std::fmt::Debug;
53use std::path::{Path, PathBuf};
54use std::sync::{
55    Arc, LazyLock,
56    atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering},
57};
58use std::time::SystemTime;
59use tracing::{Level, debug, error, info, warn};
60
61/// Dedicated thread pool for background work (scan, warmup, bigram build).
62/// Uses fewer threads than the global rayon pool so Neovim's event loop
63/// and search queries can still get CPU time.
64static BACKGROUND_THREAD_POOL: LazyLock<rayon::ThreadPool> = LazyLock::new(|| {
65    let total = std::thread::available_parallelism()
66        .map(|p| p.get())
67        .unwrap_or(4);
68    let bg_threads = total.saturating_sub(2).max(1);
69    info!(
70        "Background pool: {} threads (system has {})",
71        bg_threads, total
72    );
73    rayon::ThreadPoolBuilder::new()
74        .num_threads(bg_threads)
75        .thread_name(|i| format!("fff-bg-{i}"))
76        .build()
77        .expect("failed to create background rayon pool")
78});
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
81pub enum FFFMode {
82    #[default]
83    Neovim,
84    Ai,
85}
86
87impl FFFMode {
88    pub fn is_ai(self) -> bool {
89        self == FFFMode::Ai
90    }
91}
92
93/// Configuration for a single fuzzy search invocation.
94///
95/// Passed to [`FilePicker::search`] to control threading, pagination,
96/// and scoring behavior.
97#[derive(Debug, Clone, Copy, Default)]
98pub struct FuzzySearchOptions<'a> {
99    pub max_threads: usize,
100    pub current_file: Option<&'a str>,
101    pub project_path: Option<&'a Path>,
102    pub combo_boost_score_multiplier: i32,
103    pub min_combo_count: u32,
104    pub pagination: PaginationArgs,
105}
106
107#[derive(Debug, Clone)]
108struct FileSync {
109    git_workdir: Option<PathBuf>,
110    /// Base files sorted by (parent_dir, filename). Used for binary search and
111    /// bigram index. All paths are backed by `chunked_paths` arena.
112    /// Deletions use tombstones (`is_deleted = true`) to keep bigram indices stable.
113    files: Vec<FileItem>,
114    /// Number of base files (from the last full reindex). Overflow files
115    /// live at `files[base_count..]`, each with its own `ChunkedPathStore`
116    /// kept alive in `overflow_stores`.
117    base_count: usize,
118    /// Sorted directory table. Each entry is a unique parent directory of at
119    /// least one file in `files`. Sorted by absolute path for O(log n) lookup.
120    /// Built during `walk_filesystem` and used for directory picker mode,
121    /// per-directory stats, and as a fast replacement for `extract_watch_dirs`.
122    dirs: Vec<DirItem>,
123    /// Shared builder for overflow file paths. Each overflow file's ChunkedString
124    /// uses `arena_override` pointing into this builder's arena. The builder
125    /// grows incrementally — no per-file store allocation. Dropped on rescan.
126    overflow_builder: Option<crate::simd_path::ChunkedPathStoreBuilder>,
127    /// Compressed bigram inverted index built during the post-scan phase.
128    /// Lives here so that replacing `FileSync` on rescan automatically drops
129    /// the stale index (bigram file indices are positions in `files`).
130    bigram_index: Option<Arc<BigramFilter>>,
131    /// Overlay tracking file mutations since the bigram index was built.
132    bigram_overlay: Option<Arc<parking_lot::RwLock<BigramOverlay>>>,
133    /// Chunk-level deduped path store for zero-copy SIMD matching.
134    /// Each file's relative path is pre-chunked into 16-byte aligned blocks
135    /// with content-based deduplication across files.
136    chunked_paths: Option<crate::simd_path::ChunkedPathStore>,
137}
138
139impl FileSync {
140    fn new() -> Self {
141        Self {
142            files: Vec::new(),
143            base_count: 0,
144            dirs: Vec::new(),
145            overflow_builder: None,
146            git_workdir: None,
147            bigram_index: None,
148            bigram_overlay: None,
149            chunked_paths: None,
150        }
151    }
152
153    /// Arena for base files (from the last full scan).
154    #[inline]
155    fn arena_base_ptr(&self) -> ArenaPtr {
156        self.chunked_paths
157            .as_ref()
158            .map(|s| s.as_arena_ptr())
159            .unwrap_or(ArenaPtr::null())
160    }
161
162    /// Arena for overflow files (added after the last full scan).
163    #[inline]
164    fn overflow_arena_ptr(&self) -> ArenaPtr {
165        self.overflow_builder
166            .as_ref()
167            .map(|b| b.as_arena_ptr())
168            .unwrap_or(self.arena_base_ptr())
169    }
170
171    /// Resolve the correct arena for a given file (base vs overflow).
172    #[inline]
173    fn arena_for_file(&self, file: &FileItem) -> ArenaPtr {
174        if file.is_overflow() {
175            self.overflow_arena_ptr()
176        } else {
177            self.arena_base_ptr()
178        }
179    }
180
181    /// Get all files (base + overflow). The base portion `[..base_count]` is
182    /// sorted by path; the overflow tail is unsorted.
183    #[inline]
184    fn files(&self) -> &[FileItem] {
185        &self.files
186    }
187
188    /// Get the overflow portion (files added since last full reindex).
189    #[inline]
190    fn overflow_files(&self) -> &[FileItem] {
191        &self.files[self.base_count..]
192    }
193
194    /// Get mutable file at index (works for base files only).
195    #[inline]
196    fn get_file_mut(&mut self, index: usize) -> Option<&mut FileItem> {
197        self.files.get_mut(index)
198    }
199
200    /// Find file index by path using binary search on the sorted base portion.
201    /// `path` must be an absolute path under `base_path`.
202    #[inline]
203    fn find_file_index(&self, path: &Path, base_path: &Path) -> Result<usize, usize> {
204        let arena = self.arena_base_ptr();
205
206        // Strip base_path prefix to get the relative path.
207        let rel_path = match path.strip_prefix(base_path) {
208            Ok(r) => r.to_string_lossy(),
209            Err(_) => return Err(0),
210        };
211
212        // Split into directory (with trailing '/') and filename.
213        let parent_end = rel_path
214            .rfind(std::path::is_separator)
215            .map(|i| i + 1)
216            .unwrap_or(0);
217        let dir_rel = &rel_path[..parent_end];
218        let filename = &rel_path[parent_end..];
219
220        // Binary search dirs to find the parent directory index.
221        // Dir items store the relative path including trailing '/' (e.g. "src/components/").
222        let mut dir_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
223        let dir_idx = match self
224            .dirs
225            .binary_search_by(|d| d.read_relative_path(arena, &mut dir_buf).cmp(dir_rel))
226        {
227            Ok(idx) => idx as u32,
228            Err(_) => return Err(0), // directory not found
229        };
230
231        // Binary search files by (parent_dir, filename) — same order as the sort
232        self.files[..self.base_count].binary_search_by(|f| {
233            f.parent_dir_index().cmp(&dir_idx).then_with(|| {
234                let fname = f.file_name(arena);
235                fname.as_str().cmp(filename)
236            })
237        })
238    }
239
240    /// Find a file in the overflow portion by relative path (linear scan).
241    /// Returns the absolute index into `files` (i.e. `base_count + position`).
242    fn find_overflow_index(&self, rel_path: &str) -> Option<usize> {
243        let overflow_arena = self.overflow_arena_ptr();
244        self.files[self.base_count..]
245            .iter()
246            .position(|f| f.relative_path_eq(overflow_arena, rel_path))
247            .map(|pos| self.base_count + pos)
248    }
249
250    /// Insert a file at position. Simple - no HashMap to maintain!
251    fn insert_file(&mut self, position: usize, file: FileItem) {
252        self.files.insert(position, file);
253    }
254
255    fn retain_files_with_arena<F>(&mut self, mut predicate: F) -> usize
256    where
257        F: FnMut(&FileItem, ArenaPtr) -> bool,
258    {
259        let base_arena = self.arena_base_ptr();
260        let overflow_arena = self.overflow_arena_ptr();
261
262        let base_count = self.base_count;
263        let initial_len = self.files.len();
264        let base_retained = self.files[..base_count]
265            .iter()
266            .filter(|f| predicate(f, base_arena))
267            .count();
268
269        self.files.retain(|f| {
270            predicate(
271                f,
272                if f.is_overflow() {
273                    overflow_arena
274                } else {
275                    base_arena
276                },
277            )
278        });
279
280        self.base_count = base_retained;
281        initial_len - self.files.len()
282    }
283
284    /// Insert a file in sorted order (by path).
285    /// Returns true if inserted, false if file already exists.
286    fn insert_file_sorted(&mut self, file: FileItem, base_path: &Path) -> bool {
287        let arena = self.arena_base_ptr();
288        let abs_path = file.absolute_path(arena, base_path);
289        match self.find_file_index(&abs_path, base_path) {
290            Ok(_) => false, // File already exists
291            Err(position) => {
292                self.insert_file(position, file);
293                true
294            }
295        }
296    }
297}
298
299impl FileItem {
300    pub fn new(path: PathBuf, base_path: &Path, git_status: Option<Status>) -> (Self, String) {
301        let metadata = std::fs::metadata(&path).ok();
302        Self::new_with_metadata(path, base_path, git_status, metadata.as_ref())
303    }
304
305    /// Create a FileItem using pre-fetched metadata to avoid a redundant stat syscall.
306    /// Returns `(FileItem, relative_path)`. The FileItem's `path` field is
307    /// empty; callers must populate it via `set_path` or `build_chunked_path_store_and_assign`.
308    fn new_with_metadata(
309        path: PathBuf,
310        base_path: &Path,
311        git_status: Option<Status>,
312        metadata: Option<&std::fs::Metadata>,
313    ) -> (Self, String) {
314        let path_buf = pathdiff::diff_paths(&path, base_path).unwrap_or_else(|| path.clone());
315        let relative_path = path_buf.to_string_lossy().into_owned();
316
317        let (size, modified) = match metadata {
318            Some(metadata) => {
319                let size = metadata.len();
320                let modified = metadata
321                    .modified()
322                    .ok()
323                    .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
324                    .map_or(0, |d| d.as_secs());
325
326                (size, modified)
327            }
328            None => (0, 0),
329        };
330
331        let is_binary = is_known_binary_extension(&path);
332
333        let filename_start = relative_path
334            .rfind(std::path::is_separator)
335            .map(|i| i + 1)
336            .unwrap_or(0) as u16;
337
338        let item = Self::new_raw(filename_start, size, modified, git_status, is_binary);
339        (item, relative_path)
340    }
341
342    /// Create a FileItem with an empty ChunkedString from a path on disk.
343    ///
344    /// Returns `(file_item, relative_path_string)`. The relative path must be
345    /// kept alongside the FileItem until `build_chunked_path_store_and_assign`
346    /// populates each item's `path` field from the shared arena.
347    pub fn new_from_walk(
348        path: &Path,
349        base_path: &Path,
350        git_status: Option<Status>,
351        metadata: Option<&std::fs::Metadata>,
352    ) -> (Self, String) {
353        let (size, modified) = match metadata {
354            Some(metadata) => {
355                let size = metadata.len();
356                let modified = metadata
357                    .modified()
358                    .ok()
359                    .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
360                    .map_or(0, |d| d.as_secs());
361                (size, modified)
362            }
363            None => (0, 0),
364        };
365
366        let is_binary = is_known_binary_extension(path);
367
368        let rel = pathdiff::diff_paths(path, base_path).unwrap_or_else(|| path.to_path_buf());
369        let rel_str = rel.to_string_lossy().into_owned();
370        let fname_offset = rel_str
371            .rfind(std::path::is_separator)
372            .map(|i| i + 1)
373            .unwrap_or(0) as u16;
374
375        let item = Self::new_raw(fname_offset, size, modified, git_status, is_binary);
376        (item, rel_str)
377    }
378
379    pub(crate) fn update_frecency_scores(
380        &mut self,
381        tracker: &FrecencyTracker,
382        arena: ArenaPtr,
383        base_path: &Path,
384        mode: FFFMode,
385    ) -> Result<(), Error> {
386        let mut abs_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
387        let abs = self.write_absolute_path(arena, base_path, &mut abs_buf);
388        self.access_frecency_score = tracker.get_access_score(abs, mode) as i16;
389        self.modification_frecency_score =
390            tracker.get_modification_score(self.modified, self.git_status, mode) as i16;
391
392        Ok(())
393    }
394}
395
396/// Options for creating a [`FilePicker`].
397pub struct FilePickerOptions {
398    pub base_path: String,
399    /// Pre-populate mmap caches for top-frecency files after the initial scan.
400    pub enable_mmap_cache: bool,
401    /// Build content index after the initial scan for faster content-aware filtering.
402    pub enable_content_indexing: bool,
403    /// Mode of the picker impact the way file watcher events are handled and the scoring logic
404    pub mode: FFFMode,
405    /// Explicit cache budget. When `None`, the budget is auto-computed from
406    /// the repo size after the initial scan completes.
407    pub cache_budget: Option<ContentCacheBudget>,
408    /// When `false`, `new_with_shared_state` skips the background file watcher.
409    pub watch: bool,
410}
411
412impl Default for FilePickerOptions {
413    fn default() -> Self {
414        Self {
415            base_path: ".".into(),
416            enable_mmap_cache: false,
417            enable_content_indexing: false,
418            mode: FFFMode::default(),
419            cache_budget: None,
420            watch: true,
421        }
422    }
423}
424
425pub struct FilePicker {
426    pub mode: FFFMode,
427    pub base_path: PathBuf,
428    pub is_scanning: Arc<AtomicBool>,
429    sync_data: FileSync,
430    cache_budget: Arc<ContentCacheBudget>,
431    has_explicit_cache_budget: bool,
432    watcher_ready: Arc<AtomicBool>,
433    scanned_files_count: Arc<AtomicUsize>,
434    background_watcher: Option<BackgroundWatcher>,
435    enable_mmap_cache: bool,
436    enable_content_indexing: bool,
437    watch: bool,
438    cancelled: Arc<AtomicBool>,
439    // This is a soft lock that we use to prevent rescan be triggered while the
440    // bigram indexing is in progress. This allows to keep some of the unsafe magic
441    // relying on the immutabillity of the files vec after the index without worrying
442    // that the vec is going to be dropped before the indexing is finished
443    //
444    // In addition to that rescan is likely triggered by something unnecessary
445    // before the indexing is finished it means that fff is dogfooded the index either
446    // by the UI rendering preview or simply by walking the directory. Which is not good anyway
447    post_scan_busy: Arc<AtomicBool>,
448}
449
450impl std::fmt::Debug for FilePicker {
451    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
452        f.debug_struct("FilePicker")
453            .field("base_path", &self.base_path)
454            .field("sync_data", &self.sync_data)
455            .field("is_scanning", &self.is_scanning.load(Ordering::Relaxed))
456            .field(
457                "scanned_files_count",
458                &self.scanned_files_count.load(Ordering::Relaxed),
459            )
460            .finish_non_exhaustive()
461    }
462}
463
464impl FFFStringStorage for &FilePicker {
465    #[inline]
466    fn arena_for(&self, file: &FileItem) -> crate::simd_path::ArenaPtr {
467        self.sync_data.arena_for_file(file)
468    }
469
470    #[inline]
471    fn base_arena(&self) -> crate::simd_path::ArenaPtr {
472        self.sync_data.arena_base_ptr()
473    }
474
475    #[inline]
476    fn overflow_arena(&self) -> crate::simd_path::ArenaPtr {
477        self.sync_data.overflow_arena_ptr()
478    }
479}
480
481impl FilePicker {
482    pub fn base_path(&self) -> &Path {
483        &self.base_path
484    }
485
486    /// Convert an absolute path to a relative path string (relative to base_path).
487    /// Returns None if the path doesn't start with base_path.
488    fn to_relative_path<'a>(&self, path: &'a Path) -> Option<&'a str> {
489        path.strip_prefix(&self.base_path)
490            .ok()
491            .and_then(|p| p.to_str())
492    }
493
494    pub fn need_enable_mmap_cache(&self) -> bool {
495        self.enable_mmap_cache
496    }
497
498    pub fn need_enable_content_indexing(&self) -> bool {
499        self.enable_content_indexing
500    }
501
502    pub fn need_watch(&self) -> bool {
503        self.watch
504    }
505
506    pub fn mode(&self) -> FFFMode {
507        self.mode
508    }
509
510    pub fn cache_budget(&self) -> &ContentCacheBudget {
511        &self.cache_budget
512    }
513
514    pub fn bigram_index(&self) -> Option<&BigramFilter> {
515        self.sync_data.bigram_index.as_deref()
516    }
517
518    pub fn bigram_overlay(&self) -> Option<&parking_lot::RwLock<BigramOverlay>> {
519        self.sync_data.bigram_overlay.as_deref()
520    }
521
522    pub fn get_file_mut(&mut self, index: usize) -> Option<&mut FileItem> {
523        self.sync_data.get_file_mut(index)
524    }
525
526    pub fn set_bigram_index(&mut self, index: BigramFilter, overlay: BigramOverlay) {
527        self.sync_data.bigram_index = Some(Arc::new(index));
528        self.sync_data.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(overlay)));
529    }
530
531    pub fn git_root(&self) -> Option<&Path> {
532        self.sync_data.git_workdir.as_deref()
533    }
534
535    /// Get all indexed files sorted by path.
536    /// Note: Files are stored sorted by PATH for efficient insert/remove.
537    /// For frecency-sorted results, use search() which sorts matched results.
538    pub fn get_files(&self) -> &[FileItem] {
539        self.sync_data.files()
540    }
541
542    pub fn get_overflow_files(&self) -> &[FileItem] {
543        self.sync_data.overflow_files()
544    }
545
546    /// Get the directory table (sorted by path).
547    pub fn get_dirs(&self) -> &[DirItem] {
548        &self.sync_data.dirs
549    }
550
551    /// Actual heap bytes used: (chunked_path_store, 0, 0).
552    /// The second element is 0 because leaked overflow stores aren't tracked.
553    pub fn arena_bytes(&self) -> (usize, usize, usize) {
554        let chunked = self
555            .sync_data
556            .chunked_paths
557            .as_ref()
558            .map_or(0, |s| s.heap_bytes());
559        (chunked, 0, 0)
560    }
561
562    /// Extracts all unique ancestor directories from the indexed file list.
563    /// Uses the pre-built directory table when available (O(d) where d = unique dirs),
564    /// falling back to the old traversal for overflow files.
565    #[tracing::instrument(level = "debug", skip(self))]
566    pub fn extract_watch_dirs(&self) -> Vec<PathBuf> {
567        let dir_table = &self.sync_data.dirs;
568
569        if !dir_table.is_empty() {
570            // Fast path: just collect PathBufs from the dir table.
571            // The dir table already contains all unique parent directories.
572            // We also need ancestor directories (parents of parents) for the
573            // watcher to work. Walk up from each dir to the base.
574            let base = self.base_path.as_path();
575            let arena = self.arena_base_ptr();
576            let mut all_dirs = Vec::with_capacity(dir_table.len() * 2);
577            let mut seen = std::collections::HashSet::with_capacity(dir_table.len() * 2);
578
579            for dir_item in dir_table {
580                let mut current = dir_item.absolute_path(arena, base);
581                while current.as_path() != base {
582                    if !seen.insert(current.clone()) {
583                        break; // already visited this and all its ancestors
584                    }
585                    all_dirs.push(current.clone());
586                    if !current.pop() {
587                        break;
588                    }
589                }
590            }
591
592            return all_dirs;
593        }
594
595        // Fallback: old traversal for cases where dir table is empty
596        let files = self.sync_data.files();
597        let base = self.base_path.as_path();
598        let arena = self.arena_base_ptr();
599        let mut dirs = Vec::with_capacity(files.len() / 4);
600        let mut current = self.base_path.clone();
601
602        for file in files {
603            let abs = file.absolute_path(arena, base);
604            let Some(parent) = abs.parent() else {
605                continue;
606            };
607            if parent == current.as_path() {
608                continue;
609            }
610
611            while current.as_path() != base && !parent.starts_with(&current) {
612                current.pop();
613            }
614
615            let Ok(remainder) = parent.strip_prefix(&current) else {
616                continue;
617            };
618            for component in remainder.components() {
619                current.push(component);
620                dirs.push(current.clone());
621            }
622        }
623
624        dirs
625    }
626
627    /// Create a new FilePicker from options.
628    /// Always prefer new_with_shared_state for the consumer application, use this only if you know
629    /// what you are doing. This won't spawn the backgraound watcher and won't walk the file tree.
630    pub fn new(options: FilePickerOptions) -> Result<Self, Error> {
631        let path = PathBuf::from(&options.base_path);
632        if !path.exists() {
633            error!("Base path does not exist: {}", options.base_path);
634            return Err(Error::InvalidPath(path));
635        }
636        if path.parent().is_none() {
637            error!("Refusing to index filesystem root: {}", path.display());
638            return Err(Error::FilesystemRoot(path));
639        }
640
641        let has_explicit_budget = options.cache_budget.is_some();
642        let initial_budget = options.cache_budget.unwrap_or_default();
643
644        Ok(FilePicker {
645            background_watcher: None,
646            base_path: path,
647            cache_budget: Arc::new(initial_budget),
648            cancelled: Arc::new(AtomicBool::new(false)),
649            has_explicit_cache_budget: has_explicit_budget,
650            is_scanning: Arc::new(AtomicBool::new(false)),
651            mode: options.mode,
652            post_scan_busy: Arc::new(AtomicBool::new(false)),
653            scanned_files_count: Arc::new(AtomicUsize::new(0)),
654            sync_data: FileSync::new(),
655            enable_mmap_cache: options.enable_mmap_cache,
656            enable_content_indexing: options.enable_content_indexing,
657            watch: options.watch,
658            watcher_ready: Arc::new(AtomicBool::new(false)),
659        })
660    }
661
662    /// Create a picker, place it into the shared handle, and spawn background
663    /// indexing + file-system watcher. This is the default entry point.
664    pub fn new_with_shared_state(
665        shared_picker: SharedPicker,
666        shared_frecency: SharedFrecency,
667        options: FilePickerOptions,
668    ) -> Result<(), Error> {
669        let picker = Self::new(options)?;
670
671        info!(
672            "Spawning background threads: base_path={}, warmup={}, content_indexing={}, mode={:?}",
673            picker.base_path.display(),
674            picker.enable_mmap_cache,
675            picker.enable_content_indexing,
676            picker.mode,
677        );
678
679        let warmup = picker.enable_mmap_cache;
680        let content_indexing = picker.enable_content_indexing;
681        let watch = picker.watch;
682        let mode = picker.mode;
683
684        picker.is_scanning.store(true, Ordering::Release);
685
686        let scan_signal = Arc::clone(&picker.is_scanning);
687        let watcher_ready = Arc::clone(&picker.watcher_ready);
688        let synced_files_count = Arc::clone(&picker.scanned_files_count);
689        let cancelled = Arc::clone(&picker.cancelled);
690        let post_scan_busy = Arc::clone(&picker.post_scan_busy);
691        let path = picker.base_path.clone();
692
693        {
694            let mut guard = shared_picker.write()?;
695            *guard = Some(picker);
696        }
697
698        spawn_scan_and_watcher(
699            path,
700            scan_signal,
701            watcher_ready,
702            synced_files_count,
703            warmup,
704            content_indexing,
705            watch,
706            mode,
707            shared_picker,
708            shared_frecency,
709            cancelled,
710            post_scan_busy,
711        );
712
713        Ok(())
714    }
715
716    /// Synchronous filesystem scan — populates `self` with indexed files.
717    ///
718    /// Use this when you need direct access to the picker without shared state:
719    /// ```ignore
720    /// let mut picker = FilePicker::new(options)?;
721    /// picker.collect_files()?;
722    /// // picker.get_files() is now populated
723    /// ```
724    pub fn collect_files(&mut self) -> Result<(), Error> {
725        self.is_scanning.store(true, Ordering::Relaxed);
726        self.scanned_files_count.store(0, Ordering::Relaxed);
727
728        let empty_frecency = SharedFrecency::default();
729        let walk = walk_filesystem(
730            &self.base_path,
731            &self.scanned_files_count,
732            &empty_frecency,
733            self.mode,
734        )?;
735
736        self.sync_data = walk.sync;
737
738        // Recalculate cache budget based on actual file count (unless
739        // the caller provided an explicit budget via FilePickerOptions).
740        if !self.has_explicit_cache_budget {
741            let file_count = self.sync_data.files().len();
742            self.cache_budget = Arc::new(ContentCacheBudget::new_for_repo(file_count));
743        } else {
744            self.cache_budget.reset();
745        }
746
747        // Apply git status synchronously.
748        if let Ok(Some(git_cache)) = walk.git_handle.join() {
749            let arena = self.arena_base_ptr();
750            for file in self.sync_data.files.iter_mut() {
751                file.git_status =
752                    git_cache.lookup_status(&file.absolute_path(arena, &self.base_path));
753            }
754        }
755
756        self.is_scanning.store(false, Ordering::Relaxed);
757        Ok(())
758    }
759
760    /// Start the background file-system watcher.
761    ///
762    /// The picker must already be placed into `shared_picker` (the watcher
763    /// needs the shared handle to apply live updates). Call after
764    /// [`collect_files`](Self::collect_files) or after an initial scan.
765    pub fn spawn_background_watcher(
766        &mut self,
767        shared_picker: &SharedPicker,
768        shared_frecency: &SharedFrecency,
769    ) -> Result<(), Error> {
770        let git_workdir = self.sync_data.git_workdir.clone();
771        let watch_dirs = self.extract_watch_dirs();
772        let watcher = BackgroundWatcher::new(
773            self.base_path.clone(),
774            git_workdir,
775            shared_picker.clone(),
776            shared_frecency.clone(),
777            self.mode,
778            watch_dirs,
779        )?;
780        self.background_watcher = Some(watcher);
781        self.watcher_ready.store(true, Ordering::Release);
782        Ok(())
783    }
784
785    /// Perform fuzzy search on files with a pre-parsed query.
786    ///
787    /// The query should be parsed using [`FFFQuery`]::parse() before calling
788    /// this function. If a [`QueryTracker`] is provided, the search will
789    /// automatically look up the last selected file for this query and apply
790    /// combo-boost scoring.
791    ///
792    pub fn fuzzy_search<'q>(
793        &self,
794        query: &'q FFFQuery<'q>,
795        query_tracker: Option<&QueryTracker>,
796        options: FuzzySearchOptions<'q>,
797    ) -> SearchResult<'_> {
798        let files = self.get_files();
799        let max_threads = if options.max_threads == 0 {
800            std::thread::available_parallelism()
801                .map(|n| n.get())
802                .unwrap_or(4)
803        } else {
804            options.max_threads
805        };
806
807        debug!(
808            raw_query = ?query.raw_query,
809            pagination = ?options.pagination,
810            ?max_threads,
811            current_file = ?options.current_file,
812            "Fuzzy search",
813        );
814
815        let total_files = files.len();
816        let location = query.location;
817
818        // Get effective query for max_typos calculation (without location suffix)
819        let effective_query = match &query.fuzzy_query {
820            fff_query_parser::FuzzyQuery::Text(t) => *t,
821            fff_query_parser::FuzzyQuery::Parts(parts) if !parts.is_empty() => parts[0],
822            _ => query.raw_query.trim(),
823        };
824
825        // small queries with a large number of results can match absolutely everything
826        let max_typos = (effective_query.len() as u16 / 4).clamp(2, 6);
827        // Look up the last file selected for this query (combo-boost scoring)
828        let last_same_query_entry =
829            query_tracker
830                .zip(options.project_path)
831                .and_then(|(tracker, project_path)| {
832                    tracker
833                        .get_last_query_entry(
834                            query.raw_query,
835                            project_path,
836                            options.min_combo_count,
837                        )
838                        .ok()
839                        .flatten()
840                });
841
842        let context = ScoringContext {
843            query,
844            max_typos,
845            max_threads,
846            project_path: options.project_path,
847            current_file: options.current_file,
848            last_same_query_match: last_same_query_entry,
849            combo_boost_score_multiplier: options.combo_boost_score_multiplier,
850            min_combo_count: options.min_combo_count,
851            pagination: options.pagination,
852        };
853
854        let time = std::time::Instant::now();
855
856        let base_arena = self.sync_data.arena_base_ptr();
857        let overflow_arena = self
858            .sync_data
859            .overflow_builder
860            .as_ref()
861            .map(|b| b.as_arena_ptr())
862            .unwrap_or(base_arena);
863
864        let (items, scores, total_matched) = fuzzy_match_and_score_files(
865            files,
866            &context,
867            self.sync_data.base_count,
868            base_arena,
869            overflow_arena,
870        );
871
872        info!(
873            ?query,
874            completed_in = ?time.elapsed(),
875            total_matched,
876            returned_count = items.len(),
877            pagination = ?options.pagination,
878            "Fuzzy search completed",
879        );
880
881        SearchResult {
882            items,
883            scores,
884            total_matched,
885            total_files,
886            location,
887        }
888    }
889
890    /// Perform fuzzy search on indexed directories.
891    ///
892    /// Returns directories ranked by fuzzy match quality + frecency.
893    pub fn fuzzy_search_directories<'q>(
894        &self,
895        query: &'q FFFQuery<'q>,
896        options: FuzzySearchOptions<'q>,
897    ) -> DirSearchResult<'_> {
898        let dirs = self.get_dirs();
899        let max_threads = if options.max_threads == 0 {
900            std::thread::available_parallelism()
901                .map(|n| n.get())
902                .unwrap_or(4)
903        } else {
904            options.max_threads
905        };
906
907        let total_dirs = dirs.len();
908
909        let effective_query = match &query.fuzzy_query {
910            fff_query_parser::FuzzyQuery::Text(t) => *t,
911            fff_query_parser::FuzzyQuery::Parts(parts) if !parts.is_empty() => parts[0],
912            _ => query.raw_query.trim(),
913        };
914
915        let max_typos = (effective_query.len() as u16 / 4).clamp(2, 6);
916
917        let context = ScoringContext {
918            query,
919            max_typos,
920            max_threads,
921            project_path: options.project_path,
922            current_file: options.current_file,
923            last_same_query_match: None,
924            combo_boost_score_multiplier: 0,
925            min_combo_count: 0,
926            pagination: options.pagination,
927        };
928
929        let arena = self.sync_data.arena_base_ptr();
930        let time = std::time::Instant::now();
931
932        let (items, scores, total_matched) =
933            crate::score::fuzzy_match_and_score_dirs(dirs, &context, arena);
934
935        info!(
936            ?query,
937            completed_in = ?time.elapsed(),
938            total_matched,
939            returned_count = items.len(),
940            "Directory search completed",
941        );
942
943        DirSearchResult {
944            items,
945            scores,
946            total_matched,
947            total_dirs,
948        }
949    }
950
951    /// Perform a mixed fuzzy search across both files and directories.
952    ///
953    /// Returns a single flat list where files and directories are interleaved
954    /// by total score in descending order.
955    ///
956    /// If the raw query ends with a path separator (`/`), only directories
957    /// are searched — files are skipped entirely. The caller should parse the
958    /// query with `DirSearchConfig` so that trailing `/` is kept as fuzzy
959    /// text instead of becoming a `PathSegment` constraint.
960    pub fn fuzzy_search_mixed<'q>(
961        &self,
962        query: &'q FFFQuery<'q>,
963        query_tracker: Option<&QueryTracker>,
964        options: FuzzySearchOptions<'q>,
965    ) -> MixedSearchResult<'_> {
966        let location = query.location;
967        let page_offset = options.pagination.offset;
968        let page_limit = if options.pagination.limit > 0 {
969            options.pagination.limit
970        } else {
971            100
972        };
973
974        let dirs_only =
975            query.raw_query.ends_with(std::path::MAIN_SEPARATOR) || query.raw_query.ends_with('/');
976
977        // Run file search and dir search with no pagination (we merge then paginate).
978        let internal_limit = page_offset.saturating_add(page_limit).saturating_mul(2);
979
980        let dir_options = FuzzySearchOptions {
981            pagination: PaginationArgs {
982                offset: 0,
983                limit: internal_limit,
984            },
985            ..options
986        };
987        let dir_results = self.fuzzy_search_directories(query, dir_options);
988
989        if dirs_only {
990            let total_matched = dir_results.total_matched;
991            let total_dirs = dir_results.total_dirs;
992
993            let mut merged: Vec<(MixedItemRef<'_>, Score)> =
994                Vec::with_capacity(dir_results.items.len());
995            for (dir, score) in dir_results.items.into_iter().zip(dir_results.scores) {
996                merged.push((MixedItemRef::Dir(dir), score));
997            }
998
999            if page_offset >= merged.len() {
1000                return MixedSearchResult {
1001                    items: vec![],
1002                    scores: vec![],
1003                    total_matched,
1004                    total_files: self.sync_data.files().len(),
1005                    total_dirs,
1006                    location,
1007                };
1008            }
1009
1010            let end = (page_offset + page_limit).min(merged.len());
1011            let page = merged.drain(page_offset..end);
1012            let (items, scores): (Vec<_>, Vec<_>) = page.unzip();
1013
1014            return MixedSearchResult {
1015                items,
1016                scores,
1017                total_matched,
1018                total_files: self.sync_data.files().len(),
1019                total_dirs,
1020                location,
1021            };
1022        }
1023
1024        let file_options = FuzzySearchOptions {
1025            pagination: PaginationArgs {
1026                offset: 0,
1027                limit: internal_limit,
1028            },
1029            ..options
1030        };
1031        let file_results = self.fuzzy_search(query, query_tracker, file_options);
1032
1033        // Merge by score descending.
1034        let total_matched = file_results.total_matched + dir_results.total_matched;
1035        let total_files = file_results.total_files;
1036        let total_dirs = dir_results.total_dirs;
1037
1038        let mut merged: Vec<(MixedItemRef<'_>, Score)> =
1039            Vec::with_capacity(file_results.items.len() + dir_results.items.len());
1040
1041        for (file, score) in file_results.items.into_iter().zip(file_results.scores) {
1042            merged.push((MixedItemRef::File(file), score));
1043        }
1044        for (dir, score) in dir_results.items.into_iter().zip(dir_results.scores) {
1045            merged.push((MixedItemRef::Dir(dir), score));
1046        }
1047
1048        // Sort merged results by total score descending.
1049        merged.sort_unstable_by_key(|b| std::cmp::Reverse(b.1.total));
1050
1051        // Paginate.
1052        if page_offset >= merged.len() {
1053            return MixedSearchResult {
1054                items: vec![],
1055                scores: vec![],
1056                total_matched,
1057                total_files,
1058                total_dirs,
1059                location,
1060            };
1061        }
1062
1063        let end = (page_offset + page_limit).min(merged.len());
1064        let page = merged.drain(page_offset..end);
1065        let (items, scores): (Vec<_>, Vec<_>) = page.unzip();
1066
1067        MixedSearchResult {
1068            items,
1069            scores,
1070            total_matched,
1071            total_files,
1072            total_dirs,
1073            location,
1074        }
1075    }
1076
1077    /// Perform a live grep search across indexed files.
1078    ///
1079    /// If `options.abort_signal` is set it overrides the picker's internal
1080    /// cancellation flag, giving the caller full control over when to stop.
1081    pub fn grep(&self, query: &FFFQuery<'_>, options: &GrepSearchOptions) -> GrepResult<'_> {
1082        let overlay_guard = self.sync_data.bigram_overlay.as_ref().map(|o| o.read());
1083        let arena = self.arena_base_ptr();
1084        let overflow_arena = self.sync_data.overflow_arena_ptr();
1085        let cancel = options.abort_signal.as_deref().unwrap_or(&self.cancelled);
1086
1087        grep_search(
1088            self.get_files(),
1089            query,
1090            options,
1091            self.cache_budget(),
1092            self.sync_data.bigram_index.as_deref(),
1093            overlay_guard.as_deref(),
1094            cancel,
1095            &self.base_path,
1096            arena,
1097            overflow_arena,
1098        )
1099    }
1100
1101    /// Multi-pattern grep search across indexed files.
1102    pub fn multi_grep(
1103        &self,
1104        patterns: &[&str],
1105        constraints: &[fff_query_parser::Constraint<'_>],
1106        options: &GrepSearchOptions,
1107    ) -> GrepResult<'_> {
1108        let overlay_guard = self.sync_data.bigram_overlay.as_ref().map(|o| o.read());
1109        let arena = self.arena_base_ptr();
1110        let overflow_arena = self.sync_data.overflow_arena_ptr();
1111        let cancel = options.abort_signal.as_deref().unwrap_or(&self.cancelled);
1112
1113        multi_grep_search(
1114            self.get_files(),
1115            patterns,
1116            constraints,
1117            options,
1118            self.cache_budget(),
1119            self.sync_data.bigram_index.as_deref(),
1120            overlay_guard.as_deref(),
1121            cancel,
1122            &self.base_path,
1123            arena,
1124            overflow_arena,
1125        )
1126    }
1127
1128    /// Like [`grep`](Self::grep) but ignores the bigram overlay.
1129    pub fn grep_without_overlay(
1130        &self,
1131        query: &FFFQuery<'_>,
1132        options: &GrepSearchOptions,
1133    ) -> GrepResult<'_> {
1134        let arena = self.arena_base_ptr();
1135        let overflow_arena = self.sync_data.overflow_arena_ptr();
1136        let cancel = options.abort_signal.as_deref().unwrap_or(&self.cancelled);
1137
1138        grep_search(
1139            self.get_files(),
1140            query,
1141            options,
1142            self.cache_budget(),
1143            self.sync_data.bigram_index.as_deref(),
1144            None,
1145            cancel,
1146            &self.base_path,
1147            arena,
1148            overflow_arena,
1149        )
1150    }
1151
1152    // Returns an ongoing or finisshed scan progress
1153    pub fn get_scan_progress(&self) -> ScanProgress {
1154        let scanned_count = self.scanned_files_count.load(Ordering::Relaxed);
1155        let is_scanning = self.is_scanning.load(Ordering::Relaxed);
1156        ScanProgress {
1157            scanned_files_count: scanned_count,
1158            is_scanning,
1159            is_watcher_ready: self.watcher_ready.load(Ordering::Relaxed),
1160            is_warmup_complete: self.sync_data.bigram_index.is_some(),
1161        }
1162    }
1163
1164    /// Update git statuses for files, using the provided shared frecency tracker.
1165    pub fn update_git_statuses(
1166        &mut self,
1167        status_cache: GitStatusCache,
1168        shared_frecency: &SharedFrecency,
1169    ) -> Result<(), Error> {
1170        debug!(
1171            statuses_count = status_cache.statuses_len(),
1172            "Updating git status",
1173        );
1174
1175        let mode = self.mode;
1176        let bp = self.base_path.clone();
1177        let arena = self.arena_base_ptr();
1178        let frecency = shared_frecency.read()?;
1179        status_cache
1180            .into_iter()
1181            .try_for_each(|(path, status)| -> Result<(), Error> {
1182                if let Some(file) = self.get_mut_file_by_path(&path) {
1183                    file.git_status = Some(status);
1184                    if let Some(ref f) = *frecency {
1185                        file.update_frecency_scores(f, arena, &bp, mode)?;
1186                    }
1187                    // Update parent dir frecency inline.
1188                    let score = file.access_frecency_score as i32;
1189                    let dir_idx = file.parent_dir_index() as usize;
1190                    if let Some(dir) = self.sync_data.dirs.get_mut(dir_idx) {
1191                        dir.update_frecency_if_larger(score);
1192                    }
1193                } else {
1194                    error!(?path, "Couldn't update the git status for path");
1195                }
1196                Ok(())
1197            })?;
1198
1199        Ok(())
1200    }
1201
1202    pub fn update_single_file_frecency(
1203        &mut self,
1204        file_path: impl AsRef<Path>,
1205        frecency_tracker: &FrecencyTracker,
1206    ) -> Result<(), Error> {
1207        let path = file_path.as_ref();
1208        let arena = self.arena_base_ptr();
1209        let rel = self.to_relative_path(path).unwrap_or("");
1210        let index = self
1211            .sync_data
1212            .find_file_index(path, &self.base_path)
1213            .ok()
1214            .or_else(|| self.sync_data.find_overflow_index(rel));
1215        if let Some(index) = index
1216            && let Some(file) = self.sync_data.get_file_mut(index)
1217        {
1218            file.update_frecency_scores(frecency_tracker, arena, &self.base_path, self.mode)?;
1219
1220            // Update parent dir frecency inline (only if larger).
1221            let score = file.access_frecency_score as i32;
1222            let dir_idx = file.parent_dir_index() as usize;
1223            if let Some(dir) = self.sync_data.dirs.get_mut(dir_idx) {
1224                dir.update_frecency_if_larger(score);
1225            }
1226        }
1227
1228        Ok(())
1229    }
1230
1231    pub fn get_file_by_path(&self, path: impl AsRef<Path>) -> Option<&FileItem> {
1232        self.sync_data
1233            .find_file_index(path.as_ref(), &self.base_path)
1234            .ok()
1235            .and_then(|index| self.sync_data.files().get(index))
1236    }
1237
1238    pub fn get_mut_file_by_path(&mut self, path: impl AsRef<Path>) -> Option<&mut FileItem> {
1239        let path = path.as_ref();
1240        let rel = self.to_relative_path(path).unwrap_or("");
1241        let index = self
1242            .sync_data
1243            .find_file_index(path, &self.base_path)
1244            .ok()
1245            .or_else(|| self.sync_data.find_overflow_index(rel));
1246        index.and_then(|i| self.sync_data.get_file_mut(i))
1247    }
1248
1249    /// Add a file to the picker's files in sorted order (used by background watcher)
1250    pub fn add_file_sorted(&mut self, file: FileItem) -> Option<&FileItem> {
1251        let arena = self.arena_base_ptr();
1252        let path = file.absolute_path(arena, &self.base_path);
1253
1254        if self.sync_data.insert_file_sorted(file, &self.base_path) {
1255            // File was inserted, look it up
1256            self.sync_data
1257                .find_file_index(&path, &self.base_path)
1258                .ok()
1259                .and_then(|idx| self.sync_data.get_file_mut(idx))
1260                .map(|file_mut| &*file_mut) // Convert &mut to &
1261        } else {
1262            // File already exists
1263            warn!(
1264                "Trying to insert a file that already exists: {}",
1265                path.display()
1266            );
1267            self.sync_data
1268                .find_file_index(&path, &self.base_path)
1269                .ok()
1270                .and_then(|idx| self.sync_data.get_file_mut(idx))
1271                .map(|file_mut| &*file_mut) // Convert &mut to &
1272        }
1273    }
1274
1275    #[tracing::instrument(skip(self), name = "timing_update", level = Level::DEBUG)]
1276    pub fn on_create_or_modify(&mut self, path: impl AsRef<Path> + Debug) -> Option<&FileItem> {
1277        let path = path.as_ref();
1278        let overlay = self.sync_data.bigram_overlay.as_ref().map(Arc::clone);
1279
1280        if let Ok(pos) = self.sync_data.find_file_index(path, &self.base_path) {
1281            let file = self.sync_data.get_file_mut(pos)?;
1282
1283            if file.is_deleted() {
1284                // Resurrect tombstoned file.
1285                file.set_deleted(false);
1286                debug!(
1287                    "on_create_or_modify: resurrected tombstoned file at index {}",
1288                    pos
1289                );
1290            }
1291
1292            debug!(
1293                "on_create_or_modify: file EXISTS at index {}, updating metadata",
1294                pos
1295            );
1296
1297            let modified = match std::fs::metadata(path) {
1298                Ok(metadata) => metadata
1299                    .modified()
1300                    .ok()
1301                    .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok()),
1302                Err(e) => {
1303                    error!("Failed to get metadata for {}: {}", path.display(), e);
1304                    None
1305                }
1306            };
1307
1308            if let Some(modified) = modified {
1309                let modified = modified.as_secs();
1310                if file.modified < modified {
1311                    file.modified = modified;
1312                    file.invalidate_mmap(&self.cache_budget);
1313                }
1314            }
1315
1316            // Update the bigram overlay for this modified file.
1317            if let Some(ref overlay) = overlay
1318                && let Ok(content) = std::fs::read(path)
1319            {
1320                overlay.write().modify_file(pos, &content);
1321            }
1322
1323            return Some(&*file);
1324        }
1325
1326        // Check overflow for existing added files.
1327        let rel_path = self.to_relative_path(path).unwrap_or("");
1328        if let Some(abs_idx) = self.sync_data.find_overflow_index(rel_path) {
1329            let file = self.sync_data.get_file_mut(abs_idx)?;
1330            let modified = std::fs::metadata(path)
1331                .ok()
1332                .and_then(|m| m.modified().ok())
1333                .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok());
1334            if let Some(modified) = modified {
1335                let modified = modified.as_secs();
1336                if file.modified < modified {
1337                    file.modified = modified;
1338                    file.invalidate_mmap(&self.cache_budget);
1339                }
1340            }
1341            return Some(&*file);
1342        }
1343
1344        // New file — append to overflow (preserves base indices for bigram).
1345        debug!(
1346            "on_create_or_modify: file NEW, appending to overflow (base: {}, overflow: {})",
1347            self.sync_data.base_count,
1348            self.sync_data.overflow_files().len(),
1349        );
1350
1351        let (mut file_item, rel_path) = FileItem::new(path.to_path_buf(), &self.base_path, None);
1352
1353        // Lazily create the shared overflow builder on first use.
1354        let builder = self
1355            .sync_data
1356            .overflow_builder
1357            .get_or_insert_with(|| crate::simd_path::ChunkedPathStoreBuilder::new(64));
1358
1359        let cs = builder.add_file_immediate(&rel_path, file_item.path.filename_offset);
1360        file_item.set_path(cs);
1361        file_item.set_overflow(true);
1362        self.sync_data.files.push(file_item);
1363        self.sync_data.files.last()
1364    }
1365
1366    /// Tombstone a file instead of removing it, keeping base indices stable.
1367    pub fn remove_file_by_path(&mut self, path: impl AsRef<Path>) -> bool {
1368        let path = path.as_ref();
1369        match self.sync_data.find_file_index(path, &self.base_path) {
1370            Ok(index) => {
1371                let file = &mut self.sync_data.files[index];
1372                file.set_deleted(true);
1373                file.invalidate_mmap(&self.cache_budget);
1374                if let Some(ref overlay) = self.sync_data.bigram_overlay {
1375                    overlay.write().delete_file(index);
1376                }
1377                true
1378            }
1379            Err(_) => {
1380                // Check overflow for added files — these can be removed directly
1381                // since they aren't in the base bigram index.
1382                let rel = self.to_relative_path(path).unwrap_or("");
1383                if let Some(abs_pos) = self.sync_data.find_overflow_index(rel) {
1384                    self.sync_data.files.remove(abs_pos);
1385                    true
1386                } else {
1387                    false
1388                }
1389            }
1390        }
1391    }
1392
1393    // TODO make this O(n)
1394    pub fn remove_all_files_in_dir(&mut self, dir: impl AsRef<Path>) -> usize {
1395        let dir_path = dir.as_ref();
1396        let relative_dir = self.to_relative_path(dir_path).unwrap_or("").to_string();
1397
1398        let dir_prefix = if relative_dir.is_empty() {
1399            String::new()
1400        } else {
1401            format!("{}{}", relative_dir, std::path::MAIN_SEPARATOR)
1402        };
1403
1404        self.sync_data.retain_files_with_arena(|file, arena| {
1405            !file.relative_path_starts_with(arena, &dir_prefix)
1406        })
1407    }
1408
1409    /// Use this to prevent any substantial background threads from acquiring the locks
1410    pub fn cancel(&self) {
1411        self.cancelled.store(true, Ordering::Release);
1412    }
1413
1414    pub fn stop_background_monitor(&mut self) {
1415        if let Some(mut watcher) = self.background_watcher.take() {
1416            watcher.stop();
1417        }
1418    }
1419
1420    #[inline]
1421    pub(crate) fn arena_base_ptr(&self) -> ArenaPtr {
1422        self.sync_data.arena_base_ptr()
1423    }
1424
1425    /// Spawn a background thread to rebuild the bigram index after rescan.
1426    pub(crate) fn spawn_post_rescan_rebuild(&self, shared_picker: SharedPicker) -> bool {
1427        if self.cancelled.load(Ordering::Relaxed) {
1428            return false;
1429        }
1430
1431        let post_scan_busy = Arc::clone(&self.post_scan_busy);
1432        let cancelled = Arc::clone(&self.cancelled);
1433        let auto_budget = !self.has_explicit_cache_budget;
1434        let do_warmup = self.enable_mmap_cache;
1435        let do_content_indexing = self.enable_content_indexing;
1436
1437        post_scan_busy.store(true, Ordering::Release);
1438
1439        std::thread::spawn(move || {
1440            let phase_start = std::time::Instant::now();
1441
1442            // Scale cache budget if not explicitly configured.
1443            if auto_budget
1444                && !cancelled.load(Ordering::Acquire)
1445                && let Ok(mut guard) = shared_picker.write()
1446                && let Some(ref mut picker) = *guard
1447                && !picker.has_explicit_cache_budget
1448            {
1449                let file_count = picker.sync_data.files().len();
1450                picker.cache_budget = Arc::new(ContentCacheBudget::new_for_repo(file_count));
1451            }
1452
1453            // Take a snapshot of files + budget while holding a brief read lock.
1454            // SAFETY: post_scan_busy blocks trigger_rescan from replacing
1455            // sync_data, so the Vec backing this slice stays alive.
1456            let files_snapshot = if !cancelled.load(Ordering::Acquire) {
1457                shared_picker.read().ok().and_then(|guard| {
1458                    guard.as_ref().map(|picker| {
1459                        let files = picker.sync_data.files();
1460                        let ptr = files.as_ptr();
1461                        let len = files.len();
1462                        let base_count = picker.sync_data.base_count;
1463                        let budget = Arc::clone(&picker.cache_budget);
1464                        let static_files: &[FileItem] =
1465                            unsafe { std::slice::from_raw_parts(ptr, len) };
1466                        (
1467                            static_files,
1468                            base_count,
1469                            budget,
1470                            picker.base_path().to_path_buf(),
1471                            picker.arena_base_ptr(),
1472                        )
1473                    })
1474                })
1475            } else {
1476                None
1477            };
1478
1479            if let Some((files, base_count, budget, bp, arena)) = files_snapshot {
1480                // Warmup mmap caches.
1481                if do_warmup && !cancelled.load(Ordering::Acquire) {
1482                    let t = std::time::Instant::now();
1483                    warmup_mmaps(files, &budget, &bp, arena);
1484                    info!(
1485                        "Rescan warmup completed in {:.2}s (cached {} files, {} bytes)",
1486                        t.elapsed().as_secs_f64(),
1487                        budget.cached_count.load(Ordering::Relaxed),
1488                        budget.cached_bytes.load(Ordering::Relaxed),
1489                    );
1490                }
1491
1492                // Build bigram index (lock-free).
1493                if do_content_indexing && !cancelled.load(Ordering::Acquire) {
1494                    let t = std::time::Instant::now();
1495                    // Index ONLY base files — overflow files are searched
1496                    // unconditionally by the grep overflow loop, so
1497                    // `BigramFilter::file_count` must equal
1498                    // `BigramOverlay::base_file_count` for the candidate
1499                    // bitset to never carry overflow-range bits.
1500                    let base_files = &files[..base_count.min(files.len())];
1501                    info!(
1502                        "Rescan: starting bigram index build for {} files...",
1503                        base_files.len()
1504                    );
1505                    let (index, content_binary) =
1506                        build_bigram_index(base_files, &budget, &bp, arena);
1507                    info!(
1508                        "Rescan: bigram index ready in {:.2}s",
1509                        t.elapsed().as_secs_f64()
1510                    );
1511
1512                    // Brief write lock to store the index.
1513                    if let Ok(mut guard) = shared_picker.write()
1514                        && let Some(ref mut picker) = *guard
1515                    {
1516                        for &idx in &content_binary {
1517                            if let Some(file) = picker.sync_data.get_file_mut(idx) {
1518                                file.set_binary(true);
1519                            }
1520                        }
1521
1522                        // Use the same `base_count` the filter was built with
1523                        // so `file_count == base_file_count` is guaranteed.
1524                        picker.sync_data.bigram_index = Some(Arc::new(index));
1525                        picker.sync_data.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(
1526                            BigramOverlay::new(base_count),
1527                        )));
1528                    }
1529                }
1530            }
1531
1532            post_scan_busy.store(false, Ordering::Release);
1533            info!(
1534                "Rescan post-scan phase total: {:.2}s (warmup={}, content_indexing={})",
1535                phase_start.elapsed().as_secs_f64(),
1536                do_warmup,
1537                do_content_indexing,
1538            );
1539        });
1540
1541        true
1542    }
1543
1544    pub fn trigger_rescan(&mut self, shared_frecency: &SharedFrecency) -> Result<(), Error> {
1545        if self.is_scanning.load(Ordering::Relaxed) {
1546            debug!("Scan already in progress, skipping trigger_rescan");
1547            return Ok(());
1548        }
1549
1550        // The post-scan warmup + bigram phase holds a raw pointer into the
1551        // current files Vec. Replacing sync_data now would free that memory.
1552        // Skip — the background watcher will retry on the next event.
1553        if self.post_scan_busy.load(Ordering::Acquire) {
1554            debug!("Post-scan bigram build in progress, skipping rescan");
1555            return Ok(());
1556        }
1557
1558        self.is_scanning.store(true, Ordering::Relaxed);
1559        self.scanned_files_count.store(0, Ordering::Relaxed);
1560
1561        let walk_result = walk_filesystem(
1562            &self.base_path,
1563            &self.scanned_files_count,
1564            shared_frecency,
1565            self.mode,
1566        );
1567
1568        match walk_result {
1569            Ok(walk) => {
1570                info!(
1571                    "Filesystem rescan completed: found {} files",
1572                    walk.sync.files.len()
1573                );
1574
1575                self.sync_data = walk.sync;
1576                self.cache_budget.reset();
1577
1578                // Apply git status synchronously for rescan (typically fast).
1579                if let Ok(Some(git_cache)) = walk.git_handle.join() {
1580                    let frecency = shared_frecency.read().ok();
1581                    let frecency_ref = frecency.as_ref().and_then(|f| f.as_ref());
1582                    let mode = self.mode;
1583                    let bp = &self.base_path;
1584                    let arena = self.arena_base_ptr();
1585
1586                    // Reset dir frecency before recomputation.
1587                    for dir in self.sync_data.dirs.iter() {
1588                        dir.reset_frecency();
1589                    }
1590
1591                    let files = &mut self.sync_data.files;
1592                    let dirs = &self.sync_data.dirs;
1593                    BACKGROUND_THREAD_POOL.install(|| {
1594                        files.par_iter_mut().for_each(|file| {
1595                            file.git_status =
1596                                git_cache.lookup_status(&file.absolute_path(arena, bp));
1597                            if let Some(frecency) = frecency_ref {
1598                                let _ = file.update_frecency_scores(frecency, arena, bp, mode);
1599                            }
1600                            let score = file.access_frecency_score as i32;
1601                            if score > 0 {
1602                                let dir_idx = file.parent_dir_index() as usize;
1603                                if let Some(dir) = dirs.get(dir_idx) {
1604                                    dir.update_frecency_if_larger(score);
1605                                }
1606                            }
1607                        });
1608                    });
1609                }
1610
1611                // Warmup is deferred to the post-rescan bigram rebuild thread
1612                // (spawned by trigger_full_rescan) which does warmup + bigram
1613                // in one pass, matching the initial scan's post-scan phase.
1614            }
1615            Err(error) => error!(?error, "Failed to scan file system"),
1616        }
1617
1618        self.is_scanning.store(false, Ordering::Relaxed);
1619        Ok(())
1620    }
1621
1622    /// Quick way to check if scan is going without acquiring a lock for [Self::get_scan_progress]
1623    pub fn is_scan_active(&self) -> bool {
1624        self.is_scanning.load(Ordering::Relaxed)
1625    }
1626
1627    /// Return a clone of the scanning flag so callers can poll it without
1628    /// holding a lock on the picker.
1629    pub fn scan_signal(&self) -> Arc<AtomicBool> {
1630        Arc::clone(&self.is_scanning)
1631    }
1632
1633    /// Return a clone of the watcher-ready flag so callers can poll it without
1634    /// holding a lock on the picker.
1635    pub fn watcher_signal(&self) -> Arc<AtomicBool> {
1636        Arc::clone(&self.watcher_ready)
1637    }
1638}
1639
1640/// A point-in-time snapshot of the file-scanning progress.
1641///
1642/// Returned by [`FilePicker::get_scan_progress`]. Useful for displaying
1643/// a progress indicator while the initial scan is running.
1644#[derive(Debug, Clone)]
1645pub struct ScanProgress {
1646    pub scanned_files_count: usize,
1647    pub is_scanning: bool,
1648    pub is_watcher_ready: bool,
1649    pub is_warmup_complete: bool,
1650}
1651
1652#[allow(clippy::too_many_arguments)]
1653fn spawn_scan_and_watcher(
1654    base_path: PathBuf,
1655    scan_signal: Arc<AtomicBool>,
1656    watcher_ready: Arc<AtomicBool>,
1657    synced_files_count: Arc<AtomicUsize>,
1658    enable_mmap_cache: bool,
1659    enable_content_indexing: bool,
1660    watch: bool,
1661    mode: FFFMode,
1662    shared_picker: SharedPicker,
1663    shared_frecency: SharedFrecency,
1664    cancelled: Arc<AtomicBool>,
1665    post_scan_busy: Arc<AtomicBool>,
1666) {
1667    std::thread::spawn(move || {
1668        // scan_signal is already `true` (set by the caller before spawning)
1669        // so waiters see "scanning" even before this thread is scheduled.
1670        info!("Starting initial file scan");
1671
1672        let git_workdir;
1673
1674        match walk_filesystem(&base_path, &synced_files_count, &shared_frecency, mode) {
1675            Ok(walk) => {
1676                if cancelled.load(Ordering::Acquire) {
1677                    info!("Walk completed but picker was replaced, discarding results");
1678                    scan_signal.store(false, Ordering::Relaxed);
1679                    return;
1680                }
1681
1682                info!(
1683                    "Initial filesystem walk completed: found {} files",
1684                    walk.sync.files.len()
1685                );
1686
1687                git_workdir = walk.sync.git_workdir.clone();
1688                let git_handle = walk.git_handle;
1689
1690                // Write files immediately — they are now searchable even
1691                // before git status or warmup completes.
1692                let write_result = shared_picker.write().ok().map(|mut guard| {
1693                    if let Some(ref mut picker) = *guard {
1694                        picker.sync_data = walk.sync;
1695                        picker.cache_budget.reset();
1696                    }
1697                });
1698
1699                if write_result.is_none() {
1700                    error!("Failed to write scan results into picker");
1701                }
1702
1703                // Signal scan complete — files are searchable.
1704                scan_signal.store(false, Ordering::Relaxed);
1705                info!("Files indexed and searchable");
1706
1707                if !cancelled.load(Ordering::Acquire) {
1708                    apply_git_status_and_frecency(
1709                        &shared_picker,
1710                        &shared_frecency,
1711                        git_handle,
1712                        mode,
1713                    );
1714                }
1715            }
1716            Err(e) => {
1717                error!("Initial scan failed: {:?}", e);
1718                scan_signal.store(false, Ordering::Relaxed);
1719                watcher_ready.store(true, Ordering::Release);
1720                return;
1721            }
1722        }
1723
1724        if watch && !cancelled.load(Ordering::Acquire) {
1725            let watch_dirs = shared_picker
1726                .read()
1727                .ok()
1728                .and_then(|guard| guard.as_ref().map(|picker| picker.extract_watch_dirs()))
1729                .unwrap_or_default();
1730
1731            match BackgroundWatcher::new(
1732                base_path.clone(),
1733                git_workdir,
1734                shared_picker.clone(),
1735                shared_frecency.clone(),
1736                mode,
1737                watch_dirs,
1738            ) {
1739                Ok(watcher) => {
1740                    info!("Background file watcher initialized successfully");
1741
1742                    if cancelled.load(Ordering::Acquire) {
1743                        info!("Picker was replaced, dropping orphaned watcher");
1744                        drop(watcher);
1745                        watcher_ready.store(true, Ordering::Release);
1746                        return;
1747                    }
1748
1749                    let write_result = shared_picker.write().ok().map(|mut guard| {
1750                        if let Some(ref mut picker) = *guard {
1751                            picker.background_watcher = Some(watcher);
1752                        }
1753                    });
1754
1755                    if write_result.is_none() {
1756                        error!("Failed to store background watcher in picker");
1757                    }
1758                }
1759                Err(e) => {
1760                    error!("Failed to initialize background file watcher: {:?}", e);
1761                }
1762            }
1763        }
1764
1765        watcher_ready.store(true, Ordering::Release);
1766
1767        let need_post_scan =
1768            (enable_mmap_cache || enable_content_indexing) && !cancelled.load(Ordering::Acquire);
1769
1770        if need_post_scan {
1771            post_scan_busy.store(true, Ordering::Release);
1772            let phase_start = std::time::Instant::now();
1773
1774            // Scale cache limits based on repo size (skip if caller provided an explicit budget).
1775            if let Ok(mut guard) = shared_picker.write()
1776                && let Some(ref mut picker) = *guard
1777                && !picker.has_explicit_cache_budget
1778            {
1779                let file_count = picker.sync_data.files().len();
1780                picker.cache_budget = Arc::new(ContentCacheBudget::new_for_repo(file_count));
1781                info!(
1782                    "Cache budget configured for {} files: max_files={}, max_bytes={}",
1783                    file_count, picker.cache_budget.max_files, picker.cache_budget.max_bytes,
1784                );
1785            }
1786
1787            // SAFETY: The file index Vec is not resized between the initial scan
1788            // completing and the warmup + bigram phase finishing because
1789            // `post_scan_busy` prevents concurrent rescans from replacing
1790            // sync_data while we hold the raw pointer.
1791            let files_snapshot: Option<(&[FileItem], usize, Arc<ContentCacheBudget>, ArenaPtr)> =
1792                if !cancelled.load(Ordering::Acquire) {
1793                    let guard = shared_picker.read().ok();
1794                    guard.and_then(|guard| {
1795                        guard.as_ref().map(|picker| {
1796                            let files = picker.sync_data.files();
1797                            let ptr = files.as_ptr();
1798                            let len = files.len();
1799                            let base_count = picker.sync_data.base_count;
1800                            let budget = Arc::clone(&picker.cache_budget);
1801                            let arena = picker.arena_base_ptr();
1802                            // SAFETY: post_scan_busy flag blocks trigger_rescan and
1803                            // background watcher rescans from replacing sync_data,
1804                            // so the Vec backing this slice stays alive.
1805                            let static_files: &[FileItem] =
1806                                unsafe { std::slice::from_raw_parts(ptr, len) };
1807                            (static_files, base_count, budget, arena)
1808                        })
1809                    })
1810                } else {
1811                    None
1812                };
1813
1814            // both of this is using a custom soft lock not guaranteed by compiler
1815            // this is required to keep the picker functioning if someone opened a really crazy
1816            // e.g  10m files directory but potentially unsafe
1817            if let Some((files, base_count, budget, arena)) = files_snapshot {
1818                if enable_mmap_cache && !cancelled.load(Ordering::Acquire) {
1819                    let warmup_start = std::time::Instant::now();
1820                    warmup_mmaps(files, &budget, &base_path, arena);
1821                    info!(
1822                        "Warmup completed in {:.2}s (cached {} files, {} bytes)",
1823                        warmup_start.elapsed().as_secs_f64(),
1824                        budget.cached_count.load(Ordering::Relaxed),
1825                        budget.cached_bytes.load(Ordering::Relaxed),
1826                    );
1827                }
1828
1829                if enable_content_indexing && !cancelled.load(Ordering::Acquire) {
1830                    // Index ONLY base files. Any overflow files present in
1831                    // the snapshot (from watcher events that landed before
1832                    // this snapshot was taken) are intentionally excluded:
1833                    // grep handles them via the unconditional overflow-
1834                    // append loop, and the filter's `file_count` must match
1835                    // the overlay's `base_file_count` so the candidate
1836                    // bitset can't carry bits for overflow-range indices.
1837                    let base_files = &files[..base_count.min(files.len())];
1838                    let (index, content_binary) =
1839                        build_bigram_index(base_files, &budget, &base_path, arena);
1840
1841                    if let Ok(mut guard) = shared_picker.write()
1842                        && let Some(ref mut picker) = *guard
1843                    {
1844                        for &idx in &content_binary {
1845                            if let Some(file) = picker.sync_data.get_file_mut(idx) {
1846                                file.set_binary(true);
1847                            }
1848                        }
1849
1850                        picker.sync_data.bigram_index = Some(Arc::new(index));
1851                        picker.sync_data.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(
1852                            BigramOverlay::new(base_count),
1853                        )));
1854                    }
1855                }
1856            }
1857
1858            post_scan_busy.store(false, Ordering::Release);
1859
1860            info!(
1861                "Post-scan phase total: {:.2}s (warmup={}, content_indexing={})",
1862                phase_start.elapsed().as_secs_f64(),
1863                enable_mmap_cache,
1864                enable_content_indexing,
1865            );
1866        }
1867
1868        // the debouncer keeps running in its own thread
1869    });
1870}
1871
1872/// Pre-populate mmap caches for the most valuable files so the first grep
1873/// search doesn't pay the mmap creation + page fault cost.
1874///
1875/// All files are collected once, then an O(n) `select_nth_unstable_by`
1876/// partitions the top [`MAX_CACHED_CONTENT_FILES`] highest-frecency eligible
1877/// files to the front (binary / empty files are pushed to the end by the
1878/// comparator). The selected prefix is warmed in parallel via rayon.
1879///
1880/// Files beyond the budget are still available via temporary mmaps on first
1881/// grep access, so correctness is unaffected.
1882#[tracing::instrument(skip(files), name = "warmup_mmaps", level = Level::DEBUG)]
1883pub(crate) fn warmup_mmaps(
1884    files: &[FileItem],
1885    budget: &ContentCacheBudget,
1886    base_path: &Path,
1887    arena: ArenaPtr,
1888) {
1889    let max_files = budget.max_files;
1890    let max_bytes = budget.max_bytes;
1891    let max_file_size = budget.max_file_size;
1892
1893    // Single collect — no pre-filter. The comparator in select_nth pushes
1894    // ineligible files (binary, empty) to the tail automatically.
1895    let mut all: Vec<&FileItem> = files.iter().collect();
1896
1897    // O(n) partial sort: top max_files eligible-by-frecency files land in
1898    // all[..max_files]. Ineligible files compare as "lowest priority" so
1899    // they naturally sink past the partition boundary.
1900    if all.len() > max_files {
1901        all.select_nth_unstable_by(max_files, |a, b| {
1902            let a_ok = !a.is_binary() && a.size > 0;
1903            let b_ok = !b.is_binary() && b.size > 0;
1904            match (a_ok, b_ok) {
1905                (true, false) => std::cmp::Ordering::Less,
1906                (false, true) => std::cmp::Ordering::Greater,
1907                (false, false) => std::cmp::Ordering::Equal,
1908                (true, true) => b.total_frecency_score().cmp(&a.total_frecency_score()),
1909            }
1910        });
1911    }
1912
1913    let to_warm = &all[..all.len().min(max_files)];
1914
1915    let warmed_bytes = AtomicU64::new(0);
1916    let budget_exhausted = AtomicBool::new(false);
1917
1918    BACKGROUND_THREAD_POOL.install(|| {
1919        to_warm.par_iter().for_each(|file| {
1920            if budget_exhausted.load(Ordering::Relaxed) {
1921                return;
1922            }
1923
1924            if file.is_binary() || file.size == 0 || file.size > max_file_size {
1925                return;
1926            }
1927
1928            // Byte budget.
1929            let prev_bytes = warmed_bytes.fetch_add(file.size, Ordering::Relaxed);
1930            if prev_bytes + file.size > max_bytes {
1931                budget_exhausted.store(true, Ordering::Relaxed);
1932                return;
1933            }
1934
1935            if let Some(content) = file.get_content(arena, base_path, budget) {
1936                let _ = std::hint::black_box(content.first());
1937            }
1938        });
1939    });
1940}
1941
1942/// Max bytes of file content scanned for bigram indexing. After this many
1943/// bytes the ~4900 possible printable-ASCII bigrams are effectively saturated,
1944/// so reading further adds no new information to the index.
1945pub const BIGRAM_CONTENT_CAP: usize = 64 * 1024;
1946
1947#[tracing::instrument(skip_all, name = "Building Bigram Index", level = Level::DEBUG)]
1948pub(crate) fn build_bigram_index(
1949    files: &[FileItem],
1950    budget: &ContentCacheBudget,
1951    base_path: &Path,
1952    arena: ArenaPtr,
1953) -> (BigramFilter, Vec<usize>) {
1954    let start = std::time::Instant::now();
1955    info!("Building bigram index for {} files...", files.len());
1956    let builder = BigramIndexBuilder::new(files.len());
1957    let skip_builder = BigramIndexBuilder::new(files.len());
1958    let max_file_size = budget.max_file_size;
1959
1960    // Collect indices of files that passed the extension heuristic but are
1961    // actually binary (contain NUL bytes). These are marked `is_binary = true`
1962    // on the real file list after the build, so grep never has to re-check.
1963    let content_binary: std::sync::Mutex<Vec<usize>> = std::sync::Mutex::new(Vec::new());
1964
1965    BACKGROUND_THREAD_POOL.install(|| {
1966        files.par_iter().enumerate().for_each(|(i, file)| {
1967            if file.is_binary() || file.size == 0 || file.size > max_file_size {
1968                return;
1969            }
1970            // Use cached content if available (no extra memory).
1971            // For uncached files, read from disk — heap memory is freed on drop.
1972            let data: Option<&[u8]>;
1973            let owned;
1974            if let Some(cached) = file.get_content(arena, base_path, budget) {
1975                if detect_binary_content(cached) {
1976                    content_binary.lock().unwrap().push(i);
1977                    return;
1978                }
1979                data = Some(cached);
1980                owned = None;
1981            } else if let Ok(read_data) = std::fs::read(file.absolute_path(arena, base_path)) {
1982                if detect_binary_content(&read_data) {
1983                    content_binary.lock().unwrap().push(i);
1984                    return;
1985                }
1986                data = None;
1987                owned = Some(read_data);
1988            } else {
1989                return;
1990            }
1991
1992            let content = data.unwrap_or_else(|| owned.as_ref().unwrap());
1993            let capped = &content[..content.len().min(BIGRAM_CONTENT_CAP)];
1994            builder.add_file_content(&skip_builder, i, capped);
1995        });
1996    });
1997
1998    let cols = builder.columns_used();
1999    let mut index = builder.compress(None);
2000
2001    // Skip bigrams are supplementary — the consecutive index does the heavy
2002    // lifting. Rare skip columns (< 12% of files) add virtually no filtering
2003    // on either homogeneous (kernel) or polyglot (monorepo) codebases, but
2004    // cost ~25-30% of total index memory. Using a higher sparse cutoff for
2005    // the skip index drops these dead-weight columns with negligible loss.
2006    let skip_index = skip_builder.compress(Some(12));
2007    index.set_skip_index(skip_index);
2008
2009    // The builders' flat buffers were freed by compress() above (single
2010    // deallocation each). Hint the allocator to return pages from other
2011    // per-thread allocations (file reads, sort buffers) during the build.
2012    hint_allocator_collect();
2013
2014    info!(
2015        "Bigram index built in {:.2}s — {} dense columns for {} files",
2016        start.elapsed().as_secs_f64(),
2017        cols,
2018        files.len(),
2019    );
2020
2021    let binary_indices = content_binary.into_inner().unwrap();
2022    if !binary_indices.is_empty() {
2023        info!(
2024            "Bigram build detected {} content-binary files (not caught by extension)",
2025            binary_indices.len(),
2026        );
2027    }
2028
2029    (index, binary_indices)
2030}
2031
2032/// Result of the fast walk phase — files are searchable immediately,
2033/// git status arrives later via the join handle.
2034struct WalkResult {
2035    sync: FileSync,
2036    git_handle: std::thread::JoinHandle<Option<GitStatusCache>>,
2037}
2038
2039/// Returns files immediately (searchable) and a handle to the in-progress
2040/// git status computation. This avoids blocking on `git status` which can
2041/// take 10+ seconds on very large repos (e.g. chromium).
2042fn walk_filesystem(
2043    base_path: &Path,
2044    synced_files_count: &Arc<AtomicUsize>,
2045    shared_frecency: &SharedFrecency,
2046    mode: FFFMode,
2047) -> Result<WalkResult, Error> {
2048    use ignore::WalkBuilder;
2049
2050    let scan_start = std::time::Instant::now();
2051    info!("SCAN: Starting filesystem walk and git status (async)");
2052
2053    // Discover git root (fast — just walks up looking for .git/)
2054    let git_workdir = Repository::discover(base_path)
2055        .ok()
2056        .and_then(|repo| repo.workdir().map(Path::to_path_buf));
2057
2058    if let Some(ref git_dir) = git_workdir {
2059        debug!("Git repository found at: {}", git_dir.display());
2060    } else {
2061        debug!("No git repository found for path: {}", base_path.display());
2062    }
2063
2064    // Spawn git status on a detached thread — we won't wait for it here.
2065    let git_workdir_for_status = git_workdir.clone();
2066    let git_handle = std::thread::spawn(move || {
2067        GitStatusCache::read_git_status(
2068            git_workdir_for_status.as_deref(),
2069            StatusOptions::new()
2070                .include_untracked(true)
2071                .recurse_untracked_dirs(true)
2072                .exclude_submodules(true),
2073        )
2074    });
2075
2076    // Walk files (the fast part, typically 2-3s even on huge repos).
2077    let is_git_repo = git_workdir.is_some();
2078    let bg_threads = BACKGROUND_THREAD_POOL.current_num_threads();
2079    let mut walk_builder = WalkBuilder::new(base_path);
2080    walk_builder
2081        // this is a very important guard for the user opening ~/ or other root non-git dir
2082        .hidden(!is_git_repo)
2083        .git_ignore(true)
2084        .git_exclude(true)
2085        .git_global(true)
2086        .ignore(true)
2087        .follow_links(false)
2088        .threads(bg_threads);
2089
2090    if !is_git_repo && let Some(overrides) = non_git_repo_overrides(base_path) {
2091        walk_builder.overrides(overrides);
2092    }
2093
2094    let walker = walk_builder.build_parallel();
2095
2096    let walker_start = std::time::Instant::now();
2097    debug!("SCAN: Starting file walker");
2098
2099    // Walk: collect (FileItem, rel_path) pairs. Keep the walk fast —
2100    // no chunking, no HashMap, just Vec::push under the Mutex.
2101    let pairs = parking_lot::Mutex::new(Vec::<(FileItem, String)>::new());
2102
2103    walker.run(|| {
2104        let pairs = &pairs;
2105        let counter = Arc::clone(synced_files_count);
2106        let base_path = base_path.to_path_buf();
2107
2108        Box::new(move |result| {
2109            let Ok(entry) = result else {
2110                return ignore::WalkState::Continue;
2111            };
2112
2113            if entry.file_type().is_some_and(|ft| ft.is_file()) {
2114                let path = entry.path();
2115
2116                if is_git_file(path) {
2117                    return ignore::WalkState::Continue;
2118                }
2119
2120                if !is_git_repo && is_known_binary_extension(path) {
2121                    return ignore::WalkState::Continue;
2122                }
2123
2124                let metadata = entry.metadata().ok();
2125                let (file_item, rel_path) =
2126                    FileItem::new_from_walk(path, &base_path, None, metadata.as_ref());
2127
2128                pairs.lock().push((file_item, rel_path));
2129                counter.fetch_add(1, Ordering::Relaxed);
2130            }
2131            ignore::WalkState::Continue
2132        })
2133    });
2134
2135    let mut pairs = pairs.into_inner();
2136
2137    info!(
2138        "SCAN: File walking completed in {:?} for {} files",
2139        walker_start.elapsed(),
2140        pairs.len(),
2141    );
2142
2143    // Sort by full relative path. This groups files by directory naturally,
2144    // so dir extraction becomes a simple linear scan — no HashMap.
2145    BACKGROUND_THREAD_POOL.install(|| {
2146        pairs.par_sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
2147    });
2148
2149    // Build ChunkedPathStore + extract dirs + assign parent_dir in one pass.
2150    // Files are sorted by relative path, so dir changes happen in order.
2151    // add_file_immediate returns a ChunkedString with null arena_base;
2152    // we fixup arena_base after the arena is frozen.
2153    let mut files: Vec<FileItem> = Vec::with_capacity(pairs.len());
2154    let mut dirs: Vec<DirItem> = Vec::new();
2155    let mut builder = crate::simd_path::ChunkedPathStoreBuilder::new(pairs.len());
2156    // Use a sentinel that can never match any real dir_part (including "")
2157    // so the very first file always creates its dir entry.
2158    let mut prev_dir: Option<String> = None;
2159    let mut current_dir_idx: u32 = 0;
2160
2161    for (mut file, rel) in pairs {
2162        let fname_offset = file.path.filename_offset as usize;
2163        let dir_part = &rel[..fname_offset];
2164
2165        if prev_dir.as_deref() != Some(dir_part) {
2166            let dir_cs = builder.add_dir_immediate(dir_part);
2167            // Compute last-segment offset: for "src/components/" -> 4 (points to "components/")
2168            let last_seg = if dir_part.is_empty() {
2169                0
2170            } else {
2171                let trimmed = dir_part.trim_end_matches(std::path::is_separator);
2172                trimmed
2173                    .rfind(std::path::is_separator)
2174                    .map(|i| i + 1)
2175                    .unwrap_or(0) as u16
2176            };
2177            dirs.push(DirItem::new(dir_cs, last_seg));
2178            current_dir_idx = (dirs.len() - 1) as u32;
2179            prev_dir = Some(dir_part.to_string());
2180        }
2181
2182        let cs = builder.add_file_immediate(&rel, file.path.filename_offset);
2183        file.set_path(cs);
2184        file.set_parent_dir(current_dir_idx);
2185        files.push(file);
2186    }
2187    let chunked_paths = builder.finish();
2188    let arena = chunked_paths.as_arena_ptr();
2189
2190    // Apply frecency scores (access-based only — git status not yet available).
2191    // DirItem.max_access_frecency is AtomicI32, so parallel threads write directly.
2192    let frecency = shared_frecency
2193        .read()
2194        .map_err(|_| Error::AcquireFrecencyLock)?;
2195    if let Some(frecency) = frecency.as_ref() {
2196        let dirs_ref = &dirs;
2197        BACKGROUND_THREAD_POOL.install(|| {
2198            files.par_iter_mut().for_each(|file| {
2199                let _ = file.update_frecency_scores(frecency, arena, base_path, mode);
2200                let score = file.access_frecency_score as i32;
2201                if score > 0 {
2202                    let dir_idx = file.parent_dir_index() as usize;
2203                    if let Some(dir) = dirs_ref.get(dir_idx) {
2204                        dir.update_frecency_if_larger(score);
2205                    }
2206                }
2207            });
2208        });
2209    }
2210    drop(frecency);
2211
2212    // Re-sort by (parent_dir, filename) for binary search in find_file_index.
2213    BACKGROUND_THREAD_POOL.install(|| {
2214        files.par_sort_unstable_by(|a, b| {
2215            a.parent_dir_index()
2216                .cmp(&b.parent_dir_index())
2217                .then_with(|| a.file_name(arena).cmp(&b.file_name(arena)))
2218        });
2219    });
2220
2221    // Ask the allocator to return freed pages to the OS.
2222    hint_allocator_collect();
2223
2224    let file_item_size = std::mem::size_of::<FileItem>();
2225    let files_vec_bytes = files.len() * file_item_size;
2226    let dir_table_bytes = dirs.len() * std::mem::size_of::<DirItem>()
2227        + dirs
2228            .iter()
2229            .map(|d| d.relative_path(arena).len())
2230            .sum::<usize>();
2231
2232    let total_time = scan_start.elapsed();
2233    info!(
2234        "SCAN: Walk completed in {:?} ({} files, {} dirs, \
2235         chunked_store={:.2}MB, files_vec={:.2}MB, dirs={:.2}MB, FileItem={}B)",
2236        total_time,
2237        files.len(),
2238        dirs.len(),
2239        chunked_paths.heap_bytes() as f64 / 1_048_576.0,
2240        files_vec_bytes as f64 / 1_048_576.0,
2241        dir_table_bytes as f64 / 1_048_576.0,
2242        file_item_size,
2243    );
2244
2245    let base_count = files.len();
2246
2247    Ok(WalkResult {
2248        sync: FileSync {
2249            files,
2250            base_count,
2251            dirs,
2252            overflow_builder: None,
2253            git_workdir,
2254            bigram_index: None,
2255            bigram_overlay: None,
2256            chunked_paths: Some(chunked_paths),
2257        },
2258        git_handle,
2259    })
2260}
2261
2262fn apply_git_status_and_frecency(
2263    shared_picker: &SharedPicker,
2264    shared_frecency: &SharedFrecency,
2265    git_handle: std::thread::JoinHandle<Option<GitStatusCache>>,
2266    mode: FFFMode,
2267) {
2268    let join_start = std::time::Instant::now();
2269    let git_cache = match git_handle.join() {
2270        Ok(cache) => cache,
2271        Err(_) => {
2272            error!("Git status thread panicked");
2273            return;
2274        }
2275    };
2276    info!("SCAN: Git status ready in {:?}", join_start.elapsed());
2277
2278    let Some(git_cache) = git_cache else { return };
2279
2280    if let Ok(mut guard) = shared_picker.write()
2281        && let Some(ref mut picker) = *guard
2282    {
2283        let frecency = shared_frecency.read().ok();
2284        let frecency_ref = frecency.as_ref().and_then(|f| f.as_ref());
2285
2286        // Destructure to split borrows: files (mut) and dirs (shared) are independent.
2287        let bp = &picker.base_path;
2288        let arena = picker.arena_base_ptr();
2289
2290        // Reset dir frecency before recomputation.
2291        for dir in picker.sync_data.dirs.iter() {
2292            dir.reset_frecency();
2293        }
2294
2295        let files = &mut picker.sync_data.files;
2296        let dirs = &picker.sync_data.dirs;
2297
2298        BACKGROUND_THREAD_POOL.install(|| {
2299            files.par_iter_mut().for_each(|file| {
2300                let mut buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
2301                let absolute_path = file.write_absolute_path(arena, bp, &mut buf);
2302
2303                file.git_status = git_cache.lookup_status(absolute_path);
2304                if let Some(frecency) = frecency_ref {
2305                    let _ = file.update_frecency_scores(frecency, arena, bp, mode);
2306                }
2307
2308                let score = file.access_frecency_score as i32;
2309                if score > 0 {
2310                    let dir_idx = file.parent_dir_index() as usize;
2311                    if let Some(dir) = dirs.get(dir_idx) {
2312                        dir.update_frecency_if_larger(score);
2313                    }
2314                }
2315            });
2316        });
2317
2318        info!(
2319            "SCAN: Applied git status to {} files ({} dirty)",
2320            picker.sync_data.files.len(),
2321            git_cache.statuses_len(),
2322        );
2323    }
2324}
2325
2326#[inline]
2327fn is_git_file(path: &Path) -> bool {
2328    path.to_str().is_some_and(|path| {
2329        if cfg!(target_family = "windows") {
2330            path.contains("\\.git\\")
2331        } else {
2332            path.contains("/.git/")
2333        }
2334    })
2335}
2336
2337/// Fast extension-based binary detection. Avoids opening files during scan.
2338/// Covers the vast majority of binary files in typical repositories.
2339#[inline]
2340fn is_known_binary_extension(path: &Path) -> bool {
2341    let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
2342        return false;
2343    };
2344    matches!(
2345        ext,
2346        // Images
2347        "png" | "jpg" | "jpeg" | "gif" | "bmp" | "ico" | "webp" | "tiff" | "tif" | "avif" |
2348        "heic" | "psd" | "icns" | "cur" | "raw" | "cr2" | "nef" | "dng" |
2349        // Video/Audio
2350        "mp4" | "avi" | "mov" | "wmv" | "mkv" | "mp3" | "wav" | "flac" | "ogg" | "m4a" |
2351        "aac" | "webm" | "flv" | "mpg" | "mpeg" | "wma" | "opus" |
2352        // Compressed/Archives
2353        "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "zst" | "lz4" | "lzma" |
2354        "cab" | "cpio" |
2355        // Packages/Installers
2356        "deb" | "rpm" | "apk" | "dmg" | "msi" | "iso" | "nupkg" | "whl" | "egg" |
2357        "snap" | "appimage" | "flatpak" |
2358        // Executables/Libraries
2359        "exe" | "dll" | "so" | "dylib" | "o" | "a" | "lib" | "bin" | "elf" |
2360        // Documents
2361        "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" |
2362        // Databases
2363        "db" | "sqlite" | "sqlite3" | "mdb" |
2364        // Fonts
2365        "ttf" | "otf" | "woff" | "woff2" | "eot" |
2366        // Compiled/Runtime
2367        "class" | "pyc" | "pyo" | "wasm" | "dex" | "jar" | "war" |
2368        // ML/Data Science
2369        "npy" | "npz" | "pkl" | "pickle" | "h5" | "hdf5" | "pt" | "pth" | "onnx" |
2370        "safetensors" | "tfrecord" |
2371        // 3D/Game
2372        "glb" | "fbx" | "blend" |
2373        // Data/serialized
2374        "parquet" | "arrow" | "pb" |
2375        // IDE/OS metadata
2376        "DS_Store" | "suo"
2377    )
2378}
2379
2380/// Detect binary content by checking for NUL bytes in the first 512 bytes.
2381/// Called lazily when file content is first loaded, not during initial scan.
2382#[inline]
2383pub(crate) fn detect_binary_content(content: &[u8]) -> bool {
2384    let check_len = content.len().min(512);
2385    content[..check_len].contains(&0)
2386}
2387
2388/// Ask the global allocator to return freed pages to the OS.
2389/// Enabled via the `mimalloc-collect` feature (set by fff-nvim).
2390/// No-op when the feature is off (tests, system allocator).
2391fn hint_allocator_collect() {
2392    #[cfg(feature = "mimalloc-collect")]
2393    {
2394        // Collect BACKGROUND_THREAD_POOL workers — that's where the bigram
2395        // builder allocated memory. `rayon::broadcast` would target the global
2396        // pool, which is the wrong set of threads.
2397        BACKGROUND_THREAD_POOL.broadcast(|_| unsafe { libmimalloc_sys::mi_collect(true) });
2398
2399        // Main thread too.
2400        unsafe { libmimalloc_sys::mi_collect(true) };
2401    }
2402}