Skip to main content

ffs_search/
file_picker.rs

1//! Core file picker: filesystem indexing, background watching, and fuzzy search.
2//!
3//! [`FilePicker`] is the central component of ffs-search. It:
4//!
5//! 1. **Indexes** a directory tree in a background thread, collecting every
6//!    non-ignored file into a path-sorted `Vec<FileItem>`.
7//! 2. **Watches** the filesystem via the `notify` crate, applying
8//!    create/modify/delete events to the index in real time.
9//! 3. **Owns files**: Provides a values for search and provides a good entry point for
10//!    fuzzy search and live grep
11//!
12//! # Lifecycle
13//!
14//! ```text
15//!   new_with_shared_state()
16//!     │
17//!     ├─> background scan thread ──> populates SharedPicker
18//!     └─> file-system watcher    ──> live updates SharedPicker
19//!
20//!   search()         <── borrows &self, delegates to fuzzy_search
21//!   grep()           <── static, borrows &[FileItem] (live content search)
22//!   trigger_rescan() <── synchronous re-index
23//!   cancel()         <── shuts down background work
24//! ```
25//!
26//! # Thread Safety
27//!
28//! `FilePicker` itself is **not** `Sync`!
29//! all concurrent access goes through [`SharedPicker`](crate::SharedPicker) .
30//! The background scanner and watcher acquire write locks only when mutating
31//! the file index, so read-heavy search workloads rarely contend.
32
33use crate::FfsStringStorage;
34use crate::background_watcher::{BackgroundWatcher, is_git_file};
35use crate::bigram_filter::{BigramFilter, BigramOverlay};
36use crate::error::Error;
37use crate::frecency::FrecencyTracker;
38use crate::git::GitStatusCache;
39use crate::grep::{GrepResult, GrepSearchOptions, grep_search, multi_grep_search};
40use crate::ignore::non_git_repo_overrides;
41use crate::query_tracker::QueryTracker;
42use crate::scan::{ScanConfig, ScanJob, ScanSignals};
43use crate::score::fuzzy_match_and_score_files;
44use crate::shared::{SharedFilePicker, SharedFrecency};
45use crate::simd_path::{ArenaPtr, PATH_BUF_SIZE};
46use crate::stable_vec::StableVec;
47use crate::types::{
48    ContentCacheBudget, DirItem, DirSearchResult, FileItem, MixedItemRef, MixedSearchResult,
49    PaginationArgs, Score, ScoringContext, SearchResult,
50};
51use ffs_query_parser::FfsQuery;
52use git2::{Repository, Status};
53use rayon::prelude::*;
54use std::fmt::Debug;
55use std::ops::ControlFlow;
56use std::path::{Path, PathBuf};
57use std::sync::{
58    Arc, LazyLock,
59    atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering},
60};
61use std::thread::JoinHandle;
62use std::time::SystemTime;
63use tracing::{Level, debug, error, info, warn};
64
65/// Max overflow files before the watcher triggers a full rescan.
66/// `walk_filesystem` reserves this much extra capacity so the Vec never
67/// reallocates while raw pointers are held during post-scan.
68pub(crate) const MAX_OVERFLOW_FILES: usize = 1024;
69
70/// Dedicated thread pool for background work (scan, warmup, bigram build).
71/// Uses fewer threads than the global rayon pool so Neovim's event loop
72/// and search queries can still get CPU time.
73pub(crate) static BACKGROUND_THREAD_POOL: LazyLock<rayon::ThreadPool> = LazyLock::new(|| {
74    let total = std::thread::available_parallelism()
75        .map(|p| p.get())
76        .unwrap_or(4);
77    let bg_threads = total.saturating_sub(2).max(1);
78    info!(
79        "Background pool: {} threads (system has {})",
80        bg_threads, total
81    );
82    rayon::ThreadPoolBuilder::new()
83        .num_threads(bg_threads)
84        .thread_name(|i| format!("ffs-bg-{i}"))
85        .start_handler(|_| {
86            // Pin workers to the USER_INITIATED QoS class on macOS so the
87            // scheduler keeps them on P-cores. Without this the kernel is
88            // free to drift them to E-cores, which are ~2× slower for the
89            // bigram scan and per-file syscalls.
90            #[cfg(target_os = "macos")]
91            unsafe {
92                let _ = libc::pthread_set_qos_class_self_np(
93                    libc::qos_class_t::QOS_CLASS_USER_INITIATED,
94                    0,
95                );
96            }
97        })
98        .build()
99        .expect("failed to create background rayon pool")
100});
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
103pub enum FfsMode {
104    #[default]
105    Neovim,
106    Ai,
107}
108
109impl FfsMode {
110    pub fn is_ai(self) -> bool {
111        self == FfsMode::Ai
112    }
113}
114
115/// Configuration for a single fuzzy search invocation.
116///
117/// Passed to [`FilePicker::search`] to control threading, pagination,
118/// and scoring behavior.
119#[derive(Debug, Clone, Copy, Default)]
120pub struct FuzzySearchOptions<'a> {
121    pub max_threads: usize,
122    pub current_file: Option<&'a str>,
123    pub project_path: Option<&'a Path>,
124    pub combo_boost_score_multiplier: i32,
125    pub min_combo_count: u32,
126    pub pagination: PaginationArgs,
127}
128
129#[derive(Debug, Clone)]
130pub(crate) struct FileSync {
131    pub(crate) git_workdir: Option<PathBuf>,
132    /// Base files laid out in two partitions, each internally sorted by
133    /// (parent_dir, filename):
134    ///   `files[..indexable_count]` - indexable
135    ///   `files[indexable_count..base_count]` - original-unindexable
136    ///   `files[base_count..]`— overflow (created on demand)
137    files: StableVec<FileItem>,
138    indexable_count: usize,
139    base_count: usize,
140    /// Sorted directory table. Each entry is a unique parent directory of at
141    /// least one file in `files`. Sorted by absolute path for O(log n) lookup.
142    dirs: Vec<DirItem>,
143    /// Shared builder for overflow file paths. Each overflow file's ChunkedString
144    /// uses `arena_override` pointing into this builder's arena.
145    overflow_builder: Option<crate::simd_path::ChunkedPathStoreBuilder>,
146    /// Compressed bigram inverted index built during the post-scan phase.
147    /// Lives here so that replacing `FileSync` on rescan automatically drops
148    /// the stale index (bigram file indices are positions in `files`).
149    bigram_index: Option<Arc<BigramFilter>>,
150    /// Overlay tracking file mutations since the bigram index was built.
151    bigram_overlay: Option<Arc<parking_lot::RwLock<BigramOverlay>>>,
152    /// Chunk-level deduped path store for zero-copy SIMD matching.
153    /// Each file's relative path is pre-chunked into 16-byte aligned blocks
154    /// with content-based deduplication across files.
155    chunked_paths: Option<crate::simd_path::ChunkedPathStore>,
156}
157
158impl FileSync {
159    fn new() -> Self {
160        Self {
161            files: StableVec::from_vec_with_reserve(Vec::new(), MAX_OVERFLOW_FILES),
162            indexable_count: 0,
163            base_count: 0,
164            dirs: Vec::new(),
165            overflow_builder: None,
166            git_workdir: None,
167            bigram_index: None,
168            bigram_overlay: None,
169            chunked_paths: None,
170        }
171    }
172
173    /// Arena for base files (from the last full scan).
174    #[inline]
175    fn arena_base_ptr(&self) -> ArenaPtr {
176        self.chunked_paths
177            .as_ref()
178            .map(|s| s.as_arena_ptr())
179            .unwrap_or(ArenaPtr::null())
180    }
181
182    /// Arena for overflow files (added after the last full scan).
183    #[inline]
184    fn overflow_arena_ptr(&self) -> ArenaPtr {
185        self.overflow_builder
186            .as_ref()
187            .map(|b| b.as_arena_ptr())
188            .unwrap_or(self.arena_base_ptr())
189    }
190
191    /// Resolve the correct arena for a given file (base vs overflow).
192    #[inline]
193    fn arena_for_file(&self, file: &FileItem) -> ArenaPtr {
194        if file.is_overflow() {
195            self.overflow_arena_ptr()
196        } else {
197            self.arena_base_ptr()
198        }
199    }
200
201    /// Get all files (base + overflow). The base portion `[..base_count]` is
202    /// sorted by path; the overflow tail is unsorted.
203    #[inline]
204    fn files(&self) -> &[FileItem] {
205        &self.files
206    }
207
208    /// Get the overflow portion (files added since last full reindex).
209    #[inline]
210    fn overflow_files(&self) -> &[FileItem] {
211        &self.files[self.base_count..]
212    }
213
214    /// Get mutable file at index (works for base files only).
215    #[inline]
216    fn get_file_mut(&mut self, index: usize) -> Option<&mut FileItem> {
217        self.files.get_mut(index)
218    }
219
220    /// Find file index by path using binary search on the sorted base portion.
221    /// `path` must be an absolute path under `base_path`.
222    #[inline]
223    fn find_file_index(&self, path: &Path, base_path: &Path) -> Result<usize, usize> {
224        let arena = self.arena_base_ptr();
225
226        // Strip base_path prefix to get the relative path. On Windows this
227        // can fail for 8.3 short names or a different casing; fall back to
228        // canonicalize-then-strip so watcher events still land on the right
229        // `FileItem`.
230        let rel_path_owned: String = match path.strip_prefix(base_path) {
231            Ok(r) => normalize_relative_path(&r.to_string_lossy()).into_owned(),
232            Err(_) => {
233                #[cfg(windows)]
234                {
235                    canonical_relative_path(path, base_path).ok_or(0usize)?
236                }
237                #[cfg(not(windows))]
238                {
239                    return Err(0);
240                }
241            }
242        };
243        let rel_path: &str = &rel_path_owned;
244
245        // Split into directory (with trailing '/') and filename.
246        let parent_end = rel_path
247            .rfind(std::path::is_separator)
248            .map(|i| i + 1)
249            .unwrap_or(0);
250        let dir_rel = &rel_path[..parent_end];
251        let filename = &rel_path[parent_end..];
252
253        // Binary search dirs to find the parent directory index.
254        // Dir items store the relative path including trailing '/' (e.g. "src/components/").
255        let mut dir_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
256        let dir_idx = match self
257            .dirs
258            .binary_search_by(|d| d.read_relative_path(arena, &mut dir_buf).cmp(dir_rel))
259        {
260            Ok(idx) => idx as u32,
261            Err(_) => return Err(0), // directory not found
262        };
263
264        // Binary search files by (parent_dir, filename). Base files live in
265        // two internally-sorted partitions — indexable first, then
266        // unindexable — so we try each half in turn. Two O(log n) searches
267        // with short-circuit on the first hit.
268        let cmp_key = |f: &FileItem| {
269            f.parent_dir_index().cmp(&dir_idx).then_with(|| {
270                let fname = f.file_name(arena);
271                fname.as_str().cmp(filename)
272            })
273        };
274
275        if self.indexable_count > 0
276            && let Ok(pos) = self.files[..self.indexable_count].binary_search_by(cmp_key)
277        {
278            return Ok(pos);
279        }
280
281        if self.indexable_count < self.base_count
282            && let Ok(rel_pos) =
283                self.files[self.indexable_count..self.base_count].binary_search_by(cmp_key)
284        {
285            return Ok(self.indexable_count + rel_pos);
286        }
287
288        Err(0)
289    }
290
291    /// Find a file in the overflow portion by relative path (linear scan).
292    /// Returns the absolute index into `files` (i.e. `base_count + position`).
293    fn find_overflow_index(&self, relative_path: &str) -> Option<usize> {
294        let overflow_arena = self.overflow_arena_ptr();
295        self.files[self.base_count..]
296            .iter()
297            .position(|f| f.relative_path_eq(overflow_arena, relative_path))
298            .map(|pos| self.base_count + pos)
299    }
300
301    fn retain_files_with_arena<F>(&mut self, mut predicate: F) -> usize
302    where
303        F: FnMut(&FileItem, ArenaPtr) -> bool,
304    {
305        let base_arena = self.arena_base_ptr();
306        let overflow_arena = self.overflow_arena_ptr();
307
308        let indexable_count = self.indexable_count;
309        let base_count = self.base_count;
310        let initial_len = self.files.len();
311
312        let indexable_retained = self.files[..indexable_count]
313            .iter()
314            .filter(|f| predicate(f, base_arena))
315            .count();
316        let base_retained = self.files[indexable_count..base_count]
317            .iter()
318            .filter(|f| predicate(f, base_arena))
319            .count()
320            + indexable_retained;
321
322        self.files.retain(|f| {
323            predicate(
324                f,
325                if f.is_overflow() {
326                    overflow_arena
327                } else {
328                    base_arena
329                },
330            )
331        });
332
333        self.indexable_count = indexable_retained;
334        self.base_count = base_retained;
335        initial_len - self.files.len()
336    }
337}
338
339impl FileItem {
340    pub fn new(path: PathBuf, base_path: &Path, git_status: Option<Status>) -> (Self, String) {
341        let metadata = std::fs::metadata(&path).ok();
342        Self::new_with_metadata(path, base_path, git_status, metadata.as_ref())
343    }
344
345    /// Create a FileItem using pre-fetched metadata to avoid a redundant stat syscall.
346    /// Returns `(FileItem, relative_path)`. The FileItem's `path` field is
347    /// empty; callers must populate it via `set_path` or `build_chunked_path_store_and_assign`.
348    fn new_with_metadata(
349        path: PathBuf,
350        base_path: &Path,
351        git_status: Option<Status>,
352        metadata: Option<&std::fs::Metadata>,
353    ) -> (Self, String) {
354        let path_buf = pathdiff::diff_paths(&path, base_path).unwrap_or_else(|| path.clone());
355        let relative_path = path_buf.to_string_lossy().into_owned();
356
357        let (size, modified) = match metadata {
358            Some(metadata) => {
359                let size = metadata.len();
360                let modified = metadata
361                    .modified()
362                    .ok()
363                    .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
364                    .map_or(0, |d| d.as_secs());
365
366                (size, modified)
367            }
368            None => (0, 0),
369        };
370
371        let is_binary = is_known_binary_extension(&path);
372
373        let filename_start = relative_path
374            .rfind(std::path::is_separator)
375            .map(|i| i + 1)
376            .unwrap_or(0) as u16;
377
378        let item = Self::new_raw(filename_start, size, modified, git_status, is_binary);
379        (item, relative_path)
380    }
381
382    /// Create a FileItem with an empty ChunkedString from a path on disk.
383    ///
384    /// Returns `(file_item, relative_path_string)`. The relative path must be
385    /// kept alongside the FileItem until `build_chunked_path_store_and_assign`
386    /// populates each item's `path` field from the shared arena.
387    pub fn new_from_walk(
388        path: &Path,
389        base_path: &Path,
390        git_status: Option<Status>,
391        metadata: Option<&std::fs::Metadata>,
392    ) -> (Self, String) {
393        let (size, modified) = match metadata {
394            Some(metadata) => {
395                let size = metadata.len();
396                let modified = metadata
397                    .modified()
398                    .ok()
399                    .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
400                    .map_or(0, |d| d.as_secs());
401                (size, modified)
402            }
403            None => (0, 0),
404        };
405
406        let is_binary = is_known_binary_extension(path);
407
408        let rel = pathdiff::diff_paths(path, base_path).unwrap_or_else(|| path.to_path_buf());
409        let rel_str = rel.to_string_lossy().into_owned();
410        let fname_offset = rel_str
411            .rfind(std::path::is_separator)
412            .map(|i| i + 1)
413            .unwrap_or(0) as u16;
414
415        let item = Self::new_raw(fname_offset, size, modified, git_status, is_binary);
416        (item, rel_str)
417    }
418
419    pub(crate) fn update_frecency_scores(
420        &mut self,
421        tracker: &FrecencyTracker,
422        arena: ArenaPtr,
423        base_path: &Path,
424        mode: FfsMode,
425    ) -> Result<(), Error> {
426        let mut abs_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
427        let abs = self.write_absolute_path(arena, base_path, &mut abs_buf);
428        self.access_frecency_score = tracker.get_access_score(abs, mode) as i16;
429        self.modification_frecency_score =
430            tracker.get_modification_score(self.modified, self.git_status, mode) as i16;
431
432        Ok(())
433    }
434}
435
436/// Options for creating a [`FilePicker`].
437pub struct FilePickerOptions {
438    pub base_path: String,
439    /// Pre-populate mmap caches for top-frecency files after the initial scan.
440    pub enable_mmap_cache: bool,
441    /// Build content index after the initial scan for faster content-aware filtering.
442    pub enable_content_indexing: bool,
443    /// Mode of the picker impact the way file watcher events are handled and the scoring logic
444    pub mode: FfsMode,
445    /// Explicit cache budget. When `None`, the budget is auto-computed from
446    /// the repo size after the initial scan completes.
447    pub cache_budget: Option<ContentCacheBudget>,
448    /// When `false`, `new_with_shared_state` skips the background file watcher.
449    pub watch: bool,
450}
451
452impl Default for FilePickerOptions {
453    fn default() -> Self {
454        Self {
455            base_path: ".".into(),
456            enable_mmap_cache: false,
457            enable_content_indexing: false,
458            mode: FfsMode::default(),
459            cache_budget: None,
460            watch: true,
461        }
462    }
463}
464
465pub struct FilePicker {
466    pub mode: FfsMode,
467    pub base_path: PathBuf,
468    sync_data: FileSync,
469    pub(crate) signals: ScanSignals,
470    pub(crate) background_watcher: Option<BackgroundWatcher>,
471    cache_budget: Arc<ContentCacheBudget>,
472    has_explicit_cache_budget: bool,
473    scanned_files_count: Arc<AtomicUsize>,
474    enable_mmap_cache: bool,
475    enable_content_indexing: bool,
476    watch: bool,
477}
478
479impl std::fmt::Debug for FilePicker {
480    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
481        f.debug_struct("FilePicker")
482            .field("base_path", &self.base_path)
483            .field("sync_data", &self.sync_data)
484            .field(
485                "is_scanning",
486                &self.signals.scanning.load(Ordering::Relaxed),
487            )
488            .field(
489                "scanned_files_count",
490                &self.scanned_files_count.load(Ordering::Relaxed),
491            )
492            .finish_non_exhaustive()
493    }
494}
495
496impl FfsStringStorage for &FilePicker {
497    #[inline]
498    fn arena_for(&self, file: &FileItem) -> crate::simd_path::ArenaPtr {
499        self.sync_data.arena_for_file(file)
500    }
501
502    #[inline]
503    fn base_arena(&self) -> crate::simd_path::ArenaPtr {
504        self.sync_data.arena_base_ptr()
505    }
506
507    #[inline]
508    fn overflow_arena(&self) -> crate::simd_path::ArenaPtr {
509        self.sync_data.overflow_arena_ptr()
510    }
511}
512
513impl FilePicker {
514    pub fn base_path(&self) -> &Path {
515        &self.base_path
516    }
517
518    pub fn has_mmap_cache(&self) -> bool {
519        self.enable_mmap_cache
520    }
521
522    pub fn has_content_indexing(&self) -> bool {
523        self.enable_content_indexing
524    }
525
526    pub fn has_watcher(&self) -> bool {
527        self.watch
528    }
529
530    pub fn mode(&self) -> FfsMode {
531        self.mode
532    }
533
534    pub fn cache_budget(&self) -> &ContentCacheBudget {
535        &self.cache_budget
536    }
537
538    pub fn bigram_index(&self) -> Option<&BigramFilter> {
539        self.sync_data.bigram_index.as_deref()
540    }
541
542    pub fn bigram_overlay(&self) -> Option<&parking_lot::RwLock<BigramOverlay>> {
543        self.sync_data.bigram_overlay.as_deref()
544    }
545
546    pub fn get_file_mut(&mut self, index: usize) -> Option<&mut FileItem> {
547        self.sync_data.get_file_mut(index)
548    }
549
550    /// Absolute path to the repository root if the indexed tree lives
551    /// inside a git working directory. `None` for non-git bases.
552    pub fn git_root(&self) -> Option<&Path> {
553        self.sync_data.git_workdir.as_deref()
554    }
555
556    pub fn has_explicit_cache_budget(&self) -> bool {
557        self.has_explicit_cache_budget
558    }
559
560    pub fn set_cache_budget(&mut self, budget: ContentCacheBudget) {
561        self.cache_budget = Arc::new(budget);
562    }
563
564    /// Get all indexed files sorted by path.
565    /// Note: Files are stored sorted by PATH for efficient insert/remove.
566    /// For frecency-sorted results, use search() which sorts matched results.
567    pub fn get_files(&self) -> &[FileItem] {
568        self.sync_data.files()
569    }
570
571    pub fn get_overflow_files(&self) -> &[FileItem] {
572        self.sync_data.overflow_files()
573    }
574
575    /// Get the directory table (sorted by path).
576    pub fn get_dirs(&self) -> &[DirItem] {
577        &self.sync_data.dirs
578    }
579
580    /// Actual heap bytes used: (chunked_path_store, 0, 0).
581    /// The second element is 0 because leaked overflow stores aren't tracked.
582    pub fn arena_bytes(&self) -> (usize, usize, usize) {
583        let chunked = self
584            .sync_data
585            .chunked_paths
586            .as_ref()
587            .map_or(0, |s| s.heap_bytes());
588
589        (chunked, 0, 0)
590    }
591
592    #[tracing::instrument(level = "debug", skip_all)]
593    pub(crate) fn for_each_dir(&self, mut f: impl FnMut(&Path) -> ControlFlow<()>) {
594        let dir_table = &self.sync_data.dirs;
595        let base = self.base_path.as_path();
596
597        if !dir_table.is_empty() {
598            let arena = self.arena_base_ptr();
599            let mut path_buf = PathBuf::with_capacity(crate::simd_path::PATH_BUF_SIZE);
600            let mut prev_relative_path = String::new();
601
602            let mut scratch_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
603            for dir_item in dir_table {
604                let full_relative_path = dir_item.read_relative_path(arena, &mut scratch_buf);
605                let relative_path = full_relative_path.trim_end_matches(std::path::is_separator);
606
607                if relative_path.is_empty() {
608                    // Files directly under base_path
609                    prev_relative_path.clear();
610                    continue;
611                }
612
613                let mut i = common_dir_prefix_len(&prev_relative_path, relative_path);
614                // If we stopped on a separator, skip it — we want to start
615                // emitting at the first unseen segment, not re-emit the
616                // already-emitted prefix path.
617                if i < relative_path.len()
618                    && std::path::is_separator(relative_path.as_bytes()[i] as char)
619                {
620                    i += 1;
621                }
622
623                // Walk the suffix of `relative_path` one segment at a time, emitting
624                // each previously unseen ancestor up to and including `relative_path`.
625                while i < relative_path.len() {
626                    let next_sep = relative_path[i..]
627                        .find(std::path::is_separator)
628                        .map(|off| i + off)
629                        .unwrap_or(relative_path.len());
630                    let ancestor_rel = &relative_path[..next_sep];
631
632                    path_buf.clear();
633                    path_buf.push(base);
634                    path_buf.push(ancestor_rel);
635
636                    // we can't really emit iterator here unfortunately
637                    if matches!(f(path_buf.as_path()), ControlFlow::Break(())) {
638                        return;
639                    }
640
641                    i = next_sep + 1;
642                }
643
644                prev_relative_path.clear();
645                prev_relative_path.push_str(relative_path);
646            }
647            return;
648        }
649
650        // fallback that should never be happening, but it is possible to get the file
651        // path from the absolute path using components api as well:
652        let files = self.sync_data.files();
653        let arena = self.arena_base_ptr();
654        let mut current = self.base_path.clone();
655        let mut path_buf = [0u8; PATH_BUF_SIZE];
656
657        for file in files {
658            let abs = file.write_absolute_path(arena, base, &mut path_buf);
659            let Some(parent) = abs.parent() else {
660                continue;
661            };
662            if parent == current.as_path() {
663                continue;
664            }
665
666            while current.as_path() != base && !parent.starts_with(&current) {
667                current.pop();
668            }
669
670            let Ok(remainder) = parent.strip_prefix(&current) else {
671                continue;
672            };
673            for component in remainder.components() {
674                current.push(component);
675                if matches!(f(current.as_path()), ControlFlow::Break(())) {
676                    return;
677                }
678            }
679        }
680    }
681
682    /// Create a new FilePicker from options.
683    /// Always prefer new_with_shared_state for the consumer application, use this only if you know
684    /// what you are doing. This won't spawn the backgraound watcher and won't walk the file tree.
685    pub fn new(options: FilePickerOptions) -> Result<Self, Error> {
686        let path = PathBuf::from(&options.base_path);
687        if !path.exists() {
688            error!("Base path does not exist: {}", options.base_path);
689            return Err(Error::InvalidPath(path));
690        }
691        if path.parent().is_none() {
692            error!("Refusing to index filesystem root: {}", path.display());
693            return Err(Error::FilesystemRoot(path));
694        }
695
696        // Windows-only: canonicalize with so the base path does NOT
697        // have the `\\?\` UNC prefix that `std::fs::canonicalize` adds.
698        // libgit2's `repo.workdir()`
699        #[cfg(windows)]
700        let path = crate::path_utils::canonicalize(&path).unwrap_or(path);
701
702        let has_explicit_budget = options.cache_budget.is_some();
703        let initial_budget = options.cache_budget.unwrap_or_default();
704
705        Ok(FilePicker {
706            background_watcher: None,
707            base_path: path,
708            cache_budget: Arc::new(initial_budget),
709            has_explicit_cache_budget: has_explicit_budget,
710            signals: crate::scan::ScanSignals::default(),
711            mode: options.mode,
712            scanned_files_count: Arc::new(AtomicUsize::new(0)),
713            sync_data: FileSync::new(),
714            enable_mmap_cache: options.enable_mmap_cache,
715            enable_content_indexing: options.enable_content_indexing,
716            watch: options.watch,
717        })
718    }
719
720    /// Create a picker, place it into the shared handle, and spawn background
721    /// indexing + file-system watcher. This is the default entry point.
722    pub fn new_with_shared_state(
723        shared_picker: SharedFilePicker,
724        shared_frecency: SharedFrecency,
725        options: FilePickerOptions,
726    ) -> Result<(), Error> {
727        let picker = Self::new(options)?;
728
729        info!(
730            "Spawning background threads: base_path={}, warmup={}, content_indexing={}, mode={:?}",
731            picker.base_path.display(),
732            picker.enable_mmap_cache,
733            picker.enable_content_indexing,
734            picker.mode,
735        );
736
737        let warmup = picker.enable_mmap_cache;
738        let content_indexing = picker.enable_content_indexing;
739        let watch = picker.watch;
740        let mode = picker.mode;
741
742        let signals = picker.scan_signals();
743        let scanned_files_counter = picker.scanned_files_counter();
744        let path = picker.base_path.clone();
745
746        {
747            let mut guard = shared_picker.write()?;
748            *guard = Some(picker);
749        }
750
751        // `ScanJob::spawn` flips `scanning=true` synchronously before handing
752        // off to the worker thread, so callers that invoke `wait_for_scan`
753        // immediately after `new_with_shared_state` are guaranteed to see
754        // the scan in progress.
755        ScanJob::new_initial(
756            shared_picker,
757            shared_frecency,
758            path,
759            mode,
760            signals,
761            scanned_files_counter,
762            ScanConfig {
763                warmup,
764                content_indexing,
765                watch,
766                auto_cache_budget: true,
767                install_watcher: true,
768            },
769        )
770        .spawn();
771
772        Ok(())
773    }
774
775    /// Synchronous filesystem scan — populates `self` with indexed files.
776    ///
777    /// Use this when you need direct access to the picker without shared state:
778    /// ```ignore
779    /// let mut picker = FilePicker::new(options)?;
780    /// picker.collect_files()?;
781    /// // picker.get_files() is now populated
782    /// ```
783    pub fn collect_files(&mut self) -> Result<(), Error> {
784        self.signals.scanning.store(true, Ordering::Relaxed);
785        self.scanned_files_count.store(0, Ordering::Relaxed);
786
787        let git_workdir = FileSync::discover_git_workdir(&self.base_path);
788        let git_handle = git_workdir.clone().map(FileSync::spawn_git_status);
789
790        let empty_frecency = SharedFrecency::default();
791        let sync = FileSync::walk_filesystem(
792            &self.base_path,
793            git_workdir,
794            &self.scanned_files_count,
795            &empty_frecency,
796            self.mode,
797        )?;
798
799        self.sync_data = sync;
800
801        // Recalculate cache budget based on actual file count (unless
802        // the caller provided an explicit budget via FilePickerOptions).
803        if !self.has_explicit_cache_budget {
804            let file_count = self.sync_data.files().len();
805            self.cache_budget = Arc::new(ContentCacheBudget::new_for_repo(file_count));
806        } else {
807            self.cache_budget.reset();
808        }
809
810        // Apply git status synchronously.
811        if let Some(handle) = git_handle
812            && let Ok(Some(git_cache)) = handle.join()
813        {
814            let arena = self.arena_base_ptr();
815            for file in self.sync_data.files.iter_mut() {
816                file.git_status =
817                    git_cache.lookup_status(&file.absolute_path(arena, &self.base_path));
818            }
819        }
820
821        self.signals.scanning.store(false, Ordering::Relaxed);
822        Ok(())
823    }
824
825    /// Start the background file-system watcher.
826    ///
827    /// The picker must already be placed into `shared_picker` (the watcher
828    /// needs the shared handle to apply live updates). Call after
829    /// [`collect_files`](Self::collect_files) or after an initial scan.
830    pub fn spawn_background_watcher(
831        &mut self,
832        shared_picker: &SharedFilePicker,
833        shared_frecency: &SharedFrecency,
834    ) -> Result<(), Error> {
835        let git_workdir = self.sync_data.git_workdir.clone();
836        let watcher = BackgroundWatcher::new(
837            self.base_path.clone(),
838            git_workdir,
839            shared_picker.clone(),
840            shared_frecency.clone(),
841            self.mode,
842        )?;
843        self.background_watcher = Some(watcher);
844
845        // On macOS, FSEventStreamCreate schedules the stream on the runloop but
846        // it needs time to actually start delivering events. Without this delay,
847        // events created immediately after wait_for_watcher returns can be missed.
848        #[cfg(target_os = "macos")]
849        std::thread::sleep(std::time::Duration::from_millis(500));
850
851        self.signals.watcher_ready.store(true, Ordering::Release);
852        Ok(())
853    }
854
855    /// Perform fuzzy search on files with a pre-parsed query.
856    ///
857    /// The query should be parsed using [`FfsQuery`]::parse() before calling
858    /// this function. If a [`QueryTracker`] is provided, the search will
859    /// automatically look up the last selected file for this query and boost it
860    pub fn fuzzy_search<'q>(
861        &self,
862        query: &'q FfsQuery<'q>,
863        query_tracker: Option<&QueryTracker>,
864        options: FuzzySearchOptions<'q>,
865    ) -> SearchResult<'_> {
866        let files = self.get_files();
867        let max_threads = if options.max_threads == 0 {
868            std::thread::available_parallelism()
869                .map(|n| n.get())
870                .unwrap_or(4)
871        } else {
872            options.max_threads
873        };
874
875        debug!(
876            raw_query = ?query.raw_query,
877            pagination = ?options.pagination,
878            ?max_threads,
879            current_file = ?options.current_file,
880            "Fuzzy search",
881        );
882
883        let total_files = files.len();
884        let location = query.location;
885
886        // Get effective query for max_typos calculation (without location suffix)
887        let effective_query = match &query.fuzzy_query {
888            ffs_query_parser::FuzzyQuery::Text(t) => *t,
889            ffs_query_parser::FuzzyQuery::Parts(parts) if !parts.is_empty() => parts[0],
890            _ => query.raw_query.trim(),
891        };
892
893        // small queries with a large number of results can match absolutely everything
894        let max_typos = (effective_query.len() as u16 / 4).clamp(2, 6);
895        // Look up the last file selected for this query (combo-boost scoring)
896        let last_same_query_entry =
897            query_tracker
898                .zip(options.project_path)
899                .and_then(|(tracker, project_path)| {
900                    tracker
901                        .get_last_query_entry(
902                            query.raw_query,
903                            project_path,
904                            options.min_combo_count,
905                        )
906                        .ok()
907                        .flatten()
908                });
909
910        let context = ScoringContext {
911            query,
912            max_typos,
913            max_threads,
914            project_path: options.project_path,
915            current_file: options.current_file,
916            last_same_query_match: last_same_query_entry,
917            combo_boost_score_multiplier: options.combo_boost_score_multiplier,
918            min_combo_count: options.min_combo_count,
919            pagination: options.pagination,
920        };
921
922        let time = std::time::Instant::now();
923
924        let base_arena = self.sync_data.arena_base_ptr();
925        let overflow_arena = self
926            .sync_data
927            .overflow_builder
928            .as_ref()
929            .map(|b| b.as_arena_ptr())
930            .unwrap_or(base_arena);
931
932        let (items, scores, total_matched) = fuzzy_match_and_score_files(
933            files,
934            &context,
935            self.sync_data.base_count,
936            base_arena,
937            overflow_arena,
938        );
939
940        info!(
941            ?query,
942            completed_in = ?time.elapsed(),
943            total_matched,
944            returned_count = items.len(),
945            pagination = ?options.pagination,
946            "Fuzzy search completed",
947        );
948
949        SearchResult {
950            items,
951            scores,
952            total_matched,
953            total_files,
954            location,
955        }
956    }
957
958    /// Perform fuzzy search on indexed directories.
959    ///
960    /// Returns directories ranked by fuzzy match quality + frecency.
961    pub fn fuzzy_search_directories<'q>(
962        &self,
963        query: &'q FfsQuery<'q>,
964        options: FuzzySearchOptions<'q>,
965    ) -> DirSearchResult<'_> {
966        let dirs = self.get_dirs();
967        let max_threads = if options.max_threads == 0 {
968            std::thread::available_parallelism()
969                .map(|n| n.get())
970                .unwrap_or(4)
971        } else {
972            options.max_threads
973        };
974
975        let total_dirs = dirs.len();
976
977        let effective_query = match &query.fuzzy_query {
978            ffs_query_parser::FuzzyQuery::Text(t) => *t,
979            ffs_query_parser::FuzzyQuery::Parts(parts) if !parts.is_empty() => parts[0],
980            _ => query.raw_query.trim(),
981        };
982
983        let max_typos = (effective_query.len() as u16 / 4).clamp(2, 6);
984
985        let context = ScoringContext {
986            query,
987            max_typos,
988            max_threads,
989            project_path: options.project_path,
990            current_file: options.current_file,
991            last_same_query_match: None,
992            combo_boost_score_multiplier: 0,
993            min_combo_count: 0,
994            pagination: options.pagination,
995        };
996
997        let arena = self.sync_data.arena_base_ptr();
998        let time = std::time::Instant::now();
999
1000        let (items, scores, total_matched) =
1001            crate::score::fuzzy_match_and_score_dirs(dirs, &context, arena);
1002
1003        info!(
1004            ?query,
1005            completed_in = ?time.elapsed(),
1006            total_matched,
1007            returned_count = items.len(),
1008            "Directory search completed",
1009        );
1010
1011        DirSearchResult {
1012            items,
1013            scores,
1014            total_matched,
1015            total_dirs,
1016        }
1017    }
1018
1019    /// Perform a mixed fuzzy search across both files and directories.
1020    ///
1021    /// Returns a single flat list where files and directories are interleaved
1022    /// by total score in descending order.
1023    ///
1024    /// If the raw query ends with a path separator (`/`), only directories
1025    /// are searched — files are skipped entirely. The caller should parse the
1026    /// query with `DirSearchConfig` so that trailing `/` is kept as fuzzy
1027    /// text instead of becoming a `PathSegment` constraint.
1028    pub fn fuzzy_search_mixed<'q>(
1029        &self,
1030        query: &'q FfsQuery<'q>,
1031        query_tracker: Option<&QueryTracker>,
1032        options: FuzzySearchOptions<'q>,
1033    ) -> MixedSearchResult<'_> {
1034        let location = query.location;
1035        let page_offset = options.pagination.offset;
1036        let page_limit = if options.pagination.limit > 0 {
1037            options.pagination.limit
1038        } else {
1039            100
1040        };
1041
1042        let dirs_only =
1043            query.raw_query.ends_with(std::path::MAIN_SEPARATOR) || query.raw_query.ends_with('/');
1044
1045        // Run file search and dir search with no pagination (we merge then paginate).
1046        let internal_limit = page_offset.saturating_add(page_limit).saturating_mul(2);
1047
1048        let dir_options = FuzzySearchOptions {
1049            pagination: PaginationArgs {
1050                offset: 0,
1051                limit: internal_limit,
1052            },
1053            ..options
1054        };
1055        let dir_results = self.fuzzy_search_directories(query, dir_options);
1056
1057        if dirs_only {
1058            let total_matched = dir_results.total_matched;
1059            let total_dirs = dir_results.total_dirs;
1060
1061            let mut merged: Vec<(MixedItemRef<'_>, Score)> =
1062                Vec::with_capacity(dir_results.items.len());
1063            for (dir, score) in dir_results.items.into_iter().zip(dir_results.scores) {
1064                merged.push((MixedItemRef::Dir(dir), score));
1065            }
1066
1067            if page_offset >= merged.len() {
1068                return MixedSearchResult {
1069                    items: vec![],
1070                    scores: vec![],
1071                    total_matched,
1072                    total_files: self.sync_data.files().len(),
1073                    total_dirs,
1074                    location,
1075                };
1076            }
1077
1078            let end = (page_offset + page_limit).min(merged.len());
1079            let page = merged.drain(page_offset..end);
1080            let (items, scores): (Vec<_>, Vec<_>) = page.unzip();
1081
1082            return MixedSearchResult {
1083                items,
1084                scores,
1085                total_matched,
1086                total_files: self.sync_data.files().len(),
1087                total_dirs,
1088                location,
1089            };
1090        }
1091
1092        let file_options = FuzzySearchOptions {
1093            pagination: PaginationArgs {
1094                offset: 0,
1095                limit: internal_limit,
1096            },
1097            ..options
1098        };
1099        let file_results = self.fuzzy_search(query, query_tracker, file_options);
1100
1101        // Merge by score descending.
1102        let total_matched = file_results.total_matched + dir_results.total_matched;
1103        let total_files = file_results.total_files;
1104        let total_dirs = dir_results.total_dirs;
1105
1106        let mut merged: Vec<(MixedItemRef<'_>, Score)> =
1107            Vec::with_capacity(file_results.items.len() + dir_results.items.len());
1108
1109        for (file, score) in file_results.items.into_iter().zip(file_results.scores) {
1110            merged.push((MixedItemRef::File(file), score));
1111        }
1112        for (dir, score) in dir_results.items.into_iter().zip(dir_results.scores) {
1113            merged.push((MixedItemRef::Dir(dir), score));
1114        }
1115
1116        // Sort merged results by total score descending.
1117        merged.sort_unstable_by_key(|b| std::cmp::Reverse(b.1.total));
1118
1119        // Paginate.
1120        if page_offset >= merged.len() {
1121            return MixedSearchResult {
1122                items: vec![],
1123                scores: vec![],
1124                total_matched,
1125                total_files,
1126                total_dirs,
1127                location,
1128            };
1129        }
1130
1131        let end = (page_offset + page_limit).min(merged.len());
1132        let page = merged.drain(page_offset..end);
1133        let (items, scores): (Vec<_>, Vec<_>) = page.unzip();
1134
1135        MixedSearchResult {
1136            items,
1137            scores,
1138            total_matched,
1139            total_files,
1140            total_dirs,
1141            location,
1142        }
1143    }
1144
1145    /// Perform a live grep search across indexed files.
1146    ///
1147    /// If `options.abort_signal` is set it overrides the picker's internal
1148    /// cancellation flag, giving the caller full control over when to stop.
1149    pub fn grep(&self, query: &FfsQuery<'_>, options: &GrepSearchOptions) -> GrepResult<'_> {
1150        let overlay_guard = self.sync_data.bigram_overlay.as_ref().map(|o| o.read());
1151        let arena = self.arena_base_ptr();
1152        let overflow_arena = self.sync_data.overflow_arena_ptr();
1153        let cancel = options
1154            .abort_signal
1155            .as_deref()
1156            .unwrap_or(&self.signals.cancelled);
1157
1158        grep_search(
1159            self.get_files(),
1160            query,
1161            options,
1162            self.cache_budget(),
1163            self.sync_data.bigram_index.as_deref(),
1164            overlay_guard.as_deref(),
1165            cancel,
1166            &self.base_path,
1167            arena,
1168            overflow_arena,
1169        )
1170    }
1171
1172    /// Multi-pattern grep search across indexed files.
1173    pub fn multi_grep(
1174        &self,
1175        patterns: &[&str],
1176        constraints: &[ffs_query_parser::Constraint<'_>],
1177        options: &GrepSearchOptions,
1178    ) -> GrepResult<'_> {
1179        let overlay_guard = self.sync_data.bigram_overlay.as_ref().map(|o| o.read());
1180        let arena = self.arena_base_ptr();
1181        let overflow_arena = self.sync_data.overflow_arena_ptr();
1182        let cancel = options
1183            .abort_signal
1184            .as_deref()
1185            .unwrap_or(&self.signals.cancelled);
1186
1187        multi_grep_search(
1188            self.get_files(),
1189            patterns,
1190            constraints,
1191            options,
1192            self.cache_budget(),
1193            self.sync_data.bigram_index.as_deref(),
1194            overlay_guard.as_deref(),
1195            cancel,
1196            &self.base_path,
1197            arena,
1198            overflow_arena,
1199        )
1200    }
1201
1202    #[doc(hidden)]
1203    pub fn grep_original(
1204        &self,
1205        query: &FfsQuery<'_>,
1206        options: &GrepSearchOptions,
1207    ) -> GrepResult<'_> {
1208        let arena = self.arena_base_ptr();
1209        let overflow_arena = self.sync_data.overflow_arena_ptr();
1210        let cancel = options
1211            .abort_signal
1212            .as_deref()
1213            .unwrap_or(&self.signals.cancelled);
1214
1215        grep_search(
1216            self.get_files(),
1217            query,
1218            options,
1219            self.cache_budget(),
1220            self.sync_data.bigram_index.as_deref(),
1221            None,
1222            cancel,
1223            &self.base_path,
1224            arena,
1225            overflow_arena,
1226        )
1227    }
1228
1229    // Returns an ongoing or finisshed scan progress
1230    pub fn get_scan_progress(&self) -> ScanProgress {
1231        let scanned_count = self.scanned_files_count.load(Ordering::Relaxed);
1232        let is_scanning = self.signals.scanning.load(Ordering::Relaxed);
1233        ScanProgress {
1234            scanned_files_count: scanned_count,
1235            is_scanning,
1236            is_watcher_ready: self.signals.watcher_ready.load(Ordering::Relaxed),
1237            is_warmup_complete: self.sync_data.bigram_index.is_some(),
1238        }
1239    }
1240
1241    pub(crate) fn set_bigram_index(&mut self, index: BigramFilter, overlay: BigramOverlay) {
1242        self.sync_data.bigram_index = Some(Arc::new(index));
1243        self.sync_data.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(overlay)));
1244    }
1245
1246    pub(crate) fn scan_signals(&self) -> crate::scan::ScanSignals {
1247        self.signals.clone()
1248    }
1249
1250    pub(crate) fn scanned_files_counter(&self) -> Arc<AtomicUsize> {
1251        Arc::clone(&self.scanned_files_count)
1252    }
1253
1254    /// Capture raw pointers to the picker's internal arrays for off-lock use.
1255    ///
1256    /// Sets `post_scan_indexing_active` and returns a snapshot that clears it
1257    /// on drop. This is the ONLY approved way to escape the lock for
1258    /// long-running parallel work (git status, warmup, bigram).
1259    ///
1260    /// Returns `None` if `post_scan_indexing_active` is already set — this
1261    /// means another post-scan is in flight and we must not create a second
1262    /// set of dangling pointers.
1263    ///
1264    /// # Safety
1265    /// 1. `walk_filesystem` reserved `MAX_OVERFLOW_FILES` capacity on the
1266    ///    files Vec at creation — watcher pushes cannot reallocate it.
1267    /// 2. `post_scan_indexing_active` is set — prevents `commit_new_sync`
1268    ///    from replacing the Vec (ScanJob::new checks this flag).
1269    /// 3. Only `[..base_count]` is accessed — base files use the immutable
1270    ///    base arena. Overflow files use a different arena.
1271    pub(crate) unsafe fn post_scan_snapshot(&self) -> Option<PostScanUnsafeSnapshot> {
1272        if self
1273            .signals
1274            .post_scan_indexing_active
1275            .load(Ordering::Acquire)
1276        {
1277            tracing::error!(
1278                "Can not acquire post scan unsafe snapshot, someone already acquired it"
1279            );
1280            return None;
1281        }
1282
1283        self.signals
1284            .post_scan_indexing_active
1285            .store(true, Ordering::Release);
1286
1287        let files = &self.sync_data.files;
1288        let dirs = &self.sync_data.dirs;
1289        Some(PostScanUnsafeSnapshot {
1290            files: files.as_ptr() as *mut FileItem,
1291            base_count: self.sync_data.base_count,
1292            indexable_count: self.sync_data.indexable_count,
1293            dirs: dirs.as_ptr(),
1294            dirs_len: dirs.len(),
1295            arena: self.sync_data.arena_base_ptr(),
1296            budget: &*self.cache_budget as *const _,
1297            base_path: self.base_path.clone(),
1298            post_scan_flag: Arc::clone(&self.signals.post_scan_indexing_active),
1299        })
1300    }
1301
1302    pub(crate) fn commit_new_sync(&mut self, sync: FileSync) {
1303        self.sync_data = sync;
1304        self.cache_budget.reset();
1305    }
1306
1307    #[inline]
1308    pub(crate) fn arena_base_ptr(&self) -> ArenaPtr {
1309        self.sync_data.arena_base_ptr()
1310    }
1311
1312    /// Update git statuses for files, using the provided shared frecency tracker.
1313    pub(crate) fn update_git_statuses(
1314        &mut self,
1315        status_cache: GitStatusCache,
1316        shared_frecency: &SharedFrecency,
1317    ) -> Result<(), Error> {
1318        debug!(
1319            statuses_count = status_cache.statuses_len(),
1320            "Updating git status",
1321        );
1322
1323        let mode = self.mode;
1324        let bp = self.base_path.clone();
1325        let arena = self.arena_base_ptr();
1326        let frecency = shared_frecency.read()?;
1327        status_cache
1328            .into_iter()
1329            .try_for_each(|(path, status)| -> Result<(), Error> {
1330                if let Some(file) = self.get_mut_file_by_path(&path) {
1331                    file.git_status = Some(status);
1332                    if let Some(ref f) = *frecency {
1333                        file.update_frecency_scores(f, arena, &bp, mode)?;
1334                    }
1335                    // Update parent dir frecency inline.
1336                    let score = file.access_frecency_score as i32;
1337                    let dir_idx = file.parent_dir_index() as usize;
1338                    if let Some(dir) = self.sync_data.dirs.get_mut(dir_idx) {
1339                        dir.update_frecency_if_larger(score);
1340                    }
1341                } else {
1342                    // Expected on sparse checkouts: git reports a status for
1343                    // a path that isn't materialized on disk and therefore
1344                    // isn't in the file index. Don't spam the log (#404).
1345                    debug!(?path, "Git status for path not in index, skipping");
1346                }
1347                Ok(())
1348            })?;
1349
1350        Ok(())
1351    }
1352
1353    pub fn update_single_file_frecency(
1354        &mut self,
1355        file_path: impl AsRef<Path>,
1356        frecency_tracker: &FrecencyTracker,
1357    ) -> Result<(), Error> {
1358        let path = file_path.as_ref();
1359        let arena = self.arena_base_ptr();
1360        let rel = self.to_relative_path(path);
1361        let rel_ref: &str = rel.as_deref().unwrap_or("");
1362        let index = self
1363            .sync_data
1364            .find_file_index(path, &self.base_path)
1365            .ok()
1366            .or_else(|| self.sync_data.find_overflow_index(rel_ref));
1367        if let Some(index) = index
1368            && let Some(file) = self.sync_data.get_file_mut(index)
1369        {
1370            file.update_frecency_scores(frecency_tracker, arena, &self.base_path, self.mode)?;
1371
1372            // Update parent dir frecency inline (only if larger).
1373            let score = file.access_frecency_score as i32;
1374            let dir_idx = file.parent_dir_index() as usize;
1375            if let Some(dir) = self.sync_data.dirs.get_mut(dir_idx) {
1376                dir.update_frecency_if_larger(score);
1377            }
1378        }
1379
1380        Ok(())
1381    }
1382
1383    pub fn get_file_by_path(&self, path: impl AsRef<Path>) -> Option<&FileItem> {
1384        self.sync_data
1385            .find_file_index(path.as_ref(), &self.base_path)
1386            .ok()
1387            .and_then(|index| self.sync_data.files().get(index))
1388    }
1389
1390    pub fn get_mut_file_by_path(&mut self, path: impl AsRef<Path>) -> Option<&mut FileItem> {
1391        let path = path.as_ref();
1392        let rel = self.to_relative_path(path);
1393        let rel_ref: &str = rel.as_deref().unwrap_or("");
1394        let index = self
1395            .sync_data
1396            .find_file_index(path, &self.base_path)
1397            .ok()
1398            .or_else(|| self.sync_data.find_overflow_index(rel_ref));
1399        index.and_then(|i| self.sync_data.get_file_mut(i))
1400    }
1401
1402    /// Handle the event of certain file being modified or adds a neww file if it is not added
1403    /// If this function returns `None` it means that picker is in the invalid state, or the capacity
1404    /// of index is exhausted and a new rescan needs to be triggered.
1405    #[tracing::instrument(skip(self),level = Level::DEBUG)]
1406    pub fn handle_create_or_modify(&mut self, path: impl AsRef<Path> + Debug) -> Option<&FileItem> {
1407        let path = path.as_ref();
1408
1409        if let Ok(idx) = self.sync_data.find_file_index(path, &self.base_path) {
1410            return self.handle_file_modify(path, FileSlot::Base(idx));
1411        }
1412
1413        let relative_path = self.to_relative_path(path)?;
1414        if let Some(idx) = self.sync_data.find_overflow_index(&relative_path) {
1415            return self.handle_file_modify(path, FileSlot::Overflow(idx));
1416        }
1417
1418        self.add_new_file(path)
1419    }
1420
1421    #[tracing::instrument(skip_all, fields(path = ?path), level = Level::DEBUG)]
1422    fn handle_file_modify(&mut self, path: &Path, slot: FileSlot) -> Option<&FileItem> {
1423        let overlay = self.sync_data.bigram_overlay.as_ref().map(Arc::clone);
1424        let pos = slot.index();
1425        let file = self.sync_data.get_file_mut(pos)?;
1426
1427        let metadata = std::fs::metadata(path)
1428            .inspect_err(|e| {
1429                tracing::error!(
1430                    ?e,
1431                    "File market for modification doesn't exists or not accessible"
1432                )
1433            })
1434            .ok()?; // if we can't read metadata this file either doesn't exists or not accessible
1435
1436        let size = metadata.len();
1437        let modified_time = metadata
1438            .modified()
1439            .ok()
1440            .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
1441            .map(|d| d.as_secs());
1442
1443        if file.is_deleted() {
1444            file.set_deleted(false);
1445        }
1446
1447        file.update_metadata(&self.cache_budget, modified_time, Some(size));
1448
1449        // only base-region entries participate in the bigram overlay
1450        if matches!(slot, FileSlot::Base(_))
1451            && let Some(ref overlay) = overlay
1452        {
1453            let in_indexable = {
1454                let guard = overlay.read();
1455                pos < guard.base_file_count()
1456            };
1457
1458            if in_indexable && let Ok(content) = std::fs::read(path) {
1459                overlay.write().modify_file(pos, &content);
1460            }
1461        }
1462
1463        Some(&*self.sync_data.get_file_mut(pos)?)
1464    }
1465
1466    /// Adds a new file to picker, if the file can not be added returns `None`
1467    /// which indicates that it's time to trigger a new sync
1468    #[tracing::instrument(skip(self))]
1469    pub fn add_new_file(&mut self, path: &Path) -> Option<&FileItem> {
1470        // On Windows `pathdiff::diff_paths` is byte-wise, so a short-name
1471        // input never shares a prefix with the canonicalized base_path and
1472        // the resulting relative path becomes absolute. Canonicalize first.
1473        #[cfg(windows)]
1474        let canonical_buf: Option<PathBuf> = if path.starts_with(&self.base_path) {
1475            None
1476        } else if let Ok(c) = crate::path_utils::canonicalize(path) {
1477            Some(c)
1478        } else {
1479            let parent = path.parent()?;
1480            let file_name = path.file_name()?;
1481            let mut p = crate::path_utils::canonicalize(parent).ok()?;
1482            p.push(file_name);
1483            Some(p)
1484        };
1485        #[cfg(windows)]
1486        let path_for_index: &Path = canonical_buf.as_deref().unwrap_or(path);
1487        #[cfg(not(windows))]
1488        let path_for_index: &Path = path;
1489
1490        let (mut file_item, rel_path) =
1491            FileItem::new(path_for_index.to_path_buf(), &self.base_path, None);
1492
1493        // Lazily create the shared overflow builder if not exists yet
1494        let builder = self
1495            .sync_data
1496            .overflow_builder
1497            .get_or_insert_with(|| crate::simd_path::ChunkedPathStoreBuilder::new(64));
1498
1499        let chunked_path = builder.add_file_immediate(&rel_path, file_item.path.filename_offset);
1500        file_item.set_path(chunked_path);
1501        file_item.set_overflow(true);
1502
1503        if !self.sync_data.files.push(file_item) {
1504            return None;
1505        }
1506
1507        self.sync_data.files.last()
1508    }
1509
1510    /// Tombstone a file instead of removing it, keeping base indices stable.
1511    pub fn remove_file_by_path(&mut self, path: impl AsRef<Path>) -> bool {
1512        let path = path.as_ref();
1513        match self.sync_data.find_file_index(path, &self.base_path) {
1514            Ok(index) => {
1515                let file = &mut self.sync_data.files[index];
1516                file.set_deleted(true);
1517                // Clear any cached git status — the tombstone no longer
1518                // corresponds to a real worktree file, so any previously
1519                // cached status (e.g. `WT_MODIFIED` from before the
1520                // delete) is actively misleading. All user-facing search
1521                // paths filter `is_deleted()` so this is invisible today,
1522                // but keeping the invariant "tombstone ⇒ git_status=None"
1523                // means a new reader that forgets the filter can't leak
1524                // stale data.
1525                file.git_status = None;
1526                file.invalidate_mmap(&self.cache_budget);
1527                if let Some(ref overlay) = self.sync_data.bigram_overlay {
1528                    overlay.write().delete_file(index);
1529                }
1530                true
1531            }
1532            Err(_) => {
1533                // Check overflow for added files — these can be removed directly
1534                // since they aren't in the base bigram index.
1535                let rel = self.to_relative_path(path);
1536                let rel_ref: &str = rel.as_deref().unwrap_or("");
1537                if let Some(abs_pos) = self.sync_data.find_overflow_index(rel_ref) {
1538                    self.sync_data.files.remove(abs_pos);
1539                    true
1540                } else {
1541                    false
1542                }
1543            }
1544        }
1545    }
1546
1547    // TODO make this O(n)
1548    pub fn remove_all_files_in_dir(&mut self, dir: impl AsRef<Path>) -> usize {
1549        let dir_path = dir.as_ref();
1550        let relative_dir = self
1551            .to_relative_path(dir_path)
1552            .map(|c| c.into_owned())
1553            .unwrap_or_default();
1554
1555        let dir_prefix = if relative_dir.is_empty() {
1556            String::new()
1557        } else {
1558            format!("{}{}", relative_dir, std::path::MAIN_SEPARATOR)
1559        };
1560
1561        self.sync_data.retain_files_with_arena(|file, arena| {
1562            !file.relative_path_starts_with(arena, &dir_prefix)
1563        })
1564    }
1565
1566    /// Use this to prevent any substantial background threads from acquiring the locks
1567    pub fn cancel(&self) {
1568        self.signals.cancelled.store(true, Ordering::Release);
1569    }
1570
1571    /// Stop the background filesystem watcher. Non-blocking.
1572    pub fn stop_background_monitor(&mut self) {
1573        if let Some(mut watcher) = self.background_watcher.take() {
1574            watcher.stop();
1575        }
1576    }
1577
1578    /// Quick way to check if scan is going without acquiring a lock for [Self::get_scan_progress]
1579    pub fn is_scan_active(&self) -> bool {
1580        self.signals.scanning.load(Ordering::Relaxed)
1581    }
1582
1583    /// Return a clone of the watcher-ready flag so callers can poll it without
1584    /// holding a lock on the picker.
1585    pub fn watcher_signal(&self) -> Arc<AtomicBool> {
1586        Arc::clone(&self.signals.watcher_ready)
1587    }
1588
1589    /// Convert an absolute path to a relative path string (relative to base_path).
1590    /// Returns None if the path doesn't start with base_path.
1591    ///
1592    /// On Windows the picker canonicalizes its base via `dunce`, so caller
1593    /// paths that still carry 8.3 short names or a different casing would
1594    /// fail a naive prefix check. Fall back to canonicalizing (or, when the
1595    /// file was just deleted, canonicalizing its parent) before stripping.
1596    fn to_relative_path<'a>(&self, path: &'a Path) -> Option<std::borrow::Cow<'a, str>> {
1597        if let Ok(stripped) = path.strip_prefix(&self.base_path)
1598            && let Some(s) = stripped.to_str()
1599        {
1600            return Some(normalize_relative_path(s));
1601        }
1602
1603        #[cfg(windows)]
1604        {
1605            let rel = canonical_relative_path(path, &self.base_path)?;
1606            return Some(std::borrow::Cow::Owned(rel));
1607        }
1608
1609        #[cfg(not(windows))]
1610        None
1611    }
1612}
1613
1614/// Normalize relative-path separators so byte-wise comparisons against
1615/// arena-stored paths are stable on Windows.
1616///
1617/// `pathdiff::diff_paths` (used during scan) builds a `PathBuf` whose
1618/// segments are joined with `MAIN_SEPARATOR` (`\\`), but lookups go through
1619/// `Path::strip_prefix` which returns a sub-slice of the original input —
1620/// so callers that constructed the path via `base.join("a/b/c")` end up
1621/// with `/` in the lookup key and never match the stored `a\\b\\c`.
1622#[cfg(windows)]
1623#[inline]
1624fn normalize_relative_path(s: &str) -> std::borrow::Cow<'_, str> {
1625    if s.contains('/') {
1626        std::borrow::Cow::Owned(s.replace('/', "\\"))
1627    } else {
1628        std::borrow::Cow::Borrowed(s)
1629    }
1630}
1631
1632#[cfg(not(windows))]
1633#[inline]
1634fn normalize_relative_path(s: &str) -> std::borrow::Cow<'_, str> {
1635    std::borrow::Cow::Borrowed(s)
1636}
1637
1638/// Resolve a possibly-short-name Windows path to the picker's canonical base.
1639/// Used by the Windows-only fallbacks in `to_relative_path` and
1640/// `find_file_index` so events still match tombstoned entries.
1641#[cfg(windows)]
1642fn canonical_relative_path(path: &Path, base: &Path) -> Option<String> {
1643    if let Ok(canonical) = crate::path_utils::canonicalize(path)
1644        && let Ok(stripped) = canonical.strip_prefix(base)
1645        && let Some(s) = stripped.to_str()
1646    {
1647        return Some(s.to_owned());
1648    }
1649
1650    // Deleted files can't be canonicalized — canonicalize the parent and
1651    // re-attach the filename.
1652    let parent = path.parent()?;
1653    let file_name = path.file_name()?;
1654    let canonical_parent = crate::path_utils::canonicalize(parent).ok()?;
1655    let stripped_parent = canonical_parent.strip_prefix(base).ok()?;
1656    let mut rel = stripped_parent.to_path_buf();
1657    rel.push(file_name);
1658    rel.to_str().map(str::to_owned)
1659}
1660
1661#[derive(Debug, Clone, Copy)]
1662enum FileSlot {
1663    Base(usize),
1664    Overflow(usize),
1665}
1666
1667impl FileSlot {
1668    fn index(self) -> usize {
1669        match self {
1670            FileSlot::Base(i) | FileSlot::Overflow(i) => i,
1671        }
1672    }
1673}
1674
1675/// Raw pointers captured from the picker for off-lock parallel work.
1676/// Created by [`FilePicker::post_scan_snapshot`]. See its safety docs.
1677///
1678/// Clears `post_scan_indexing_active` on drop — the flag is set by
1679/// `post_scan_snapshot` and MUST only be cleared by dropping this struct.
1680pub(crate) struct PostScanUnsafeSnapshot {
1681    pub files: *mut FileItem,
1682    pub base_count: usize,
1683    pub indexable_count: usize,
1684    pub dirs: *const crate::types::DirItem,
1685    pub dirs_len: usize,
1686    pub arena: ArenaPtr,
1687    pub budget: *const crate::types::ContentCacheBudget,
1688    pub base_path: PathBuf,
1689    /// Holds the flag reference so it is automatically flips
1690    /// when the pointsr
1691    post_scan_flag: Arc<AtomicBool>,
1692}
1693
1694impl Drop for PostScanUnsafeSnapshot {
1695    fn drop(&mut self) {
1696        self.post_scan_flag.store(false, Ordering::Release);
1697    }
1698}
1699
1700// SAFETY: the pointers are derived from Vec/Arc storage that outlives
1701// any use of this struct (guaranteed by reserve + post_scan_indexing_active).
1702unsafe impl Send for PostScanUnsafeSnapshot {}
1703unsafe impl Sync for PostScanUnsafeSnapshot {}
1704
1705/// A point-in-time snapshot of the file-scanning progress.
1706///
1707/// Returned by [`FilePicker::get_scan_progress`]. Useful for displaying
1708/// a progress indicator while the initial scan is running.
1709#[derive(Debug, Clone)]
1710pub struct ScanProgress {
1711    pub scanned_files_count: usize,
1712    pub is_scanning: bool,
1713    pub is_watcher_ready: bool,
1714    pub is_warmup_complete: bool,
1715}
1716
1717/// Pre-populate mmap caches for the most valuable files so the first grep
1718/// search doesn't pay the mmap creation + page fault cost.
1719///
1720/// All files are collected once, then an O(n) `select_nth_unstable_by`
1721/// partitions the top [`MAX_CACHED_CONTENT_FILES`] highest-frecency eligible
1722/// files to the front (binary / empty files are pushed to the end by the
1723/// comparator). The selected prefix is warmed in parallel via rayon.
1724///
1725/// Files beyond the budget are still available via temporary mmaps on first
1726/// grep access, so correctness is unaffected.
1727#[tracing::instrument(skip(files), name = "warmup_mmaps", level = Level::DEBUG)]
1728pub(crate) fn warmup_mmaps(
1729    files: &[FileItem],
1730    budget: &ContentCacheBudget,
1731    base_path: &Path,
1732    arena: ArenaPtr,
1733) {
1734    let max_files = budget.max_files;
1735    let max_bytes = budget.max_bytes;
1736    let max_file_size = budget.max_file_size;
1737
1738    // Single collect — no pre-filter. The comparator in select_nth pushes
1739    // ineligible files (binary, empty) to the tail automatically.
1740    let mut all: Vec<&FileItem> = files.iter().collect();
1741
1742    // O(n) partial sort: top max_files eligible-by-frecency files land in
1743    // all[..max_files]. Ineligible files compare as "lowest priority" so
1744    // they naturally sink past the partition boundary.
1745    if all.len() > max_files {
1746        all.select_nth_unstable_by(max_files, |a, b| {
1747            let a_ok = !a.is_binary() && a.size > 0;
1748            let b_ok = !b.is_binary() && b.size > 0;
1749            match (a_ok, b_ok) {
1750                (true, false) => std::cmp::Ordering::Less,
1751                (false, true) => std::cmp::Ordering::Greater,
1752                (false, false) => std::cmp::Ordering::Equal,
1753                (true, true) => b.total_frecency_score().cmp(&a.total_frecency_score()),
1754            }
1755        });
1756    }
1757
1758    let to_warm = &all[..all.len().min(max_files)];
1759
1760    let warmed_bytes = AtomicU64::new(0);
1761    let budget_exhausted = AtomicBool::new(false);
1762
1763    BACKGROUND_THREAD_POOL.install(|| {
1764        to_warm.par_iter().for_each(|file| {
1765            if budget_exhausted.load(Ordering::Relaxed) {
1766                return;
1767            }
1768
1769            if file.is_binary() || file.size == 0 || file.size > max_file_size {
1770                return;
1771            }
1772
1773            // Byte budget.
1774            let prev_bytes = warmed_bytes.fetch_add(file.size, Ordering::Relaxed);
1775            if prev_bytes + file.size > max_bytes {
1776                budget_exhausted.store(true, Ordering::Relaxed);
1777                return;
1778            }
1779
1780            if let Some(content) = file.get_content(arena, base_path, budget) {
1781                let _ = std::hint::black_box(content.first());
1782            }
1783        });
1784    });
1785}
1786
1787impl FileSync {
1788    pub(crate) fn discover_git_workdir(base_path: &Path) -> Option<PathBuf> {
1789        let git_workdir = Repository::discover(base_path)
1790            .ok()
1791            .and_then(|repo| repo.workdir().map(Path::to_path_buf))
1792            .map(crate::path_utils::normalize);
1793
1794        match &git_workdir {
1795            Some(workdir) => debug!("Git repository found at: {}", workdir.display()),
1796            None => warn!("No git repository found for path: {}", base_path.display()),
1797        }
1798
1799        git_workdir
1800    }
1801
1802    pub(crate) fn spawn_git_status(git_workdir: PathBuf) -> JoinHandle<Option<GitStatusCache>> {
1803        std::thread::spawn(move || {
1804            GitStatusCache::read_git_status(
1805                Some(git_workdir.as_path()),
1806                &mut crate::git::default_status_options(),
1807            )
1808        })
1809    }
1810
1811    /// Returns files immediately (searchable) and a handle to the in-progress
1812    /// git status computation. This avoids blocking on `git status` which can
1813    /// take 10+ seconds on very large repos (e.g. chromium).
1814    pub(crate) fn walk_filesystem(
1815        base_path: &Path,
1816        git_workdir: Option<PathBuf>,
1817        synced_files_count: &Arc<AtomicUsize>,
1818        shared_frecency: &SharedFrecency,
1819        mode: FfsMode,
1820    ) -> Result<FileSync, Error> {
1821        use ignore::WalkBuilder;
1822
1823        let scan_start = std::time::Instant::now();
1824        info!("SCAN: Starting filesystem walk and git status (async)");
1825
1826        // Walk files (the fast part, typically 2-3s even on huge repos).
1827        let is_git_repo = git_workdir.is_some();
1828        let bg_threads = BACKGROUND_THREAD_POOL.current_num_threads();
1829
1830        let mut walk_builder = WalkBuilder::new(base_path);
1831        walk_builder
1832            // this is a very important guard for the user opening ~/ or other root non-git dir
1833            .hidden(!is_git_repo)
1834            .git_ignore(true)
1835            .git_exclude(true)
1836            .git_global(true)
1837            .ignore(true)
1838            .follow_links(false)
1839            .threads(bg_threads);
1840
1841        if !is_git_repo && let Some(overrides) = non_git_repo_overrides(base_path) {
1842            walk_builder.overrides(overrides);
1843        }
1844
1845        let walker = walk_builder.build_parallel();
1846        let walker_start = std::time::Instant::now();
1847        debug!("SCAN: Starting file walker");
1848
1849        // Walk: collect (FileItem, rel_path) pairs. Keep the walk fast —
1850        // no chunking, no HashMap, just Vec::push under the Mutex.
1851        let pairs = parking_lot::Mutex::new(Vec::<(FileItem, String)>::new());
1852
1853        walker.run(|| {
1854            let pairs = &pairs;
1855            let counter = Arc::clone(synced_files_count);
1856            let base_path = base_path.to_path_buf();
1857
1858            Box::new(move |result| {
1859                let Ok(entry) = result else {
1860                    return ignore::WalkState::Continue;
1861                };
1862
1863                if entry.file_type().is_some_and(|ft| ft.is_file()) {
1864                    let path = entry.path();
1865
1866                    // Ignore walkers sometimes surface files inside `.git/`
1867                    // when the base is itself a git repo — skip them.
1868                    if is_git_file(path) {
1869                        return ignore::WalkState::Continue;
1870                    }
1871
1872                    if !is_git_repo && is_known_binary_extension(path) {
1873                        return ignore::WalkState::Continue;
1874                    }
1875
1876                    let metadata = entry.metadata().ok();
1877                    let (file_item, rel_path) =
1878                        FileItem::new_from_walk(path, &base_path, None, metadata.as_ref());
1879
1880                    pairs.lock().push((file_item, rel_path));
1881                    counter.fetch_add(1, Ordering::Relaxed);
1882                }
1883                ignore::WalkState::Continue
1884            })
1885        });
1886
1887        let mut pairs = pairs.into_inner();
1888        info!(
1889            "SCAN: File walking completed in {:?} for {} files",
1890            walker_start.elapsed(),
1891            pairs.len(),
1892        );
1893
1894        // Sort by (dir_part, filename). This groups files by their directory
1895        // into contiguous runs so the linear dir-extraction pass below can
1896        // dedupe by comparing only against the previous dir.
1897        BACKGROUND_THREAD_POOL.install(|| {
1898            pairs.par_sort_unstable_by(|(a, path_a), (b, path_b)| {
1899                // SAFETY: `filename_offset` is always at a character boundary
1900                let (a_dir, a_file) = path_a.split_at(a.path.filename_offset as usize);
1901                let (b_dir, b_file) = path_b.split_at(b.path.filename_offset as usize);
1902                a_dir.cmp(b_dir).then_with(|| a_file.cmp(b_file))
1903            });
1904        });
1905
1906        let mut builder = crate::simd_path::ChunkedPathStoreBuilder::new(pairs.len());
1907        let dirs = populates_dirs_files_chunked_storage(&mut pairs, &mut builder);
1908
1909        let mut files: Vec<FileItem> = pairs.into_iter().map(|(file, _)| file).collect();
1910        let chunked_paths = builder.finish();
1911        let arena = chunked_paths.as_arena_ptr();
1912
1913        // Apply frecency scores (access-based only — git status not yet available).
1914        // DirItem.max_access_frecency is AtomicI32, so parallel threads write directly.
1915        let frecency = shared_frecency
1916            .read()
1917            .map_err(|_| Error::AcquireFrecencyLock)?;
1918
1919        if let Some(frecency) = frecency.as_ref() {
1920            let dirs_ref = &dirs;
1921            BACKGROUND_THREAD_POOL.install(|| {
1922                files.par_iter_mut().for_each(|file| {
1923                    let _ = file.update_frecency_scores(frecency, arena, base_path, mode);
1924                    let score = file.access_frecency_score as i32;
1925                    if score > 0 {
1926                        let dir_idx = file.parent_dir_index() as usize;
1927                        if let Some(dir) = dirs_ref.get(dir_idx) {
1928                            dir.update_frecency_if_larger(score);
1929                        }
1930                    }
1931                });
1932            });
1933        }
1934        drop(frecency);
1935
1936        // Re-sort by (indexable-first, parent_dir, filename). Indexable base
1937        // files come first so the bigram builder can size its column bitsets to
1938        // just the indexable subset. Within each partition files stay sorted by
1939        // (parent_dir, filename) — `find_file_index` does two binary searches
1940        // (one per partition) to preserve O(log n) lookups.
1941        //
1942        // "Indexable" = can possibly contribute bigrams: not binary-by-extension,
1943        // non-zero size, not larger than the bigram/mmap cap. The cap matches
1944        // `ContentCacheBudget::max_file_size` default (10 MB) — any file above
1945        // that is skipped by `build_bigram_index` anyway.
1946        const BIGRAM_ELIGIBLE_MAX_SIZE: u64 = 10 * 1024 * 1024;
1947        let is_indexable =
1948            |f: &FileItem| !f.is_binary() && f.size > 0 && f.size <= BIGRAM_ELIGIBLE_MAX_SIZE;
1949        BACKGROUND_THREAD_POOL.install(|| {
1950            files.par_sort_unstable_by(|a, b| {
1951                // Sort indexables first (true < false when we invert with !).
1952                (!is_indexable(a))
1953                    .cmp(&!is_indexable(b))
1954                    .then_with(|| a.parent_dir_index().cmp(&b.parent_dir_index()))
1955                    .then_with(|| a.file_name(arena).cmp(&b.file_name(arena)))
1956            });
1957        });
1958        let indexable_count = files.partition_point(is_indexable);
1959
1960        // Ask the allocator to return freed pages to the OS.
1961        hint_allocator_collect();
1962
1963        let file_item_size = std::mem::size_of::<FileItem>();
1964        let files_vec_bytes = files.len() * file_item_size;
1965        let dir_table_bytes = dirs.len() * std::mem::size_of::<DirItem>()
1966            + dirs
1967                .iter()
1968                .map(|d| d.relative_path(arena).len())
1969                .sum::<usize>();
1970
1971        let total_time = scan_start.elapsed();
1972        info!(
1973            "SCAN: Walk completed in {:?} ({} files, {} dirs, \
1974         chunked_store={:.2}MB, files_vec={:.2}MB, dirs={:.2}MB, FileItem={}B)",
1975            total_time,
1976            files.len(),
1977            dirs.len(),
1978            chunked_paths.heap_bytes() as f64 / 1_048_576.0,
1979            files_vec_bytes as f64 / 1_048_576.0,
1980            dir_table_bytes as f64 / 1_048_576.0,
1981            file_item_size,
1982        );
1983
1984        let base_count = files.len();
1985
1986        Ok(FileSync {
1987            files: StableVec::from_vec_with_reserve(files, MAX_OVERFLOW_FILES),
1988            indexable_count,
1989            base_count,
1990            dirs,
1991            overflow_builder: None,
1992            git_workdir,
1993            bigram_index: None,
1994            bigram_overlay: None,
1995            chunked_paths: Some(chunked_paths),
1996        })
1997    }
1998}
1999
2000/// This does both thing (yes sorry all the OOP morons)
2001/// in one go: populates files chunked storage and creates new directories
2002fn populates_dirs_files_chunked_storage<'a>(
2003    pairs: &'a mut [(FileItem, String)],
2004    builder: &mut crate::simd_path::ChunkedPathStoreBuilder,
2005) -> Vec<DirItem> {
2006    let mut dirs: Vec<DirItem> = Vec::new();
2007
2008    let mut prev_dir: &'a str = "";
2009    let mut prev_dir_valid = false;
2010    let mut current_dir_idx: u32 = 0;
2011
2012    for (file, rel) in pairs.iter_mut() {
2013        let rel: &'a str = rel;
2014        let dir_part: &'a str = &rel[..file.path.filename_offset as usize];
2015
2016        if !prev_dir_valid || prev_dir != dir_part {
2017            let dir_string = builder.add_dir_immediate(dir_part);
2018
2019            // Compute last-segment offset: for "src/components/" -> 4 (points to "components/")
2020            let last_seg = if dir_part.is_empty() {
2021                0
2022            } else {
2023                let trimmed = dir_part.trim_end_matches(std::path::is_separator);
2024                trimmed
2025                    .rfind(std::path::is_separator)
2026                    .map(|i| i + 1)
2027                    .unwrap_or(0) as u16
2028            };
2029
2030            dirs.push(DirItem::new(dir_string, last_seg));
2031            current_dir_idx = (dirs.len() - 1) as u32;
2032
2033            prev_dir = dir_part;
2034            prev_dir_valid = true;
2035        }
2036
2037        let cs = builder.add_file_immediate(rel, file.path.filename_offset);
2038
2039        file.set_path(cs);
2040        file.set_parent_dir(current_dir_idx);
2041    }
2042
2043    dirs
2044}
2045
2046/// Fast extension-based binary detection. Avoids opening files during scan.
2047/// Covers the vast majority of binary files in typical repositories.
2048#[inline]
2049fn is_known_binary_extension(path: &Path) -> bool {
2050    let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
2051        return false;
2052    };
2053
2054    matches!(
2055        ext,
2056        // Images
2057        "png" | "jpg" | "jpeg" | "gif" | "bmp" | "ico" | "webp" | "tiff" | "tif" | "avif" |
2058        "heic" | "psd" | "icns" | "cur" | "raw" | "cr2" | "nef" | "dng" |
2059        // Video/Audio
2060        "mp4" | "avi" | "mov" | "wmv" | "mkv" | "mp3" | "wav" | "flac" | "ogg" | "m4a" |
2061        "aac" | "webm" | "flv" | "mpg" | "mpeg" | "wma" | "opus" | "pcm" | "reapeaks" |
2062        // Compressed/Archives
2063        "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "zst" | "lz4" | "lzma" |
2064        "cab" | "cpio" | "jsonlz4" |
2065        // Packages/Installers
2066        "deb" | "rpm" | "apk" | "dmg" | "msi" | "iso" | "nupkg" | "whl" | "egg" |
2067        "snap" | "appimage" | "flatpak" | "crx" | "pak" |
2068        // Executables/Libraries
2069        "exe" | "dll" | "so" | "dylib" | "o" | "a" | "lib" | "bin" | "elf" |
2070        // Documents
2071        "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" |
2072        // Databases
2073        "db" | "sqlite" | "sqlite3" | "mdb" |
2074        // SQLite / LevelDB auxiliary files
2075        "sqlite-wal" | "sqlite-shm" | "sqlite3-wal" | "sqlite3-shm" |
2076        "db-wal" | "db-shm" | "ldb" |
2077        // Fonts
2078        "ttf" | "otf" | "woff" | "woff2" | "eot" |
2079        // Compiled/Runtime
2080        "class" | "pyc" | "pyo" | "wasm" | "dex" | "jar" | "war" |
2081        // OCaml / Swift / Objective-C build artefacts
2082        "cmi" | "cmt" | "cmti" | "cmx" | "cof" | "cot" | "cop" | "nib" |
2083        "swiftdeps" | "swiftdeps~" | "swiftdoc" | "swiftmodule" | "swiftsourceinfo" |
2084        // ML/Data Science
2085        "npy" | "npz" | "pkl" | "pickle" | "h5" | "hdf5" | "pt" | "pth" | "onnx" |
2086        "safetensors" | "tfrecord" |
2087        // 3D/Game
2088        "glb" | "fbx" | "blend" | "blp" | "tga" |
2089        // Game engines / Unity-Unreal side-files
2090        "meta" | "dat" | "tfx" | "dia" | "journal" | "toc" | "thm" | "pfl" |
2091        "shadow" | "scan" | "flm" | "bcmap" | "userinfo" |
2092        // Data/serialized
2093        "parquet" | "arrow" | "pb" |
2094        // IDE/OS metadata
2095        "DS_Store" | "suo"
2096    )
2097}
2098
2099/// Detect binary content by checking for NUL bytes in the first 512 bytes.
2100/// Called lazily when file content is first loaded, not during initial scan.
2101#[inline]
2102pub(crate) fn detect_binary_content(content: &[u8]) -> bool {
2103    let check_len = content.len().min(512);
2104    content[..check_len].contains(&0)
2105}
2106
2107/// Length of the longest shared directory prefix of two relative dir
2108/// paths (without a trailing separator), measured as the number of bytes
2109/// up to and including the last shared separator — plus the full shorter
2110/// path when it is itself a directory prefix of the longer one.
2111///
2112/// Examples:
2113///   `"src/components"` vs `"src/routes"`   → 4  (`"src/"` emitted once)
2114///   `"lib/deep/nested"` vs `"lib/deep"`   → 8  (`"lib/deep"` is a prefix)
2115///   `"lib/deep"` vs `"lib/deeper"`        → 4  (only `"lib/"` is shared)
2116///   `"lib"` vs `"src"`                    → 0
2117///
2118/// Used by [`FilePicker::for_each_watch_dir`] to avoid re-emitting
2119/// ancestors that were already yielded for the previous (sorted) sibling.
2120fn common_dir_prefix_len(a: &str, b: &str) -> usize {
2121    let max = a.len().min(b.len());
2122    let a_bytes = a.as_bytes();
2123    let b_bytes = b.as_bytes();
2124    let mut last_sep = 0;
2125    let mut i = 0;
2126    while i < max && a_bytes[i] == b_bytes[i] {
2127        if std::path::is_separator(a_bytes[i] as char) {
2128            last_sep = i + 1;
2129        }
2130        i += 1;
2131    }
2132    // If one string is a prefix of the other and the next byte in the
2133    // longer one is a separator, the full shorter path is a shared dir.
2134    if i == max && i > 0 {
2135        let longer = if a.len() > b.len() { a_bytes } else { b_bytes };
2136        if i < longer.len() && std::path::is_separator(longer[i] as char) {
2137            return i;
2138        }
2139    }
2140    last_sep
2141}
2142
2143/// Ask the global allocator to return freed pages to the OS.
2144/// Enabled via the `mimalloc-collect` feature (set by ffs-nvim).
2145/// No-op when the feature is off (tests, system allocator).
2146pub(crate) fn hint_allocator_collect() {
2147    #[cfg(feature = "mimalloc-collect")]
2148    {
2149        // Collect BACKGROUND_THREAD_POOL workers — that's where the bigram
2150        // builder allocated memory. `rayon::broadcast` would target the global
2151        // pool, which is the wrong set of threads.
2152        BACKGROUND_THREAD_POOL.broadcast(|_| unsafe { libmimalloc_sys::mi_collect(true) });
2153
2154        // Main thread too.
2155        unsafe { libmimalloc_sys::mi_collect(true) };
2156    }
2157}
2158
2159#[cfg(test)]
2160mod tests {
2161    use super::*;
2162
2163    /// The watcher must watch every ancestor directory up to `base_path`,
2164    /// not just the immediate parents of indexed files. Intermediate dirs
2165    /// that contain only subdirectories (no direct files) are NOT in
2166    /// `sync_data.dirs` — yet they must still appear in `extract_watch_dirs`
2167    /// so Create events on new subdirectories below them fire.
2168    ///
2169    /// Correctness regression guard for any refactor that replaces the
2170    /// ancestor walk with a direct `sync_data.dirs` iteration.
2171    #[test]
2172    fn extract_watch_dirs_includes_pure_ancestor_dirs() {
2173        let dir = tempfile::tempdir().unwrap();
2174        // On Windows the picker canonicalizes base_path with dunce; match that
2175        // here so the stored dir paths line up with assertions built from
2176        // `base.join(..)` (which otherwise would carry an 8.3 short name).
2177        let base_buf = crate::path_utils::canonicalize(dir.path()).unwrap();
2178        let base = base_buf.as_path();
2179
2180        // Tree:
2181        //   base/src/components/button.txt    (src/components has a file)
2182        //   base/src/routes/home.txt          (src/routes has a file)
2183        //   base/lib/deep/nested/util.txt     (lib and lib/deep have no files)
2184        //
2185        // `sync_data.dirs` will only contain:
2186        //   src/components/
2187        //   src/routes/
2188        //   lib/deep/nested/
2189        //
2190        // But the watcher also needs:
2191        //   src/       (pure ancestor — no direct files)
2192        //   lib/       (pure ancestor)
2193        //   lib/deep/  (pure ancestor)
2194        // otherwise new siblings like `src/NewDir/x.txt` are missed.
2195        for rel in [
2196            "src/components/button.txt",
2197            "src/routes/home.txt",
2198            "lib/deep/nested/util.txt",
2199        ] {
2200            let path = base.join(rel);
2201            std::fs::create_dir_all(path.parent().unwrap()).unwrap();
2202            std::fs::write(&path, b"x").unwrap();
2203        }
2204
2205        let mut picker = FilePicker::new(FilePickerOptions {
2206            base_path: base.to_str().unwrap().into(),
2207            watch: false,
2208            ..Default::default()
2209        })
2210        .unwrap();
2211        picker.collect_files().unwrap();
2212
2213        let mut watch_dirs: Vec<PathBuf> = Vec::new();
2214        picker.for_each_dir(|p| {
2215            watch_dirs.push(p.to_path_buf());
2216            std::ops::ControlFlow::Continue(())
2217        });
2218        let watch_set: std::collections::HashSet<PathBuf> = watch_dirs.iter().cloned().collect();
2219
2220        // Immediate parents (in sync_data.dirs) must be present.
2221        for rel in ["src/components", "src/routes", "lib/deep/nested"] {
2222            assert!(
2223                watch_set.contains(&base.join(rel)),
2224                "expected immediate parent {rel} in watch dirs, got {watch_set:?}",
2225            );
2226        }
2227
2228        // Pure-ancestor dirs (NOT in sync_data.dirs) must also be present.
2229        for rel in ["src", "lib", "lib/deep"] {
2230            assert!(
2231                watch_set.contains(&base.join(rel)),
2232                "expected pure-ancestor {rel} in watch dirs, got {watch_set:?}",
2233            );
2234        }
2235
2236        // No duplicates — streaming dedup must not emit the same dir twice.
2237        assert_eq!(
2238            watch_dirs.len(),
2239            watch_set.len(),
2240            "duplicate watch dir emitted: {watch_dirs:?}",
2241        );
2242
2243        // Base path itself is NOT walked into the result — the walker stops
2244        // at `current == base`. The outer `debouncer.watch(base_path, ...)`
2245        // call in create_debouncer covers it separately.
2246        assert!(
2247            !watch_set.contains(base),
2248            "base path must not be in watch dirs (covered by the top-level watch call)",
2249        );
2250    }
2251
2252    #[test]
2253    fn common_dir_prefix_len_cases() {
2254        assert_eq!(common_dir_prefix_len("", ""), 0);
2255        assert_eq!(common_dir_prefix_len("", "src"), 0);
2256        assert_eq!(common_dir_prefix_len("lib", "src"), 0);
2257        assert_eq!(common_dir_prefix_len("src/components", "src/routes"), 4);
2258        assert_eq!(common_dir_prefix_len("lib/deep/nested", "lib/deep"), 8);
2259        assert_eq!(common_dir_prefix_len("lib/deep", "lib/deep/nested"), 8);
2260        assert_eq!(common_dir_prefix_len("lib/deep", "lib/deeper"), 4);
2261        assert_eq!(common_dir_prefix_len("src", "src"), 0);
2262        // "src" is emitted-as-dir; "src/x" extends it — full "src" is shared.
2263        assert_eq!(common_dir_prefix_len("src", "src/x"), 3);
2264    }
2265}