Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, BinaryHeap, HashMap, HashSet};
2use std::fs::{self, File, OpenOptions};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const SPILL_MAGIC: &[u8; 8] = b"AFTSPI01";
27const FILE_TRIGRAM_COUNT_MAGIC: &[u8; 8] = b"AFTFTC01";
28const INDEX_VERSION: u32 = 4;
29const PREVIEW_BYTES: usize = 8 * 1024;
30const SPIMI_SOFT_LIMIT_BYTES: usize = 128 * 1024 * 1024;
31const SPIMI_HARD_LIMIT_BYTES: usize = 256 * 1024 * 1024;
32const SPILL_RECORD_ESTIMATED_BYTES: usize = 16;
33const DELTA_COMPACT_SOFT_FILES: usize = 1_000;
34const DELTA_COMPACT_HARD_FILES: usize = 5_000;
35const DELTA_COMPACT_SOFT_BYTES: usize = 32 * 1024 * 1024;
36const DELTA_COMPACT_HARD_BYTES: usize = 128 * 1024 * 1024;
37const EOF_SENTINEL: u8 = 0;
38const MAX_ENTRIES: usize = 10_000_000;
39const MIN_FILE_ENTRY_BYTES: usize = 57;
40const LOOKUP_ENTRY_BYTES: usize = 16;
41const POSTING_BYTES: usize = 6;
42static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
43
44pub struct CacheLock {
45    _guard: fs_lock::LockGuard,
46}
47
48impl CacheLock {
49    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
50        fs::create_dir_all(cache_dir)?;
51        let path = cache_dir.join("cache.lock");
52        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
53            .lock()
54            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
55        fs_lock::try_acquire(&path, Duration::from_secs(2))
56            .map(|guard| Self { _guard: guard })
57            .map_err(|error| match error {
58                fs_lock::AcquireError::Timeout => {
59                    std::io::Error::other("timed out acquiring search cache lock")
60                }
61                fs_lock::AcquireError::Io(error) => error,
62            })
63    }
64}
65
66#[derive(Clone, Debug)]
67pub struct SearchIndex {
68    base: Option<Arc<BasePostings>>,
69    delta_postings: HashMap<u32, Vec<Posting>>,
70    delta_file_trigrams: HashMap<u32, Vec<u32>>,
71    pub files: Arc<Vec<FileEntry>>,
72    pub path_to_id: Arc<HashMap<PathBuf, u32>>,
73    pub ready: bool,
74    project_root: PathBuf,
75    git_head: Option<String>,
76    max_file_size: u64,
77    ignore_rules_fingerprint: String,
78    pub file_trigram_count: Arc<Vec<u32>>,
79    unindexed_files: Arc<HashSet<u32>>,
80    superseded: HashSet<u32>,
81    base_file_count: u32,
82    delta_packed_bytes: usize,
83    compaction_state: Arc<Mutex<CompactionState>>,
84}
85
86#[derive(Clone, Debug)]
87struct BasePostings {
88    file: Arc<File>,
89    postings_blob_start: u64,
90    postings_blob_len: u64,
91    lookup: Arc<Vec<LookupEntry>>,
92}
93
94#[derive(Clone, Copy, Debug, PartialEq, Eq)]
95struct LookupEntry {
96    trigram: u32,
97    offset: u64,
98    count: u32,
99}
100
101#[derive(Clone, Debug, Default)]
102struct CompactionState {
103    running: bool,
104    requested_again: bool,
105    buffered_paths: Vec<PathBuf>,
106}
107
108#[derive(Clone, Debug)]
109pub struct SearchIndexSnapshot {
110    base: Option<Arc<BasePostings>>,
111    delta_postings: Arc<HashMap<u32, Vec<Posting>>>,
112    files: Arc<Vec<FileEntry>>,
113    path_to_id: Arc<HashMap<PathBuf, u32>>,
114    ready: bool,
115    project_root: PathBuf,
116    file_trigram_count: Arc<Vec<u32>>,
117    unindexed_files: Arc<HashSet<u32>>,
118    superseded: Arc<HashSet<u32>>,
119}
120
121#[derive(Clone, Debug, Default)]
122pub struct LexicalRankResult {
123    pub files: Vec<(PathBuf, f32)>,
124    pub engine_capped: bool,
125}
126
127impl SearchIndex {
128    /// Number of indexed files.
129    pub fn file_count(&self) -> usize {
130        self.files.len()
131    }
132
133    /// Number of unique trigrams in the combined base index and delta postings.
134    pub fn trigram_count(&self) -> usize {
135        self.snapshot().trigram_count()
136    }
137
138    /// Returns an immutable snapshot for queries. Callers must obtain the
139    /// snapshot while holding the RwLock that protects the SearchIndex, then
140    /// drop the guard before running expensive operations such as grep, glob, or
141    /// lexical ranking.
142    pub fn snapshot(&self) -> SearchIndexSnapshot {
143        SearchIndexSnapshot {
144            base: self.base.clone(),
145            delta_postings: Arc::new(self.delta_postings.clone()),
146            files: Arc::clone(&self.files),
147            path_to_id: Arc::clone(&self.path_to_id),
148            ready: self.ready,
149            project_root: self.project_root.clone(),
150            file_trigram_count: Arc::clone(&self.file_trigram_count),
151            unindexed_files: Arc::clone(&self.unindexed_files),
152            superseded: Arc::new(self.superseded.clone()),
153        }
154    }
155
156    /// Compute distinct query trigrams from literal tokens.
157    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
158        query_trigrams_from_tokens(tokens)
159    }
160
161    /// Score-rank file candidates by lexical relevance to query trigrams.
162    pub fn lexical_rank(
163        &self,
164        query_trigrams: &[u32],
165        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
166        max_files: usize,
167    ) -> Vec<(PathBuf, f32)> {
168        self.snapshot()
169            .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
170            .files
171    }
172
173    /// Score-rank file candidates and report whether the pre-filter step that
174    /// collects candidates reached its internal size limit before ranking.
175    pub fn lexical_rank_with_stats(
176        &self,
177        query_trigrams: &[u32],
178        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
179        max_files: usize,
180    ) -> LexicalRankResult {
181        self.snapshot()
182            .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
183    }
184}
185
186impl SearchIndexSnapshot {
187    /// Number of unique trigrams in the combined base index and delta postings.
188    pub fn trigram_count(&self) -> usize {
189        let base_count = self.base.as_ref().map_or(0, |base| base.lookup.len());
190        let Some(base) = &self.base else {
191            return self.delta_postings.len();
192        };
193        base_count
194            + self
195                .delta_postings
196                .keys()
197                .filter(|trigram| base.lookup_entry(**trigram).is_none())
198                .count()
199    }
200
201    /// Score-rank file candidates and report whether the pre-filter step that
202    /// collects candidates reached its internal size limit before ranking.
203    pub fn lexical_rank_with_stats(
204        &self,
205        query_trigrams: &[u32],
206        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
207        max_files: usize,
208    ) -> LexicalRankResult {
209        if query_trigrams.is_empty() || max_files == 0 {
210            return LexicalRankResult::default();
211        }
212
213        let mut non_zero: Vec<(u32, usize)> = query_trigrams
214            .iter()
215            .filter_map(|trigram| {
216                let posting_count = self.posting_count(*trigram);
217                (posting_count > 0).then_some((*trigram, posting_count))
218            })
219            .collect();
220        if non_zero.is_empty() {
221            return LexicalRankResult::default();
222        }
223
224        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
225        let selected_count = non_zero.len().min(3);
226        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
227
228        let mut candidate_ids = BTreeSet::new();
229        for (trigram, _) in non_zero.iter().take(selected_count) {
230            for file_id in self.postings_for_trigram(*trigram, None) {
231                candidate_ids.insert(file_id);
232            }
233        }
234        let pre_filter_candidate_count = candidate_ids.len();
235        let engine_capped = pre_filter_candidate_count > candidate_cap;
236        let filtered_candidates = candidate_ids
237            .into_iter()
238            .filter_map(|file_id| {
239                self.files
240                    .get(file_id as usize)
241                    .map(|entry| (file_id, entry))
242            })
243            .filter(|(_, entry)| {
244                if let Some(filter) = candidate_filter {
245                    filter(&entry.path)
246                } else {
247                    true
248                }
249            })
250            .collect::<Vec<_>>();
251
252        let mut ranked = Vec::new();
253        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
254            let score = lexical_score_snapshot(self, query_trigrams, file_id);
255            if score > 0.0 {
256                ranked.push((entry.path.clone(), score));
257            }
258        }
259
260        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
261        ranked.truncate(max_files);
262        LexicalRankResult {
263            files: ranked,
264            engine_capped,
265        }
266    }
267}
268
269#[derive(Clone, Debug, PartialEq, Eq)]
270pub struct Posting {
271    pub file_id: u32,
272    pub next_mask: u8,
273    pub loc_mask: u8,
274}
275
276#[derive(Clone, Debug)]
277pub struct FileEntry {
278    pub path: PathBuf,
279    pub size: u64,
280    pub modified: SystemTime,
281    pub content_hash: blake3::Hash,
282}
283
284#[derive(Clone, Debug, PartialEq, Eq)]
285pub struct GrepMatch {
286    pub file: PathBuf,
287    pub line: u32,
288    pub column: u32,
289    pub line_text: String,
290    pub match_text: String,
291}
292
293#[derive(Clone, Debug)]
294pub struct GrepResult {
295    pub matches: Vec<GrepMatch>,
296    pub total_matches: usize,
297    pub files_searched: usize,
298    pub files_with_matches: usize,
299    pub index_status: IndexStatus,
300    pub truncated: bool,
301    pub fully_degraded: bool,
302    pub engine_capped: bool,
303    /// True when a fallback directory walk stopped early due to file-count or time budget.
304    pub walk_truncated: bool,
305}
306
307#[derive(Clone, Copy, Debug, PartialEq, Eq)]
308pub enum IndexStatus {
309    Ready,
310    Building,
311    Fallback,
312    Disabled,
313}
314
315impl IndexStatus {
316    pub fn as_str(&self) -> &'static str {
317        match self {
318            IndexStatus::Ready => "Ready",
319            IndexStatus::Building => "Building",
320            IndexStatus::Fallback => "Fallback",
321            IndexStatus::Disabled => "Disabled",
322        }
323    }
324}
325
326#[derive(Clone, Debug, Default)]
327pub struct RegexQuery {
328    pub and_trigrams: Vec<u32>,
329    pub or_groups: Vec<Vec<u32>>,
330    pub(crate) and_filters: HashMap<u32, PostingFilter>,
331    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
332}
333
334#[derive(Clone, Copy, Debug, Default)]
335pub(crate) struct PostingFilter {
336    next_mask: u8,
337    loc_mask: u8,
338}
339
340#[derive(Clone, Copy)]
341struct SearchFileMetadata {
342    size: u64,
343    modified: SystemTime,
344}
345
346struct PreparedIndexedFile {
347    metadata: SearchFileMetadata,
348    content_hash: blake3::Hash,
349    trigram_map: BTreeMap<u32, PostingFilter>,
350}
351
352enum PreparedSearchPath {
353    Indexed(PreparedIndexedFile),
354    Unindexed(SearchFileMetadata),
355    Skipped,
356}
357
358#[derive(Clone, Debug, Default)]
359struct QueryBuild {
360    and_runs: Vec<Vec<u8>>,
361    or_groups: Vec<Vec<Vec<u8>>>,
362}
363
364#[derive(Clone, Debug, Default)]
365pub(crate) struct PathFilters {
366    includes: Option<GlobSet>,
367    excludes: Option<GlobSet>,
368}
369
370#[derive(Clone, Debug)]
371pub(crate) struct SearchScope {
372    pub root: PathBuf,
373    pub use_index: bool,
374}
375
376#[derive(Clone, Debug)]
377struct SharedGrepMatch {
378    file: Arc<PathBuf>,
379    line: u32,
380    column: u32,
381    line_text: String,
382    match_text: String,
383}
384
385#[derive(Clone, Debug)]
386enum SearchMatcher {
387    Literal(LiteralSearch),
388    Regex(Regex),
389}
390
391impl SearchIndex {
392    pub fn new() -> Self {
393        SearchIndex {
394            base: None,
395            delta_postings: HashMap::new(),
396            delta_file_trigrams: HashMap::new(),
397            files: Arc::new(Vec::new()),
398            path_to_id: Arc::new(HashMap::new()),
399            ready: false,
400            project_root: PathBuf::new(),
401            git_head: None,
402            max_file_size: DEFAULT_MAX_FILE_SIZE,
403            ignore_rules_fingerprint: String::new(),
404            file_trigram_count: Arc::new(Vec::new()),
405            unindexed_files: Arc::new(HashSet::new()),
406            superseded: HashSet::new(),
407            base_file_count: 0,
408            delta_packed_bytes: 0,
409            compaction_state: Arc::new(Mutex::new(CompactionState::default())),
410        }
411    }
412
413    pub fn build(root: &Path) -> Self {
414        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
415    }
416
417    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
418        let cache_dir = transient_search_cache_dir(root);
419        Self::build_with_limit_to_cache_dir(root, max_file_size, &cache_dir)
420    }
421
422    pub fn build_with_limit_to_cache_dir(
423        root: &Path,
424        max_file_size: u64,
425        cache_dir: &Path,
426    ) -> Self {
427        let started = std::time::Instant::now();
428        match build_streaming_index(root, max_file_size, cache_dir) {
429            Ok((mut index, indexed)) => {
430                index.ready = true;
431                crate::slog_info!(
432                    "search index cold streaming build: {} files, {} trigrams, {} ms (pool={})",
433                    indexed,
434                    index.trigram_count(),
435                    started.elapsed().as_millis(),
436                    search_index_build_pool_size()
437                );
438                index
439            }
440            Err(error) => {
441                log::warn!(
442                    "search index: streaming build failed ({}); falling back to bounded in-memory delta",
443                    error
444                );
445                let mut index = SearchIndex {
446                    project_root: fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
447                    max_file_size,
448                    ignore_rules_fingerprint: ignore_rules_fingerprint(
449                        &fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
450                    ),
451                    ..SearchIndex::new()
452                };
453                let filters = PathFilters::default();
454                let paths: Vec<PathBuf> = walk_project_files(&index.project_root, &filters);
455                let indexed = index.ingest_paths_parallel(&paths);
456                index.git_head = current_git_head(&index.project_root);
457                index.ready = true;
458                crate::slog_info!(
459                    "search index fallback build: {} files, {} trigrams, {} ms (pool={})",
460                    indexed,
461                    index.trigram_count(),
462                    started.elapsed().as_millis(),
463                    search_index_build_pool_size()
464                );
465                index
466            }
467        }
468    }
469
470    /// Serial cold build for tests and parity checks against [`build_with_limit`].
471    #[cfg(test)]
472    pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
473        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
474        let mut index = SearchIndex {
475            project_root: project_root.clone(),
476            max_file_size,
477            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
478            ..SearchIndex::new()
479        };
480        let filters = PathFilters::default();
481        for path in walk_project_files(&project_root, &filters) {
482            index.update_file(&path);
483        }
484        index.git_head = current_git_head(&project_root);
485        index.ready = true;
486        index
487    }
488
489    fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
490        let max_file_size = self.max_file_size;
491        let pool_size = search_index_build_pool_size();
492        let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
493        let pool = match rayon::ThreadPoolBuilder::new()
494            .num_threads(pool_size)
495            .thread_name(|index| format!("aft-search-build-{index}"))
496            .stack_size(8 * 1024 * 1024)
497            .build()
498        {
499            Ok(pool) => Some(pool),
500            Err(error) => {
501                log::warn!(
502                    "search index: bounded build pool unavailable ({error}); using global pool"
503                );
504                None
505            }
506        };
507
508        let mut indexed = 0usize;
509        for chunk in paths.chunks(chunk_size) {
510            let prepare_chunk = || -> Vec<PreparedSearchPath> {
511                chunk
512                    .par_iter()
513                    .map(|path| prepare_search_path(path, max_file_size))
514                    .collect()
515            };
516            let prepared = match &pool {
517                Some(pool) => pool.install(prepare_chunk),
518                None => prepare_chunk(),
519            };
520
521            for (path, prepared) in chunk.iter().zip(prepared) {
522                let inserted = match prepared {
523                    PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
524                    PreparedSearchPath::Unindexed(metadata) => {
525                        self.track_unindexed_file_with_metadata(path, metadata)
526                    }
527                    PreparedSearchPath::Skipped => false,
528                };
529                if inserted {
530                    indexed += 1;
531                }
532            }
533        }
534
535        indexed
536    }
537
538    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
539        self.remove_file(path);
540        let metadata = metadata_for_indexed_content(path, content.len() as u64);
541        self.index_file_with_metadata(path, content, metadata);
542    }
543
544    fn index_file_with_metadata(
545        &mut self,
546        path: &Path,
547        content: &[u8],
548        metadata: SearchFileMetadata,
549    ) -> bool {
550        self.index_prepared_new_file(
551            path,
552            PreparedIndexedFile {
553                metadata,
554                content_hash: cache_freshness::hash_bytes(content),
555                trigram_map: trigram_filter_map(content, true),
556            },
557        )
558    }
559
560    fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
561        let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
562            Some(file_id) => file_id,
563            None => return false,
564        };
565        if let Some(entry) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
566            entry.content_hash = file.content_hash;
567        }
568
569        let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
570        for (trigram, filter) in file.trigram_map {
571            let postings = self.delta_postings.entry(trigram).or_default();
572            postings.push(Posting {
573                file_id,
574                next_mask: filter.next_mask,
575                loc_mask: filter.loc_mask,
576            });
577            if postings.len() > 1
578                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
579            {
580                postings.sort_unstable_by_key(|p| p.file_id);
581            }
582            file_trigrams.push(trigram);
583        }
584
585        let trigram_count = file_trigrams.len() as u32;
586        self.delta_packed_bytes = self
587            .delta_packed_bytes
588            .saturating_add(file_trigrams.len().saturating_mul(POSTING_BYTES));
589        self.delta_file_trigrams.insert(file_id, file_trigrams);
590        ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
591        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
592            *count = trigram_count;
593        }
594        Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
595        self.update_compaction_flags(Some(path));
596        true
597    }
598
599    pub fn remove_file(&mut self, path: &Path) {
600        let canonical_path = canonicalize_existing_or_deleted_path(path);
601        let file_id = {
602            let path_to_id = Arc::make_mut(&mut self.path_to_id);
603            if let Some(file_id) = path_to_id.remove(path) {
604                file_id
605            } else if canonical_path.as_path() != path {
606                let Some(file_id) = path_to_id.remove(&canonical_path) else {
607                    return;
608                };
609                file_id
610            } else {
611                return;
612            }
613        };
614
615        if file_id < self.base_file_count {
616            self.superseded.insert(file_id);
617        }
618
619        if let Some(trigrams) = self.delta_file_trigrams.remove(&file_id) {
620            self.delta_packed_bytes = self
621                .delta_packed_bytes
622                .saturating_sub(trigrams.len().saturating_mul(POSTING_BYTES));
623            for trigram in trigrams {
624                let should_remove = if let Some(postings) = self.delta_postings.get_mut(&trigram) {
625                    postings.retain(|posting| posting.file_id != file_id);
626                    postings.is_empty()
627                } else {
628                    false
629                };
630
631                if should_remove {
632                    self.delta_postings.remove(&trigram);
633                }
634            }
635        }
636
637        Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
638        if let Some(file) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
639            file.path = PathBuf::new();
640            file.size = 0;
641            file.modified = UNIX_EPOCH;
642            file.content_hash = cache_freshness::zero_hash();
643        }
644        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
645            *count = 0;
646        }
647        self.update_compaction_flags(Some(path));
648    }
649
650    pub fn update_file(&mut self, path: &Path) {
651        self.remove_file(path);
652
653        let metadata = match fs::metadata(path) {
654            Ok(metadata) if metadata.is_file() => metadata,
655            _ => return,
656        };
657
658        let metadata = search_file_metadata(&metadata);
659
660        if is_binary_path(path, metadata.size) {
661            self.track_unindexed_file_with_metadata(path, metadata);
662            return;
663        }
664
665        if metadata.size > self.max_file_size {
666            self.track_unindexed_file_with_metadata(path, metadata);
667            return;
668        }
669
670        let content = match fs::read(path) {
671            Ok(content) => content,
672            Err(_) => return,
673        };
674
675        if is_binary_bytes(&content) {
676            self.track_unindexed_file_with_metadata(path, metadata);
677            return;
678        }
679
680        self.index_file_with_metadata(path, &content, metadata);
681    }
682
683    pub fn grep(
684        &self,
685        pattern: &str,
686        case_sensitive: bool,
687        include: &[String],
688        exclude: &[String],
689        search_root: &Path,
690        max_results: usize,
691    ) -> GrepResult {
692        self.snapshot().grep(
693            pattern,
694            case_sensitive,
695            include,
696            exclude,
697            search_root,
698            max_results,
699        )
700    }
701
702    pub fn search_grep(
703        &self,
704        pattern: &CompiledPattern,
705        include: &[String],
706        exclude: &[String],
707        search_root: &Path,
708        max_results: usize,
709    ) -> GrepResult {
710        self.snapshot()
711            .search_grep(pattern, include, exclude, search_root, max_results)
712    }
713
714    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
715        self.snapshot().glob(pattern, search_root)
716    }
717
718    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
719        self.snapshot().candidates(query)
720    }
721
722    pub fn write_to_disk(&mut self, cache_dir: &Path, git_head: Option<&str>) {
723        let Some(plan) = CacheWritePlan::from_index(self, git_head) else {
724            return;
725        };
726
727        let write_result = {
728            let mut sources = self.compaction_record_sources(Arc::clone(&plan.id_map));
729            write_cache_file_from_sources(cache_dir, &plan, &mut sources)
730        };
731
732        match write_result {
733            Ok(base) => {
734                self.base = Some(Arc::new(base));
735                self.delta_postings.clear();
736                self.delta_file_trigrams.clear();
737                self.superseded.clear();
738                self.delta_packed_bytes = 0;
739                self.base_file_count = u32::try_from(plan.files.len()).unwrap_or(u32::MAX);
740                self.files = Arc::new(plan.files);
741                self.path_to_id = Arc::new(plan.path_to_id);
742                self.unindexed_files = Arc::new(plan.unindexed_files);
743                self.file_trigram_count = Arc::new(plan.file_trigram_count);
744                self.git_head = plan.git_head.filter(|head| !head.is_empty());
745                self.ignore_rules_fingerprint = plan.ignore_fingerprint;
746            }
747            Err(error) => {
748                log::warn!("search index: failed to write disk cache: {}", error);
749            }
750        }
751    }
752
753    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
754        debug_assert!(current_canonical_root.is_absolute());
755        let cache_path = cache_dir.join("cache.bin");
756        let cache_file = open_cache_file_read(&cache_path).ok()?;
757        let file_len = cache_file.metadata().ok()?.len();
758        if file_len < 16 {
759            return None;
760        }
761
762        let mut reader = BufReader::new(cache_file.try_clone().ok()?);
763        if read_u32(&mut reader).ok()? != CACHE_MAGIC {
764            return None;
765        }
766        if read_u32(&mut reader).ok()? != INDEX_VERSION {
767            return None;
768        }
769        let postings_len_total = read_u64(&mut reader).ok()?;
770        let postings_section_start = reader.stream_position().ok()?;
771        let postings_section_end = postings_section_start.checked_add(postings_len_total)?;
772        if postings_len_total < 4 || postings_section_end > file_len {
773            return None;
774        }
775        let postings_body_end = postings_section_end.checked_sub(4)?;
776
777        let mut magic = [0u8; 8];
778        reader.read_exact(&mut magic).ok()?;
779        if &magic != INDEX_MAGIC {
780            return None;
781        }
782        if read_u32(&mut reader).ok()? != INDEX_VERSION {
783            return None;
784        }
785
786        let head_len = read_u32(&mut reader).ok()? as usize;
787        let root_len = read_u32(&mut reader).ok()? as usize;
788        let ignore_fingerprint_len = read_u32(&mut reader).ok()? as usize;
789        let max_file_size = read_u64(&mut reader).ok()?;
790        let file_count = read_u32(&mut reader).ok()? as usize;
791        if file_count > MAX_ENTRIES {
792            return None;
793        }
794
795        if !reader_has_remaining(&mut reader, postings_body_end, head_len).ok()? {
796            return None;
797        }
798        let mut head_bytes = vec![0u8; head_len];
799        reader.read_exact(&mut head_bytes).ok()?;
800        let git_head = String::from_utf8(head_bytes)
801            .ok()
802            .filter(|head| !head.is_empty());
803
804        if !reader_has_remaining(&mut reader, postings_body_end, root_len).ok()? {
805            return None;
806        }
807        let mut root_bytes = vec![0u8; root_len];
808        reader.read_exact(&mut root_bytes).ok()?;
809        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
810        let project_root = current_canonical_root.to_path_buf();
811
812        if !reader_has_remaining(&mut reader, postings_body_end, ignore_fingerprint_len).ok()? {
813            return None;
814        }
815        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
816        reader.read_exact(&mut ignore_fingerprint_bytes).ok()?;
817        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
818        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
819        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
820            return None;
821        }
822
823        let mut files = Vec::with_capacity(file_count);
824        let mut path_to_id = HashMap::new();
825        let mut unindexed_files = HashSet::new();
826
827        for file_id in 0..file_count {
828            if !reader_has_remaining(&mut reader, postings_body_end, MIN_FILE_ENTRY_BYTES).ok()? {
829                return None;
830            }
831            let mut unindexed = [0u8; 1];
832            reader.read_exact(&mut unindexed).ok()?;
833            let path_len = read_u32(&mut reader).ok()? as usize;
834            let size = read_u64(&mut reader).ok()?;
835            let secs = read_u64(&mut reader).ok()?;
836            let nanos = read_u32(&mut reader).ok()?;
837            let mut hash_bytes = [0u8; 32];
838            reader.read_exact(&mut hash_bytes).ok()?;
839            let content_hash = blake3::Hash::from_bytes(hash_bytes);
840            if nanos >= 1_000_000_000 {
841                return None;
842            }
843            if !reader_has_remaining(&mut reader, postings_body_end, path_len).ok()? {
844                return None;
845            }
846            let mut path_bytes = vec![0u8; path_len];
847            reader.read_exact(&mut path_bytes).ok()?;
848            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
849            let full_path = cached_path_under_root(&project_root, &relative_path)?;
850            let file_id_u32 = u32::try_from(file_id).ok()?;
851
852            files.push(FileEntry {
853                path: full_path.clone(),
854                size,
855                modified: UNIX_EPOCH + Duration::new(secs, nanos),
856                content_hash,
857            });
858            path_to_id.insert(full_path, file_id_u32);
859            if unindexed[0] == 1 {
860                unindexed_files.insert(file_id_u32);
861            }
862        }
863
864        if !reader_has_remaining(&mut reader, postings_body_end, 8).ok()? {
865            return None;
866        }
867        let postings_blob_len = read_u64(&mut reader).ok()?;
868        let postings_blob_start = reader.stream_position().ok()?;
869        let postings_blob_end = postings_blob_start.checked_add(postings_blob_len)?;
870        if postings_blob_end > postings_body_end || postings_blob_len % POSTING_BYTES as u64 != 0 {
871            return None;
872        }
873
874        let lookup_section_start = postings_section_end;
875        if lookup_section_start >= file_len {
876            return None;
877        }
878        let mut lookup_file = cache_file.try_clone().ok()?;
879        lookup_file
880            .seek(SeekFrom::Start(lookup_section_start))
881            .ok()?;
882        let mut lookup_bytes = Vec::new();
883        lookup_file.read_to_end(&mut lookup_bytes).ok()?;
884        if lookup_bytes.len() < 4 {
885            return None;
886        }
887        verify_crc32_bytes_slice(&lookup_bytes).ok()?;
888        let lookup_body_len = lookup_bytes.len().checked_sub(4)?;
889        let mut lookup_reader = BufReader::new(Cursor::new(&lookup_bytes));
890        let mut lookup_magic = [0u8; 8];
891        lookup_reader.read_exact(&mut lookup_magic).ok()?;
892        if &lookup_magic != LOOKUP_MAGIC {
893            return None;
894        }
895        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
896            return None;
897        }
898        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
899        if entry_count > MAX_ENTRIES {
900            return None;
901        }
902        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
903        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
904        if minimum_lookup_bytes > remaining_lookup {
905            return None;
906        }
907
908        let mut lookup = Vec::with_capacity(entry_count);
909        let mut previous_trigram = None;
910        for _ in 0..entry_count {
911            let trigram = read_u32(&mut lookup_reader).ok()?;
912            let offset = read_u64(&mut lookup_reader).ok()?;
913            let count = read_u32(&mut lookup_reader).ok()?;
914            if count as usize > MAX_ENTRIES {
915                return None;
916            }
917            if previous_trigram.is_some_and(|previous| previous >= trigram) {
918                return None;
919            }
920            previous_trigram = Some(trigram);
921            let bytes_len = (count as u64).checked_mul(POSTING_BYTES as u64)?;
922            let end = offset.checked_add(bytes_len)?;
923            if end > postings_blob_len {
924                return None;
925            }
926            lookup.push(LookupEntry {
927                trigram,
928                offset,
929                count,
930            });
931        }
932
933        let base = BasePostings {
934            file: Arc::new(cache_file),
935            postings_blob_start,
936            postings_blob_len,
937            lookup: Arc::new(lookup),
938        };
939
940        let (file_trigram_count, migrated_counts) = match read_file_trigram_count_extension(
941            &base,
942            postings_blob_end,
943            postings_body_end,
944            file_count,
945        ) {
946            Ok(Some(counts)) => (counts, false),
947            Ok(None) => (
948                compute_file_trigram_counts_from_base(&base, file_count).ok()?,
949                true,
950            ),
951            Err(_) => return None,
952        };
953
954        let mut index = SearchIndex {
955            base: Some(Arc::new(base)),
956            delta_postings: HashMap::new(),
957            delta_file_trigrams: HashMap::new(),
958            files: Arc::new(files),
959            path_to_id: Arc::new(path_to_id),
960            ready: false,
961            project_root,
962            git_head,
963            max_file_size,
964            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
965            file_trigram_count: Arc::new(file_trigram_count),
966            unindexed_files: Arc::new(unindexed_files),
967            superseded: HashSet::new(),
968            base_file_count: u32::try_from(file_count).ok()?,
969            delta_packed_bytes: 0,
970            compaction_state: Arc::new(Mutex::new(CompactionState::default())),
971        };
972
973        if migrated_counts {
974            if let Ok(_lock) = CacheLock::acquire(cache_dir) {
975                let head = index.git_head.clone();
976                index.write_to_disk(cache_dir, head.as_deref());
977            }
978        }
979
980        Some(index)
981    }
982
983    pub fn stored_git_head(&self) -> Option<&str> {
984        self.git_head.as_deref()
985    }
986
987    pub(crate) fn set_ready(&mut self, ready: bool) {
988        self.ready = ready;
989    }
990
991    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
992        self.git_head = current_head;
993        verify_file_mtimes(self);
994        self.ready = true;
995    }
996
997    #[cfg(debug_assertions)]
998    #[doc(hidden)]
999    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1000        self.verify_against_disk(current_head);
1001    }
1002
1003    pub(crate) fn rebuild_or_refresh(
1004        root: &Path,
1005        max_file_size: u64,
1006        current_head: Option<String>,
1007        baseline: Option<SearchIndex>,
1008        cache_dir: Option<&Path>,
1009    ) -> Self {
1010        if let Some(mut baseline) = baseline {
1011            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1012            baseline.max_file_size = max_file_size;
1013            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1014            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1015                return match cache_dir {
1016                    Some(cache_dir) => {
1017                        SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1018                    }
1019                    None => SearchIndex::build_with_limit(root, max_file_size),
1020                };
1021            }
1022            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1023
1024            if baseline.git_head == current_head || current_head.is_none() {
1025                // HEAD matches, but files may have changed on disk since the index was
1026                // last written (e.g., uncommitted edits, stash pop, manual file changes
1027                // while OpenCode was closed). Verify mtimes and re-index stale files.
1028                // Non-git projects also use this per-file (path, mtime, size)
1029                // fingerprint so unchanged trees reuse the disk cache instead of
1030                // rebuilding every configure.
1031                baseline.git_head = current_head;
1032                verify_file_mtimes(&mut baseline);
1033                baseline.ready = true;
1034                return baseline;
1035            }
1036
1037            if let (Some(previous), Some(current)) =
1038                (baseline.git_head.clone(), current_head.clone())
1039            {
1040                let project_root = baseline.project_root.clone();
1041                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1042                    baseline.git_head = Some(current);
1043                    verify_file_mtimes(&mut baseline);
1044                    baseline.ready = true;
1045                    return baseline;
1046                }
1047            }
1048        }
1049
1050        match cache_dir {
1051            Some(cache_dir) => {
1052                SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1053            }
1054            None => SearchIndex::build_with_limit(root, max_file_size),
1055        }
1056    }
1057
1058    fn allocate_file_id_with_metadata(
1059        &mut self,
1060        path: &Path,
1061        metadata: SearchFileMetadata,
1062    ) -> Option<u32> {
1063        let file_id = u32::try_from(self.files.len()).ok()?;
1064        Arc::make_mut(&mut self.files).push(FileEntry {
1065            path: path.to_path_buf(),
1066            size: metadata.size,
1067            modified: metadata.modified,
1068            content_hash: cache_freshness::zero_hash(),
1069        });
1070        Arc::make_mut(&mut self.path_to_id).insert(path.to_path_buf(), file_id);
1071        ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
1072        Some(file_id)
1073    }
1074
1075    fn track_unindexed_file_with_metadata(
1076        &mut self,
1077        path: &Path,
1078        metadata: SearchFileMetadata,
1079    ) -> bool {
1080        let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1081            return false;
1082        };
1083        Arc::make_mut(&mut self.unindexed_files).insert(file_id);
1084        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
1085            *count = 0;
1086        }
1087        true
1088    }
1089
1090    fn active_file_ids(&self) -> Vec<u32> {
1091        self.snapshot().active_file_ids()
1092    }
1093
1094    #[cfg(test)]
1095    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1096        self.snapshot().postings_for_trigram(trigram, filter)
1097    }
1098
1099    fn update_compaction_flags(&mut self, changed_path: Option<&Path>) {
1100        let delta_files = self.delta_file_trigrams.len();
1101        let hard = delta_files >= DELTA_COMPACT_HARD_FILES
1102            || self.delta_packed_bytes >= DELTA_COMPACT_HARD_BYTES;
1103        let soft = delta_files >= DELTA_COMPACT_SOFT_FILES
1104            || self.delta_packed_bytes >= DELTA_COMPACT_SOFT_BYTES;
1105        if let Ok(mut state) = self.compaction_state.lock() {
1106            if state.running {
1107                if let Some(path) = changed_path {
1108                    state.buffered_paths.push(path.to_path_buf());
1109                }
1110                if soft || hard {
1111                    state.requested_again = true;
1112                }
1113            } else if hard || (soft && !state.requested_again) {
1114                state.requested_again = true;
1115            }
1116        }
1117    }
1118
1119    fn compaction_record_sources(
1120        &self,
1121        id_map: Arc<HashMap<u32, u32>>,
1122    ) -> Vec<Box<dyn PostingRecordSource>> {
1123        let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
1124        if let Some(base) = self.base.clone() {
1125            sources.push(Box::new(BaseRecordSource::new(
1126                base,
1127                Arc::clone(&id_map),
1128                Arc::new(self.superseded.clone()),
1129            )));
1130        }
1131
1132        let mut delta_records = Vec::new();
1133        for (&trigram, postings) in &self.delta_postings {
1134            for posting in postings {
1135                let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
1136                    continue;
1137                };
1138                delta_records.push(SpillRecord {
1139                    trigram,
1140                    file_id: mapped_file_id,
1141                    next_mask: posting.next_mask,
1142                    loc_mask: posting.loc_mask,
1143                });
1144            }
1145        }
1146        if !delta_records.is_empty() {
1147            delta_records.sort_unstable_by_key(|record| (record.trigram, record.file_id));
1148            sources.push(Box::new(VecRecordSource::new(delta_records)));
1149        }
1150        sources
1151    }
1152}
1153
1154impl BasePostings {
1155    fn lookup_entry(&self, trigram: u32) -> Option<LookupEntry> {
1156        self.lookup
1157            .binary_search_by_key(&trigram, |entry| entry.trigram)
1158            .ok()
1159            .and_then(|index| self.lookup.get(index).copied())
1160    }
1161
1162    fn read_postings(&self, entry: LookupEntry) -> std::io::Result<Vec<Posting>> {
1163        let bytes_len = (entry.count as usize)
1164            .checked_mul(POSTING_BYTES)
1165            .ok_or_else(|| std::io::Error::other("posting list too large"))?;
1166        let offset = self
1167            .postings_blob_start
1168            .checked_add(entry.offset)
1169            .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1170        let end = entry
1171            .offset
1172            .checked_add(bytes_len as u64)
1173            .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1174        if end > self.postings_blob_len {
1175            return Err(std::io::Error::other("posting list exceeds blob"));
1176        }
1177        let mut bytes = vec![0u8; bytes_len];
1178        pread_exact(&self.file, offset, &mut bytes)?;
1179        let mut postings = Vec::with_capacity(entry.count as usize);
1180        for chunk in bytes.chunks_exact(POSTING_BYTES) {
1181            postings.push(Posting {
1182                file_id: u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
1183                next_mask: chunk[4],
1184                loc_mask: chunk[5],
1185            });
1186        }
1187        Ok(postings)
1188    }
1189}
1190
1191impl SearchIndexSnapshot {
1192    pub fn grep(
1193        &self,
1194        pattern: &str,
1195        case_sensitive: bool,
1196        include: &[String],
1197        exclude: &[String],
1198        search_root: &Path,
1199        max_results: usize,
1200    ) -> GrepResult {
1201        match pattern_compile::compile(
1202            pattern,
1203            CompileOpts {
1204                case_insensitive: !case_sensitive,
1205                ..CompileOpts::default()
1206            },
1207        ) {
1208            CompileResult::Ok(compiled) => {
1209                self.search_grep(&compiled, include, exclude, search_root, max_results)
1210            }
1211            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
1212                self.empty_grep_result()
1213            }
1214        }
1215    }
1216
1217    pub fn search_grep(
1218        &self,
1219        pattern: &CompiledPattern,
1220        include: &[String],
1221        exclude: &[String],
1222        search_root: &Path,
1223        max_results: usize,
1224    ) -> GrepResult {
1225        let matcher = match pattern {
1226            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
1227            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
1228        };
1229
1230        let filters = match build_path_filters(include, exclude) {
1231            Ok(filters) => filters,
1232            Err(_) => PathFilters::default(),
1233        };
1234        let search_root = canonicalize_or_normalize(search_root);
1235
1236        let raw_pattern = pattern.raw_pattern_for_trigrams();
1237        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
1238            RegexQuery::default()
1239        } else {
1240            decompose_regex(&raw_pattern)
1241        };
1242        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
1243        let candidate_ids = self.candidates(&query);
1244
1245        let candidate_files: Vec<&FileEntry> = candidate_ids
1246            .into_iter()
1247            .filter_map(|file_id| self.files.get(file_id as usize))
1248            .filter(|file| !file.path.as_os_str().is_empty())
1249            .filter(|file| is_within_search_root(&search_root, &file.path))
1250            .filter(|file| filters.matches(&self.project_root, &file.path))
1251            .collect();
1252
1253        let total_matches = AtomicUsize::new(0);
1254        let files_searched = AtomicUsize::new(0);
1255        let files_with_matches = AtomicUsize::new(0);
1256        let truncated = AtomicBool::new(false);
1257        let engine_capped = AtomicBool::new(false);
1258        let stop_after = max_results.saturating_mul(2);
1259        let stop_scan = Arc::new(AtomicBool::new(false));
1260
1261        let mut matches = if candidate_files.len() > 10 {
1262            candidate_files
1263                .par_iter()
1264                .map(|file| {
1265                    if grep_scan_should_stop(
1266                        Some(&stop_scan),
1267                        &truncated,
1268                        &total_matches,
1269                        stop_after,
1270                    ) {
1271                        engine_capped.store(true, Ordering::Relaxed);
1272                        return Vec::new();
1273                    }
1274                    search_candidate_file(
1275                        file,
1276                        &matcher,
1277                        max_results,
1278                        stop_after,
1279                        &total_matches,
1280                        &files_searched,
1281                        &files_with_matches,
1282                        &truncated,
1283                        &engine_capped,
1284                        Some(&stop_scan),
1285                    )
1286                })
1287                .reduce(Vec::new, |mut left, mut right| {
1288                    // When concatenating partial match lists from parallel file
1289                    // searches, simply append the chunks. The stop checks in
1290                    // each worker decide whether the result cap was reached.
1291                    left.append(&mut right);
1292                    left
1293                })
1294        } else {
1295            let mut matches = Vec::new();
1296            for file in candidate_files {
1297                matches.extend(search_candidate_file(
1298                    file,
1299                    &matcher,
1300                    max_results,
1301                    stop_after,
1302                    &total_matches,
1303                    &files_searched,
1304                    &files_with_matches,
1305                    &truncated,
1306                    &engine_capped,
1307                    None,
1308                ));
1309
1310                if should_stop_search(&truncated, &total_matches, stop_after) {
1311                    engine_capped.store(true, Ordering::Relaxed);
1312                    break;
1313                }
1314            }
1315            matches
1316        };
1317
1318        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
1319            self.path_to_id
1320                .get(path)
1321                .and_then(|file_id| self.files.get(*file_id as usize))
1322                .map(|file| file.modified)
1323        });
1324
1325        let matches = matches
1326            .into_iter()
1327            .map(|matched| GrepMatch {
1328                file: matched.file.as_ref().clone(),
1329                line: matched.line,
1330                column: matched.column,
1331                line_text: matched.line_text,
1332                match_text: matched.match_text,
1333            })
1334            .collect();
1335
1336        GrepResult {
1337            total_matches: total_matches.load(Ordering::Relaxed),
1338            matches,
1339            files_searched: files_searched.load(Ordering::Relaxed),
1340            files_with_matches: files_with_matches.load(Ordering::Relaxed),
1341            index_status: if self.ready {
1342                IndexStatus::Ready
1343            } else {
1344                IndexStatus::Building
1345            },
1346            truncated: truncated.load(Ordering::Relaxed),
1347            fully_degraded,
1348            engine_capped: engine_capped.load(Ordering::Relaxed),
1349            walk_truncated: false,
1350        }
1351    }
1352
1353    fn empty_grep_result(&self) -> GrepResult {
1354        GrepResult {
1355            matches: Vec::new(),
1356            total_matches: 0,
1357            files_searched: 0,
1358            files_with_matches: 0,
1359            index_status: if self.ready {
1360                IndexStatus::Ready
1361            } else {
1362                IndexStatus::Building
1363            },
1364            truncated: false,
1365            fully_degraded: false,
1366            engine_capped: false,
1367            walk_truncated: false,
1368        }
1369    }
1370
1371    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
1372        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
1373            Ok(filters) => filters,
1374            Err(_) => return Vec::new(),
1375        };
1376        let search_root = canonicalize_or_normalize(search_root);
1377        let mut entries = self
1378            .files
1379            .iter()
1380            .filter(|file| !file.path.as_os_str().is_empty())
1381            .filter(|file| is_within_search_root(&search_root, &file.path))
1382            .filter(|file| filters.matches(&self.project_root, &file.path))
1383            .map(|file| (file.path.clone(), file.modified))
1384            .collect::<Vec<_>>();
1385
1386        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
1387            right_mtime
1388                .cmp(left_mtime)
1389                .then_with(|| left_path.cmp(right_path))
1390        });
1391
1392        entries.into_iter().map(|(path, _)| path).collect()
1393    }
1394
1395    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
1396        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
1397            return self.active_file_ids();
1398        }
1399
1400        let mut and_trigrams = query.and_trigrams.clone();
1401        and_trigrams.sort_unstable_by_key(|trigram| self.posting_count(*trigram));
1402
1403        let mut current: Option<Vec<u32>> = None;
1404
1405        for trigram in and_trigrams {
1406            let filter = query.and_filters.get(&trigram).copied();
1407            let matches = self.postings_for_trigram(trigram, filter);
1408            current = Some(match current.take() {
1409                Some(existing) => intersect_sorted_ids(&existing, &matches),
1410                None => matches,
1411            });
1412
1413            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
1414                break;
1415            }
1416        }
1417
1418        let mut current = current.unwrap_or_else(|| self.active_file_ids());
1419
1420        for (index, group) in query.or_groups.iter().enumerate() {
1421            let mut group_matches = Vec::new();
1422            let filters = query.or_filters.get(index);
1423
1424            for trigram in group {
1425                let filter = filters.and_then(|filters| filters.get(trigram).copied());
1426                let matches = self.postings_for_trigram(*trigram, filter);
1427                if group_matches.is_empty() {
1428                    group_matches = matches;
1429                } else {
1430                    group_matches = union_sorted_ids(&group_matches, &matches);
1431                }
1432            }
1433
1434            current = intersect_sorted_ids(&current, &group_matches);
1435            if current.is_empty() {
1436                break;
1437            }
1438        }
1439
1440        let mut unindexed = self
1441            .unindexed_files
1442            .iter()
1443            .copied()
1444            .filter(|file_id| self.is_active_file(*file_id))
1445            .collect::<Vec<_>>();
1446        if !unindexed.is_empty() {
1447            unindexed.sort_unstable();
1448            current = union_sorted_ids(&current, &unindexed);
1449        }
1450
1451        current
1452    }
1453
1454    fn posting_count(&self, trigram: u32) -> usize {
1455        let base_count = self
1456            .base
1457            .as_ref()
1458            .and_then(|base| base.lookup_entry(trigram))
1459            .map_or(0usize, |entry| entry.count as usize);
1460        base_count.saturating_add(self.delta_postings.get(&trigram).map_or(0usize, Vec::len))
1461    }
1462
1463    fn active_file_ids(&self) -> Vec<u32> {
1464        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1465        ids.retain(|file_id| self.is_active_file(*file_id));
1466        ids.sort_unstable();
1467        ids
1468    }
1469
1470    fn is_active_file(&self, file_id: u32) -> bool {
1471        if self.superseded.contains(&file_id) {
1472            return false;
1473        }
1474        self.files
1475            .get(file_id as usize)
1476            .map(|file| !file.path.as_os_str().is_empty())
1477            .unwrap_or(false)
1478    }
1479
1480    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1481        let mut matches = Vec::new();
1482
1483        if let Some(base_entry) = self
1484            .base
1485            .as_ref()
1486            .and_then(|base| base.lookup_entry(trigram))
1487        {
1488            if let Some(base) = &self.base {
1489                if let Ok(postings) = base.read_postings(base_entry) {
1490                    matches.reserve(postings.len());
1491                    for posting in postings {
1492                        if self.superseded.contains(&posting.file_id) {
1493                            continue;
1494                        }
1495                        if !posting_matches_filter(&posting, filter) {
1496                            continue;
1497                        }
1498                        if self.is_active_file(posting.file_id) {
1499                            matches.push(posting.file_id);
1500                        }
1501                    }
1502                }
1503            }
1504        }
1505
1506        if let Some(postings) = self.delta_postings.get(&trigram) {
1507            matches.reserve(postings.len());
1508            for posting in postings {
1509                if !posting_matches_filter(posting, filter) {
1510                    continue;
1511                }
1512                if self.is_active_file(posting.file_id) {
1513                    matches.push(posting.file_id);
1514                }
1515            }
1516        }
1517
1518        if matches.len() > 1 {
1519            matches.sort_unstable();
1520            matches.dedup();
1521        }
1522        matches
1523    }
1524}
1525
1526fn posting_matches_filter(posting: &Posting, filter: Option<PostingFilter>) -> bool {
1527    if let Some(filter) = filter {
1528        // next_mask is a bloom filter: the character following this trigram in
1529        // the query must also appear after this trigram somewhere in the file.
1530        if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1531            return false;
1532        }
1533        // loc_mask is persisted for future adjacency checks. It is intentionally
1534        // not used as a single-trigram filter because query positions do not
1535        // correspond to file positions.
1536    }
1537    true
1538}
1539
1540fn search_candidate_file(
1541    file: &FileEntry,
1542    matcher: &SearchMatcher,
1543    max_results: usize,
1544    stop_after: usize,
1545    total_matches: &AtomicUsize,
1546    files_searched: &AtomicUsize,
1547    files_with_matches: &AtomicUsize,
1548    truncated: &AtomicBool,
1549    engine_capped: &AtomicBool,
1550    stop_scan: Option<&Arc<AtomicBool>>,
1551) -> Vec<SharedGrepMatch> {
1552    if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1553        engine_capped.store(true, Ordering::Relaxed);
1554        return Vec::new();
1555    }
1556
1557    let content = match read_indexed_file_bytes(&file.path) {
1558        Some(content) => content,
1559        None => return Vec::new(),
1560    };
1561    // Defense in depth: even though indexing tries to filter binaries via
1562    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1563    // query time. content_inspector is fast (~bytes-per-cycle on a small
1564    // preview) and this guarantees we never surface matches inside binary
1565    // files even if the indexer somehow let one through (e.g. file changed
1566    // between indexing and query).
1567    if is_binary_bytes(&content) {
1568        return Vec::new();
1569    }
1570    files_searched.fetch_add(1, Ordering::Relaxed);
1571
1572    let shared_path = Arc::new(file.path.clone());
1573    let mut matches = Vec::new();
1574    let mut line_starts = None;
1575    let mut seen_lines = HashSet::new();
1576    let mut matched_this_file = false;
1577
1578    match matcher {
1579        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1580            let needle = &literal.needle;
1581            let finder = memchr::memmem::Finder::new(needle);
1582            let mut start = 0;
1583
1584            while let Some(position) = finder.find(&content[start..]) {
1585                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1586                    engine_capped.store(true, Ordering::Relaxed);
1587                    break;
1588                }
1589
1590                let offset = start + position;
1591                start = offset + 1;
1592
1593                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1594                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1595                if !seen_lines.insert(line) {
1596                    continue;
1597                }
1598
1599                matched_this_file = true;
1600                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1601                if match_number > max_results {
1602                    truncated.store(true, Ordering::Relaxed);
1603                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1604                    break;
1605                }
1606
1607                let end = offset + needle.len();
1608                matches.push(SharedGrepMatch {
1609                    file: shared_path.clone(),
1610                    line,
1611                    column,
1612                    line_text,
1613                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1614                });
1615            }
1616        }
1617        SearchMatcher::Literal(literal) => {
1618            let needle = &literal.needle;
1619            let search_content = content.to_ascii_lowercase();
1620            let finder = memchr::memmem::Finder::new(needle);
1621            let mut start = 0;
1622
1623            while let Some(position) = finder.find(&search_content[start..]) {
1624                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1625                    engine_capped.store(true, Ordering::Relaxed);
1626                    break;
1627                }
1628
1629                let offset = start + position;
1630                start = offset + 1;
1631
1632                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1633                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1634                if !seen_lines.insert(line) {
1635                    continue;
1636                }
1637
1638                matched_this_file = true;
1639                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1640                if match_number > max_results {
1641                    truncated.store(true, Ordering::Relaxed);
1642                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1643                    break;
1644                }
1645
1646                let end = offset + needle.len();
1647                matches.push(SharedGrepMatch {
1648                    file: shared_path.clone(),
1649                    line,
1650                    column,
1651                    line_text,
1652                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1653                });
1654            }
1655        }
1656        SearchMatcher::Regex(regex) => {
1657            for matched in regex.find_iter(&content) {
1658                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1659                    engine_capped.store(true, Ordering::Relaxed);
1660                    break;
1661                }
1662
1663                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1664                let (line, column, line_text) =
1665                    line_details_bytes(&content, line_starts, matched.start());
1666                if !seen_lines.insert(line) {
1667                    continue;
1668                }
1669
1670                matched_this_file = true;
1671                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1672                if match_number > max_results {
1673                    truncated.store(true, Ordering::Relaxed);
1674                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1675                    break;
1676                }
1677
1678                matches.push(SharedGrepMatch {
1679                    file: shared_path.clone(),
1680                    line,
1681                    column,
1682                    line_text,
1683                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1684                });
1685            }
1686        }
1687    }
1688
1689    if matched_this_file {
1690        files_with_matches.fetch_add(1, Ordering::Relaxed);
1691    }
1692
1693    matches
1694}
1695
1696fn should_stop_search(
1697    truncated: &AtomicBool,
1698    total_matches: &AtomicUsize,
1699    stop_after: usize,
1700) -> bool {
1701    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1702}
1703
1704fn grep_scan_should_stop(
1705    stop_scan: Option<&Arc<AtomicBool>>,
1706    truncated: &AtomicBool,
1707    total_matches: &AtomicUsize,
1708    stop_after: usize,
1709) -> bool {
1710    stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1711        || should_stop_search(truncated, total_matches, stop_after)
1712}
1713
1714fn signal_grep_scan_cap(
1715    stop_scan: Option<&Arc<AtomicBool>>,
1716    total_matches: &AtomicUsize,
1717    stop_after: usize,
1718) {
1719    if let Some(flag) = stop_scan {
1720        if total_matches.load(Ordering::Relaxed) >= stop_after {
1721            flag.store(true, Ordering::Relaxed);
1722        }
1723    }
1724}
1725
1726fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1727    SearchFileMetadata {
1728        size: metadata.len(),
1729        modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1730    }
1731}
1732
1733fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1734    fs::metadata(path)
1735        .ok()
1736        .map(|metadata| search_file_metadata(&metadata))
1737        .unwrap_or(SearchFileMetadata {
1738            size: size_hint,
1739            modified: UNIX_EPOCH,
1740        })
1741}
1742
1743fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1744    let metadata = match fs::metadata(path) {
1745        Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1746        _ => return PreparedSearchPath::Skipped,
1747    };
1748
1749    if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1750        return PreparedSearchPath::Unindexed(metadata);
1751    }
1752
1753    let content = match fs::read(path) {
1754        Ok(content) => content,
1755        Err(_) => return PreparedSearchPath::Skipped,
1756    };
1757
1758    if is_binary_bytes(&content) {
1759        return PreparedSearchPath::Unindexed(metadata);
1760    }
1761
1762    PreparedSearchPath::Indexed(PreparedIndexedFile {
1763        metadata,
1764        content_hash: cache_freshness::hash_bytes(&content),
1765        trigram_map: trigram_filter_map(&content, true),
1766    })
1767}
1768
1769/// Returns the worker pool size for cold search-index builds: half of available
1770/// cores, capped at 8 to keep the same limit used by the callgraph store.
1771fn search_index_build_pool_size() -> usize {
1772    std::thread::available_parallelism()
1773        .map(|parallelism| parallelism.get())
1774        .unwrap_or(1)
1775        .div_ceil(2)
1776        .clamp(1, 8)
1777}
1778
1779#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1780struct SpillRecord {
1781    trigram: u32,
1782    file_id: u32,
1783    next_mask: u8,
1784    loc_mask: u8,
1785}
1786
1787struct CacheWritePlan {
1788    project_root: PathBuf,
1789    git_head: Option<String>,
1790    ignore_fingerprint: String,
1791    max_file_size: u64,
1792    files: Vec<FileEntry>,
1793    path_to_id: HashMap<PathBuf, u32>,
1794    unindexed_files: HashSet<u32>,
1795    file_trigram_count: Vec<u32>,
1796    id_map: Arc<HashMap<u32, u32>>,
1797}
1798
1799impl CacheWritePlan {
1800    fn from_index(index: &SearchIndex, git_head: Option<&str>) -> Option<Self> {
1801        let active_ids = index.active_file_ids();
1802        let mut id_map = HashMap::with_capacity(active_ids.len());
1803        for (new_id, old_id) in active_ids.iter().enumerate() {
1804            let new_id = u32::try_from(new_id).ok()?;
1805            id_map.insert(*old_id, new_id);
1806        }
1807
1808        let mut files = Vec::with_capacity(active_ids.len());
1809        let mut path_to_id = HashMap::with_capacity(active_ids.len());
1810        let mut unindexed_files = HashSet::new();
1811        let mut file_trigram_count = Vec::with_capacity(active_ids.len());
1812        for old_id in active_ids {
1813            let new_id = *id_map.get(&old_id)?;
1814            let file = index.files.get(old_id as usize)?.clone();
1815            if file.path.as_os_str().is_empty() {
1816                continue;
1817            }
1818            path_to_id.insert(file.path.clone(), new_id);
1819            if index.unindexed_files.contains(&old_id) {
1820                unindexed_files.insert(new_id);
1821            }
1822            file_trigram_count.push(
1823                index
1824                    .file_trigram_count
1825                    .get(old_id as usize)
1826                    .copied()
1827                    .unwrap_or(0),
1828            );
1829            files.push(file);
1830        }
1831
1832        Some(Self {
1833            project_root: index.project_root.clone(),
1834            git_head: git_head.map(ToOwned::to_owned),
1835            ignore_fingerprint: if index.ignore_rules_fingerprint.is_empty() {
1836                ignore_rules_fingerprint(&index.project_root)
1837            } else {
1838                index.ignore_rules_fingerprint.clone()
1839            },
1840            max_file_size: index.max_file_size,
1841            files,
1842            path_to_id,
1843            unindexed_files,
1844            file_trigram_count,
1845            id_map: Arc::new(id_map),
1846        })
1847    }
1848}
1849
1850trait PostingRecordSource {
1851    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>>;
1852}
1853
1854struct VecRecordSource {
1855    records: Vec<SpillRecord>,
1856    index: usize,
1857}
1858
1859impl VecRecordSource {
1860    fn new(records: Vec<SpillRecord>) -> Self {
1861        Self { records, index: 0 }
1862    }
1863}
1864
1865impl PostingRecordSource for VecRecordSource {
1866    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1867        let record = self.records.get(self.index).copied();
1868        if record.is_some() {
1869            self.index += 1;
1870        }
1871        Ok(record)
1872    }
1873}
1874
1875struct SpillSegmentSource {
1876    reader: BufReader<File>,
1877    remaining_records: u64,
1878    current_trigram: u32,
1879    remaining_in_group: u32,
1880}
1881
1882impl SpillSegmentSource {
1883    fn open(path: &Path) -> std::io::Result<Self> {
1884        let mut reader = BufReader::new(File::open(path)?);
1885        let mut magic = [0u8; 8];
1886        reader.read_exact(&mut magic)?;
1887        if &magic != SPILL_MAGIC {
1888            return Err(std::io::Error::other("invalid search spill magic"));
1889        }
1890        if read_u32(&mut reader)? != INDEX_VERSION {
1891            return Err(std::io::Error::other("invalid search spill version"));
1892        }
1893        let remaining_records = read_u64(&mut reader)?;
1894        Ok(Self {
1895            reader,
1896            remaining_records,
1897            current_trigram: 0,
1898            remaining_in_group: 0,
1899        })
1900    }
1901}
1902
1903impl PostingRecordSource for SpillSegmentSource {
1904    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1905        if self.remaining_records == 0 {
1906            return Ok(None);
1907        }
1908        if self.remaining_in_group == 0 {
1909            self.current_trigram = read_u32(&mut self.reader)?;
1910            self.remaining_in_group = read_u32(&mut self.reader)?;
1911            if self.remaining_in_group == 0 {
1912                return Err(std::io::Error::other("empty search spill group"));
1913            }
1914        }
1915        let mut file_id = [0u8; 4];
1916        self.reader.read_exact(&mut file_id)?;
1917        let mut masks = [0u8; 2];
1918        self.reader.read_exact(&mut masks)?;
1919        self.remaining_in_group -= 1;
1920        self.remaining_records -= 1;
1921        Ok(Some(SpillRecord {
1922            trigram: self.current_trigram,
1923            file_id: u32::from_le_bytes(file_id),
1924            next_mask: masks[0],
1925            loc_mask: masks[1],
1926        }))
1927    }
1928}
1929
1930struct BaseRecordSource {
1931    base: Arc<BasePostings>,
1932    id_map: Arc<HashMap<u32, u32>>,
1933    superseded: Arc<HashSet<u32>>,
1934    lookup_index: usize,
1935    current: Vec<SpillRecord>,
1936    current_index: usize,
1937}
1938
1939impl BaseRecordSource {
1940    fn new(
1941        base: Arc<BasePostings>,
1942        id_map: Arc<HashMap<u32, u32>>,
1943        superseded: Arc<HashSet<u32>>,
1944    ) -> Self {
1945        Self {
1946            base,
1947            id_map,
1948            superseded,
1949            lookup_index: 0,
1950            current: Vec::new(),
1951            current_index: 0,
1952        }
1953    }
1954
1955    fn load_next_group(&mut self) -> std::io::Result<bool> {
1956        while let Some(entry) = self.base.lookup.get(self.lookup_index).copied() {
1957            self.lookup_index += 1;
1958            let postings = self.base.read_postings(entry)?;
1959            self.current.clear();
1960            self.current_index = 0;
1961            for posting in postings {
1962                if self.superseded.contains(&posting.file_id) {
1963                    continue;
1964                }
1965                let Some(mapped_file_id) = self.id_map.get(&posting.file_id).copied() else {
1966                    continue;
1967                };
1968                self.current.push(SpillRecord {
1969                    trigram: entry.trigram,
1970                    file_id: mapped_file_id,
1971                    next_mask: posting.next_mask,
1972                    loc_mask: posting.loc_mask,
1973                });
1974            }
1975            if !self.current.is_empty() {
1976                return Ok(true);
1977            }
1978        }
1979        Ok(false)
1980    }
1981}
1982
1983impl PostingRecordSource for BaseRecordSource {
1984    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1985        if self.current_index >= self.current.len() && !self.load_next_group()? {
1986            return Ok(None);
1987        }
1988        let record = self.current[self.current_index];
1989        self.current_index += 1;
1990        Ok(Some(record))
1991    }
1992}
1993
1994#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1995struct HeapItem {
1996    record: SpillRecord,
1997    source_index: usize,
1998}
1999
2000impl Ord for HeapItem {
2001    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
2002        other
2003            .record
2004            .trigram
2005            .cmp(&self.record.trigram)
2006            .then_with(|| other.record.file_id.cmp(&self.record.file_id))
2007            .then_with(|| other.source_index.cmp(&self.source_index))
2008    }
2009}
2010
2011impl PartialOrd for HeapItem {
2012    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
2013        Some(self.cmp(other))
2014    }
2015}
2016
2017fn build_streaming_index(
2018    root: &Path,
2019    max_file_size: u64,
2020    cache_dir: &Path,
2021) -> std::io::Result<(SearchIndex, usize)> {
2022    fs::create_dir_all(cache_dir)?;
2023    sweep_stale_search_build_dirs(cache_dir);
2024    let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
2025    let ignore_fingerprint = ignore_rules_fingerprint(&project_root);
2026    let filters = PathFilters::default();
2027    let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
2028    let pool_size = search_index_build_pool_size();
2029    let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
2030    let pool = rayon::ThreadPoolBuilder::new()
2031        .num_threads(pool_size)
2032        .thread_name(|index| format!("aft-search-build-{index}"))
2033        .stack_size(8 * 1024 * 1024)
2034        .build()
2035        .ok();
2036
2037    let spill_dir = create_spill_dir(cache_dir)?;
2038    let mut spill_paths = Vec::new();
2039    let mut spill_seq = 0usize;
2040    let mut block: Vec<SpillRecord> = Vec::new();
2041    let mut files = Vec::new();
2042    let mut path_to_id = HashMap::new();
2043    let mut unindexed_files = HashSet::new();
2044    let mut file_trigram_count = Vec::new();
2045    let mut indexed = 0usize;
2046
2047    let build_result = (|| -> std::io::Result<BasePostings> {
2048        for chunk in paths.chunks(chunk_size) {
2049            let prepare_chunk = || -> Vec<PreparedSearchPath> {
2050                chunk
2051                    .par_iter()
2052                    .map(|path| prepare_search_path(path, max_file_size))
2053                    .collect()
2054            };
2055            let prepared = match &pool {
2056                Some(pool) => pool.install(prepare_chunk),
2057                None => prepare_chunk(),
2058            };
2059
2060            for (path, prepared) in chunk.iter().zip(prepared) {
2061                match prepared {
2062                    PreparedSearchPath::Indexed(file) => {
2063                        let file_id = u32::try_from(files.len())
2064                            .map_err(|_| std::io::Error::other("too many files to index"))?;
2065                        files.push(FileEntry {
2066                            path: path.clone(),
2067                            size: file.metadata.size,
2068                            modified: file.metadata.modified,
2069                            content_hash: file.content_hash,
2070                        });
2071                        path_to_id.insert(path.clone(), file_id);
2072                        file_trigram_count.push(file.trigram_map.len() as u32);
2073                        for (trigram, filter) in file.trigram_map {
2074                            block.push(SpillRecord {
2075                                trigram,
2076                                file_id,
2077                                next_mask: filter.next_mask,
2078                                loc_mask: filter.loc_mask,
2079                            });
2080                        }
2081                        indexed += 1;
2082                    }
2083                    PreparedSearchPath::Unindexed(metadata) => {
2084                        let file_id = u32::try_from(files.len())
2085                            .map_err(|_| std::io::Error::other("too many files to index"))?;
2086                        files.push(FileEntry {
2087                            path: path.clone(),
2088                            size: metadata.size,
2089                            modified: metadata.modified,
2090                            content_hash: cache_freshness::zero_hash(),
2091                        });
2092                        path_to_id.insert(path.clone(), file_id);
2093                        unindexed_files.insert(file_id);
2094                        file_trigram_count.push(0);
2095                        indexed += 1;
2096                    }
2097                    PreparedSearchPath::Skipped => {}
2098                }
2099
2100                let block_bytes = block.len().saturating_mul(SPILL_RECORD_ESTIMATED_BYTES);
2101                if block_bytes >= SPIMI_SOFT_LIMIT_BYTES || block_bytes >= SPIMI_HARD_LIMIT_BYTES {
2102                    let path = flush_spill_segment(&spill_dir, spill_seq, &mut block)?;
2103                    spill_paths.push(path);
2104                    spill_seq += 1;
2105                }
2106            }
2107        }
2108
2109        block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2110        let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
2111        for path in &spill_paths {
2112            sources.push(Box::new(SpillSegmentSource::open(path)?));
2113        }
2114        if !block.is_empty() {
2115            sources.push(Box::new(VecRecordSource::new(std::mem::take(&mut block))));
2116        }
2117
2118        let plan = CacheWritePlan {
2119            project_root: project_root.clone(),
2120            git_head: current_git_head(&project_root),
2121            ignore_fingerprint: ignore_fingerprint.clone(),
2122            max_file_size,
2123            files: files.clone(),
2124            path_to_id: path_to_id.clone(),
2125            unindexed_files: unindexed_files.clone(),
2126            file_trigram_count: file_trigram_count.clone(),
2127            id_map: Arc::new(
2128                (0..files.len())
2129                    .filter_map(|id| {
2130                        let id = u32::try_from(id).ok()?;
2131                        Some((id, id))
2132                    })
2133                    .collect(),
2134            ),
2135        };
2136        write_cache_file_from_sources(cache_dir, &plan, &mut sources)
2137    })();
2138
2139    let _ = fs::remove_dir_all(&spill_dir);
2140    let base = build_result?;
2141    let base_file_count =
2142        u32::try_from(files.len()).map_err(|_| std::io::Error::other("too many files to index"))?;
2143    let git_head = current_git_head(&project_root);
2144    let index = SearchIndex {
2145        base: Some(Arc::new(base)),
2146        delta_postings: HashMap::new(),
2147        delta_file_trigrams: HashMap::new(),
2148        files: Arc::new(files),
2149        path_to_id: Arc::new(path_to_id),
2150        ready: false,
2151        project_root,
2152        git_head,
2153        max_file_size,
2154        ignore_rules_fingerprint: ignore_fingerprint,
2155        file_trigram_count: Arc::new(file_trigram_count),
2156        unindexed_files: Arc::new(unindexed_files),
2157        superseded: HashSet::new(),
2158        base_file_count,
2159        delta_packed_bytes: 0,
2160        compaction_state: Arc::new(Mutex::new(CompactionState::default())),
2161    };
2162    Ok((index, indexed))
2163}
2164
2165fn write_cache_file_from_sources(
2166    cache_dir: &Path,
2167    plan: &CacheWritePlan,
2168    sources: &mut [Box<dyn PostingRecordSource>],
2169) -> std::io::Result<BasePostings> {
2170    fs::create_dir_all(cache_dir)?;
2171    sweep_stale_search_build_dirs(cache_dir);
2172    let cache_path = cache_dir.join("cache.bin");
2173    let tmp_cache = cache_dir.join(format!(
2174        "cache.bin.tmp.{}.{}",
2175        std::process::id(),
2176        SystemTime::now()
2177            .duration_since(UNIX_EPOCH)
2178            .unwrap_or(Duration::ZERO)
2179            .as_nanos()
2180    ));
2181
2182    let write_result = (|| -> std::io::Result<BasePostings> {
2183        let raw = OpenOptions::new()
2184            .write(true)
2185            .create_new(true)
2186            .open(&tmp_cache)?;
2187        let mut writer = BufWriter::new(raw);
2188        write_u32(&mut writer, CACHE_MAGIC)?;
2189        write_u32(&mut writer, INDEX_VERSION)?;
2190        let postings_len_patch = writer.stream_position()?;
2191        write_u64(&mut writer, 0)?;
2192
2193        let postings_section_start = writer.stream_position()?;
2194        let postings_header = build_postings_header_bytes(plan)?;
2195        writer.write_all(&postings_header)?;
2196        let postings_blob_len_patch = writer.stream_position()?;
2197        write_u64(&mut writer, 0)?;
2198        let postings_blob_start = writer.stream_position()?;
2199
2200        let (lookup_entries, postings_blob_len) = merge_sources_to_writer(sources, &mut writer)?;
2201        let extension = build_file_trigram_count_extension(&plan.file_trigram_count)?;
2202        writer.write_all(&extension)?;
2203        let postings_crc_end = writer.stream_position()?;
2204
2205        writer.flush()?;
2206        writer.seek(SeekFrom::Start(postings_blob_len_patch))?;
2207        write_u64(&mut writer, postings_blob_len)?;
2208        writer.flush()?;
2209
2210        let checksum = crc32_file_range(
2211            &tmp_cache,
2212            postings_section_start,
2213            postings_crc_end.saturating_sub(postings_section_start),
2214        )?;
2215        writer.seek(SeekFrom::Start(postings_crc_end))?;
2216        writer.write_all(&checksum.to_le_bytes())?;
2217        let postings_section_end = writer.stream_position()?;
2218        let postings_len_total = postings_section_end.saturating_sub(postings_section_start);
2219        writer.seek(SeekFrom::Start(postings_len_patch))?;
2220        write_u64(&mut writer, postings_len_total)?;
2221        writer.seek(SeekFrom::Start(postings_section_end))?;
2222
2223        let lookup_blob = build_lookup_section_bytes(&lookup_entries)?;
2224        writer.write_all(&lookup_blob)?;
2225        writer.flush()?;
2226        writer.get_ref().sync_all()?;
2227        drop(writer);
2228
2229        fs::rename(&tmp_cache, &cache_path)?;
2230        sync_parent_dir(&cache_path);
2231        let file = open_cache_file_read(&cache_path)?;
2232        Ok(BasePostings {
2233            file: Arc::new(file),
2234            postings_blob_start,
2235            postings_blob_len,
2236            lookup: Arc::new(lookup_entries),
2237        })
2238    })();
2239
2240    if write_result.is_err() {
2241        let _ = fs::remove_file(&tmp_cache);
2242    }
2243    write_result
2244}
2245
2246fn merge_sources_to_writer(
2247    sources: &mut [Box<dyn PostingRecordSource>],
2248    writer: &mut BufWriter<File>,
2249) -> std::io::Result<(Vec<LookupEntry>, u64)> {
2250    let mut heap = BinaryHeap::new();
2251    for (source_index, source) in sources.iter_mut().enumerate() {
2252        if let Some(record) = source.next_record()? {
2253            heap.push(HeapItem {
2254                record,
2255                source_index,
2256            });
2257        }
2258    }
2259
2260    let mut lookup_entries = Vec::new();
2261    let mut postings_blob_len = 0u64;
2262    let mut current_trigram: Option<u32> = None;
2263    let mut current_offset = 0u64;
2264    let mut current_count = 0u32;
2265
2266    while let Some(item) = heap.pop() {
2267        let record = item.record;
2268        if current_trigram != Some(record.trigram) {
2269            if let Some(trigram) = current_trigram {
2270                lookup_entries.push(LookupEntry {
2271                    trigram,
2272                    offset: current_offset,
2273                    count: current_count,
2274                });
2275            }
2276            current_trigram = Some(record.trigram);
2277            current_offset = postings_blob_len;
2278            current_count = 0;
2279        }
2280
2281        writer.write_all(&record.file_id.to_le_bytes())?;
2282        writer.write_all(&[record.next_mask, record.loc_mask])?;
2283        postings_blob_len = postings_blob_len
2284            .checked_add(POSTING_BYTES as u64)
2285            .ok_or_else(|| std::io::Error::other("postings blob too large"))?;
2286        current_count = current_count
2287            .checked_add(1)
2288            .ok_or_else(|| std::io::Error::other("posting list too large"))?;
2289
2290        if let Some(next) = sources[item.source_index].next_record()? {
2291            heap.push(HeapItem {
2292                record: next,
2293                source_index: item.source_index,
2294            });
2295        }
2296    }
2297
2298    if let Some(trigram) = current_trigram {
2299        lookup_entries.push(LookupEntry {
2300            trigram,
2301            offset: current_offset,
2302            count: current_count,
2303        });
2304    }
2305
2306    Ok((lookup_entries, postings_blob_len))
2307}
2308
2309fn build_postings_header_bytes(plan: &CacheWritePlan) -> std::io::Result<Vec<u8>> {
2310    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2311    writer.write_all(INDEX_MAGIC)?;
2312    write_u32(&mut writer, INDEX_VERSION)?;
2313
2314    let head = plan.git_head.as_deref().unwrap_or_default();
2315    let root = plan.project_root.to_string_lossy();
2316    let head_len = u32::try_from(head.len())
2317        .map_err(|_| std::io::Error::other("git head too large to cache"))?;
2318    let root_len = u32::try_from(root.len())
2319        .map_err(|_| std::io::Error::other("project root too large to cache"))?;
2320    let ignore_fingerprint_len = u32::try_from(plan.ignore_fingerprint.len())
2321        .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
2322    let file_count = u32::try_from(plan.files.len())
2323        .map_err(|_| std::io::Error::other("too many files to cache"))?;
2324
2325    write_u32(&mut writer, head_len)?;
2326    write_u32(&mut writer, root_len)?;
2327    write_u32(&mut writer, ignore_fingerprint_len)?;
2328    write_u64(&mut writer, plan.max_file_size)?;
2329    write_u32(&mut writer, file_count)?;
2330    writer.write_all(head.as_bytes())?;
2331    writer.write_all(root.as_bytes())?;
2332    writer.write_all(plan.ignore_fingerprint.as_bytes())?;
2333
2334    for (file_id, file) in plan.files.iter().enumerate() {
2335        let file_id =
2336            u32::try_from(file_id).map_err(|_| std::io::Error::other("too many files to cache"))?;
2337        let path = cache_relative_path(&plan.project_root, &file.path)
2338            .or_else(|| {
2339                fs::canonicalize(&file.path)
2340                    .ok()
2341                    .and_then(|canonical| cache_relative_path(&plan.project_root, &canonical))
2342            })
2343            .ok_or_else(|| {
2344                std::io::Error::other(format!(
2345                    "refusing to cache path outside project root: {}",
2346                    file.path.display()
2347                ))
2348            })?;
2349        let path = path.to_string_lossy();
2350        let path_len = u32::try_from(path.len())
2351            .map_err(|_| std::io::Error::other("cached path too large"))?;
2352        let modified = file
2353            .modified
2354            .duration_since(UNIX_EPOCH)
2355            .unwrap_or(Duration::ZERO);
2356        let unindexed = if plan.unindexed_files.contains(&file_id) {
2357            1u8
2358        } else {
2359            0u8
2360        };
2361
2362        writer.write_all(&[unindexed])?;
2363        write_u32(&mut writer, path_len)?;
2364        write_u64(&mut writer, file.size)?;
2365        write_u64(&mut writer, modified.as_secs())?;
2366        write_u32(&mut writer, modified.subsec_nanos())?;
2367        writer.write_all(file.content_hash.as_bytes())?;
2368        writer.write_all(path.as_bytes())?;
2369    }
2370
2371    writer.flush()?;
2372    Ok(writer
2373        .into_inner()
2374        .map_err(|error| std::io::Error::other(error.to_string()))?
2375        .into_inner())
2376}
2377
2378fn build_lookup_section_bytes(lookup_entries: &[LookupEntry]) -> std::io::Result<Vec<u8>> {
2379    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2380    let entry_count = u32::try_from(lookup_entries.len())
2381        .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
2382    writer.write_all(LOOKUP_MAGIC)?;
2383    write_u32(&mut writer, INDEX_VERSION)?;
2384    write_u32(&mut writer, entry_count)?;
2385    for entry in lookup_entries {
2386        write_u32(&mut writer, entry.trigram)?;
2387        write_u64(&mut writer, entry.offset)?;
2388        write_u32(&mut writer, entry.count)?;
2389    }
2390    writer.flush()?;
2391    let mut lookup_blob = writer
2392        .into_inner()
2393        .map_err(|error| std::io::Error::other(error.to_string()))?
2394        .into_inner();
2395    let checksum = crc32fast::hash(&lookup_blob);
2396    lookup_blob.extend_from_slice(&checksum.to_le_bytes());
2397    Ok(lookup_blob)
2398}
2399
2400fn build_file_trigram_count_extension(counts: &[u32]) -> std::io::Result<Vec<u8>> {
2401    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2402    writer.write_all(FILE_TRIGRAM_COUNT_MAGIC)?;
2403    write_u32(&mut writer, INDEX_VERSION)?;
2404    write_u32(
2405        &mut writer,
2406        u32::try_from(counts.len())
2407            .map_err(|_| std::io::Error::other("too many file trigram counts"))?,
2408    )?;
2409    for count in counts {
2410        write_u32(&mut writer, *count)?;
2411    }
2412    writer.flush()?;
2413    Ok(writer
2414        .into_inner()
2415        .map_err(|error| std::io::Error::other(error.to_string()))?
2416        .into_inner())
2417}
2418
2419fn flush_spill_segment(
2420    spill_dir: &Path,
2421    seq: usize,
2422    block: &mut Vec<SpillRecord>,
2423) -> std::io::Result<PathBuf> {
2424    if block.is_empty() {
2425        return Err(std::io::Error::other(
2426            "refusing to write empty search spill",
2427        ));
2428    }
2429    block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2430    let path = spill_dir.join(format!("segment.{seq:06}.bin"));
2431    let mut writer = BufWriter::new(File::create(&path)?);
2432    writer.write_all(SPILL_MAGIC)?;
2433    write_u32(&mut writer, INDEX_VERSION)?;
2434    write_u64(
2435        &mut writer,
2436        u64::try_from(block.len()).map_err(|_| std::io::Error::other("search spill too large"))?,
2437    )?;
2438
2439    let mut index = 0usize;
2440    while index < block.len() {
2441        let trigram = block[index].trigram;
2442        let group_start = index;
2443        while index < block.len() && block[index].trigram == trigram {
2444            index += 1;
2445        }
2446        write_u32(&mut writer, trigram)?;
2447        write_u32(
2448            &mut writer,
2449            u32::try_from(index - group_start)
2450                .map_err(|_| std::io::Error::other("search spill group too large"))?,
2451        )?;
2452        for record in &block[group_start..index] {
2453            writer.write_all(&record.file_id.to_le_bytes())?;
2454            writer.write_all(&[record.next_mask, record.loc_mask])?;
2455        }
2456    }
2457    writer.flush()?;
2458    writer.get_ref().sync_all()?;
2459    block.clear();
2460    Ok(path)
2461}
2462
2463fn create_spill_dir(cache_dir: &Path) -> std::io::Result<PathBuf> {
2464    let dir = cache_dir.join(format!(
2465        "search-build.tmp.{}.{}",
2466        std::process::id(),
2467        SystemTime::now()
2468            .duration_since(UNIX_EPOCH)
2469            .unwrap_or(Duration::ZERO)
2470            .as_nanos()
2471    ));
2472    fs::create_dir_all(&dir)?;
2473    Ok(dir)
2474}
2475
2476fn sweep_stale_search_build_dirs(cache_dir: &Path) {
2477    let Ok(entries) = fs::read_dir(cache_dir) else {
2478        return;
2479    };
2480    for entry in entries.flatten() {
2481        let file_name = entry.file_name();
2482        if file_name.to_string_lossy().starts_with("search-build.tmp.") {
2483            let _ = fs::remove_dir_all(entry.path());
2484        }
2485    }
2486}
2487
2488fn transient_search_cache_dir(root: &Path) -> PathBuf {
2489    std::env::temp_dir().join(format!(
2490        "aft-search-cache.{}.{}.{}",
2491        artifact_cache_key(root),
2492        std::process::id(),
2493        SystemTime::now()
2494            .duration_since(UNIX_EPOCH)
2495            .unwrap_or(Duration::ZERO)
2496            .as_nanos()
2497    ))
2498}
2499
2500fn read_file_trigram_count_extension(
2501    base: &BasePostings,
2502    extension_start: u64,
2503    postings_body_end: u64,
2504    file_count: usize,
2505) -> std::io::Result<Option<Vec<u32>>> {
2506    if extension_start >= postings_body_end {
2507        return Ok(None);
2508    }
2509    let extension_len = postings_body_end - extension_start;
2510    if extension_len < 16 {
2511        return Ok(None);
2512    }
2513    let mut header = [0u8; 16];
2514    pread_exact(&base.file, extension_start, &mut header)?;
2515    if &header[..8] != FILE_TRIGRAM_COUNT_MAGIC {
2516        return Ok(None);
2517    }
2518    let version = u32::from_le_bytes([header[8], header[9], header[10], header[11]]);
2519    if version != INDEX_VERSION {
2520        return Err(std::io::Error::other("invalid file trigram count version"));
2521    }
2522    let count = u32::from_le_bytes([header[12], header[13], header[14], header[15]]) as usize;
2523    if count != file_count {
2524        return Err(std::io::Error::other("file trigram count length mismatch"));
2525    }
2526    let counts_len = count
2527        .checked_mul(4)
2528        .ok_or_else(|| std::io::Error::other("file trigram count extension too large"))?;
2529    if 16u64 + counts_len as u64 > extension_len {
2530        return Err(std::io::Error::other(
2531            "truncated file trigram count extension",
2532        ));
2533    }
2534    let mut bytes = vec![0u8; counts_len];
2535    pread_exact(&base.file, extension_start + 16, &mut bytes)?;
2536    let mut counts = Vec::with_capacity(count);
2537    for chunk in bytes.chunks_exact(4) {
2538        counts.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
2539    }
2540    Ok(Some(counts))
2541}
2542
2543fn compute_file_trigram_counts_from_base(
2544    base: &BasePostings,
2545    file_count: usize,
2546) -> std::io::Result<Vec<u32>> {
2547    let mut counts = vec![0u32; file_count];
2548    for entry in base.lookup.iter().copied() {
2549        for posting in base.read_postings(entry)? {
2550            let Some(count) = counts.get_mut(posting.file_id as usize) else {
2551                return Err(std::io::Error::other("posting references missing file"));
2552            };
2553            *count = count.saturating_add(1);
2554        }
2555    }
2556    Ok(counts)
2557}
2558
2559fn ensure_count_slot(counts: &mut Vec<u32>, file_id: u32) {
2560    let len = file_id as usize + 1;
2561    if counts.len() < len {
2562        counts.resize(len, 0);
2563    }
2564}
2565
2566fn reader_has_remaining<R: Seek>(
2567    reader: &mut R,
2568    absolute_end: u64,
2569    len: usize,
2570) -> std::io::Result<bool> {
2571    let position = reader.stream_position()?;
2572    Ok(position <= absolute_end && (len as u64) <= absolute_end - position)
2573}
2574
2575fn crc32_file_range(path: &Path, start: u64, len: u64) -> std::io::Result<u32> {
2576    let mut file = File::open(path)?;
2577    file.seek(SeekFrom::Start(start))?;
2578    let mut hasher = crc32fast::Hasher::new();
2579    let mut remaining = len;
2580    let mut buffer = vec![0u8; 1024 * 1024];
2581    while remaining > 0 {
2582        let read_len = buffer.len().min(remaining as usize);
2583        let bytes_read = file.read(&mut buffer[..read_len])?;
2584        if bytes_read == 0 {
2585            return Err(std::io::Error::new(
2586                std::io::ErrorKind::UnexpectedEof,
2587                "truncated cache while checksumming",
2588            ));
2589        }
2590        hasher.update(&buffer[..bytes_read]);
2591        remaining -= bytes_read as u64;
2592    }
2593    Ok(hasher.finalize())
2594}
2595
2596fn sync_parent_dir(path: &Path) {
2597    if let Some(parent) = path.parent() {
2598        if let Ok(dir) = File::open(parent) {
2599            let _ = dir.sync_all();
2600        }
2601    }
2602}
2603
2604fn open_cache_file_read(path: &Path) -> std::io::Result<File> {
2605    let mut options = OpenOptions::new();
2606    options.read(true);
2607    #[cfg(windows)]
2608    {
2609        use std::os::windows::fs::OpenOptionsExt;
2610        const FILE_SHARE_READ: u32 = 0x0000_0001;
2611        const FILE_SHARE_WRITE: u32 = 0x0000_0002;
2612        const FILE_SHARE_DELETE: u32 = 0x0000_0004;
2613        options.share_mode(FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE);
2614    }
2615    options.open(path)
2616}
2617
2618#[cfg(unix)]
2619fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2620    use std::os::unix::fs::FileExt;
2621    while !buffer.is_empty() {
2622        let bytes_read = file.read_at(buffer, offset)?;
2623        if bytes_read == 0 {
2624            return Err(std::io::Error::new(
2625                std::io::ErrorKind::UnexpectedEof,
2626                "short pread from search cache",
2627            ));
2628        }
2629        offset += bytes_read as u64;
2630        let (_, rest) = buffer.split_at_mut(bytes_read);
2631        buffer = rest;
2632    }
2633    Ok(())
2634}
2635
2636#[cfg(windows)]
2637fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2638    use std::os::windows::fs::FileExt;
2639    while !buffer.is_empty() {
2640        let bytes_read = file.seek_read(buffer, offset)?;
2641        if bytes_read == 0 {
2642            return Err(std::io::Error::new(
2643                std::io::ErrorKind::UnexpectedEof,
2644                "short pread from search cache",
2645            ));
2646        }
2647        offset += bytes_read as u64;
2648        let (_, rest) = buffer.split_at_mut(bytes_read);
2649        buffer = rest;
2650    }
2651    Ok(())
2652}
2653
2654fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2655    let mut merged = Vec::with_capacity(left.len().min(right.len()));
2656    let mut left_index = 0;
2657    let mut right_index = 0;
2658
2659    while left_index < left.len() && right_index < right.len() {
2660        match left[left_index].cmp(&right[right_index]) {
2661            std::cmp::Ordering::Less => left_index += 1,
2662            std::cmp::Ordering::Greater => right_index += 1,
2663            std::cmp::Ordering::Equal => {
2664                merged.push(left[left_index]);
2665                left_index += 1;
2666                right_index += 1;
2667            }
2668        }
2669    }
2670
2671    merged
2672}
2673
2674fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2675    let mut merged = Vec::with_capacity(left.len() + right.len());
2676    let mut left_index = 0;
2677    let mut right_index = 0;
2678
2679    while left_index < left.len() && right_index < right.len() {
2680        match left[left_index].cmp(&right[right_index]) {
2681            std::cmp::Ordering::Less => {
2682                merged.push(left[left_index]);
2683                left_index += 1;
2684            }
2685            std::cmp::Ordering::Greater => {
2686                merged.push(right[right_index]);
2687                right_index += 1;
2688            }
2689            std::cmp::Ordering::Equal => {
2690                merged.push(left[left_index]);
2691                left_index += 1;
2692                right_index += 1;
2693            }
2694        }
2695    }
2696
2697    merged.extend_from_slice(&left[left_index..]);
2698    merged.extend_from_slice(&right[right_index..]);
2699    merged
2700}
2701
2702pub fn decompose_regex(pattern: &str) -> RegexQuery {
2703    let hir = match regex_syntax::parse(pattern) {
2704        Ok(hir) => hir,
2705        Err(_) => return RegexQuery::default(),
2706    };
2707
2708    let build = build_query(&hir);
2709    build.into_query()
2710}
2711
2712pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
2713    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
2714}
2715
2716pub fn normalize_char(c: u8) -> u8 {
2717    c.to_ascii_lowercase()
2718}
2719
2720fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
2721    if content.len() < 3 {
2722        return;
2723    }
2724
2725    for start in 0..=content.len() - 3 {
2726        let trigram = pack_trigram(
2727            normalize_char(content[start]),
2728            normalize_char(content[start + 1]),
2729            normalize_char(content[start + 2]),
2730        );
2731        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
2732        visit(trigram, next_char, start);
2733    }
2734}
2735
2736pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
2737    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
2738    scan_trigrams(content, |trigram, next_char, position| {
2739        trigrams.push((trigram, next_char, position));
2740    });
2741    trigrams
2742}
2743
2744fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
2745    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2746    scan_trigrams(content, |trigram, next_char, position| {
2747        let entry = filters.entry(trigram).or_default();
2748        if include_eof_next_char || next_char != EOF_SENTINEL {
2749            entry.next_mask |= mask_for_next_char(next_char);
2750        }
2751        entry.loc_mask |= mask_for_position(position);
2752    });
2753    filters
2754}
2755
2756pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
2757    let mut seen = HashSet::new();
2758    let mut out = Vec::new();
2759    for token in tokens {
2760        scan_trigrams(token.as_bytes(), |trigram, _, _| {
2761            if seen.insert(trigram) {
2762                out.push(trigram);
2763            }
2764        });
2765    }
2766    out
2767}
2768
2769pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
2770    lexical_score_snapshot(&index.snapshot(), query_trigrams, file_id)
2771}
2772
2773fn lexical_score_snapshot(
2774    index: &SearchIndexSnapshot,
2775    query_trigrams: &[u32],
2776    file_id: u32,
2777) -> f32 {
2778    if query_trigrams.is_empty() {
2779        return 0.0;
2780    }
2781
2782    let mut hits = 0u32;
2783    for &trigram in query_trigrams {
2784        let postings = index.postings_for_trigram(trigram, None);
2785        if postings.binary_search(&file_id).is_ok() {
2786            hits += 1;
2787        }
2788    }
2789
2790    if hits == 0 {
2791        return 0.0;
2792    }
2793
2794    let file_trigram_count = index
2795        .file_trigram_count
2796        .get(file_id as usize)
2797        .copied()
2798        .unwrap_or(1)
2799        .max(1) as f32;
2800    (hits as f32) / (1.0 + file_trigram_count.ln())
2801}
2802
2803pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
2804    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
2805    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
2806        return PathBuf::from(override_dir)
2807            .join("index")
2808            .join(artifact_cache_key(project_root));
2809    }
2810    // Use configured storage dir (from plugin, XDG-compliant)
2811    if let Some(dir) = storage_dir {
2812        return dir.join("index").join(artifact_cache_key(project_root));
2813    }
2814    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
2815    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
2816    // If neither is set, fall back to a temp directory rather than `"."`
2817    // because the search-index code reads/writes absolute paths.
2818    let home = std::env::var_os("HOME")
2819        .or_else(|| std::env::var_os("USERPROFILE"))
2820        .map(PathBuf::from)
2821        .unwrap_or_else(std::env::temp_dir);
2822    home.join(".cache")
2823        .join("aft")
2824        .join("index")
2825        .join(artifact_cache_key(project_root))
2826}
2827
2828pub(crate) fn build_path_filters(
2829    include: &[String],
2830    exclude: &[String],
2831) -> Result<PathFilters, String> {
2832    Ok(PathFilters {
2833        includes: build_globset(include)?,
2834        excludes: build_globset(exclude)?,
2835    })
2836}
2837
2838pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
2839    walk_project_files_from(root, root, filters)
2840}
2841
2842pub fn walk_project_files_bounded_default(
2843    root: &Path,
2844    max_files: usize,
2845) -> Result<Vec<PathBuf>, usize> {
2846    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
2847}
2848
2849pub(crate) fn walk_project_files_bounded_matching<F>(
2850    root: &Path,
2851    filters: &PathFilters,
2852    max_files: usize,
2853    matches_file: F,
2854) -> Result<Vec<PathBuf>, usize>
2855where
2856    F: Fn(&Path) -> bool,
2857{
2858    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
2859}
2860
2861pub fn walk_project_files_bounded_default_matching<F>(
2862    root: &Path,
2863    max_files: usize,
2864    matches_file: F,
2865) -> Result<Vec<PathBuf>, usize>
2866where
2867    F: Fn(&Path) -> bool,
2868{
2869    walk_project_files_from_inner_matching(
2870        root,
2871        root,
2872        &PathFilters::default(),
2873        Some(max_files),
2874        matches_file,
2875        true,
2876    )
2877}
2878
2879pub(crate) fn walk_project_files_from(
2880    filter_root: &Path,
2881    search_root: &Path,
2882    filters: &PathFilters,
2883) -> Vec<PathBuf> {
2884    walk_project_files_from_inner(filter_root, search_root, filters, None, true)
2885        .expect("unbounded project walk cannot exceed a file limit")
2886}
2887
2888pub(crate) fn has_any_project_file_from(
2889    filter_root: &Path,
2890    search_root: &Path,
2891    filters: &PathFilters,
2892) -> bool {
2893    walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
2894}
2895
2896fn walk_project_files_from_inner(
2897    filter_root: &Path,
2898    search_root: &Path,
2899    filters: &PathFilters,
2900    max_files: Option<usize>,
2901    sort_by_mtime: bool,
2902) -> Result<Vec<PathBuf>, usize> {
2903    walk_project_files_from_inner_matching(
2904        filter_root,
2905        search_root,
2906        filters,
2907        max_files,
2908        |_| true,
2909        sort_by_mtime,
2910    )
2911}
2912
2913fn project_walk_builder(search_root: &Path) -> WalkBuilder {
2914    let mut builder = WalkBuilder::new(search_root);
2915    builder
2916        .hidden(false)
2917        .git_ignore(true)
2918        .git_global(true)
2919        .git_exclude(true)
2920        .add_custom_ignore_filename(".aftignore")
2921        .filter_entry(|entry| {
2922            let name = entry.file_name().to_string_lossy();
2923            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2924                return !matches!(
2925                    name.as_ref(),
2926                    "node_modules"
2927                        | "target"
2928                        | "venv"
2929                        | ".venv"
2930                        | ".git"
2931                        | "__pycache__"
2932                        | ".tox"
2933                        | "dist"
2934                        | "build"
2935                );
2936            }
2937            true
2938        });
2939    builder
2940}
2941
2942fn walk_project_files_from_inner_matching<F>(
2943    filter_root: &Path,
2944    search_root: &Path,
2945    filters: &PathFilters,
2946    max_files: Option<usize>,
2947    matches_file: F,
2948    sort_by_mtime: bool,
2949) -> Result<Vec<PathBuf>, usize>
2950where
2951    F: Fn(&Path) -> bool,
2952{
2953    let builder = project_walk_builder(search_root);
2954
2955    let mut files = Vec::new();
2956    for entry in builder.build().filter_map(|entry| entry.ok()) {
2957        if !entry
2958            .file_type()
2959            .map_or(false, |file_type| file_type.is_file())
2960        {
2961            continue;
2962        }
2963        let path = entry.into_path();
2964        if filters.matches(filter_root, &path) && matches_file(&path) {
2965            files.push(path);
2966            if max_files.is_some_and(|limit| files.len() > limit) {
2967                return Err(files.len());
2968            }
2969        }
2970    }
2971
2972    if sort_by_mtime {
2973        sort_paths_by_mtime_desc(&mut files);
2974    }
2975    Ok(files)
2976}
2977
2978pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
2979    let bytes = fs::read(path).ok()?;
2980    if is_binary_bytes(&bytes) {
2981        return None;
2982    }
2983    String::from_utf8(bytes).ok()
2984}
2985
2986fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
2987    fs::read(path).ok()
2988}
2989
2990pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
2991    path.strip_prefix(root)
2992        .map(PathBuf::from)
2993        .unwrap_or_else(|_| path.to_path_buf())
2994}
2995
2996pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
2997    let normalized_root = normalize_path(root);
2998    let normalized_path = normalize_path(path);
2999    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
3000    validate_cached_relative_path(relative)
3001}
3002
3003pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
3004    let relative = validate_cached_relative_path(relative_path)?;
3005    let normalized_root = normalize_path(root);
3006    let full_path = normalize_path(&normalized_root.join(relative));
3007
3008    match fs::canonicalize(&full_path) {
3009        Ok(canonical_path) => {
3010            if canonical_path.starts_with(&normalized_root) {
3011                return Some(full_path);
3012            }
3013
3014            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
3015            canonical_path
3016                .starts_with(&canonical_root)
3017                .then_some(full_path)
3018        }
3019        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
3020    }
3021}
3022
3023pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
3024    if path.is_absolute() {
3025        return None;
3026    }
3027
3028    let mut normalized = PathBuf::new();
3029    for component in path.components() {
3030        match component {
3031            Component::Normal(part) => normalized.push(part),
3032            Component::CurDir => {}
3033            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
3034        }
3035    }
3036    (!normalized.as_os_str().is_empty()).then_some(normalized)
3037}
3038
3039/// Sort paths newest-first by mtime, falling back to lexicographic order.
3040///
3041/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
3042/// `sort_by()` closure. That made the comparator non-deterministic — a
3043/// `stat()` syscall for the same path can return different values across
3044/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
3045/// concurrent file-watcher activity), and Rust's slice::sort panics at
3046/// runtime when it detects a non-total-order comparator. CI hit this on
3047/// a Pi e2e test where the bridge invalidated files in parallel with grep.
3048///
3049/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
3050/// from the map inside the closure. Pure function ⇒ guaranteed total order.
3051pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
3052    use std::collections::HashMap;
3053    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
3054    for path in paths.iter() {
3055        mtimes
3056            .entry(path.clone())
3057            .or_insert_with(|| path_modified_time(path));
3058    }
3059    paths.sort_by(|left, right| {
3060        let left_mtime = mtimes.get(left).and_then(|v| *v);
3061        let right_mtime = mtimes.get(right).and_then(|v| *v);
3062        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
3063    });
3064}
3065
3066/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
3067/// the sort. Same fix, applied to grep matches that share files.
3068pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
3069    use std::collections::HashMap;
3070    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
3071    for m in matches.iter() {
3072        mtimes.entry(m.file.clone()).or_insert_with(|| {
3073            let resolved = resolve_match_path(project_root, &m.file);
3074            path_modified_time(&resolved)
3075        });
3076    }
3077    matches.sort_by(|left, right| {
3078        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
3079        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
3080        right_mtime
3081            .cmp(&left_mtime)
3082            .then_with(|| left.file.cmp(&right.file))
3083            .then_with(|| left.line.cmp(&right.line))
3084            .then_with(|| left.column.cmp(&right.column))
3085    });
3086}
3087
3088/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
3089/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
3090/// table from the search index), but it can still return different values if
3091/// the file is modified mid-sort. Snapshot once.
3092fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
3093    matches: &mut [SharedGrepMatch],
3094    modified_for_path: F,
3095) where
3096    F: Fn(&Path) -> Option<SystemTime>,
3097{
3098    use std::collections::HashMap;
3099    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
3100    for m in matches.iter() {
3101        let path = m.file.as_path().to_path_buf();
3102        mtimes
3103            .entry(path.clone())
3104            .or_insert_with(|| modified_for_path(&path));
3105    }
3106    matches.sort_by(|left, right| {
3107        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
3108        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
3109        right_mtime
3110            .cmp(&left_mtime)
3111            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
3112            .then_with(|| left.line.cmp(&right.line))
3113            .then_with(|| left.column.cmp(&right.column))
3114    });
3115}
3116
3117pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
3118    let resolved_project_root = canonicalize_or_normalize(project_root);
3119    let root = match path {
3120        Some(path) => {
3121            let path = PathBuf::from(path);
3122            if path.is_absolute() {
3123                canonicalize_or_normalize(&path)
3124            } else {
3125                normalize_path(&resolved_project_root.join(path))
3126            }
3127        }
3128        None => resolved_project_root.clone(),
3129    };
3130
3131    let use_index = is_within_search_root(&resolved_project_root, &root);
3132    SearchScope { root, use_index }
3133}
3134
3135pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
3136    content_inspector::inspect(content).is_binary()
3137}
3138
3139pub(crate) fn current_git_head(root: &Path) -> Option<String> {
3140    run_git(root, &["rev-parse", "HEAD"])
3141}
3142
3143/// On-disk ARTIFACT cache key (search, semantic, symbol, callgraph, inspect).
3144///
3145/// For git repos this is the repository ROOT COMMIT — so a linked worktree
3146/// shares the main checkout's index (opened read-only), the deliberate
3147/// worktree-sharing mechanism. For non-git it is the canonical filesystem path.
3148///
3149/// This is the per-REPOSITORY identity. It is intentionally DISTINCT from
3150/// [`crate::path_identity::project_scope_key`] (the per-CHECKOUT identity used
3151/// for bash/compression/backup/checkpoint scoping). Its value is unchanged from
3152/// the historical `project_cache_key`, so existing on-disk caches are NOT
3153/// invalidated by the P0 identity split.
3154pub fn artifact_cache_key(project_root: &Path) -> String {
3155    use sha2::{Digest, Sha256};
3156
3157    let mut hasher = Sha256::new();
3158
3159    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
3160        // Git repo: root commit is the unique identity.
3161        // Same repo cloned anywhere produces the same key.
3162        hasher.update(root_commit.as_bytes());
3163    } else {
3164        // Non-git project: use the canonical filesystem path as identity.
3165        let canonical_root = canonicalize_or_normalize(project_root);
3166        hasher.update(canonical_root.to_string_lossy().as_bytes());
3167    }
3168
3169    let digest = format!("{:x}", hasher.finalize());
3170    digest[..16].to_string()
3171}
3172
3173/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
3174///
3175/// The search cache stores this value next to the file mtimes. If `.gitignore`,
3176/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
3177/// matching HEAD + matching file mtimes is not enough to safely reuse the old
3178/// cache: files that are now ignored may still be indexed. Hashing the ignore
3179/// files themselves makes cold-start cache reuse agree with the current walker.
3180pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
3181    use sha2::{Digest, Sha256};
3182
3183    let root = canonicalize_or_normalize(project_root);
3184    let mut files = Vec::new();
3185    collect_ignore_rule_files(&root, &mut files);
3186    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
3187        if global_ignore.is_file() {
3188            files.push(global_ignore);
3189        }
3190    }
3191    let info_exclude = git_info_exclude_path(&root);
3192    if info_exclude.is_file() {
3193        files.push(info_exclude);
3194    }
3195    files.sort();
3196    files.dedup();
3197
3198    let mut hasher = Sha256::new();
3199    hasher.update(b"aft-ignore-rules-v1\0");
3200    for path in files {
3201        if let Some(relative) = cache_relative_path(&root, &path) {
3202            hasher.update(relative.to_string_lossy().as_bytes());
3203        } else {
3204            hasher.update(path.to_string_lossy().as_bytes());
3205        }
3206        hasher.update(b"\0");
3207        match fs::read(&path) {
3208            Ok(bytes) => hasher.update(&bytes),
3209            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
3210        }
3211        hasher.update(b"\0");
3212    }
3213
3214    format!("{:x}", hasher.finalize())
3215}
3216
3217fn git_info_exclude_path(root: &Path) -> PathBuf {
3218    run_git(
3219        root,
3220        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
3221    )
3222    .map(PathBuf::from)
3223    .unwrap_or_else(|| root.join(".git"))
3224    .join("info")
3225    .join("exclude")
3226}
3227
3228fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
3229    let mut builder = WalkBuilder::new(root);
3230    builder
3231        .hidden(false)
3232        .git_ignore(true)
3233        .git_global(true)
3234        .git_exclude(true)
3235        .add_custom_ignore_filename(".aftignore")
3236        .filter_entry(|entry| {
3237            let name = entry.file_name().to_string_lossy();
3238            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3239                return !matches!(
3240                    name.as_ref(),
3241                    ".git"
3242                        | "node_modules"
3243                        | "target"
3244                        | "venv"
3245                        | ".venv"
3246                        | "__pycache__"
3247                        | ".tox"
3248                        | "dist"
3249                        | "build"
3250                );
3251            }
3252            true
3253        });
3254
3255    for entry in builder.build().filter_map(|entry| entry.ok()) {
3256        if !entry
3257            .file_type()
3258            .map_or(false, |file_type| file_type.is_file())
3259        {
3260            continue;
3261        }
3262        let file_name = entry.file_name();
3263        if file_name == ".gitignore" || file_name == ".aftignore" {
3264            files.push(entry.into_path());
3265        }
3266    }
3267}
3268
3269/// Count directories visited when discovering ignore rule files (for perf regression tests).
3270#[cfg(test)]
3271pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
3272    let mut dirs = 0usize;
3273    let mut builder = WalkBuilder::new(root);
3274    builder
3275        .hidden(false)
3276        .git_ignore(true)
3277        .git_global(true)
3278        .git_exclude(true)
3279        .add_custom_ignore_filename(".aftignore");
3280    for entry in builder.build().filter_map(|entry| entry.ok()) {
3281        if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3282            dirs += 1;
3283        }
3284    }
3285    dirs
3286}
3287
3288/// Legacy stack-based discovery (pre ignore-walker fix); used only in perf tests.
3289#[cfg(test)]
3290pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
3291    let mut stack = vec![root.to_path_buf()];
3292    let mut dirs = 0usize;
3293    while let Some(dir) = stack.pop() {
3294        dirs += 1;
3295        let Ok(entries) = fs::read_dir(&dir) else {
3296            continue;
3297        };
3298        for entry in entries.flatten() {
3299            let path = entry.path();
3300            let file_name = entry.file_name();
3301            if file_name == ".gitignore" || file_name == ".aftignore" {
3302                continue;
3303            }
3304            let Ok(file_type) = entry.file_type() else {
3305                continue;
3306            };
3307            if !file_type.is_dir() || file_type.is_symlink() {
3308                continue;
3309            }
3310            if matches!(
3311                file_name.to_str().unwrap_or(""),
3312                ".git"
3313                    | "node_modules"
3314                    | "target"
3315                    | "venv"
3316                    | ".venv"
3317                    | "__pycache__"
3318                    | ".tox"
3319                    | "dist"
3320                    | "build"
3321            ) {
3322                continue;
3323            }
3324            stack.push(path);
3325        }
3326    }
3327    dirs
3328}
3329
3330impl PathFilters {
3331    pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
3332        let relative = to_glob_path(&relative_to_root(root, path));
3333        if self
3334            .includes
3335            .as_ref()
3336            .is_some_and(|includes| !includes.is_match(&relative))
3337        {
3338            return false;
3339        }
3340        if self
3341            .excludes
3342            .as_ref()
3343            .is_some_and(|excludes| excludes.is_match(&relative))
3344        {
3345            return false;
3346        }
3347        true
3348    }
3349}
3350
3351fn canonicalize_or_normalize(path: &Path) -> PathBuf {
3352    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
3353}
3354
3355fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
3356    if path.is_absolute() {
3357        path.to_path_buf()
3358    } else {
3359        project_root.join(path)
3360    }
3361}
3362
3363fn path_modified_time(path: &Path) -> Option<SystemTime> {
3364    fs::metadata(path)
3365        .and_then(|metadata| metadata.modified())
3366        .ok()
3367}
3368
3369fn normalize_path(path: &Path) -> PathBuf {
3370    let mut result = PathBuf::new();
3371    for component in path.components() {
3372        match component {
3373            Component::ParentDir => {
3374                if !result.pop() {
3375                    result.push(component);
3376                }
3377            }
3378            Component::CurDir => {}
3379            _ => result.push(component),
3380        }
3381    }
3382    result
3383}
3384
3385fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3386    if let Ok(canonical) = fs::canonicalize(path) {
3387        return canonical;
3388    }
3389
3390    let Some(parent) = path.parent() else {
3391        return path.to_path_buf();
3392    };
3393    let Some(file_name) = path.file_name() else {
3394        return path.to_path_buf();
3395    };
3396
3397    fs::canonicalize(parent)
3398        .map(|canonical_parent| canonical_parent.join(file_name))
3399        .unwrap_or_else(|_| path.to_path_buf())
3400}
3401
3402/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
3403/// since the index was last written. Also detect new files and deleted files.
3404fn verify_file_mtimes(index: &mut SearchIndex) {
3405    let filters = PathFilters::default();
3406    let current_files = walk_project_files(&index.project_root, &filters);
3407    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
3408    let mut stale_paths = Vec::new();
3409    let mut removed_paths = Vec::new();
3410
3411    for entry in Arc::make_mut(&mut index.files).iter_mut() {
3412        if entry.path.as_os_str().is_empty() {
3413            continue; // tombstoned entry
3414        }
3415        if !current_file_set.contains(&entry.path) {
3416            removed_paths.push(entry.path.clone());
3417            continue;
3418        }
3419        let cached = FileFreshness {
3420            mtime: entry.modified,
3421            size: entry.size,
3422            content_hash: entry.content_hash,
3423        };
3424        match cache_freshness::verify_file_strict(&entry.path, &cached) {
3425            FreshnessVerdict::HotFresh => {}
3426            FreshnessVerdict::ContentFresh {
3427                new_mtime,
3428                new_size,
3429            } => {
3430                entry.modified = new_mtime;
3431                entry.size = new_size;
3432            }
3433            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
3434                stale_paths.push(entry.path.clone())
3435            }
3436        }
3437    }
3438
3439    for path in &removed_paths {
3440        index.remove_file(path);
3441    }
3442
3443    // Re-index stale files that are still in the current walk set. If an ignore
3444    // rule changed while AFT was down but the fingerprint missed it, this keeps
3445    // warm-cache verification from resurrecting now-ignored cached entries.
3446    for path in &stale_paths {
3447        if current_file_set.contains(path) {
3448            index.update_file(path);
3449        } else {
3450            index.remove_file(path);
3451        }
3452    }
3453
3454    // Detect new files not in the index
3455    for path in current_files {
3456        if !index.path_to_id.contains_key(&path) {
3457            index.update_file(&path);
3458        }
3459    }
3460
3461    if !stale_paths.is_empty() {
3462        crate::slog_info!(
3463            "search index: refreshed {} stale file(s) from disk cache",
3464            stale_paths.len()
3465        );
3466    }
3467}
3468
3469fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
3470    normalize_path(path).starts_with(normalize_path(search_root))
3471}
3472
3473impl QueryBuild {
3474    fn into_query(self) -> RegexQuery {
3475        let mut query = RegexQuery::default();
3476
3477        for run in self.and_runs {
3478            add_run_to_and_query(&mut query, &run);
3479        }
3480
3481        for group in self.or_groups {
3482            let mut trigrams = BTreeSet::new();
3483            let mut filters = HashMap::new();
3484            for run in group {
3485                for (trigram, filter) in trigram_filters(&run) {
3486                    trigrams.insert(trigram);
3487                    merge_filter(filters.entry(trigram).or_default(), filter);
3488                }
3489            }
3490            if !trigrams.is_empty() {
3491                query.or_groups.push(trigrams.into_iter().collect());
3492                query.or_filters.push(filters);
3493            }
3494        }
3495
3496        query
3497    }
3498}
3499
3500fn build_query(hir: &Hir) -> QueryBuild {
3501    match hir.kind() {
3502        HirKind::Literal(literal) => {
3503            if literal.0.len() >= 3 {
3504                QueryBuild {
3505                    and_runs: vec![literal.0.to_vec()],
3506                    or_groups: Vec::new(),
3507                }
3508            } else {
3509                QueryBuild::default()
3510            }
3511        }
3512        HirKind::Capture(capture) => build_query(&capture.sub),
3513        HirKind::Concat(parts) => {
3514            let mut build = QueryBuild::default();
3515            for part in parts {
3516                let part_build = build_query(part);
3517                build.and_runs.extend(part_build.and_runs);
3518                build.or_groups.extend(part_build.or_groups);
3519            }
3520            build
3521        }
3522        HirKind::Alternation(parts) => {
3523            let mut group = Vec::new();
3524            for part in parts {
3525                let Some(mut choices) = guaranteed_run_choices(part) else {
3526                    return QueryBuild::default();
3527                };
3528                group.append(&mut choices);
3529            }
3530            if group.is_empty() {
3531                QueryBuild::default()
3532            } else {
3533                QueryBuild {
3534                    and_runs: Vec::new(),
3535                    or_groups: vec![group],
3536                }
3537            }
3538        }
3539        HirKind::Repetition(repetition) => {
3540            if repetition.min == 0 {
3541                QueryBuild::default()
3542            } else {
3543                build_query(&repetition.sub)
3544            }
3545        }
3546        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
3547    }
3548}
3549
3550fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
3551    match hir.kind() {
3552        HirKind::Literal(literal) => {
3553            if literal.0.len() >= 3 {
3554                Some(vec![literal.0.to_vec()])
3555            } else {
3556                None
3557            }
3558        }
3559        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
3560        HirKind::Concat(parts) => {
3561            let mut runs = Vec::new();
3562            for part in parts {
3563                if let Some(mut part_runs) = guaranteed_run_choices(part) {
3564                    runs.append(&mut part_runs);
3565                }
3566            }
3567            if runs.is_empty() {
3568                None
3569            } else {
3570                Some(runs)
3571            }
3572        }
3573        HirKind::Alternation(parts) => {
3574            let mut runs = Vec::new();
3575            for part in parts {
3576                let Some(mut part_runs) = guaranteed_run_choices(part) else {
3577                    return None;
3578                };
3579                runs.append(&mut part_runs);
3580            }
3581            if runs.is_empty() {
3582                None
3583            } else {
3584                Some(runs)
3585            }
3586        }
3587        HirKind::Repetition(repetition) => {
3588            if repetition.min == 0 {
3589                None
3590            } else {
3591                guaranteed_run_choices(&repetition.sub)
3592            }
3593        }
3594        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
3595    }
3596}
3597
3598fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
3599    for (trigram, filter) in trigram_filters(run) {
3600        if !query.and_trigrams.contains(&trigram) {
3601            query.and_trigrams.push(trigram);
3602        }
3603        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
3604    }
3605}
3606
3607fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
3608    trigram_filter_map(run, false).into_iter().collect()
3609}
3610
3611fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
3612    target.next_mask |= filter.next_mask;
3613    target.loc_mask |= filter.loc_mask;
3614}
3615
3616fn mask_for_next_char(next_char: u8) -> u8 {
3617    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
3618    1u8 << bit
3619}
3620
3621fn mask_for_position(position: usize) -> u8 {
3622    1u8 << (position % 8)
3623}
3624
3625fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
3626    if patterns.is_empty() {
3627        return Ok(None);
3628    }
3629
3630    let mut builder = GlobSetBuilder::new();
3631    for pattern in patterns {
3632        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
3633        builder.add(glob);
3634    }
3635    builder.build().map(Some).map_err(|error| error.to_string())
3636}
3637
3638fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
3639    let mut buffer = [0u8; 4];
3640    reader.read_exact(&mut buffer)?;
3641    Ok(u32::from_le_bytes(buffer))
3642}
3643
3644fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
3645    let mut buffer = [0u8; 8];
3646    reader.read_exact(&mut buffer)?;
3647    Ok(u64::from_le_bytes(buffer))
3648}
3649
3650fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
3651    writer.write_all(&value.to_le_bytes())
3652}
3653
3654fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
3655    writer.write_all(&value.to_le_bytes())
3656}
3657
3658fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
3659    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
3660        return Err(std::io::Error::other("search index checksum missing"));
3661    };
3662    let expected = u32::from_le_bytes(*stored);
3663    let actual = crc32fast::hash(body);
3664    if actual != expected {
3665        return Err(std::io::Error::other("search index checksum mismatch"));
3666    }
3667    Ok(())
3668}
3669
3670fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
3671    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
3672    total_len.checked_sub(pos)
3673}
3674
3675fn run_git(root: &Path, args: &[&str]) -> Option<String> {
3676    let output = Command::new("git")
3677        .arg("-C")
3678        .arg(root)
3679        .args(args)
3680        .output()
3681        .ok()?;
3682    if !output.status.success() {
3683        return None;
3684    }
3685    let value = String::from_utf8(output.stdout).ok()?;
3686    let value = value.trim().to_string();
3687    if value.is_empty() {
3688        None
3689    } else {
3690        Some(value)
3691    }
3692}
3693
3694fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
3695    let diff_range = format!("{}..{}", from, to);
3696    let output = match Command::new("git")
3697        .arg("-C")
3698        .arg(root)
3699        .args(["diff", "--name-status", "-M", &diff_range])
3700        .output()
3701    {
3702        Ok(output) => output,
3703        Err(_) => return false,
3704    };
3705
3706    if !output.status.success() {
3707        return false;
3708    }
3709
3710    let Ok(diff) = String::from_utf8(output.stdout) else {
3711        return false;
3712    };
3713
3714    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
3715        let mut fields = line.split('\t');
3716        let Some(status) = fields.next() else {
3717            continue;
3718        };
3719
3720        if status.starts_with('R') {
3721            let Some(old_path) = fields
3722                .next()
3723                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3724            else {
3725                continue;
3726            };
3727            let Some(new_path) = fields
3728                .next()
3729                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3730            else {
3731                continue;
3732            };
3733            index.remove_file(&old_path);
3734            index.update_file(&new_path);
3735            continue;
3736        }
3737
3738        let Some(path) = fields
3739            .next()
3740            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3741        else {
3742            continue;
3743        };
3744        if status.starts_with('D') || !path.exists() {
3745            index.remove_file(&path);
3746        } else {
3747            index.update_file(&path);
3748        }
3749    }
3750
3751    true
3752}
3753
3754fn is_binary_path(path: &Path, size: u64) -> bool {
3755    if size == 0 {
3756        return false;
3757    }
3758
3759    let mut file = match File::open(path) {
3760        Ok(file) => file,
3761        Err(_) => return true,
3762    };
3763
3764    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
3765    match file.read(&mut preview) {
3766        Ok(read) => is_binary_bytes(&preview[..read]),
3767        Err(_) => true,
3768    }
3769}
3770
3771fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
3772    let mut starts = vec![0usize];
3773    for (index, byte) in content.iter().copied().enumerate() {
3774        if byte == b'\n' {
3775            starts.push(index + 1);
3776        }
3777    }
3778    starts
3779}
3780
3781fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
3782    let line_index = match line_starts.binary_search(&offset) {
3783        Ok(index) => index,
3784        Err(index) => index.saturating_sub(1),
3785    };
3786    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
3787    let line_end = content[line_start..]
3788        .iter()
3789        .position(|byte| *byte == b'\n')
3790        .map(|length| line_start + length)
3791        .unwrap_or(content.len());
3792    let mut line_slice = &content[line_start..line_end];
3793    if line_slice.ends_with(b"\r") {
3794        line_slice = &line_slice[..line_slice.len() - 1];
3795    }
3796    let line_text = String::from_utf8_lossy(line_slice).into_owned();
3797    let column = String::from_utf8_lossy(&content[line_start..offset])
3798        .chars()
3799        .count() as u32
3800        + 1;
3801    (line_index as u32 + 1, column, line_text)
3802}
3803
3804fn to_glob_path(path: &Path) -> String {
3805    path.to_string_lossy().replace('\\', "/")
3806}
3807
3808#[cfg(test)]
3809mod tests {
3810    use std::process::Command;
3811
3812    use super::*;
3813
3814    #[test]
3815    fn cached_path_under_root_allows_missing_lexical_child() {
3816        let dir = tempfile::tempdir().expect("create temp dir");
3817        let project = dir.path().join("project");
3818        fs::create_dir_all(&project).expect("create project dir");
3819        let root = fs::canonicalize(&project).expect("canonicalize project");
3820
3821        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
3822            .expect("missing child should fall back to lexical validation");
3823
3824        assert_eq!(path, root.join("future/file.rs"));
3825    }
3826
3827    #[cfg(unix)]
3828    #[test]
3829    fn cached_path_under_root_rejects_symlink_escape() {
3830        let dir = tempfile::tempdir().expect("create temp dir");
3831        let project = dir.path().join("project");
3832        let outside = dir.path().join("outside");
3833        fs::create_dir_all(&project).expect("create project dir");
3834        fs::create_dir_all(&outside).expect("create outside dir");
3835        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
3836        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
3837        let root = fs::canonicalize(&project).expect("canonicalize project");
3838
3839        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
3840    }
3841
3842    #[test]
3843    fn extract_trigrams_tracks_next_char_and_position() {
3844        let trigrams = extract_trigrams(b"Rust");
3845        assert_eq!(trigrams.len(), 2);
3846        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
3847        assert_eq!(
3848            trigrams[1],
3849            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
3850        );
3851    }
3852
3853    #[test]
3854    fn index_file_trigram_filters_match_legacy_extraction() {
3855        let dir = tempfile::tempdir().expect("create temp dir");
3856        let path = dir.path().join("sample.txt");
3857        let content = b"Rust rust RUST\nxy";
3858        fs::write(&path, content).expect("write sample");
3859
3860        let mut expected = BTreeMap::new();
3861        for (trigram, next_char, position) in extract_trigrams(content) {
3862            let entry: &mut PostingFilter = expected.entry(trigram).or_default();
3863            entry.next_mask |= mask_for_next_char(next_char);
3864            entry.loc_mask |= mask_for_position(position);
3865        }
3866
3867        let mut index = SearchIndex::new();
3868        index.project_root = dir.path().to_path_buf();
3869        index.index_file(&path, content);
3870
3871        let file_id = *index.path_to_id.get(&path).expect("file indexed");
3872        let file_trigrams = index
3873            .delta_file_trigrams
3874            .get(&file_id)
3875            .expect("delta file trigrams");
3876        assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
3877        for (trigram, filter) in expected {
3878            let postings = index
3879                .delta_postings
3880                .get(&trigram)
3881                .expect("delta posting list");
3882            assert_eq!(postings.len(), 1);
3883            assert_eq!(postings[0].file_id, file_id);
3884            assert_eq!(postings[0].next_mask, filter.next_mask);
3885            assert_eq!(postings[0].loc_mask, filter.loc_mask);
3886        }
3887    }
3888
3889    #[test]
3890    fn decompose_regex_extracts_literals_and_alternations() {
3891        let query = decompose_regex("abc(def|ghi)xyz");
3892        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
3893        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
3894        assert_eq!(query.or_groups.len(), 1);
3895        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
3896        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
3897    }
3898
3899    #[test]
3900    fn candidates_intersect_posting_lists() {
3901        let mut index = SearchIndex::new();
3902        let dir = tempfile::tempdir().expect("create temp dir");
3903        let alpha = dir.path().join("alpha.txt");
3904        let beta = dir.path().join("beta.txt");
3905        fs::write(&alpha, "abcdef").expect("write alpha");
3906        fs::write(&beta, "abcxyz").expect("write beta");
3907        index.project_root = dir.path().to_path_buf();
3908        index.index_file(&alpha, b"abcdef");
3909        index.index_file(&beta, b"abcxyz");
3910
3911        let query = RegexQuery {
3912            and_trigrams: vec![
3913                pack_trigram(b'a', b'b', b'c'),
3914                pack_trigram(b'd', b'e', b'f'),
3915            ],
3916            ..RegexQuery::default()
3917        };
3918
3919        let candidates = index.candidates(&query);
3920        assert_eq!(candidates.len(), 1);
3921        assert_eq!(index.files[candidates[0] as usize].path, alpha);
3922    }
3923
3924    #[test]
3925    fn candidates_apply_bloom_filters() {
3926        let mut index = SearchIndex::new();
3927        let dir = tempfile::tempdir().expect("create temp dir");
3928        let file = dir.path().join("sample.txt");
3929        fs::write(&file, "abcd efgh").expect("write sample");
3930        index.project_root = dir.path().to_path_buf();
3931        index.index_file(&file, b"abcd efgh");
3932
3933        let trigram = pack_trigram(b'a', b'b', b'c');
3934        let matching_filter = PostingFilter {
3935            next_mask: mask_for_next_char(b'd'),
3936            loc_mask: mask_for_position(0),
3937        };
3938        let non_matching_filter = PostingFilter {
3939            next_mask: mask_for_next_char(b'z'),
3940            loc_mask: mask_for_position(0),
3941        };
3942
3943        assert_eq!(
3944            index
3945                .postings_for_trigram(trigram, Some(matching_filter))
3946                .len(),
3947            1
3948        );
3949        assert!(index
3950            .postings_for_trigram(trigram, Some(non_matching_filter))
3951            .is_empty());
3952    }
3953
3954    #[test]
3955    fn base_delta_readd_masks_base_and_keeps_postings_sorted() {
3956        let dir = tempfile::tempdir().expect("create temp dir");
3957        let project = dir.path().join("project");
3958        fs::create_dir_all(&project).expect("create project dir");
3959        let a = project.join("a.txt");
3960        let b = project.join("b.txt");
3961        fs::write(&a, "abc old").expect("write a");
3962        fs::write(&b, "abc base").expect("write b");
3963
3964        let mut built = SearchIndex::build(&project);
3965        let cache_dir = dir.path().join("cache");
3966        built.write_to_disk(&cache_dir, None);
3967        let mut index = SearchIndex::read_from_disk(&cache_dir, &project).expect("load base");
3968        assert_eq!(index.base_file_count, 2);
3969
3970        let old_a_id = *index.path_to_id.get(&a).expect("original a id");
3971        let b_id = *index.path_to_id.get(&b).expect("original b id");
3972        index.remove_file(&a);
3973        index.index_file(&a, b"abc new");
3974        let new_id = *index.path_to_id.get(&a).expect("re-added file id");
3975        assert!(new_id >= index.base_file_count);
3976        let abc = pack_trigram(b'a', b'b', b'c');
3977        let ids = index.postings_for_trigram(abc, None);
3978        assert_eq!(ids, {
3979            let mut expected = vec![b_id, new_id];
3980            expected.sort_unstable();
3981            expected
3982        });
3983        assert!(!ids.contains(&old_a_id));
3984    }
3985
3986    #[test]
3987    fn write_to_disk_compacts_base_and_delta() {
3988        let dir = tempfile::tempdir().expect("create temp dir");
3989        let project = dir.path().join("project");
3990        fs::create_dir_all(&project).expect("create project dir");
3991        let file = project.join("src.txt");
3992        fs::write(&file, "abcdef").expect("write source");
3993        let mut index = SearchIndex::build(&project);
3994        let cache_dir = dir.path().join("cache");
3995        index.write_to_disk(&cache_dir, None);
3996        fs::write(&file, "abcxyz").expect("edit source");
3997        index.update_file(&file);
3998        assert!(!index.delta_postings.is_empty());
3999        index.write_to_disk(&cache_dir, None);
4000        assert!(index.delta_postings.is_empty());
4001        assert!(index.superseded.is_empty());
4002        assert_eq!(
4003            index.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4004            vec![0]
4005        );
4006        assert!(index
4007            .postings_for_trigram(pack_trigram(b'd', b'e', b'f'), None)
4008            .is_empty());
4009    }
4010
4011    #[test]
4012    fn legacy_cache_without_file_trigram_count_migrates_streaming_counts() {
4013        let dir = tempfile::tempdir().expect("create temp dir");
4014        let project = dir.path().join("project");
4015        fs::create_dir_all(&project).expect("create project dir");
4016        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4017        let cache_dir = dir.path().join("cache");
4018        let mut index = SearchIndex::build(&project);
4019        index.write_to_disk(&cache_dir, None);
4020        let cache_path = cache_dir.join("cache.bin");
4021        strip_file_trigram_count_extension(&cache_path);
4022        assert!(!cache_has_file_trigram_count_extension(&cache_path));
4023
4024        let loaded = SearchIndex::read_from_disk(&cache_dir, &project).expect("load legacy cache");
4025        assert_eq!(loaded.file_trigram_count.as_ref(), &[4]);
4026        assert!(loaded.delta_postings.is_empty());
4027        assert!(cache_has_file_trigram_count_extension(&cache_path));
4028    }
4029
4030    #[test]
4031    fn compaction_flags_buffer_paths_while_running() {
4032        let dir = tempfile::tempdir().expect("create temp dir");
4033        let project = dir.path().join("project");
4034        fs::create_dir_all(&project).expect("create project dir");
4035        let file = project.join("src.txt");
4036        fs::write(&file, "abcdef").expect("write source");
4037        let mut index = SearchIndex::new();
4038        index.project_root = project.clone();
4039        {
4040            let mut state = index.compaction_state.lock().expect("compaction state");
4041            state.running = true;
4042        }
4043        index.update_file(&file);
4044        let state = index.compaction_state.lock().expect("compaction state");
4045        assert!(state.requested_again || !index.delta_postings.is_empty());
4046        assert!(state.buffered_paths.contains(&file));
4047    }
4048
4049    fn cache_has_file_trigram_count_extension(cache_path: &Path) -> bool {
4050        file_trigram_count_extension_range(cache_path).is_some()
4051    }
4052
4053    fn strip_file_trigram_count_extension(cache_path: &Path) {
4054        let mut bytes = fs::read(cache_path).expect("read cache");
4055        let (start, end) = file_trigram_count_extension_range_from_bytes(&bytes)
4056            .expect("file trigram count extension");
4057        bytes.drain(start..end);
4058        let postings_len_total = u64::from_le_bytes(bytes[8..16].try_into().unwrap())
4059            - u64::try_from(end - start).unwrap();
4060        bytes[8..16].copy_from_slice(&postings_len_total.to_le_bytes());
4061        let checksum_pos = 16 + usize::try_from(postings_len_total).unwrap() - 4;
4062        let checksum = crc32fast::hash(&bytes[16..checksum_pos]);
4063        bytes[checksum_pos..checksum_pos + 4].copy_from_slice(&checksum.to_le_bytes());
4064        fs::write(cache_path, bytes).expect("write legacy cache");
4065    }
4066
4067    fn file_trigram_count_extension_range(cache_path: &Path) -> Option<(usize, usize)> {
4068        let bytes = fs::read(cache_path).ok()?;
4069        file_trigram_count_extension_range_from_bytes(&bytes)
4070    }
4071
4072    fn file_trigram_count_extension_range_from_bytes(bytes: &[u8]) -> Option<(usize, usize)> {
4073        let postings_len_total = u64::from_le_bytes(bytes.get(8..16)?.try_into().ok()?) as usize;
4074        let postings_start = 16usize;
4075        let postings_end = postings_start.checked_add(postings_len_total)?;
4076        let postings_body_end = postings_end.checked_sub(4)?;
4077        let mut reader = Cursor::new(&bytes[postings_start..postings_body_end]);
4078        let mut magic = [0u8; 8];
4079        reader.read_exact(&mut magic).ok()?;
4080        if &magic != INDEX_MAGIC {
4081            return None;
4082        }
4083        read_u32(&mut reader).ok()?;
4084        let head_len = read_u32(&mut reader).ok()? as u64;
4085        let root_len = read_u32(&mut reader).ok()? as u64;
4086        let ignore_len = read_u32(&mut reader).ok()? as u64;
4087        read_u64(&mut reader).ok()?;
4088        let file_count = read_u32(&mut reader).ok()? as usize;
4089        let skip = head_len.checked_add(root_len)?.checked_add(ignore_len)?;
4090        reader.seek(SeekFrom::Current(skip as i64)).ok()?;
4091        for _ in 0..file_count {
4092            let mut unindexed = [0u8; 1];
4093            reader.read_exact(&mut unindexed).ok()?;
4094            let path_len = read_u32(&mut reader).ok()? as u64;
4095            read_u64(&mut reader).ok()?;
4096            read_u64(&mut reader).ok()?;
4097            read_u32(&mut reader).ok()?;
4098            let mut hash = [0u8; 32];
4099            reader.read_exact(&mut hash).ok()?;
4100            reader.seek(SeekFrom::Current(path_len as i64)).ok()?;
4101        }
4102        let postings_blob_len = read_u64(&mut reader).ok()? as usize;
4103        let extension_start = postings_start
4104            .checked_add(reader.position() as usize)?
4105            .checked_add(postings_blob_len)?;
4106        if extension_start + 16 > postings_body_end {
4107            return None;
4108        }
4109        if bytes.get(extension_start..extension_start + 8)? != FILE_TRIGRAM_COUNT_MAGIC {
4110            return None;
4111        }
4112        let count = u32::from_le_bytes(
4113            bytes[extension_start + 12..extension_start + 16]
4114                .try_into()
4115                .ok()?,
4116        ) as usize;
4117        let extension_end = extension_start
4118            .checked_add(16)?
4119            .checked_add(count.checked_mul(4)?)?;
4120        (extension_end <= postings_body_end).then_some((extension_start, extension_end))
4121    }
4122
4123    #[test]
4124    fn disk_round_trip_preserves_postings_and_files() {
4125        let dir = tempfile::tempdir().expect("create temp dir");
4126        let project = dir.path().join("project");
4127        fs::create_dir_all(&project).expect("create project dir");
4128        let file = project.join("src.txt");
4129        fs::write(&file, "abcdef").expect("write source");
4130
4131        let mut index = SearchIndex::build(&project);
4132        index.git_head = Some("deadbeef".to_string());
4133        let cache_dir = dir.path().join("cache");
4134        let head = index.git_head.clone();
4135        index.write_to_disk(&cache_dir, head.as_deref());
4136
4137        let loaded =
4138            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
4139        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
4140        assert_eq!(loaded.files.len(), 1);
4141        assert_eq!(
4142            relative_to_root(&loaded.project_root, &loaded.files[0].path),
4143            PathBuf::from("src.txt")
4144        );
4145        assert_eq!(loaded.trigram_count(), index.trigram_count());
4146        assert_eq!(
4147            loaded.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4148            vec![0]
4149        );
4150        assert_eq!(
4151            loaded.file_trigram_count.as_ref(),
4152            index.file_trigram_count.as_ref()
4153        );
4154    }
4155
4156    #[test]
4157    fn cache_path_helpers_reject_absolute_and_parent_paths() {
4158        let root = PathBuf::from("/tmp/aft-project");
4159
4160        assert_eq!(
4161            cache_relative_path(&root, &root.join("src/lib.rs")),
4162            Some(PathBuf::from("src/lib.rs"))
4163        );
4164        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
4165        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
4166        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
4167        assert_eq!(
4168            cached_path_under_root(&root, Path::new("src/./lib.rs")),
4169            Some(root.join("src/lib.rs"))
4170        );
4171    }
4172
4173    #[test]
4174    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
4175        let dir = tempfile::tempdir().expect("create temp dir");
4176        let project = dir.path().join("project");
4177        fs::create_dir_all(&project).expect("create project dir");
4178        let canonical_project = fs::canonicalize(&project).expect("canonical project");
4179        fs::write(project.join("old.txt"), "old token\n").expect("write old");
4180        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
4181
4182        Command::new("git")
4183            .arg("init")
4184            .arg(&project)
4185            .status()
4186            .expect("git init");
4187        for args in [
4188            ["config", "user.email", "aft@example.invalid"],
4189            ["config", "user.name", "AFT Test"],
4190        ] {
4191            Command::new("git")
4192                .arg("-C")
4193                .arg(&project)
4194                .args(args)
4195                .status()
4196                .expect("git config");
4197        }
4198        Command::new("git")
4199            .arg("-C")
4200            .arg(&project)
4201            .args(["add", "."])
4202            .status()
4203            .expect("git add initial");
4204        Command::new("git")
4205            .arg("-C")
4206            .arg(&project)
4207            .args(["commit", "-m", "initial"])
4208            .status()
4209            .expect("git commit initial");
4210        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
4211        let mut baseline = SearchIndex::build(&project);
4212        baseline.git_head = Some(previous.clone());
4213
4214        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
4215        Command::new("git")
4216            .arg("-C")
4217            .arg(&project)
4218            .args(["add", "-A"])
4219            .status()
4220            .expect("git add rename");
4221        Command::new("git")
4222            .arg("-C")
4223            .arg(&project)
4224            .args(["commit", "-m", "rename"])
4225            .status()
4226            .expect("git commit rename");
4227        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
4228
4229        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
4230        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
4231
4232        let refreshed = SearchIndex::rebuild_or_refresh(
4233            &project,
4234            DEFAULT_MAX_FILE_SIZE,
4235            Some(current),
4236            Some(baseline),
4237            None,
4238        );
4239
4240        assert!(!refreshed
4241            .path_to_id
4242            .contains_key(&canonical_project.join("old.txt")));
4243        assert!(refreshed
4244            .path_to_id
4245            .contains_key(&canonical_project.join("new.txt")));
4246        assert!(refreshed
4247            .path_to_id
4248            .contains_key(&canonical_project.join("untracked.txt")));
4249        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
4250        assert_eq!(matches.matches.len(), 1);
4251    }
4252
4253    #[test]
4254    fn read_from_disk_rejects_corrupt_lookup_checksum() {
4255        let dir = tempfile::tempdir().expect("create temp dir");
4256        let project = dir.path().join("project");
4257        fs::create_dir_all(&project).expect("create project dir");
4258        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4259
4260        let mut index = SearchIndex::build(&project);
4261        let cache_dir = dir.path().join("cache");
4262        index.write_to_disk(&cache_dir, None);
4263
4264        let cache_path = cache_dir.join("cache.bin");
4265        let mut bytes = fs::read(&cache_path).expect("read cache");
4266        let last = bytes.len() - 1;
4267        bytes[last] ^= 0xff;
4268        fs::write(&cache_path, bytes).expect("write corrupted cache");
4269
4270        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
4271    }
4272
4273    #[test]
4274    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
4275        let dir = tempfile::tempdir().expect("create temp dir");
4276        let project = dir.path().join("project");
4277        fs::create_dir_all(&project).expect("create project dir");
4278        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4279
4280        let mut index = SearchIndex::build(&project);
4281        let cache_dir = dir.path().join("cache");
4282        index.write_to_disk(&cache_dir, None);
4283
4284        assert!(cache_dir.join("cache.bin").is_file());
4285        assert!(fs::read_dir(&cache_dir)
4286            .expect("read cache dir")
4287            .all(|entry| !entry
4288                .expect("cache entry")
4289                .file_name()
4290                .to_string_lossy()
4291                .contains(".tmp.")));
4292    }
4293
4294    #[test]
4295    fn concurrent_search_index_writes_do_not_corrupt() {
4296        let dir = tempfile::tempdir().expect("create temp dir");
4297        let project = dir.path().join("project");
4298        fs::create_dir_all(&project).expect("create project dir");
4299        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
4300        let cache_dir = dir.path().join("cache");
4301
4302        let a_project = project.clone();
4303        let a_cache = cache_dir.clone();
4304        let a = std::thread::spawn(move || {
4305            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
4306            let mut index = SearchIndex::build(&a_project);
4307            index.write_to_disk(&a_cache, None);
4308        });
4309        let b_project = project.clone();
4310        let b_cache = cache_dir.clone();
4311        let b = std::thread::spawn(move || {
4312            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
4313            let mut index = SearchIndex::build(&b_project);
4314            index.write_to_disk(&b_cache, None);
4315        });
4316        a.join().expect("writer a");
4317        b.join().expect("writer b");
4318
4319        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
4320    }
4321
4322    #[test]
4323    fn search_index_atomic_rename_survives_partial_write() {
4324        let dir = tempfile::tempdir().expect("create temp dir");
4325        let cache_dir = dir.path().join("cache");
4326        fs::create_dir_all(&cache_dir).expect("create cache dir");
4327        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
4328
4329        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4330    }
4331
4332    #[test]
4333    fn artifact_cache_key_shared_across_clones_of_same_repo() {
4334        let dir = tempfile::tempdir().expect("create temp dir");
4335        let source = dir.path().join("source");
4336        fs::create_dir_all(&source).expect("create source repo dir");
4337        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
4338
4339        assert!(Command::new("git")
4340            .current_dir(&source)
4341            .args(["init"])
4342            .status()
4343            .expect("init git repo")
4344            .success());
4345        assert!(Command::new("git")
4346            .current_dir(&source)
4347            .args(["add", "."])
4348            .status()
4349            .expect("git add")
4350            .success());
4351        assert!(Command::new("git")
4352            .current_dir(&source)
4353            .args([
4354                "-c",
4355                "user.name=AFT Tests",
4356                "-c",
4357                "user.email=aft-tests@example.com",
4358                "commit",
4359                "-m",
4360                "initial",
4361            ])
4362            .status()
4363            .expect("git commit")
4364            .success());
4365
4366        let clone = dir.path().join("clone");
4367        assert!(Command::new("git")
4368            .args(["clone", "--quiet"])
4369            .arg(&source)
4370            .arg(&clone)
4371            .status()
4372            .expect("git clone")
4373            .success());
4374
4375        let source_key = artifact_cache_key(&source);
4376        let clone_key = artifact_cache_key(&clone);
4377
4378        assert_eq!(source_key.len(), 16);
4379        assert_eq!(clone_key.len(), 16);
4380        // Same repo (same root commit) → same cache key regardless of clone path
4381        assert_eq!(source_key, clone_key);
4382    }
4383
4384    #[test]
4385    fn git_head_unchanged_picks_up_local_edits() {
4386        let dir = tempfile::tempdir().expect("create temp dir");
4387        let project = dir.path().join("repo");
4388        fs::create_dir_all(&project).expect("create repo dir");
4389        let file = project.join("tracked.txt");
4390        fs::write(&file, "oldtoken\n").expect("write file");
4391        assert!(Command::new("git")
4392            .current_dir(&project)
4393            .arg("init")
4394            .status()
4395            .unwrap()
4396            .success());
4397        assert!(Command::new("git")
4398            .current_dir(&project)
4399            .args(["add", "."])
4400            .status()
4401            .unwrap()
4402            .success());
4403        assert!(Command::new("git")
4404            .current_dir(&project)
4405            .args([
4406                "-c",
4407                "user.name=AFT Tests",
4408                "-c",
4409                "user.email=aft-tests@example.com",
4410                "commit",
4411                "-m",
4412                "initial"
4413            ])
4414            .status()
4415            .unwrap()
4416            .success());
4417        let head = current_git_head(&project);
4418        let mut baseline = SearchIndex::build(&project);
4419        baseline.git_head = head.clone();
4420        fs::write(&file, "newtoken\n").expect("edit tracked file");
4421
4422        let refreshed = SearchIndex::rebuild_or_refresh(
4423            &project,
4424            DEFAULT_MAX_FILE_SIZE,
4425            head,
4426            Some(baseline),
4427            None,
4428        );
4429        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
4430
4431        assert_eq!(result.total_matches, 1);
4432    }
4433
4434    #[test]
4435    fn non_git_project_reuses_cache_when_files_unchanged() {
4436        let dir = tempfile::tempdir().expect("create temp dir");
4437        let project = dir.path().join("project");
4438        fs::create_dir_all(&project).expect("create project dir");
4439        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
4440        let baseline = SearchIndex::build(&project);
4441        let baseline_file_count = baseline.file_count();
4442
4443        let refreshed = SearchIndex::rebuild_or_refresh(
4444            &project,
4445            DEFAULT_MAX_FILE_SIZE,
4446            None,
4447            Some(baseline),
4448            None,
4449        );
4450
4451        assert_eq!(refreshed.file_count(), baseline_file_count);
4452        assert_eq!(
4453            refreshed
4454                .grep("unchangedtoken", true, &[], &[], &project, 10)
4455                .total_matches,
4456            1
4457        );
4458    }
4459
4460    #[test]
4461    fn resolve_search_scope_disables_index_for_external_path() {
4462        let dir = tempfile::tempdir().expect("create temp dir");
4463        let project = dir.path().join("project");
4464        let outside = dir.path().join("outside");
4465        fs::create_dir_all(&project).expect("create project dir");
4466        fs::create_dir_all(&outside).expect("create outside dir");
4467
4468        let scope = resolve_search_scope(&project, outside.to_str());
4469
4470        assert_eq!(
4471            scope.root,
4472            fs::canonicalize(&outside).expect("canonicalize outside")
4473        );
4474        assert!(!scope.use_index);
4475    }
4476
4477    #[test]
4478    fn grep_filters_matches_to_search_root() {
4479        let dir = tempfile::tempdir().expect("create temp dir");
4480        let project = dir.path().join("project");
4481        let src = project.join("src");
4482        let docs = project.join("docs");
4483        fs::create_dir_all(&src).expect("create src dir");
4484        fs::create_dir_all(&docs).expect("create docs dir");
4485        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
4486        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
4487
4488        let index = SearchIndex::build(&project);
4489        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4490
4491        assert_eq!(result.files_searched, 1);
4492        assert_eq!(result.files_with_matches, 1);
4493        assert_eq!(result.matches.len(), 1);
4494        // Index stores canonicalized paths; on macOS /var → /private/var
4495        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
4496        assert_eq!(result.matches[0].file, expected);
4497    }
4498
4499    #[test]
4500    fn grep_deduplicates_multiple_matches_on_same_line() {
4501        let dir = tempfile::tempdir().expect("create temp dir");
4502        let project = dir.path().join("project");
4503        let src = project.join("src");
4504        fs::create_dir_all(&src).expect("create src dir");
4505        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
4506
4507        let index = SearchIndex::build(&project);
4508        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4509
4510        assert_eq!(result.total_matches, 1);
4511        assert_eq!(result.matches.len(), 1);
4512    }
4513
4514    #[test]
4515    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
4516        let dir = tempfile::tempdir().expect("create temp dir");
4517        let project = dir.path().join("project");
4518        fs::create_dir_all(&project).expect("create project dir");
4519        let file = project.join("unicode.txt");
4520        fs::write(&file, "äbc\n").expect("write unicode file");
4521
4522        let index = SearchIndex::build(&project);
4523        let result = index.grep("Äbc", false, &[], &[], &project, 10);
4524
4525        assert_eq!(result.total_matches, 1);
4526        assert_eq!(result.matches.len(), 1);
4527        assert_eq!(
4528            result.matches[0].file,
4529            fs::canonicalize(file).expect("canonicalize unicode file")
4530        );
4531    }
4532
4533    #[test]
4534    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
4535        let dir = tempfile::tempdir().expect("create temp dir");
4536        let project = dir.path().join("project");
4537        fs::create_dir_all(&project).expect("create project dir");
4538        let file = project.join("tokens.txt");
4539        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
4540        fs::write(&file, "alpha").expect("write original file");
4541        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
4542
4543        let baseline = SearchIndex::build(&project);
4544        fs::write(&file, "bravo").expect("write same-size edit");
4545        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
4546
4547        let refreshed = SearchIndex::rebuild_or_refresh(
4548            &project,
4549            DEFAULT_MAX_FILE_SIZE,
4550            None,
4551            Some(baseline),
4552            None,
4553        );
4554        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
4555        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
4556        let refreshed_id = *refreshed
4557            .path_to_id
4558            .get(&canonical_file)
4559            .expect("file remains indexed");
4560
4561        assert_eq!(result.total_matches, 1);
4562        assert!(refreshed
4563            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
4564            .contains(&refreshed_id));
4565        assert!(!refreshed
4566            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
4567            .contains(&refreshed_id));
4568    }
4569
4570    #[test]
4571    fn grep_reports_total_matches_before_truncation() {
4572        let dir = tempfile::tempdir().expect("create temp dir");
4573        let project = dir.path().join("project");
4574        let src = project.join("src");
4575        fs::create_dir_all(&src).expect("create src dir");
4576        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
4577
4578        let index = SearchIndex::build(&project);
4579        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
4580
4581        assert_eq!(result.total_matches, 2);
4582        assert_eq!(result.matches.len(), 1);
4583        assert!(result.truncated);
4584    }
4585
4586    #[test]
4587    fn glob_filters_results_to_search_root() {
4588        let dir = tempfile::tempdir().expect("create temp dir");
4589        let project = dir.path().join("project");
4590        let src = project.join("src");
4591        let scripts = project.join("scripts");
4592        fs::create_dir_all(&src).expect("create src dir");
4593        fs::create_dir_all(&scripts).expect("create scripts dir");
4594        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
4595        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
4596
4597        let index = SearchIndex::build(&project);
4598        let files = index.glob("**/*.rs", &src);
4599
4600        assert_eq!(
4601            files,
4602            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
4603        );
4604    }
4605
4606    #[test]
4607    fn glob_includes_hidden_and_binary_files() {
4608        let dir = tempfile::tempdir().expect("create temp dir");
4609        let project = dir.path().join("project");
4610        let hidden_dir = project.join(".hidden");
4611        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
4612        let hidden_file = hidden_dir.join("data.bin");
4613        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
4614
4615        let index = SearchIndex::build(&project);
4616        let files = index.glob("**/*.bin", &project);
4617
4618        assert_eq!(
4619            files,
4620            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
4621        );
4622    }
4623
4624    #[test]
4625    fn read_from_disk_rejects_invalid_nanos() {
4626        let dir = tempfile::tempdir().expect("create temp dir");
4627        let cache_dir = dir.path().join("cache");
4628        fs::create_dir_all(&cache_dir).expect("create cache dir");
4629
4630        let mut postings = Vec::new();
4631        postings.extend_from_slice(INDEX_MAGIC);
4632        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4633        postings.extend_from_slice(&0u32.to_le_bytes());
4634        postings.extend_from_slice(&1u32.to_le_bytes());
4635        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
4636        postings.extend_from_slice(&1u32.to_le_bytes());
4637        postings.extend_from_slice(b"/");
4638        postings.push(0u8);
4639        postings.extend_from_slice(&1u32.to_le_bytes());
4640        postings.extend_from_slice(&0u64.to_le_bytes());
4641        postings.extend_from_slice(&0u64.to_le_bytes());
4642        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
4643        postings.extend_from_slice(b"a");
4644        postings.extend_from_slice(&0u64.to_le_bytes());
4645
4646        let mut lookup = Vec::new();
4647        lookup.extend_from_slice(LOOKUP_MAGIC);
4648        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4649        lookup.extend_from_slice(&0u32.to_le_bytes());
4650
4651        let postings_checksum = crc32fast::hash(&postings);
4652        postings.extend_from_slice(&postings_checksum.to_le_bytes());
4653        let lookup_checksum = crc32fast::hash(&lookup);
4654        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
4655        let mut cache = Vec::new();
4656        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
4657        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4658        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
4659        cache.extend_from_slice(&postings);
4660        cache.extend_from_slice(&lookup);
4661        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
4662
4663        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4664    }
4665
4666    #[test]
4667    fn parallel_cold_build_matches_serial_index() {
4668        let dir = tempfile::tempdir().expect("create temp dir");
4669        let project = dir.path().join("project");
4670        for index in 0..80 {
4671            let sub = project.join(format!("pkg_{index:03}"));
4672            fs::create_dir_all(&sub).expect("create subdir");
4673            fs::write(
4674                sub.join("lib.rs"),
4675                format!(
4676                    "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
4677                ),
4678            )
4679            .expect("write lib");
4680        }
4681
4682        let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
4683        let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
4684
4685        assert_eq!(serial.file_count(), parallel.file_count());
4686        assert_eq!(serial.trigram_count(), parallel.trigram_count());
4687        assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
4688        assert_eq!(
4689            serial.file_trigram_count.as_ref(),
4690            parallel.file_trigram_count.as_ref()
4691        );
4692        for (path, id) in serial.path_to_id.iter() {
4693            assert_eq!(parallel.path_to_id.get(path), Some(id));
4694        }
4695        for (serial_file, parallel_file) in serial.files.iter().zip(parallel.files.iter()) {
4696            assert_eq!(serial_file.path, parallel_file.path);
4697            assert_eq!(serial_file.size, parallel_file.size);
4698            assert_eq!(serial_file.modified, parallel_file.modified);
4699            assert_eq!(serial_file.content_hash, parallel_file.content_hash);
4700        }
4701
4702        let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4703        let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4704        assert_eq!(serial_grep.matches, parallel_grep.matches);
4705        assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
4706        assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
4707        assert_eq!(
4708            serial_grep.files_with_matches,
4709            parallel_grep.files_with_matches
4710        );
4711    }
4712
4713    #[test]
4714    fn ignore_rule_discovery_respects_gitignore() {
4715        let dir = tempfile::tempdir().expect("create temp dir");
4716        let project = dir.path().join("project");
4717        fs::create_dir_all(project.join("src")).expect("mkdir src");
4718        fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
4719        let data = project.join("src/data");
4720        fs::create_dir_all(&data).expect("mkdir data");
4721        for index in 0..200 {
4722            fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
4723            fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
4724                .expect("write ignored file");
4725        }
4726
4727        Command::new("git")
4728            .arg("init")
4729            .arg(&project)
4730            .status()
4731            .expect("git init");
4732        for args in [
4733            ["config", "user.email", "aft@example.invalid"],
4734            ["config", "user.name", "AFT Test"],
4735        ] {
4736            Command::new("git")
4737                .arg("-C")
4738                .arg(&project)
4739                .args(args)
4740                .status()
4741                .expect("git config");
4742        }
4743        Command::new("git")
4744            .arg("-C")
4745            .arg(&project)
4746            .args(["add", "."])
4747            .status()
4748            .expect("git add");
4749        Command::new("git")
4750            .arg("-C")
4751            .arg(&project)
4752            .args(["commit", "-m", "initial"])
4753            .status()
4754            .expect("git commit");
4755
4756        let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
4757        let walker_dirs = count_ignore_rule_discovery_dirs(&project);
4758        assert!(
4759            legacy_dirs > walker_dirs,
4760            "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
4761        );
4762        assert!(
4763            walker_dirs < 50,
4764            "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
4765        );
4766    }
4767
4768    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
4769    /// changed between cmp() calls.
4770    ///
4771    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
4772    /// which does a `stat()` syscall. If the file was deleted, modified, or
4773    /// touched mid-sort, the comparator returned different values for the
4774    /// same input pair on different invocations. Rust's slice::sort detects
4775    /// this and panics with "user-provided comparison function does not
4776    /// correctly implement a total order".
4777    ///
4778    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
4779    /// bridge invalidated files in parallel with grep's sort path. This
4780    /// test simulates the worst case: most paths don't exist (Err from
4781    /// fs::metadata) and sort still completes successfully.
4782    #[test]
4783    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
4784        // Mix of existing and non-existing paths in deliberately
4785        // non-monotonic order — pre-fix, the sort would call stat() at
4786        // least N log N times and any flakiness would trigger the panic.
4787        let dir = tempfile::tempdir().expect("create tempdir");
4788        let mut paths: Vec<PathBuf> = Vec::new();
4789        for i in 0..30 {
4790            // Half exist, half don't.
4791            let path = if i % 2 == 0 {
4792                let p = dir.path().join(format!("real-{i}.rs"));
4793                fs::write(&p, format!("// {i}\n")).expect("write");
4794                p
4795            } else {
4796                dir.path().join(format!("missing-{i}.rs"))
4797            };
4798            paths.push(path);
4799        }
4800
4801        // Run the sort many times to maximise the chance of catching any
4802        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
4803        for _ in 0..50 {
4804            let mut copy = paths.clone();
4805            sort_paths_by_mtime_desc(&mut copy);
4806            assert_eq!(copy.len(), paths.len());
4807        }
4808    }
4809
4810    /// Regression: the indexed parallel search's reduce() combine closure must
4811    /// NOT set engine_capped. reduce runs on every partial-result merge in a
4812    /// multi-chunk parallel search (>10 candidate files), capped or not — an
4813    /// unconditional store there falsely reported every such grep as capped,
4814    /// lying to the agent that results were truncated.
4815    #[test]
4816    fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
4817        let dir = tempfile::tempdir().expect("create tempdir");
4818        // >10 files so the parallel (reduce) branch is taken, each with exactly
4819        // one match, and a generous cap so the search is NOT actually capped.
4820        for i in 0..40 {
4821            fs::write(
4822                dir.path().join(format!("file-{i}.rs")),
4823                format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
4824            )
4825            .expect("write");
4826        }
4827        let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
4828        let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
4829        assert!(
4830            result.matches.len() >= 40,
4831            "expected a match per file, got {}",
4832            result.matches.len()
4833        );
4834        assert!(
4835            !result.engine_capped,
4836            "an uncapped grep over >10 files must not report engine_capped"
4837        );
4838        assert!(!result.truncated, "uncapped grep must not be truncated");
4839    }
4840
4841    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
4842    /// the same conditions as sort_paths_by_mtime_desc. See the
4843    /// sort_paths_... test above for the full rationale.
4844    #[test]
4845    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
4846        let dir = tempfile::tempdir().expect("create tempdir");
4847        let mut matches: Vec<GrepMatch> = Vec::new();
4848        for i in 0..30 {
4849            let file = if i % 2 == 0 {
4850                let p = dir.path().join(format!("real-{i}.rs"));
4851                fs::write(&p, format!("// {i}\n")).expect("write");
4852                p
4853            } else {
4854                dir.path().join(format!("missing-{i}.rs"))
4855            };
4856            matches.push(GrepMatch {
4857                file,
4858                line: u32::try_from(i).unwrap_or(0),
4859                column: 0,
4860                line_text: format!("match {i}"),
4861                match_text: format!("match {i}"),
4862            });
4863        }
4864
4865        for _ in 0..50 {
4866            let mut copy = matches.clone();
4867            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
4868            assert_eq!(copy.len(), matches.len());
4869        }
4870    }
4871}