aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, BinaryHeap, HashMap, HashSet};
2use std::fs::{self, File, OpenOptions};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const SPILL_MAGIC: &[u8; 8] = b"AFTSPI01";
27const FILE_TRIGRAM_COUNT_MAGIC: &[u8; 8] = b"AFTFTC01";
28const INDEX_VERSION: u32 = 4;
29const PREVIEW_BYTES: usize = 8 * 1024;
30const SPIMI_SOFT_LIMIT_BYTES: usize = 128 * 1024 * 1024;
31const SPIMI_HARD_LIMIT_BYTES: usize = 256 * 1024 * 1024;
32const SPILL_RECORD_ESTIMATED_BYTES: usize = 16;
33const DELTA_COMPACT_SOFT_FILES: usize = 1_000;
34const DELTA_COMPACT_HARD_FILES: usize = 5_000;
35const DELTA_COMPACT_SOFT_BYTES: usize = 32 * 1024 * 1024;
36const DELTA_COMPACT_HARD_BYTES: usize = 128 * 1024 * 1024;
37const EOF_SENTINEL: u8 = 0;
38const MAX_ENTRIES: usize = 10_000_000;
39const MIN_FILE_ENTRY_BYTES: usize = 57;
40const LOOKUP_ENTRY_BYTES: usize = 16;
41const POSTING_BYTES: usize = 6;
42static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
43
44pub struct CacheLock {
45    _guard: fs_lock::LockGuard,
46}
47
48impl CacheLock {
49    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
50        fs::create_dir_all(cache_dir)?;
51        let path = cache_dir.join("cache.lock");
52        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
53            .lock()
54            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
55        fs_lock::try_acquire(&path, Duration::from_secs(2))
56            .map(|guard| Self { _guard: guard })
57            .map_err(|error| match error {
58                fs_lock::AcquireError::Timeout => {
59                    std::io::Error::other("timed out acquiring search cache lock")
60                }
61                fs_lock::AcquireError::Io(error) => error,
62            })
63    }
64}
65
66#[derive(Clone, Debug)]
67pub struct SearchIndex {
68    base: Option<Arc<BasePostings>>,
69    delta_postings: HashMap<u32, Vec<Posting>>,
70    delta_file_trigrams: HashMap<u32, Vec<u32>>,
71    pub files: Arc<Vec<FileEntry>>,
72    pub path_to_id: Arc<HashMap<PathBuf, u32>>,
73    pub ready: bool,
74    project_root: PathBuf,
75    git_head: Option<String>,
76    max_file_size: u64,
77    ignore_rules_fingerprint: String,
78    pub file_trigram_count: Arc<Vec<u32>>,
79    unindexed_files: Arc<HashSet<u32>>,
80    superseded: HashSet<u32>,
81    base_file_count: u32,
82    delta_packed_bytes: usize,
83    compaction_state: Arc<Mutex<CompactionState>>,
84}
85
86#[derive(Clone, Debug)]
87struct BasePostings {
88    file: Arc<File>,
89    postings_blob_start: u64,
90    postings_blob_len: u64,
91    lookup: Arc<Vec<LookupEntry>>,
92}
93
94#[derive(Clone, Copy, Debug, PartialEq, Eq)]
95struct LookupEntry {
96    trigram: u32,
97    offset: u64,
98    count: u32,
99}
100
101#[derive(Clone, Debug, Default)]
102struct CompactionState {
103    running: bool,
104    requested_again: bool,
105    buffered_paths: Vec<PathBuf>,
106}
107
108#[derive(Clone, Debug)]
109pub struct SearchIndexSnapshot {
110    base: Option<Arc<BasePostings>>,
111    delta_postings: Arc<HashMap<u32, Vec<Posting>>>,
112    files: Arc<Vec<FileEntry>>,
113    path_to_id: Arc<HashMap<PathBuf, u32>>,
114    ready: bool,
115    project_root: PathBuf,
116    file_trigram_count: Arc<Vec<u32>>,
117    unindexed_files: Arc<HashSet<u32>>,
118    superseded: Arc<HashSet<u32>>,
119}
120
121#[derive(Clone, Debug, Default)]
122pub struct LexicalRankResult {
123    pub files: Vec<(PathBuf, f32)>,
124    pub engine_capped: bool,
125}
126
127impl SearchIndex {
128    /// Number of indexed files.
129    pub fn file_count(&self) -> usize {
130        self.files.len()
131    }
132
133    /// Number of unique trigrams in the combined base index and delta postings.
134    pub fn trigram_count(&self) -> usize {
135        self.snapshot().trigram_count()
136    }
137
138    /// Returns an immutable snapshot for queries. Callers must obtain the
139    /// snapshot while holding the RwLock that protects the SearchIndex, then
140    /// drop the guard before running expensive operations such as grep, glob, or
141    /// lexical ranking.
142    pub fn snapshot(&self) -> SearchIndexSnapshot {
143        SearchIndexSnapshot {
144            base: self.base.clone(),
145            delta_postings: Arc::new(self.delta_postings.clone()),
146            files: Arc::clone(&self.files),
147            path_to_id: Arc::clone(&self.path_to_id),
148            ready: self.ready,
149            project_root: self.project_root.clone(),
150            file_trigram_count: Arc::clone(&self.file_trigram_count),
151            unindexed_files: Arc::clone(&self.unindexed_files),
152            superseded: Arc::new(self.superseded.clone()),
153        }
154    }
155
156    /// Compute distinct query trigrams from literal tokens.
157    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
158        query_trigrams_from_tokens(tokens)
159    }
160
161    /// Score-rank file candidates by lexical relevance to query trigrams.
162    pub fn lexical_rank(
163        &self,
164        query_trigrams: &[u32],
165        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
166        max_files: usize,
167    ) -> Vec<(PathBuf, f32)> {
168        self.snapshot()
169            .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
170            .files
171    }
172
173    /// Score-rank file candidates and report whether the pre-filter step that
174    /// collects candidates reached its internal size limit before ranking.
175    pub fn lexical_rank_with_stats(
176        &self,
177        query_trigrams: &[u32],
178        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
179        max_files: usize,
180    ) -> LexicalRankResult {
181        self.snapshot()
182            .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
183    }
184}
185
186impl SearchIndexSnapshot {
187    /// Number of unique trigrams in the combined base index and delta postings.
188    pub fn trigram_count(&self) -> usize {
189        let base_count = self.base.as_ref().map_or(0, |base| base.lookup.len());
190        let Some(base) = &self.base else {
191            return self.delta_postings.len();
192        };
193        base_count
194            + self
195                .delta_postings
196                .keys()
197                .filter(|trigram| base.lookup_entry(**trigram).is_none())
198                .count()
199    }
200
201    /// Score-rank file candidates and report whether the pre-filter step that
202    /// collects candidates reached its internal size limit before ranking.
203    pub fn lexical_rank_with_stats(
204        &self,
205        query_trigrams: &[u32],
206        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
207        max_files: usize,
208    ) -> LexicalRankResult {
209        if query_trigrams.is_empty() || max_files == 0 {
210            return LexicalRankResult::default();
211        }
212
213        let mut non_zero: Vec<(u32, usize)> = query_trigrams
214            .iter()
215            .filter_map(|trigram| {
216                let posting_count = self.posting_count(*trigram);
217                (posting_count > 0).then_some((*trigram, posting_count))
218            })
219            .collect();
220        if non_zero.is_empty() {
221            return LexicalRankResult::default();
222        }
223
224        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
225        let selected_count = non_zero.len().min(3);
226        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
227
228        let mut candidate_ids = BTreeSet::new();
229        for (trigram, _) in non_zero.iter().take(selected_count) {
230            for file_id in self.postings_for_trigram(*trigram, None) {
231                candidate_ids.insert(file_id);
232            }
233        }
234        let pre_filter_candidate_count = candidate_ids.len();
235        let engine_capped = pre_filter_candidate_count > candidate_cap;
236        let filtered_candidates = candidate_ids
237            .into_iter()
238            .filter_map(|file_id| {
239                self.files
240                    .get(file_id as usize)
241                    .map(|entry| (file_id, entry))
242            })
243            .filter(|(_, entry)| {
244                if let Some(filter) = candidate_filter {
245                    filter(&entry.path)
246                } else {
247                    true
248                }
249            })
250            .collect::<Vec<_>>();
251
252        let mut ranked = Vec::new();
253        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
254            let score = lexical_score_snapshot(self, query_trigrams, file_id);
255            if score > 0.0 {
256                ranked.push((entry.path.clone(), score));
257            }
258        }
259
260        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
261        ranked.truncate(max_files);
262        LexicalRankResult {
263            files: ranked,
264            engine_capped,
265        }
266    }
267}
268
269#[derive(Clone, Debug, PartialEq, Eq)]
270pub struct Posting {
271    pub file_id: u32,
272    pub next_mask: u8,
273    pub loc_mask: u8,
274}
275
276#[derive(Clone, Debug)]
277pub struct FileEntry {
278    pub path: PathBuf,
279    pub size: u64,
280    pub modified: SystemTime,
281    pub content_hash: blake3::Hash,
282}
283
284#[derive(Clone, Debug, PartialEq, Eq)]
285pub struct GrepMatch {
286    pub file: PathBuf,
287    pub line: u32,
288    pub column: u32,
289    pub line_text: String,
290    pub match_text: String,
291}
292
293#[derive(Clone, Debug)]
294pub struct GrepResult {
295    pub matches: Vec<GrepMatch>,
296    pub total_matches: usize,
297    pub files_searched: usize,
298    pub files_with_matches: usize,
299    pub index_status: IndexStatus,
300    pub truncated: bool,
301    pub fully_degraded: bool,
302    pub engine_capped: bool,
303    /// True when a fallback directory walk stopped early due to file-count or time budget.
304    pub walk_truncated: bool,
305}
306
307#[derive(Clone, Copy, Debug, PartialEq, Eq)]
308pub enum IndexStatus {
309    Ready,
310    Building,
311    Fallback,
312    Disabled,
313}
314
315impl IndexStatus {
316    pub fn as_str(&self) -> &'static str {
317        match self {
318            IndexStatus::Ready => "Ready",
319            IndexStatus::Building => "Building",
320            IndexStatus::Fallback => "Fallback",
321            IndexStatus::Disabled => "Disabled",
322        }
323    }
324}
325
326#[derive(Clone, Debug, Default)]
327pub struct RegexQuery {
328    pub and_trigrams: Vec<u32>,
329    pub or_groups: Vec<Vec<u32>>,
330    pub(crate) and_filters: HashMap<u32, PostingFilter>,
331    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
332}
333
334#[derive(Clone, Copy, Debug, Default)]
335pub(crate) struct PostingFilter {
336    next_mask: u8,
337    loc_mask: u8,
338}
339
340#[derive(Clone, Copy)]
341struct SearchFileMetadata {
342    size: u64,
343    modified: SystemTime,
344}
345
346struct PreparedIndexedFile {
347    metadata: SearchFileMetadata,
348    content_hash: blake3::Hash,
349    trigram_map: BTreeMap<u32, PostingFilter>,
350}
351
352enum PreparedSearchPath {
353    Indexed(PreparedIndexedFile),
354    Unindexed(SearchFileMetadata),
355    Skipped,
356}
357
358#[derive(Clone, Debug, Default)]
359struct QueryBuild {
360    and_runs: Vec<Vec<u8>>,
361    or_groups: Vec<Vec<Vec<u8>>>,
362}
363
364#[derive(Clone, Debug, Default)]
365pub(crate) struct PathFilters {
366    includes: Option<GlobSet>,
367    excludes: Option<GlobSet>,
368}
369
370#[derive(Clone, Debug)]
371pub(crate) struct SearchScope {
372    pub root: PathBuf,
373    pub use_index: bool,
374}
375
376#[derive(Clone, Debug)]
377struct SharedGrepMatch {
378    file: Arc<PathBuf>,
379    line: u32,
380    column: u32,
381    line_text: String,
382    match_text: String,
383}
384
385#[derive(Clone, Debug)]
386enum SearchMatcher {
387    Literal(LiteralSearch),
388    Regex(Regex),
389}
390
391impl SearchIndex {
392    pub fn new() -> Self {
393        SearchIndex {
394            base: None,
395            delta_postings: HashMap::new(),
396            delta_file_trigrams: HashMap::new(),
397            files: Arc::new(Vec::new()),
398            path_to_id: Arc::new(HashMap::new()),
399            ready: false,
400            project_root: PathBuf::new(),
401            git_head: None,
402            max_file_size: DEFAULT_MAX_FILE_SIZE,
403            ignore_rules_fingerprint: String::new(),
404            file_trigram_count: Arc::new(Vec::new()),
405            unindexed_files: Arc::new(HashSet::new()),
406            superseded: HashSet::new(),
407            base_file_count: 0,
408            delta_packed_bytes: 0,
409            compaction_state: Arc::new(Mutex::new(CompactionState::default())),
410        }
411    }
412
413    pub fn build(root: &Path) -> Self {
414        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
415    }
416
417    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
418        let cache_dir = transient_search_cache_dir(root);
419        Self::build_with_limit_to_cache_dir(root, max_file_size, &cache_dir)
420    }
421
422    pub fn build_with_limit_to_cache_dir(
423        root: &Path,
424        max_file_size: u64,
425        cache_dir: &Path,
426    ) -> Self {
427        let started = std::time::Instant::now();
428        match build_streaming_index(root, max_file_size, cache_dir) {
429            Ok((mut index, indexed)) => {
430                index.ready = true;
431                crate::slog_info!(
432                    "search index cold streaming build: {} files, {} trigrams, {} ms (pool={})",
433                    indexed,
434                    index.trigram_count(),
435                    started.elapsed().as_millis(),
436                    search_index_build_pool_size()
437                );
438                index
439            }
440            Err(error) => {
441                log::warn!(
442                    "search index: streaming build failed ({}); falling back to bounded in-memory delta",
443                    error
444                );
445                let mut index = SearchIndex {
446                    project_root: fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
447                    max_file_size,
448                    ignore_rules_fingerprint: ignore_rules_fingerprint(
449                        &fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
450                    ),
451                    ..SearchIndex::new()
452                };
453                let filters = PathFilters::default();
454                let paths: Vec<PathBuf> = walk_project_files(&index.project_root, &filters);
455                let indexed = index.ingest_paths_parallel(&paths);
456                index.git_head = current_git_head(&index.project_root);
457                index.ready = true;
458                crate::slog_info!(
459                    "search index fallback build: {} files, {} trigrams, {} ms (pool={})",
460                    indexed,
461                    index.trigram_count(),
462                    started.elapsed().as_millis(),
463                    search_index_build_pool_size()
464                );
465                index
466            }
467        }
468    }
469
470    /// Serial cold build for tests and parity checks against [`build_with_limit`].
471    #[cfg(test)]
472    pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
473        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
474        let mut index = SearchIndex {
475            project_root: project_root.clone(),
476            max_file_size,
477            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
478            ..SearchIndex::new()
479        };
480        let filters = PathFilters::default();
481        for path in walk_project_files(&project_root, &filters) {
482            index.update_file(&path);
483        }
484        index.git_head = current_git_head(&project_root);
485        index.ready = true;
486        index
487    }
488
489    fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
490        let max_file_size = self.max_file_size;
491        let pool_size = search_index_build_pool_size();
492        let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
493        let pool = match rayon::ThreadPoolBuilder::new()
494            .num_threads(pool_size)
495            .thread_name(|index| format!("aft-search-build-{index}"))
496            .stack_size(8 * 1024 * 1024)
497            .build()
498        {
499            Ok(pool) => Some(pool),
500            Err(error) => {
501                log::warn!(
502                    "search index: bounded build pool unavailable ({error}); using global pool"
503                );
504                None
505            }
506        };
507
508        let mut indexed = 0usize;
509        for chunk in paths.chunks(chunk_size) {
510            let prepare_chunk = || -> Vec<PreparedSearchPath> {
511                chunk
512                    .par_iter()
513                    .map(|path| prepare_search_path(path, max_file_size))
514                    .collect()
515            };
516            let prepared = match &pool {
517                Some(pool) => pool.install(prepare_chunk),
518                None => prepare_chunk(),
519            };
520
521            for (path, prepared) in chunk.iter().zip(prepared) {
522                let inserted = match prepared {
523                    PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
524                    PreparedSearchPath::Unindexed(metadata) => {
525                        self.track_unindexed_file_with_metadata(path, metadata)
526                    }
527                    PreparedSearchPath::Skipped => false,
528                };
529                if inserted {
530                    indexed += 1;
531                }
532            }
533        }
534
535        indexed
536    }
537
538    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
539        self.remove_file(path);
540        let metadata = metadata_for_indexed_content(path, content.len() as u64);
541        self.index_file_with_metadata(path, content, metadata);
542    }
543
544    fn index_file_with_metadata(
545        &mut self,
546        path: &Path,
547        content: &[u8],
548        metadata: SearchFileMetadata,
549    ) -> bool {
550        self.index_prepared_new_file(
551            path,
552            PreparedIndexedFile {
553                metadata,
554                content_hash: cache_freshness::hash_bytes(content),
555                trigram_map: trigram_filter_map(content, true),
556            },
557        )
558    }
559
560    fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
561        let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
562            Some(file_id) => file_id,
563            None => return false,
564        };
565        if let Some(entry) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
566            entry.content_hash = file.content_hash;
567        }
568
569        let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
570        for (trigram, filter) in file.trigram_map {
571            let postings = self.delta_postings.entry(trigram).or_default();
572            postings.push(Posting {
573                file_id,
574                next_mask: filter.next_mask,
575                loc_mask: filter.loc_mask,
576            });
577            if postings.len() > 1
578                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
579            {
580                postings.sort_unstable_by_key(|p| p.file_id);
581            }
582            file_trigrams.push(trigram);
583        }
584
585        let trigram_count = file_trigrams.len() as u32;
586        self.delta_packed_bytes = self
587            .delta_packed_bytes
588            .saturating_add(file_trigrams.len().saturating_mul(POSTING_BYTES));
589        self.delta_file_trigrams.insert(file_id, file_trigrams);
590        ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
591        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
592            *count = trigram_count;
593        }
594        Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
595        self.update_compaction_flags(Some(path));
596        true
597    }
598
599    pub fn remove_file(&mut self, path: &Path) {
600        let canonical_path = canonicalize_existing_or_deleted_path(path);
601        let file_id = {
602            let path_to_id = Arc::make_mut(&mut self.path_to_id);
603            if let Some(file_id) = path_to_id.remove(path) {
604                file_id
605            } else if canonical_path.as_path() != path {
606                let Some(file_id) = path_to_id.remove(&canonical_path) else {
607                    return;
608                };
609                file_id
610            } else {
611                return;
612            }
613        };
614
615        if file_id < self.base_file_count {
616            self.superseded.insert(file_id);
617        }
618
619        if let Some(trigrams) = self.delta_file_trigrams.remove(&file_id) {
620            self.delta_packed_bytes = self
621                .delta_packed_bytes
622                .saturating_sub(trigrams.len().saturating_mul(POSTING_BYTES));
623            for trigram in trigrams {
624                let should_remove = if let Some(postings) = self.delta_postings.get_mut(&trigram) {
625                    postings.retain(|posting| posting.file_id != file_id);
626                    postings.is_empty()
627                } else {
628                    false
629                };
630
631                if should_remove {
632                    self.delta_postings.remove(&trigram);
633                }
634            }
635        }
636
637        Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
638        if let Some(file) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
639            file.path = PathBuf::new();
640            file.size = 0;
641            file.modified = UNIX_EPOCH;
642            file.content_hash = cache_freshness::zero_hash();
643        }
644        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
645            *count = 0;
646        }
647        self.update_compaction_flags(Some(path));
648    }
649
650    pub fn update_file(&mut self, path: &Path) {
651        self.remove_file(path);
652
653        let metadata = match fs::metadata(path) {
654            Ok(metadata) if metadata.is_file() => metadata,
655            _ => return,
656        };
657
658        let metadata = search_file_metadata(&metadata);
659
660        if is_binary_path(path, metadata.size) {
661            self.track_unindexed_file_with_metadata(path, metadata);
662            return;
663        }
664
665        if metadata.size > self.max_file_size {
666            self.track_unindexed_file_with_metadata(path, metadata);
667            return;
668        }
669
670        let content = match fs::read(path) {
671            Ok(content) => content,
672            Err(_) => return,
673        };
674
675        if is_binary_bytes(&content) {
676            self.track_unindexed_file_with_metadata(path, metadata);
677            return;
678        }
679
680        self.index_file_with_metadata(path, &content, metadata);
681    }
682
683    pub fn grep(
684        &self,
685        pattern: &str,
686        case_sensitive: bool,
687        include: &[String],
688        exclude: &[String],
689        search_root: &Path,
690        max_results: usize,
691    ) -> GrepResult {
692        self.snapshot().grep(
693            pattern,
694            case_sensitive,
695            include,
696            exclude,
697            search_root,
698            max_results,
699        )
700    }
701
702    pub fn search_grep(
703        &self,
704        pattern: &CompiledPattern,
705        include: &[String],
706        exclude: &[String],
707        search_root: &Path,
708        max_results: usize,
709    ) -> GrepResult {
710        self.snapshot()
711            .search_grep(pattern, include, exclude, search_root, max_results)
712    }
713
714    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
715        self.snapshot().glob(pattern, search_root)
716    }
717
718    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
719        self.snapshot().candidates(query)
720    }
721
722    pub fn write_to_disk(&mut self, cache_dir: &Path, git_head: Option<&str>) {
723        let Some(plan) = CacheWritePlan::from_index(self, git_head) else {
724            return;
725        };
726
727        let write_result = {
728            let mut sources = self.compaction_record_sources(Arc::clone(&plan.id_map));
729            write_cache_file_from_sources(cache_dir, &plan, &mut sources)
730        };
731
732        match write_result {
733            Ok(base) => {
734                self.base = Some(Arc::new(base));
735                self.delta_postings.clear();
736                self.delta_file_trigrams.clear();
737                self.superseded.clear();
738                self.delta_packed_bytes = 0;
739                self.base_file_count = u32::try_from(plan.files.len()).unwrap_or(u32::MAX);
740                self.files = Arc::new(plan.files);
741                self.path_to_id = Arc::new(plan.path_to_id);
742                self.unindexed_files = Arc::new(plan.unindexed_files);
743                self.file_trigram_count = Arc::new(plan.file_trigram_count);
744                self.git_head = plan.git_head.filter(|head| !head.is_empty());
745                self.ignore_rules_fingerprint = plan.ignore_fingerprint;
746            }
747            Err(error) => {
748                log::warn!("search index: failed to write disk cache: {}", error);
749            }
750        }
751    }
752
753    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
754        debug_assert!(current_canonical_root.is_absolute());
755        let cache_path = cache_dir.join("cache.bin");
756        let cache_file = open_cache_file_read(&cache_path).ok()?;
757        let file_len = cache_file.metadata().ok()?.len();
758        if file_len < 16 {
759            return None;
760        }
761
762        let mut reader = BufReader::new(cache_file.try_clone().ok()?);
763        if read_u32(&mut reader).ok()? != CACHE_MAGIC {
764            return None;
765        }
766        if read_u32(&mut reader).ok()? != INDEX_VERSION {
767            return None;
768        }
769        let postings_len_total = read_u64(&mut reader).ok()?;
770        let postings_section_start = reader.stream_position().ok()?;
771        let postings_section_end = postings_section_start.checked_add(postings_len_total)?;
772        if postings_len_total < 4 || postings_section_end > file_len {
773            return None;
774        }
775        let postings_body_end = postings_section_end.checked_sub(4)?;
776
777        let mut magic = [0u8; 8];
778        reader.read_exact(&mut magic).ok()?;
779        if &magic != INDEX_MAGIC {
780            return None;
781        }
782        if read_u32(&mut reader).ok()? != INDEX_VERSION {
783            return None;
784        }
785
786        let head_len = read_u32(&mut reader).ok()? as usize;
787        let root_len = read_u32(&mut reader).ok()? as usize;
788        let ignore_fingerprint_len = read_u32(&mut reader).ok()? as usize;
789        let max_file_size = read_u64(&mut reader).ok()?;
790        let file_count = read_u32(&mut reader).ok()? as usize;
791        if file_count > MAX_ENTRIES {
792            return None;
793        }
794
795        if !reader_has_remaining(&mut reader, postings_body_end, head_len).ok()? {
796            return None;
797        }
798        let mut head_bytes = vec![0u8; head_len];
799        reader.read_exact(&mut head_bytes).ok()?;
800        let git_head = String::from_utf8(head_bytes)
801            .ok()
802            .filter(|head| !head.is_empty());
803
804        if !reader_has_remaining(&mut reader, postings_body_end, root_len).ok()? {
805            return None;
806        }
807        let mut root_bytes = vec![0u8; root_len];
808        reader.read_exact(&mut root_bytes).ok()?;
809        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
810        let project_root = current_canonical_root.to_path_buf();
811
812        if !reader_has_remaining(&mut reader, postings_body_end, ignore_fingerprint_len).ok()? {
813            return None;
814        }
815        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
816        reader.read_exact(&mut ignore_fingerprint_bytes).ok()?;
817        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
818        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
819        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
820            return None;
821        }
822
823        let mut files = Vec::with_capacity(file_count);
824        let mut path_to_id = HashMap::new();
825        let mut unindexed_files = HashSet::new();
826
827        for file_id in 0..file_count {
828            if !reader_has_remaining(&mut reader, postings_body_end, MIN_FILE_ENTRY_BYTES).ok()? {
829                return None;
830            }
831            let mut unindexed = [0u8; 1];
832            reader.read_exact(&mut unindexed).ok()?;
833            let path_len = read_u32(&mut reader).ok()? as usize;
834            let size = read_u64(&mut reader).ok()?;
835            let secs = read_u64(&mut reader).ok()?;
836            let nanos = read_u32(&mut reader).ok()?;
837            let mut hash_bytes = [0u8; 32];
838            reader.read_exact(&mut hash_bytes).ok()?;
839            let content_hash = blake3::Hash::from_bytes(hash_bytes);
840            if nanos >= 1_000_000_000 {
841                return None;
842            }
843            if !reader_has_remaining(&mut reader, postings_body_end, path_len).ok()? {
844                return None;
845            }
846            let mut path_bytes = vec![0u8; path_len];
847            reader.read_exact(&mut path_bytes).ok()?;
848            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
849            let full_path = cached_path_under_root(&project_root, &relative_path)?;
850            let file_id_u32 = u32::try_from(file_id).ok()?;
851
852            files.push(FileEntry {
853                path: full_path.clone(),
854                size,
855                modified: UNIX_EPOCH + Duration::new(secs, nanos),
856                content_hash,
857            });
858            path_to_id.insert(full_path, file_id_u32);
859            if unindexed[0] == 1 {
860                unindexed_files.insert(file_id_u32);
861            }
862        }
863
864        if !reader_has_remaining(&mut reader, postings_body_end, 8).ok()? {
865            return None;
866        }
867        let postings_blob_len = read_u64(&mut reader).ok()?;
868        let postings_blob_start = reader.stream_position().ok()?;
869        let postings_blob_end = postings_blob_start.checked_add(postings_blob_len)?;
870        if postings_blob_end > postings_body_end || postings_blob_len % POSTING_BYTES as u64 != 0 {
871            return None;
872        }
873
874        let lookup_section_start = postings_section_end;
875        if lookup_section_start >= file_len {
876            return None;
877        }
878        let mut lookup_file = cache_file.try_clone().ok()?;
879        lookup_file
880            .seek(SeekFrom::Start(lookup_section_start))
881            .ok()?;
882        let mut lookup_bytes = Vec::new();
883        lookup_file.read_to_end(&mut lookup_bytes).ok()?;
884        if lookup_bytes.len() < 4 {
885            return None;
886        }
887        verify_crc32_bytes_slice(&lookup_bytes).ok()?;
888        let lookup_body_len = lookup_bytes.len().checked_sub(4)?;
889        let mut lookup_reader = BufReader::new(Cursor::new(&lookup_bytes));
890        let mut lookup_magic = [0u8; 8];
891        lookup_reader.read_exact(&mut lookup_magic).ok()?;
892        if &lookup_magic != LOOKUP_MAGIC {
893            return None;
894        }
895        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
896            return None;
897        }
898        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
899        if entry_count > MAX_ENTRIES {
900            return None;
901        }
902        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
903        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
904        if minimum_lookup_bytes > remaining_lookup {
905            return None;
906        }
907
908        let mut lookup = Vec::with_capacity(entry_count);
909        let mut previous_trigram = None;
910        for _ in 0..entry_count {
911            let trigram = read_u32(&mut lookup_reader).ok()?;
912            let offset = read_u64(&mut lookup_reader).ok()?;
913            let count = read_u32(&mut lookup_reader).ok()?;
914            if count as usize > MAX_ENTRIES {
915                return None;
916            }
917            if previous_trigram.is_some_and(|previous| previous >= trigram) {
918                return None;
919            }
920            previous_trigram = Some(trigram);
921            let bytes_len = (count as u64).checked_mul(POSTING_BYTES as u64)?;
922            let end = offset.checked_add(bytes_len)?;
923            if end > postings_blob_len {
924                return None;
925            }
926            lookup.push(LookupEntry {
927                trigram,
928                offset,
929                count,
930            });
931        }
932
933        let base = BasePostings {
934            file: Arc::new(cache_file),
935            postings_blob_start,
936            postings_blob_len,
937            lookup: Arc::new(lookup),
938        };
939
940        let (file_trigram_count, migrated_counts) = match read_file_trigram_count_extension(
941            &base,
942            postings_blob_end,
943            postings_body_end,
944            file_count,
945        ) {
946            Ok(Some(counts)) => (counts, false),
947            Ok(None) => (
948                compute_file_trigram_counts_from_base(&base, file_count).ok()?,
949                true,
950            ),
951            Err(_) => return None,
952        };
953
954        let mut index = SearchIndex {
955            base: Some(Arc::new(base)),
956            delta_postings: HashMap::new(),
957            delta_file_trigrams: HashMap::new(),
958            files: Arc::new(files),
959            path_to_id: Arc::new(path_to_id),
960            ready: false,
961            project_root,
962            git_head,
963            max_file_size,
964            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
965            file_trigram_count: Arc::new(file_trigram_count),
966            unindexed_files: Arc::new(unindexed_files),
967            superseded: HashSet::new(),
968            base_file_count: u32::try_from(file_count).ok()?,
969            delta_packed_bytes: 0,
970            compaction_state: Arc::new(Mutex::new(CompactionState::default())),
971        };
972
973        if migrated_counts {
974            if let Ok(_lock) = CacheLock::acquire(cache_dir) {
975                let head = index.git_head.clone();
976                index.write_to_disk(cache_dir, head.as_deref());
977            }
978        }
979
980        Some(index)
981    }
982
983    pub fn stored_git_head(&self) -> Option<&str> {
984        self.git_head.as_deref()
985    }
986
987    pub(crate) fn set_ready(&mut self, ready: bool) {
988        self.ready = ready;
989    }
990
991    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
992        self.git_head = current_head;
993        verify_file_mtimes(self);
994        self.ready = true;
995    }
996
997    #[cfg(debug_assertions)]
998    #[doc(hidden)]
999    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1000        self.verify_against_disk(current_head);
1001    }
1002
1003    pub(crate) fn rebuild_or_refresh(
1004        root: &Path,
1005        max_file_size: u64,
1006        current_head: Option<String>,
1007        baseline: Option<SearchIndex>,
1008        cache_dir: Option<&Path>,
1009    ) -> Self {
1010        if let Some(mut baseline) = baseline {
1011            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1012            baseline.max_file_size = max_file_size;
1013            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1014            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1015                return match cache_dir {
1016                    Some(cache_dir) => {
1017                        SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1018                    }
1019                    None => SearchIndex::build_with_limit(root, max_file_size),
1020                };
1021            }
1022            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1023
1024            if baseline.git_head == current_head || current_head.is_none() {
1025                // HEAD matches, but files may have changed on disk since the index was
1026                // last written (e.g., uncommitted edits, stash pop, manual file changes
1027                // while OpenCode was closed). Verify mtimes and re-index stale files.
1028                // Non-git projects also use this per-file (path, mtime, size)
1029                // fingerprint so unchanged trees reuse the disk cache instead of
1030                // rebuilding every configure.
1031                baseline.git_head = current_head;
1032                verify_file_mtimes(&mut baseline);
1033                baseline.ready = true;
1034                return baseline;
1035            }
1036
1037            if let (Some(previous), Some(current)) =
1038                (baseline.git_head.clone(), current_head.clone())
1039            {
1040                let project_root = baseline.project_root.clone();
1041                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1042                    baseline.git_head = Some(current);
1043                    verify_file_mtimes(&mut baseline);
1044                    baseline.ready = true;
1045                    return baseline;
1046                }
1047            }
1048        }
1049
1050        match cache_dir {
1051            Some(cache_dir) => {
1052                SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1053            }
1054            None => SearchIndex::build_with_limit(root, max_file_size),
1055        }
1056    }
1057
1058    fn allocate_file_id_with_metadata(
1059        &mut self,
1060        path: &Path,
1061        metadata: SearchFileMetadata,
1062    ) -> Option<u32> {
1063        let file_id = u32::try_from(self.files.len()).ok()?;
1064        Arc::make_mut(&mut self.files).push(FileEntry {
1065            path: path.to_path_buf(),
1066            size: metadata.size,
1067            modified: metadata.modified,
1068            content_hash: cache_freshness::zero_hash(),
1069        });
1070        Arc::make_mut(&mut self.path_to_id).insert(path.to_path_buf(), file_id);
1071        ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
1072        Some(file_id)
1073    }
1074
1075    fn track_unindexed_file_with_metadata(
1076        &mut self,
1077        path: &Path,
1078        metadata: SearchFileMetadata,
1079    ) -> bool {
1080        let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1081            return false;
1082        };
1083        Arc::make_mut(&mut self.unindexed_files).insert(file_id);
1084        if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
1085            *count = 0;
1086        }
1087        true
1088    }
1089
1090    fn active_file_ids(&self) -> Vec<u32> {
1091        self.snapshot().active_file_ids()
1092    }
1093
1094    #[cfg(test)]
1095    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1096        self.snapshot().postings_for_trigram(trigram, filter)
1097    }
1098
1099    fn update_compaction_flags(&mut self, changed_path: Option<&Path>) {
1100        let delta_files = self.delta_file_trigrams.len();
1101        let hard = delta_files >= DELTA_COMPACT_HARD_FILES
1102            || self.delta_packed_bytes >= DELTA_COMPACT_HARD_BYTES;
1103        let soft = delta_files >= DELTA_COMPACT_SOFT_FILES
1104            || self.delta_packed_bytes >= DELTA_COMPACT_SOFT_BYTES;
1105        if let Ok(mut state) = self.compaction_state.lock() {
1106            if state.running {
1107                if let Some(path) = changed_path {
1108                    state.buffered_paths.push(path.to_path_buf());
1109                }
1110                if soft || hard {
1111                    state.requested_again = true;
1112                }
1113            } else if hard || (soft && !state.requested_again) {
1114                state.requested_again = true;
1115            }
1116        }
1117    }
1118
1119    fn compaction_record_sources(
1120        &self,
1121        id_map: Arc<HashMap<u32, u32>>,
1122    ) -> Vec<Box<dyn PostingRecordSource>> {
1123        let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
1124        if let Some(base) = self.base.clone() {
1125            sources.push(Box::new(BaseRecordSource::new(
1126                base,
1127                Arc::clone(&id_map),
1128                Arc::new(self.superseded.clone()),
1129            )));
1130        }
1131
1132        let mut delta_records = Vec::new();
1133        for (&trigram, postings) in &self.delta_postings {
1134            for posting in postings {
1135                let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
1136                    continue;
1137                };
1138                delta_records.push(SpillRecord {
1139                    trigram,
1140                    file_id: mapped_file_id,
1141                    next_mask: posting.next_mask,
1142                    loc_mask: posting.loc_mask,
1143                });
1144            }
1145        }
1146        if !delta_records.is_empty() {
1147            delta_records.sort_unstable_by_key(|record| (record.trigram, record.file_id));
1148            sources.push(Box::new(VecRecordSource::new(delta_records)));
1149        }
1150        sources
1151    }
1152}
1153
1154impl BasePostings {
1155    fn lookup_entry(&self, trigram: u32) -> Option<LookupEntry> {
1156        self.lookup
1157            .binary_search_by_key(&trigram, |entry| entry.trigram)
1158            .ok()
1159            .and_then(|index| self.lookup.get(index).copied())
1160    }
1161
1162    fn read_postings(&self, entry: LookupEntry) -> std::io::Result<Vec<Posting>> {
1163        let bytes_len = (entry.count as usize)
1164            .checked_mul(POSTING_BYTES)
1165            .ok_or_else(|| std::io::Error::other("posting list too large"))?;
1166        let offset = self
1167            .postings_blob_start
1168            .checked_add(entry.offset)
1169            .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1170        let end = entry
1171            .offset
1172            .checked_add(bytes_len as u64)
1173            .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1174        if end > self.postings_blob_len {
1175            return Err(std::io::Error::other("posting list exceeds blob"));
1176        }
1177        let mut bytes = vec![0u8; bytes_len];
1178        pread_exact(&self.file, offset, &mut bytes)?;
1179        let mut postings = Vec::with_capacity(entry.count as usize);
1180        for chunk in bytes.chunks_exact(POSTING_BYTES) {
1181            postings.push(Posting {
1182                file_id: u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
1183                next_mask: chunk[4],
1184                loc_mask: chunk[5],
1185            });
1186        }
1187        Ok(postings)
1188    }
1189}
1190
1191impl SearchIndexSnapshot {
1192    pub fn grep(
1193        &self,
1194        pattern: &str,
1195        case_sensitive: bool,
1196        include: &[String],
1197        exclude: &[String],
1198        search_root: &Path,
1199        max_results: usize,
1200    ) -> GrepResult {
1201        match pattern_compile::compile(
1202            pattern,
1203            CompileOpts {
1204                case_insensitive: !case_sensitive,
1205                ..CompileOpts::default()
1206            },
1207        ) {
1208            CompileResult::Ok(compiled) => {
1209                self.search_grep(&compiled, include, exclude, search_root, max_results)
1210            }
1211            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
1212                self.empty_grep_result()
1213            }
1214        }
1215    }
1216
1217    pub fn search_grep(
1218        &self,
1219        pattern: &CompiledPattern,
1220        include: &[String],
1221        exclude: &[String],
1222        search_root: &Path,
1223        max_results: usize,
1224    ) -> GrepResult {
1225        let matcher = match pattern {
1226            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
1227            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
1228        };
1229
1230        let filters = match build_path_filters(include, exclude) {
1231            Ok(filters) => filters,
1232            Err(_) => PathFilters::default(),
1233        };
1234        let search_root = canonicalize_or_normalize(search_root);
1235
1236        let raw_pattern = pattern.raw_pattern_for_trigrams();
1237        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
1238            RegexQuery::default()
1239        } else {
1240            decompose_regex(&raw_pattern)
1241        };
1242        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
1243        let candidate_ids = self.candidates(&query);
1244
1245        let candidate_files: Vec<&FileEntry> = candidate_ids
1246            .into_iter()
1247            .filter_map(|file_id| self.files.get(file_id as usize))
1248            .filter(|file| !file.path.as_os_str().is_empty())
1249            .filter(|file| is_within_search_root(&search_root, &file.path))
1250            .filter(|file| filters.matches(&self.project_root, &file.path))
1251            .collect();
1252
1253        let total_matches = AtomicUsize::new(0);
1254        let files_searched = AtomicUsize::new(0);
1255        let files_with_matches = AtomicUsize::new(0);
1256        let truncated = AtomicBool::new(false);
1257        let engine_capped = AtomicBool::new(false);
1258        let stop_after = max_results.saturating_mul(2);
1259        let stop_scan = Arc::new(AtomicBool::new(false));
1260
1261        let mut matches = if candidate_files.len() > 10 {
1262            candidate_files
1263                .par_iter()
1264                .map(|file| {
1265                    if grep_scan_should_stop(
1266                        Some(&stop_scan),
1267                        &truncated,
1268                        &total_matches,
1269                        stop_after,
1270                    ) {
1271                        engine_capped.store(true, Ordering::Relaxed);
1272                        return Vec::new();
1273                    }
1274                    search_candidate_file(
1275                        file,
1276                        &matcher,
1277                        max_results,
1278                        stop_after,
1279                        &total_matches,
1280                        &files_searched,
1281                        &files_with_matches,
1282                        &truncated,
1283                        &engine_capped,
1284                        Some(&stop_scan),
1285                    )
1286                })
1287                .reduce(Vec::new, |mut left, mut right| {
1288                    // When concatenating partial match lists from parallel file
1289                    // searches, simply append the chunks. The stop checks in
1290                    // each worker decide whether the result cap was reached.
1291                    left.append(&mut right);
1292                    left
1293                })
1294        } else {
1295            let mut matches = Vec::new();
1296            for file in candidate_files {
1297                matches.extend(search_candidate_file(
1298                    file,
1299                    &matcher,
1300                    max_results,
1301                    stop_after,
1302                    &total_matches,
1303                    &files_searched,
1304                    &files_with_matches,
1305                    &truncated,
1306                    &engine_capped,
1307                    None,
1308                ));
1309
1310                if should_stop_search(&truncated, &total_matches, stop_after) {
1311                    engine_capped.store(true, Ordering::Relaxed);
1312                    break;
1313                }
1314            }
1315            matches
1316        };
1317
1318        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, &self.project_root, |path| {
1319            self.path_to_id
1320                .get(path)
1321                .and_then(|file_id| self.files.get(*file_id as usize))
1322                .map(|file| file.modified)
1323        });
1324
1325        let matches = matches
1326            .into_iter()
1327            .map(|matched| GrepMatch {
1328                file: matched.file.as_ref().clone(),
1329                line: matched.line,
1330                column: matched.column,
1331                line_text: matched.line_text,
1332                match_text: matched.match_text,
1333            })
1334            .collect();
1335
1336        GrepResult {
1337            total_matches: total_matches.load(Ordering::Relaxed),
1338            matches,
1339            files_searched: files_searched.load(Ordering::Relaxed),
1340            files_with_matches: files_with_matches.load(Ordering::Relaxed),
1341            index_status: if self.ready {
1342                IndexStatus::Ready
1343            } else {
1344                IndexStatus::Building
1345            },
1346            truncated: truncated.load(Ordering::Relaxed),
1347            fully_degraded,
1348            engine_capped: engine_capped.load(Ordering::Relaxed),
1349            walk_truncated: false,
1350        }
1351    }
1352
1353    fn empty_grep_result(&self) -> GrepResult {
1354        GrepResult {
1355            matches: Vec::new(),
1356            total_matches: 0,
1357            files_searched: 0,
1358            files_with_matches: 0,
1359            index_status: if self.ready {
1360                IndexStatus::Ready
1361            } else {
1362                IndexStatus::Building
1363            },
1364            truncated: false,
1365            fully_degraded: false,
1366            engine_capped: false,
1367            walk_truncated: false,
1368        }
1369    }
1370
1371    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
1372        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
1373            Ok(filters) => filters,
1374            Err(_) => return Vec::new(),
1375        };
1376        let search_root = canonicalize_or_normalize(search_root);
1377        let mut entries = self
1378            .files
1379            .iter()
1380            .filter(|file| !file.path.as_os_str().is_empty())
1381            .filter(|file| is_within_search_root(&search_root, &file.path))
1382            .filter(|file| filters.matches(&self.project_root, &file.path))
1383            .map(|file| (file.path.clone(), file.modified))
1384            .collect::<Vec<_>>();
1385
1386        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
1387            right_mtime
1388                .cmp(left_mtime)
1389                .then_with(|| left_path.cmp(right_path))
1390        });
1391
1392        entries.into_iter().map(|(path, _)| path).collect()
1393    }
1394
1395    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
1396        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
1397            return self.active_file_ids();
1398        }
1399
1400        let mut and_trigrams = query.and_trigrams.clone();
1401        and_trigrams.sort_unstable_by_key(|trigram| self.posting_count(*trigram));
1402
1403        let mut current: Option<Vec<u32>> = None;
1404
1405        for trigram in and_trigrams {
1406            let filter = query.and_filters.get(&trigram).copied();
1407            let matches = self.postings_for_trigram(trigram, filter);
1408            current = Some(match current.take() {
1409                Some(existing) => intersect_sorted_ids(&existing, &matches),
1410                None => matches,
1411            });
1412
1413            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
1414                break;
1415            }
1416        }
1417
1418        let mut current = current.unwrap_or_else(|| self.active_file_ids());
1419
1420        for (index, group) in query.or_groups.iter().enumerate() {
1421            let mut group_matches = Vec::new();
1422            let filters = query.or_filters.get(index);
1423
1424            for trigram in group {
1425                let filter = filters.and_then(|filters| filters.get(trigram).copied());
1426                let matches = self.postings_for_trigram(*trigram, filter);
1427                if group_matches.is_empty() {
1428                    group_matches = matches;
1429                } else {
1430                    group_matches = union_sorted_ids(&group_matches, &matches);
1431                }
1432            }
1433
1434            current = intersect_sorted_ids(&current, &group_matches);
1435            if current.is_empty() {
1436                break;
1437            }
1438        }
1439
1440        let mut unindexed = self
1441            .unindexed_files
1442            .iter()
1443            .copied()
1444            .filter(|file_id| self.is_active_file(*file_id))
1445            .collect::<Vec<_>>();
1446        if !unindexed.is_empty() {
1447            unindexed.sort_unstable();
1448            current = union_sorted_ids(&current, &unindexed);
1449        }
1450
1451        current
1452    }
1453
1454    fn posting_count(&self, trigram: u32) -> usize {
1455        let base_count = self
1456            .base
1457            .as_ref()
1458            .and_then(|base| base.lookup_entry(trigram))
1459            .map_or(0usize, |entry| entry.count as usize);
1460        base_count.saturating_add(self.delta_postings.get(&trigram).map_or(0usize, Vec::len))
1461    }
1462
1463    fn active_file_ids(&self) -> Vec<u32> {
1464        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1465        ids.retain(|file_id| self.is_active_file(*file_id));
1466        ids.sort_unstable();
1467        ids
1468    }
1469
1470    fn is_active_file(&self, file_id: u32) -> bool {
1471        if self.superseded.contains(&file_id) {
1472            return false;
1473        }
1474        self.files
1475            .get(file_id as usize)
1476            .map(|file| !file.path.as_os_str().is_empty())
1477            .unwrap_or(false)
1478    }
1479
1480    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1481        let mut matches = Vec::new();
1482
1483        if let Some(base_entry) = self
1484            .base
1485            .as_ref()
1486            .and_then(|base| base.lookup_entry(trigram))
1487        {
1488            if let Some(base) = &self.base {
1489                if let Ok(postings) = base.read_postings(base_entry) {
1490                    matches.reserve(postings.len());
1491                    for posting in postings {
1492                        if self.superseded.contains(&posting.file_id) {
1493                            continue;
1494                        }
1495                        if !posting_matches_filter(&posting, filter) {
1496                            continue;
1497                        }
1498                        if self.is_active_file(posting.file_id) {
1499                            matches.push(posting.file_id);
1500                        }
1501                    }
1502                }
1503            }
1504        }
1505
1506        if let Some(postings) = self.delta_postings.get(&trigram) {
1507            matches.reserve(postings.len());
1508            for posting in postings {
1509                if !posting_matches_filter(posting, filter) {
1510                    continue;
1511                }
1512                if self.is_active_file(posting.file_id) {
1513                    matches.push(posting.file_id);
1514                }
1515            }
1516        }
1517
1518        if matches.len() > 1 {
1519            matches.sort_unstable();
1520            matches.dedup();
1521        }
1522        matches
1523    }
1524}
1525
1526fn posting_matches_filter(posting: &Posting, filter: Option<PostingFilter>) -> bool {
1527    if let Some(filter) = filter {
1528        // next_mask is a bloom filter: the character following this trigram in
1529        // the query must also appear after this trigram somewhere in the file.
1530        if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1531            return false;
1532        }
1533        // loc_mask is persisted for future adjacency checks. It is intentionally
1534        // not used as a single-trigram filter because query positions do not
1535        // correspond to file positions.
1536    }
1537    true
1538}
1539
1540fn search_candidate_file(
1541    file: &FileEntry,
1542    matcher: &SearchMatcher,
1543    max_results: usize,
1544    stop_after: usize,
1545    total_matches: &AtomicUsize,
1546    files_searched: &AtomicUsize,
1547    files_with_matches: &AtomicUsize,
1548    truncated: &AtomicBool,
1549    engine_capped: &AtomicBool,
1550    stop_scan: Option<&Arc<AtomicBool>>,
1551) -> Vec<SharedGrepMatch> {
1552    if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1553        engine_capped.store(true, Ordering::Relaxed);
1554        return Vec::new();
1555    }
1556
1557    let content = match read_indexed_file_bytes(&file.path) {
1558        Some(content) => content,
1559        None => return Vec::new(),
1560    };
1561    // Defense in depth: even though indexing tries to filter binaries via
1562    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1563    // query time. content_inspector is fast (~bytes-per-cycle on a small
1564    // preview) and this guarantees we never surface matches inside binary
1565    // files even if the indexer somehow let one through (e.g. file changed
1566    // between indexing and query).
1567    if is_binary_bytes(&content) {
1568        return Vec::new();
1569    }
1570    files_searched.fetch_add(1, Ordering::Relaxed);
1571
1572    let shared_path = Arc::new(file.path.clone());
1573    let mut matches = Vec::new();
1574    let mut line_starts = None;
1575    let mut seen_lines = HashSet::new();
1576    let mut matched_this_file = false;
1577
1578    match matcher {
1579        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1580            let needle = &literal.needle;
1581            let finder = memchr::memmem::Finder::new(needle);
1582            let mut start = 0;
1583
1584            while let Some(position) = finder.find(&content[start..]) {
1585                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1586                    engine_capped.store(true, Ordering::Relaxed);
1587                    break;
1588                }
1589
1590                let offset = start + position;
1591                start = offset + 1;
1592
1593                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1594                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1595                if !seen_lines.insert(line) {
1596                    continue;
1597                }
1598
1599                matched_this_file = true;
1600                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1601                if match_number > max_results {
1602                    truncated.store(true, Ordering::Relaxed);
1603                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1604                    break;
1605                }
1606
1607                let end = offset + needle.len();
1608                matches.push(SharedGrepMatch {
1609                    file: shared_path.clone(),
1610                    line,
1611                    column,
1612                    line_text,
1613                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1614                });
1615            }
1616        }
1617        SearchMatcher::Literal(literal) => {
1618            let needle = &literal.needle;
1619            let search_content = content.to_ascii_lowercase();
1620            let finder = memchr::memmem::Finder::new(needle);
1621            let mut start = 0;
1622
1623            while let Some(position) = finder.find(&search_content[start..]) {
1624                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1625                    engine_capped.store(true, Ordering::Relaxed);
1626                    break;
1627                }
1628
1629                let offset = start + position;
1630                start = offset + 1;
1631
1632                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1633                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1634                if !seen_lines.insert(line) {
1635                    continue;
1636                }
1637
1638                matched_this_file = true;
1639                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1640                if match_number > max_results {
1641                    truncated.store(true, Ordering::Relaxed);
1642                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1643                    break;
1644                }
1645
1646                let end = offset + needle.len();
1647                matches.push(SharedGrepMatch {
1648                    file: shared_path.clone(),
1649                    line,
1650                    column,
1651                    line_text,
1652                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1653                });
1654            }
1655        }
1656        SearchMatcher::Regex(regex) => {
1657            for matched in regex.find_iter(&content) {
1658                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1659                    engine_capped.store(true, Ordering::Relaxed);
1660                    break;
1661                }
1662
1663                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1664                let (line, column, line_text) =
1665                    line_details_bytes(&content, line_starts, matched.start());
1666                if !seen_lines.insert(line) {
1667                    continue;
1668                }
1669
1670                matched_this_file = true;
1671                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1672                if match_number > max_results {
1673                    truncated.store(true, Ordering::Relaxed);
1674                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1675                    break;
1676                }
1677
1678                matches.push(SharedGrepMatch {
1679                    file: shared_path.clone(),
1680                    line,
1681                    column,
1682                    line_text,
1683                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1684                });
1685            }
1686        }
1687    }
1688
1689    if matched_this_file {
1690        files_with_matches.fetch_add(1, Ordering::Relaxed);
1691    }
1692
1693    matches
1694}
1695
1696fn should_stop_search(
1697    truncated: &AtomicBool,
1698    total_matches: &AtomicUsize,
1699    stop_after: usize,
1700) -> bool {
1701    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1702}
1703
1704fn grep_scan_should_stop(
1705    stop_scan: Option<&Arc<AtomicBool>>,
1706    truncated: &AtomicBool,
1707    total_matches: &AtomicUsize,
1708    stop_after: usize,
1709) -> bool {
1710    stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1711        || should_stop_search(truncated, total_matches, stop_after)
1712}
1713
1714fn signal_grep_scan_cap(
1715    stop_scan: Option<&Arc<AtomicBool>>,
1716    total_matches: &AtomicUsize,
1717    stop_after: usize,
1718) {
1719    if let Some(flag) = stop_scan {
1720        if total_matches.load(Ordering::Relaxed) >= stop_after {
1721            flag.store(true, Ordering::Relaxed);
1722        }
1723    }
1724}
1725
1726fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1727    SearchFileMetadata {
1728        size: metadata.len(),
1729        modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1730    }
1731}
1732
1733fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1734    fs::metadata(path)
1735        .ok()
1736        .map(|metadata| search_file_metadata(&metadata))
1737        .unwrap_or(SearchFileMetadata {
1738            size: size_hint,
1739            modified: UNIX_EPOCH,
1740        })
1741}
1742
1743fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1744    let metadata = match fs::metadata(path) {
1745        Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1746        _ => return PreparedSearchPath::Skipped,
1747    };
1748
1749    if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1750        return PreparedSearchPath::Unindexed(metadata);
1751    }
1752
1753    let content = match fs::read(path) {
1754        Ok(content) => content,
1755        Err(_) => return PreparedSearchPath::Skipped,
1756    };
1757
1758    if is_binary_bytes(&content) {
1759        return PreparedSearchPath::Unindexed(metadata);
1760    }
1761
1762    PreparedSearchPath::Indexed(PreparedIndexedFile {
1763        metadata,
1764        content_hash: cache_freshness::hash_bytes(&content),
1765        trigram_map: trigram_filter_map(&content, true),
1766    })
1767}
1768
1769/// Returns the worker pool size for cold search-index builds: half of available
1770/// cores, capped at 8 to keep the same limit used by the callgraph store.
1771fn search_index_build_pool_size() -> usize {
1772    std::thread::available_parallelism()
1773        .map(|parallelism| parallelism.get())
1774        .unwrap_or(1)
1775        .div_ceil(2)
1776        .clamp(1, 8)
1777}
1778
1779#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1780struct SpillRecord {
1781    trigram: u32,
1782    file_id: u32,
1783    next_mask: u8,
1784    loc_mask: u8,
1785}
1786
1787struct CacheWritePlan {
1788    project_root: PathBuf,
1789    git_head: Option<String>,
1790    ignore_fingerprint: String,
1791    max_file_size: u64,
1792    files: Vec<FileEntry>,
1793    path_to_id: HashMap<PathBuf, u32>,
1794    unindexed_files: HashSet<u32>,
1795    file_trigram_count: Vec<u32>,
1796    id_map: Arc<HashMap<u32, u32>>,
1797}
1798
1799impl CacheWritePlan {
1800    fn from_index(index: &SearchIndex, git_head: Option<&str>) -> Option<Self> {
1801        let active_ids = index.active_file_ids();
1802        let mut id_map = HashMap::with_capacity(active_ids.len());
1803        for (new_id, old_id) in active_ids.iter().enumerate() {
1804            let new_id = u32::try_from(new_id).ok()?;
1805            id_map.insert(*old_id, new_id);
1806        }
1807
1808        let mut files = Vec::with_capacity(active_ids.len());
1809        let mut path_to_id = HashMap::with_capacity(active_ids.len());
1810        let mut unindexed_files = HashSet::new();
1811        let mut file_trigram_count = Vec::with_capacity(active_ids.len());
1812        for old_id in active_ids {
1813            let new_id = *id_map.get(&old_id)?;
1814            let file = index.files.get(old_id as usize)?.clone();
1815            if file.path.as_os_str().is_empty() {
1816                continue;
1817            }
1818            path_to_id.insert(file.path.clone(), new_id);
1819            if index.unindexed_files.contains(&old_id) {
1820                unindexed_files.insert(new_id);
1821            }
1822            file_trigram_count.push(
1823                index
1824                    .file_trigram_count
1825                    .get(old_id as usize)
1826                    .copied()
1827                    .unwrap_or(0),
1828            );
1829            files.push(file);
1830        }
1831
1832        Some(Self {
1833            project_root: index.project_root.clone(),
1834            git_head: git_head.map(ToOwned::to_owned),
1835            ignore_fingerprint: if index.ignore_rules_fingerprint.is_empty() {
1836                ignore_rules_fingerprint(&index.project_root)
1837            } else {
1838                index.ignore_rules_fingerprint.clone()
1839            },
1840            max_file_size: index.max_file_size,
1841            files,
1842            path_to_id,
1843            unindexed_files,
1844            file_trigram_count,
1845            id_map: Arc::new(id_map),
1846        })
1847    }
1848}
1849
1850trait PostingRecordSource {
1851    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>>;
1852}
1853
1854struct VecRecordSource {
1855    records: Vec<SpillRecord>,
1856    index: usize,
1857}
1858
1859impl VecRecordSource {
1860    fn new(records: Vec<SpillRecord>) -> Self {
1861        Self { records, index: 0 }
1862    }
1863}
1864
1865impl PostingRecordSource for VecRecordSource {
1866    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1867        let record = self.records.get(self.index).copied();
1868        if record.is_some() {
1869            self.index += 1;
1870        }
1871        Ok(record)
1872    }
1873}
1874
1875struct SpillSegmentSource {
1876    reader: BufReader<File>,
1877    remaining_records: u64,
1878    current_trigram: u32,
1879    remaining_in_group: u32,
1880}
1881
1882impl SpillSegmentSource {
1883    fn open(path: &Path) -> std::io::Result<Self> {
1884        let mut reader = BufReader::new(File::open(path)?);
1885        let mut magic = [0u8; 8];
1886        reader.read_exact(&mut magic)?;
1887        if &magic != SPILL_MAGIC {
1888            return Err(std::io::Error::other("invalid search spill magic"));
1889        }
1890        if read_u32(&mut reader)? != INDEX_VERSION {
1891            return Err(std::io::Error::other("invalid search spill version"));
1892        }
1893        let remaining_records = read_u64(&mut reader)?;
1894        Ok(Self {
1895            reader,
1896            remaining_records,
1897            current_trigram: 0,
1898            remaining_in_group: 0,
1899        })
1900    }
1901}
1902
1903impl PostingRecordSource for SpillSegmentSource {
1904    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1905        if self.remaining_records == 0 {
1906            return Ok(None);
1907        }
1908        if self.remaining_in_group == 0 {
1909            self.current_trigram = read_u32(&mut self.reader)?;
1910            self.remaining_in_group = read_u32(&mut self.reader)?;
1911            if self.remaining_in_group == 0 {
1912                return Err(std::io::Error::other("empty search spill group"));
1913            }
1914        }
1915        let mut file_id = [0u8; 4];
1916        self.reader.read_exact(&mut file_id)?;
1917        let mut masks = [0u8; 2];
1918        self.reader.read_exact(&mut masks)?;
1919        self.remaining_in_group -= 1;
1920        self.remaining_records -= 1;
1921        Ok(Some(SpillRecord {
1922            trigram: self.current_trigram,
1923            file_id: u32::from_le_bytes(file_id),
1924            next_mask: masks[0],
1925            loc_mask: masks[1],
1926        }))
1927    }
1928}
1929
1930struct BaseRecordSource {
1931    base: Arc<BasePostings>,
1932    id_map: Arc<HashMap<u32, u32>>,
1933    superseded: Arc<HashSet<u32>>,
1934    lookup_index: usize,
1935    current: Vec<SpillRecord>,
1936    current_index: usize,
1937}
1938
1939impl BaseRecordSource {
1940    fn new(
1941        base: Arc<BasePostings>,
1942        id_map: Arc<HashMap<u32, u32>>,
1943        superseded: Arc<HashSet<u32>>,
1944    ) -> Self {
1945        Self {
1946            base,
1947            id_map,
1948            superseded,
1949            lookup_index: 0,
1950            current: Vec::new(),
1951            current_index: 0,
1952        }
1953    }
1954
1955    fn load_next_group(&mut self) -> std::io::Result<bool> {
1956        while let Some(entry) = self.base.lookup.get(self.lookup_index).copied() {
1957            self.lookup_index += 1;
1958            let postings = self.base.read_postings(entry)?;
1959            self.current.clear();
1960            self.current_index = 0;
1961            for posting in postings {
1962                if self.superseded.contains(&posting.file_id) {
1963                    continue;
1964                }
1965                let Some(mapped_file_id) = self.id_map.get(&posting.file_id).copied() else {
1966                    continue;
1967                };
1968                self.current.push(SpillRecord {
1969                    trigram: entry.trigram,
1970                    file_id: mapped_file_id,
1971                    next_mask: posting.next_mask,
1972                    loc_mask: posting.loc_mask,
1973                });
1974            }
1975            if !self.current.is_empty() {
1976                return Ok(true);
1977            }
1978        }
1979        Ok(false)
1980    }
1981}
1982
1983impl PostingRecordSource for BaseRecordSource {
1984    fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1985        if self.current_index >= self.current.len() && !self.load_next_group()? {
1986            return Ok(None);
1987        }
1988        let record = self.current[self.current_index];
1989        self.current_index += 1;
1990        Ok(Some(record))
1991    }
1992}
1993
1994#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1995struct HeapItem {
1996    record: SpillRecord,
1997    source_index: usize,
1998}
1999
2000impl Ord for HeapItem {
2001    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
2002        other
2003            .record
2004            .trigram
2005            .cmp(&self.record.trigram)
2006            .then_with(|| other.record.file_id.cmp(&self.record.file_id))
2007            .then_with(|| other.source_index.cmp(&self.source_index))
2008    }
2009}
2010
2011impl PartialOrd for HeapItem {
2012    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
2013        Some(self.cmp(other))
2014    }
2015}
2016
2017fn build_streaming_index(
2018    root: &Path,
2019    max_file_size: u64,
2020    cache_dir: &Path,
2021) -> std::io::Result<(SearchIndex, usize)> {
2022    fs::create_dir_all(cache_dir)?;
2023    sweep_stale_search_build_dirs(cache_dir);
2024    let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
2025    let ignore_fingerprint = ignore_rules_fingerprint(&project_root);
2026    let filters = PathFilters::default();
2027    let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
2028    let pool_size = search_index_build_pool_size();
2029    let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
2030    let pool = rayon::ThreadPoolBuilder::new()
2031        .num_threads(pool_size)
2032        .thread_name(|index| format!("aft-search-build-{index}"))
2033        .stack_size(8 * 1024 * 1024)
2034        .build()
2035        .ok();
2036
2037    let spill_dir = create_spill_dir(cache_dir)?;
2038    let mut spill_paths = Vec::new();
2039    let mut spill_seq = 0usize;
2040    let mut block: Vec<SpillRecord> = Vec::new();
2041    let mut files = Vec::new();
2042    let mut path_to_id = HashMap::new();
2043    let mut unindexed_files = HashSet::new();
2044    let mut file_trigram_count = Vec::new();
2045    let mut indexed = 0usize;
2046
2047    let build_result = (|| -> std::io::Result<BasePostings> {
2048        for chunk in paths.chunks(chunk_size) {
2049            let prepare_chunk = || -> Vec<PreparedSearchPath> {
2050                chunk
2051                    .par_iter()
2052                    .map(|path| prepare_search_path(path, max_file_size))
2053                    .collect()
2054            };
2055            let prepared = match &pool {
2056                Some(pool) => pool.install(prepare_chunk),
2057                None => prepare_chunk(),
2058            };
2059
2060            for (path, prepared) in chunk.iter().zip(prepared) {
2061                match prepared {
2062                    PreparedSearchPath::Indexed(file) => {
2063                        let file_id = u32::try_from(files.len())
2064                            .map_err(|_| std::io::Error::other("too many files to index"))?;
2065                        files.push(FileEntry {
2066                            path: path.clone(),
2067                            size: file.metadata.size,
2068                            modified: file.metadata.modified,
2069                            content_hash: file.content_hash,
2070                        });
2071                        path_to_id.insert(path.clone(), file_id);
2072                        file_trigram_count.push(file.trigram_map.len() as u32);
2073                        for (trigram, filter) in file.trigram_map {
2074                            block.push(SpillRecord {
2075                                trigram,
2076                                file_id,
2077                                next_mask: filter.next_mask,
2078                                loc_mask: filter.loc_mask,
2079                            });
2080                        }
2081                        indexed += 1;
2082                    }
2083                    PreparedSearchPath::Unindexed(metadata) => {
2084                        let file_id = u32::try_from(files.len())
2085                            .map_err(|_| std::io::Error::other("too many files to index"))?;
2086                        files.push(FileEntry {
2087                            path: path.clone(),
2088                            size: metadata.size,
2089                            modified: metadata.modified,
2090                            content_hash: cache_freshness::zero_hash(),
2091                        });
2092                        path_to_id.insert(path.clone(), file_id);
2093                        unindexed_files.insert(file_id);
2094                        file_trigram_count.push(0);
2095                        indexed += 1;
2096                    }
2097                    PreparedSearchPath::Skipped => {}
2098                }
2099
2100                let block_bytes = block.len().saturating_mul(SPILL_RECORD_ESTIMATED_BYTES);
2101                if block_bytes >= SPIMI_SOFT_LIMIT_BYTES || block_bytes >= SPIMI_HARD_LIMIT_BYTES {
2102                    let path = flush_spill_segment(&spill_dir, spill_seq, &mut block)?;
2103                    spill_paths.push(path);
2104                    spill_seq += 1;
2105                }
2106            }
2107        }
2108
2109        block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2110        let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
2111        for path in &spill_paths {
2112            sources.push(Box::new(SpillSegmentSource::open(path)?));
2113        }
2114        if !block.is_empty() {
2115            sources.push(Box::new(VecRecordSource::new(std::mem::take(&mut block))));
2116        }
2117
2118        let plan = CacheWritePlan {
2119            project_root: project_root.clone(),
2120            git_head: current_git_head(&project_root),
2121            ignore_fingerprint: ignore_fingerprint.clone(),
2122            max_file_size,
2123            files: files.clone(),
2124            path_to_id: path_to_id.clone(),
2125            unindexed_files: unindexed_files.clone(),
2126            file_trigram_count: file_trigram_count.clone(),
2127            id_map: Arc::new(
2128                (0..files.len())
2129                    .filter_map(|id| {
2130                        let id = u32::try_from(id).ok()?;
2131                        Some((id, id))
2132                    })
2133                    .collect(),
2134            ),
2135        };
2136        write_cache_file_from_sources(cache_dir, &plan, &mut sources)
2137    })();
2138
2139    let _ = fs::remove_dir_all(&spill_dir);
2140    let base = build_result?;
2141    let base_file_count =
2142        u32::try_from(files.len()).map_err(|_| std::io::Error::other("too many files to index"))?;
2143    let git_head = current_git_head(&project_root);
2144    let index = SearchIndex {
2145        base: Some(Arc::new(base)),
2146        delta_postings: HashMap::new(),
2147        delta_file_trigrams: HashMap::new(),
2148        files: Arc::new(files),
2149        path_to_id: Arc::new(path_to_id),
2150        ready: false,
2151        project_root,
2152        git_head,
2153        max_file_size,
2154        ignore_rules_fingerprint: ignore_fingerprint,
2155        file_trigram_count: Arc::new(file_trigram_count),
2156        unindexed_files: Arc::new(unindexed_files),
2157        superseded: HashSet::new(),
2158        base_file_count,
2159        delta_packed_bytes: 0,
2160        compaction_state: Arc::new(Mutex::new(CompactionState::default())),
2161    };
2162    Ok((index, indexed))
2163}
2164
2165fn write_cache_file_from_sources(
2166    cache_dir: &Path,
2167    plan: &CacheWritePlan,
2168    sources: &mut [Box<dyn PostingRecordSource>],
2169) -> std::io::Result<BasePostings> {
2170    fs::create_dir_all(cache_dir)?;
2171    sweep_stale_search_build_dirs(cache_dir);
2172    let cache_path = cache_dir.join("cache.bin");
2173    let tmp_cache = cache_dir.join(format!(
2174        "cache.bin.tmp.{}.{}",
2175        std::process::id(),
2176        SystemTime::now()
2177            .duration_since(UNIX_EPOCH)
2178            .unwrap_or(Duration::ZERO)
2179            .as_nanos()
2180    ));
2181
2182    let write_result = (|| -> std::io::Result<BasePostings> {
2183        let raw = OpenOptions::new()
2184            .write(true)
2185            .create_new(true)
2186            .open(&tmp_cache)?;
2187        let mut writer = BufWriter::new(raw);
2188        write_u32(&mut writer, CACHE_MAGIC)?;
2189        write_u32(&mut writer, INDEX_VERSION)?;
2190        let postings_len_patch = writer.stream_position()?;
2191        write_u64(&mut writer, 0)?;
2192
2193        let postings_section_start = writer.stream_position()?;
2194        let postings_header = build_postings_header_bytes(plan)?;
2195        writer.write_all(&postings_header)?;
2196        let postings_blob_len_patch = writer.stream_position()?;
2197        write_u64(&mut writer, 0)?;
2198        let postings_blob_start = writer.stream_position()?;
2199
2200        let (lookup_entries, postings_blob_len) = merge_sources_to_writer(sources, &mut writer)?;
2201        let extension = build_file_trigram_count_extension(&plan.file_trigram_count)?;
2202        writer.write_all(&extension)?;
2203        let postings_crc_end = writer.stream_position()?;
2204
2205        writer.flush()?;
2206        writer.seek(SeekFrom::Start(postings_blob_len_patch))?;
2207        write_u64(&mut writer, postings_blob_len)?;
2208        writer.flush()?;
2209
2210        let checksum = crc32_file_range(
2211            &tmp_cache,
2212            postings_section_start,
2213            postings_crc_end.saturating_sub(postings_section_start),
2214        )?;
2215        writer.seek(SeekFrom::Start(postings_crc_end))?;
2216        writer.write_all(&checksum.to_le_bytes())?;
2217        let postings_section_end = writer.stream_position()?;
2218        let postings_len_total = postings_section_end.saturating_sub(postings_section_start);
2219        writer.seek(SeekFrom::Start(postings_len_patch))?;
2220        write_u64(&mut writer, postings_len_total)?;
2221        writer.seek(SeekFrom::Start(postings_section_end))?;
2222
2223        let lookup_blob = build_lookup_section_bytes(&lookup_entries)?;
2224        writer.write_all(&lookup_blob)?;
2225        writer.flush()?;
2226        writer.get_ref().sync_all()?;
2227        drop(writer);
2228
2229        fs::rename(&tmp_cache, &cache_path)?;
2230        sync_parent_dir(&cache_path);
2231        let file = open_cache_file_read(&cache_path)?;
2232        Ok(BasePostings {
2233            file: Arc::new(file),
2234            postings_blob_start,
2235            postings_blob_len,
2236            lookup: Arc::new(lookup_entries),
2237        })
2238    })();
2239
2240    if write_result.is_err() {
2241        let _ = fs::remove_file(&tmp_cache);
2242    }
2243    write_result
2244}
2245
2246fn merge_sources_to_writer(
2247    sources: &mut [Box<dyn PostingRecordSource>],
2248    writer: &mut BufWriter<File>,
2249) -> std::io::Result<(Vec<LookupEntry>, u64)> {
2250    let mut heap = BinaryHeap::new();
2251    for (source_index, source) in sources.iter_mut().enumerate() {
2252        if let Some(record) = source.next_record()? {
2253            heap.push(HeapItem {
2254                record,
2255                source_index,
2256            });
2257        }
2258    }
2259
2260    let mut lookup_entries = Vec::new();
2261    let mut postings_blob_len = 0u64;
2262    let mut current_trigram: Option<u32> = None;
2263    let mut current_offset = 0u64;
2264    let mut current_count = 0u32;
2265
2266    while let Some(item) = heap.pop() {
2267        let record = item.record;
2268        if current_trigram != Some(record.trigram) {
2269            if let Some(trigram) = current_trigram {
2270                lookup_entries.push(LookupEntry {
2271                    trigram,
2272                    offset: current_offset,
2273                    count: current_count,
2274                });
2275            }
2276            current_trigram = Some(record.trigram);
2277            current_offset = postings_blob_len;
2278            current_count = 0;
2279        }
2280
2281        writer.write_all(&record.file_id.to_le_bytes())?;
2282        writer.write_all(&[record.next_mask, record.loc_mask])?;
2283        postings_blob_len = postings_blob_len
2284            .checked_add(POSTING_BYTES as u64)
2285            .ok_or_else(|| std::io::Error::other("postings blob too large"))?;
2286        current_count = current_count
2287            .checked_add(1)
2288            .ok_or_else(|| std::io::Error::other("posting list too large"))?;
2289
2290        if let Some(next) = sources[item.source_index].next_record()? {
2291            heap.push(HeapItem {
2292                record: next,
2293                source_index: item.source_index,
2294            });
2295        }
2296    }
2297
2298    if let Some(trigram) = current_trigram {
2299        lookup_entries.push(LookupEntry {
2300            trigram,
2301            offset: current_offset,
2302            count: current_count,
2303        });
2304    }
2305
2306    Ok((lookup_entries, postings_blob_len))
2307}
2308
2309fn build_postings_header_bytes(plan: &CacheWritePlan) -> std::io::Result<Vec<u8>> {
2310    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2311    writer.write_all(INDEX_MAGIC)?;
2312    write_u32(&mut writer, INDEX_VERSION)?;
2313
2314    let head = plan.git_head.as_deref().unwrap_or_default();
2315    let root = plan.project_root.to_string_lossy();
2316    let head_len = u32::try_from(head.len())
2317        .map_err(|_| std::io::Error::other("git head too large to cache"))?;
2318    let root_len = u32::try_from(root.len())
2319        .map_err(|_| std::io::Error::other("project root too large to cache"))?;
2320    let ignore_fingerprint_len = u32::try_from(plan.ignore_fingerprint.len())
2321        .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
2322    let file_count = u32::try_from(plan.files.len())
2323        .map_err(|_| std::io::Error::other("too many files to cache"))?;
2324
2325    write_u32(&mut writer, head_len)?;
2326    write_u32(&mut writer, root_len)?;
2327    write_u32(&mut writer, ignore_fingerprint_len)?;
2328    write_u64(&mut writer, plan.max_file_size)?;
2329    write_u32(&mut writer, file_count)?;
2330    writer.write_all(head.as_bytes())?;
2331    writer.write_all(root.as_bytes())?;
2332    writer.write_all(plan.ignore_fingerprint.as_bytes())?;
2333
2334    for (file_id, file) in plan.files.iter().enumerate() {
2335        let file_id =
2336            u32::try_from(file_id).map_err(|_| std::io::Error::other("too many files to cache"))?;
2337        let path = cache_relative_path(&plan.project_root, &file.path)
2338            .or_else(|| {
2339                fs::canonicalize(&file.path)
2340                    .ok()
2341                    .and_then(|canonical| cache_relative_path(&plan.project_root, &canonical))
2342            })
2343            .ok_or_else(|| {
2344                std::io::Error::other(format!(
2345                    "refusing to cache path outside project root: {}",
2346                    file.path.display()
2347                ))
2348            })?;
2349        let path = path.to_string_lossy();
2350        let path_len = u32::try_from(path.len())
2351            .map_err(|_| std::io::Error::other("cached path too large"))?;
2352        let modified = file
2353            .modified
2354            .duration_since(UNIX_EPOCH)
2355            .unwrap_or(Duration::ZERO);
2356        let unindexed = if plan.unindexed_files.contains(&file_id) {
2357            1u8
2358        } else {
2359            0u8
2360        };
2361
2362        writer.write_all(&[unindexed])?;
2363        write_u32(&mut writer, path_len)?;
2364        write_u64(&mut writer, file.size)?;
2365        write_u64(&mut writer, modified.as_secs())?;
2366        write_u32(&mut writer, modified.subsec_nanos())?;
2367        writer.write_all(file.content_hash.as_bytes())?;
2368        writer.write_all(path.as_bytes())?;
2369    }
2370
2371    writer.flush()?;
2372    Ok(writer
2373        .into_inner()
2374        .map_err(|error| std::io::Error::other(error.to_string()))?
2375        .into_inner())
2376}
2377
2378fn build_lookup_section_bytes(lookup_entries: &[LookupEntry]) -> std::io::Result<Vec<u8>> {
2379    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2380    let entry_count = u32::try_from(lookup_entries.len())
2381        .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
2382    writer.write_all(LOOKUP_MAGIC)?;
2383    write_u32(&mut writer, INDEX_VERSION)?;
2384    write_u32(&mut writer, entry_count)?;
2385    for entry in lookup_entries {
2386        write_u32(&mut writer, entry.trigram)?;
2387        write_u64(&mut writer, entry.offset)?;
2388        write_u32(&mut writer, entry.count)?;
2389    }
2390    writer.flush()?;
2391    let mut lookup_blob = writer
2392        .into_inner()
2393        .map_err(|error| std::io::Error::other(error.to_string()))?
2394        .into_inner();
2395    let checksum = crc32fast::hash(&lookup_blob);
2396    lookup_blob.extend_from_slice(&checksum.to_le_bytes());
2397    Ok(lookup_blob)
2398}
2399
2400fn build_file_trigram_count_extension(counts: &[u32]) -> std::io::Result<Vec<u8>> {
2401    let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2402    writer.write_all(FILE_TRIGRAM_COUNT_MAGIC)?;
2403    write_u32(&mut writer, INDEX_VERSION)?;
2404    write_u32(
2405        &mut writer,
2406        u32::try_from(counts.len())
2407            .map_err(|_| std::io::Error::other("too many file trigram counts"))?,
2408    )?;
2409    for count in counts {
2410        write_u32(&mut writer, *count)?;
2411    }
2412    writer.flush()?;
2413    Ok(writer
2414        .into_inner()
2415        .map_err(|error| std::io::Error::other(error.to_string()))?
2416        .into_inner())
2417}
2418
2419fn flush_spill_segment(
2420    spill_dir: &Path,
2421    seq: usize,
2422    block: &mut Vec<SpillRecord>,
2423) -> std::io::Result<PathBuf> {
2424    if block.is_empty() {
2425        return Err(std::io::Error::other(
2426            "refusing to write empty search spill",
2427        ));
2428    }
2429    block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2430    let path = spill_dir.join(format!("segment.{seq:06}.bin"));
2431    let mut writer = BufWriter::new(File::create(&path)?);
2432    writer.write_all(SPILL_MAGIC)?;
2433    write_u32(&mut writer, INDEX_VERSION)?;
2434    write_u64(
2435        &mut writer,
2436        u64::try_from(block.len()).map_err(|_| std::io::Error::other("search spill too large"))?,
2437    )?;
2438
2439    let mut index = 0usize;
2440    while index < block.len() {
2441        let trigram = block[index].trigram;
2442        let group_start = index;
2443        while index < block.len() && block[index].trigram == trigram {
2444            index += 1;
2445        }
2446        write_u32(&mut writer, trigram)?;
2447        write_u32(
2448            &mut writer,
2449            u32::try_from(index - group_start)
2450                .map_err(|_| std::io::Error::other("search spill group too large"))?,
2451        )?;
2452        for record in &block[group_start..index] {
2453            writer.write_all(&record.file_id.to_le_bytes())?;
2454            writer.write_all(&[record.next_mask, record.loc_mask])?;
2455        }
2456    }
2457    writer.flush()?;
2458    writer.get_ref().sync_all()?;
2459    block.clear();
2460    Ok(path)
2461}
2462
2463fn create_spill_dir(cache_dir: &Path) -> std::io::Result<PathBuf> {
2464    let dir = cache_dir.join(format!(
2465        "search-build.tmp.{}.{}",
2466        std::process::id(),
2467        SystemTime::now()
2468            .duration_since(UNIX_EPOCH)
2469            .unwrap_or(Duration::ZERO)
2470            .as_nanos()
2471    ));
2472    fs::create_dir_all(&dir)?;
2473    Ok(dir)
2474}
2475
2476fn sweep_stale_search_build_dirs(cache_dir: &Path) {
2477    let Ok(entries) = fs::read_dir(cache_dir) else {
2478        return;
2479    };
2480    for entry in entries.flatten() {
2481        let file_name = entry.file_name();
2482        if file_name.to_string_lossy().starts_with("search-build.tmp.") {
2483            let _ = fs::remove_dir_all(entry.path());
2484        }
2485    }
2486}
2487
2488fn transient_search_cache_dir(root: &Path) -> PathBuf {
2489    std::env::temp_dir().join(format!(
2490        "aft-search-cache.{}.{}.{}",
2491        artifact_cache_key(root),
2492        std::process::id(),
2493        SystemTime::now()
2494            .duration_since(UNIX_EPOCH)
2495            .unwrap_or(Duration::ZERO)
2496            .as_nanos()
2497    ))
2498}
2499
2500fn read_file_trigram_count_extension(
2501    base: &BasePostings,
2502    extension_start: u64,
2503    postings_body_end: u64,
2504    file_count: usize,
2505) -> std::io::Result<Option<Vec<u32>>> {
2506    if extension_start >= postings_body_end {
2507        return Ok(None);
2508    }
2509    let extension_len = postings_body_end - extension_start;
2510    if extension_len < 16 {
2511        return Ok(None);
2512    }
2513    let mut header = [0u8; 16];
2514    pread_exact(&base.file, extension_start, &mut header)?;
2515    if &header[..8] != FILE_TRIGRAM_COUNT_MAGIC {
2516        return Ok(None);
2517    }
2518    let version = u32::from_le_bytes([header[8], header[9], header[10], header[11]]);
2519    if version != INDEX_VERSION {
2520        return Err(std::io::Error::other("invalid file trigram count version"));
2521    }
2522    let count = u32::from_le_bytes([header[12], header[13], header[14], header[15]]) as usize;
2523    if count != file_count {
2524        return Err(std::io::Error::other("file trigram count length mismatch"));
2525    }
2526    let counts_len = count
2527        .checked_mul(4)
2528        .ok_or_else(|| std::io::Error::other("file trigram count extension too large"))?;
2529    if 16u64 + counts_len as u64 > extension_len {
2530        return Err(std::io::Error::other(
2531            "truncated file trigram count extension",
2532        ));
2533    }
2534    let mut bytes = vec![0u8; counts_len];
2535    pread_exact(&base.file, extension_start + 16, &mut bytes)?;
2536    let mut counts = Vec::with_capacity(count);
2537    for chunk in bytes.chunks_exact(4) {
2538        counts.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
2539    }
2540    Ok(Some(counts))
2541}
2542
2543fn compute_file_trigram_counts_from_base(
2544    base: &BasePostings,
2545    file_count: usize,
2546) -> std::io::Result<Vec<u32>> {
2547    let mut counts = vec![0u32; file_count];
2548    for entry in base.lookup.iter().copied() {
2549        for posting in base.read_postings(entry)? {
2550            let Some(count) = counts.get_mut(posting.file_id as usize) else {
2551                return Err(std::io::Error::other("posting references missing file"));
2552            };
2553            *count = count.saturating_add(1);
2554        }
2555    }
2556    Ok(counts)
2557}
2558
2559fn ensure_count_slot(counts: &mut Vec<u32>, file_id: u32) {
2560    let len = file_id as usize + 1;
2561    if counts.len() < len {
2562        counts.resize(len, 0);
2563    }
2564}
2565
2566fn reader_has_remaining<R: Seek>(
2567    reader: &mut R,
2568    absolute_end: u64,
2569    len: usize,
2570) -> std::io::Result<bool> {
2571    let position = reader.stream_position()?;
2572    Ok(position <= absolute_end && (len as u64) <= absolute_end - position)
2573}
2574
2575fn crc32_file_range(path: &Path, start: u64, len: u64) -> std::io::Result<u32> {
2576    let mut file = File::open(path)?;
2577    file.seek(SeekFrom::Start(start))?;
2578    let mut hasher = crc32fast::Hasher::new();
2579    let mut remaining = len;
2580    let mut buffer = vec![0u8; 1024 * 1024];
2581    while remaining > 0 {
2582        let read_len = buffer.len().min(remaining as usize);
2583        let bytes_read = file.read(&mut buffer[..read_len])?;
2584        if bytes_read == 0 {
2585            return Err(std::io::Error::new(
2586                std::io::ErrorKind::UnexpectedEof,
2587                "truncated cache while checksumming",
2588            ));
2589        }
2590        hasher.update(&buffer[..bytes_read]);
2591        remaining -= bytes_read as u64;
2592    }
2593    Ok(hasher.finalize())
2594}
2595
2596fn sync_parent_dir(path: &Path) {
2597    if let Some(parent) = path.parent() {
2598        if let Ok(dir) = File::open(parent) {
2599            let _ = dir.sync_all();
2600        }
2601    }
2602}
2603
2604fn open_cache_file_read(path: &Path) -> std::io::Result<File> {
2605    let mut options = OpenOptions::new();
2606    options.read(true);
2607    #[cfg(windows)]
2608    {
2609        use std::os::windows::fs::OpenOptionsExt;
2610        const FILE_SHARE_READ: u32 = 0x0000_0001;
2611        const FILE_SHARE_WRITE: u32 = 0x0000_0002;
2612        const FILE_SHARE_DELETE: u32 = 0x0000_0004;
2613        options.share_mode(FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE);
2614    }
2615    options.open(path)
2616}
2617
2618#[cfg(unix)]
2619fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2620    use std::os::unix::fs::FileExt;
2621    while !buffer.is_empty() {
2622        let bytes_read = file.read_at(buffer, offset)?;
2623        if bytes_read == 0 {
2624            return Err(std::io::Error::new(
2625                std::io::ErrorKind::UnexpectedEof,
2626                "short pread from search cache",
2627            ));
2628        }
2629        offset += bytes_read as u64;
2630        let (_, rest) = buffer.split_at_mut(bytes_read);
2631        buffer = rest;
2632    }
2633    Ok(())
2634}
2635
2636#[cfg(windows)]
2637fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2638    use std::os::windows::fs::FileExt;
2639    while !buffer.is_empty() {
2640        let bytes_read = file.seek_read(buffer, offset)?;
2641        if bytes_read == 0 {
2642            return Err(std::io::Error::new(
2643                std::io::ErrorKind::UnexpectedEof,
2644                "short pread from search cache",
2645            ));
2646        }
2647        offset += bytes_read as u64;
2648        let (_, rest) = buffer.split_at_mut(bytes_read);
2649        buffer = rest;
2650    }
2651    Ok(())
2652}
2653
2654fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2655    let mut merged = Vec::with_capacity(left.len().min(right.len()));
2656    let mut left_index = 0;
2657    let mut right_index = 0;
2658
2659    while left_index < left.len() && right_index < right.len() {
2660        match left[left_index].cmp(&right[right_index]) {
2661            std::cmp::Ordering::Less => left_index += 1,
2662            std::cmp::Ordering::Greater => right_index += 1,
2663            std::cmp::Ordering::Equal => {
2664                merged.push(left[left_index]);
2665                left_index += 1;
2666                right_index += 1;
2667            }
2668        }
2669    }
2670
2671    merged
2672}
2673
2674fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2675    let mut merged = Vec::with_capacity(left.len() + right.len());
2676    let mut left_index = 0;
2677    let mut right_index = 0;
2678
2679    while left_index < left.len() && right_index < right.len() {
2680        match left[left_index].cmp(&right[right_index]) {
2681            std::cmp::Ordering::Less => {
2682                merged.push(left[left_index]);
2683                left_index += 1;
2684            }
2685            std::cmp::Ordering::Greater => {
2686                merged.push(right[right_index]);
2687                right_index += 1;
2688            }
2689            std::cmp::Ordering::Equal => {
2690                merged.push(left[left_index]);
2691                left_index += 1;
2692                right_index += 1;
2693            }
2694        }
2695    }
2696
2697    merged.extend_from_slice(&left[left_index..]);
2698    merged.extend_from_slice(&right[right_index..]);
2699    merged
2700}
2701
2702pub fn decompose_regex(pattern: &str) -> RegexQuery {
2703    let hir = match regex_syntax::parse(pattern) {
2704        Ok(hir) => hir,
2705        Err(_) => return RegexQuery::default(),
2706    };
2707
2708    let build = build_query(&hir);
2709    build.into_query()
2710}
2711
2712pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
2713    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
2714}
2715
2716pub fn normalize_char(c: u8) -> u8 {
2717    c.to_ascii_lowercase()
2718}
2719
2720fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
2721    if content.len() < 3 {
2722        return;
2723    }
2724
2725    for start in 0..=content.len() - 3 {
2726        let trigram = pack_trigram(
2727            normalize_char(content[start]),
2728            normalize_char(content[start + 1]),
2729            normalize_char(content[start + 2]),
2730        );
2731        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
2732        visit(trigram, next_char, start);
2733    }
2734}
2735
2736pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
2737    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
2738    scan_trigrams(content, |trigram, next_char, position| {
2739        trigrams.push((trigram, next_char, position));
2740    });
2741    trigrams
2742}
2743
2744fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
2745    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2746    scan_trigrams(content, |trigram, next_char, position| {
2747        let entry = filters.entry(trigram).or_default();
2748        if include_eof_next_char || next_char != EOF_SENTINEL {
2749            entry.next_mask |= mask_for_next_char(next_char);
2750        }
2751        entry.loc_mask |= mask_for_position(position);
2752    });
2753    filters
2754}
2755
2756pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
2757    let mut seen = HashSet::new();
2758    let mut out = Vec::new();
2759    for token in tokens {
2760        scan_trigrams(token.as_bytes(), |trigram, _, _| {
2761            if seen.insert(trigram) {
2762                out.push(trigram);
2763            }
2764        });
2765    }
2766    out
2767}
2768
2769pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
2770    lexical_score_snapshot(&index.snapshot(), query_trigrams, file_id)
2771}
2772
2773fn lexical_score_snapshot(
2774    index: &SearchIndexSnapshot,
2775    query_trigrams: &[u32],
2776    file_id: u32,
2777) -> f32 {
2778    if query_trigrams.is_empty() {
2779        return 0.0;
2780    }
2781
2782    let mut hits = 0u32;
2783    for &trigram in query_trigrams {
2784        let postings = index.postings_for_trigram(trigram, None);
2785        if postings.binary_search(&file_id).is_ok() {
2786            hits += 1;
2787        }
2788    }
2789
2790    if hits == 0 {
2791        return 0.0;
2792    }
2793
2794    let file_trigram_count = index
2795        .file_trigram_count
2796        .get(file_id as usize)
2797        .copied()
2798        .unwrap_or(1)
2799        .max(1) as f32;
2800    (hits as f32) / (1.0 + file_trigram_count.ln())
2801}
2802
2803pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
2804    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
2805    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
2806        return PathBuf::from(override_dir)
2807            .join("index")
2808            .join(artifact_cache_key(project_root));
2809    }
2810    // Use configured storage dir (from plugin, XDG-compliant)
2811    if let Some(dir) = storage_dir {
2812        return dir.join("index").join(artifact_cache_key(project_root));
2813    }
2814    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
2815    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
2816    // If neither is set, fall back to a temp directory rather than `"."`
2817    // because the search-index code reads/writes absolute paths.
2818    let home = std::env::var_os("HOME")
2819        .or_else(|| std::env::var_os("USERPROFILE"))
2820        .map(PathBuf::from)
2821        .unwrap_or_else(std::env::temp_dir);
2822    home.join(".cache")
2823        .join("aft")
2824        .join("index")
2825        .join(artifact_cache_key(project_root))
2826}
2827
2828pub(crate) fn build_path_filters(
2829    include: &[String],
2830    exclude: &[String],
2831) -> Result<PathFilters, String> {
2832    Ok(PathFilters {
2833        includes: build_globset(include)?,
2834        excludes: build_globset(exclude)?,
2835    })
2836}
2837
2838pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
2839    walk_project_files_from(root, root, filters)
2840}
2841
2842pub fn walk_project_files_bounded_default(
2843    root: &Path,
2844    max_files: usize,
2845) -> Result<Vec<PathBuf>, usize> {
2846    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
2847}
2848
2849pub(crate) fn walk_project_files_bounded_matching<F>(
2850    root: &Path,
2851    filters: &PathFilters,
2852    max_files: usize,
2853    matches_file: F,
2854) -> Result<Vec<PathBuf>, usize>
2855where
2856    F: Fn(&Path) -> bool,
2857{
2858    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
2859}
2860
2861pub fn walk_project_files_bounded_default_matching<F>(
2862    root: &Path,
2863    max_files: usize,
2864    matches_file: F,
2865) -> Result<Vec<PathBuf>, usize>
2866where
2867    F: Fn(&Path) -> bool,
2868{
2869    walk_project_files_from_inner_matching(
2870        root,
2871        root,
2872        &PathFilters::default(),
2873        Some(max_files),
2874        matches_file,
2875        true,
2876    )
2877}
2878
2879pub(crate) fn walk_project_files_from(
2880    filter_root: &Path,
2881    search_root: &Path,
2882    filters: &PathFilters,
2883) -> Vec<PathBuf> {
2884    walk_project_files_from_inner(filter_root, search_root, filters, None, true)
2885        .expect("unbounded project walk cannot exceed a file limit")
2886}
2887
2888pub(crate) fn has_any_project_file_from(
2889    filter_root: &Path,
2890    search_root: &Path,
2891    filters: &PathFilters,
2892) -> bool {
2893    walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
2894}
2895
2896fn walk_project_files_from_inner(
2897    filter_root: &Path,
2898    search_root: &Path,
2899    filters: &PathFilters,
2900    max_files: Option<usize>,
2901    sort_by_mtime: bool,
2902) -> Result<Vec<PathBuf>, usize> {
2903    walk_project_files_from_inner_matching(
2904        filter_root,
2905        search_root,
2906        filters,
2907        max_files,
2908        |_| true,
2909        sort_by_mtime,
2910    )
2911}
2912
2913fn project_walk_builder(search_root: &Path) -> WalkBuilder {
2914    let mut builder = WalkBuilder::new(search_root);
2915    builder
2916        .hidden(false)
2917        .git_ignore(true)
2918        .git_global(true)
2919        .git_exclude(true)
2920        .add_custom_ignore_filename(".aftignore")
2921        .filter_entry(|entry| {
2922            let name = entry.file_name().to_string_lossy();
2923            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2924                return !matches!(
2925                    name.as_ref(),
2926                    "node_modules"
2927                        | "target"
2928                        | "venv"
2929                        | ".venv"
2930                        | ".git"
2931                        | "__pycache__"
2932                        | ".tox"
2933                        | "dist"
2934                        | "build"
2935                );
2936            }
2937            true
2938        });
2939    builder
2940}
2941
2942fn walk_project_files_from_inner_matching<F>(
2943    filter_root: &Path,
2944    search_root: &Path,
2945    filters: &PathFilters,
2946    max_files: Option<usize>,
2947    matches_file: F,
2948    sort_by_mtime: bool,
2949) -> Result<Vec<PathBuf>, usize>
2950where
2951    F: Fn(&Path) -> bool,
2952{
2953    let builder = project_walk_builder(search_root);
2954
2955    let mut files = Vec::new();
2956    for entry in builder.build().filter_map(|entry| entry.ok()) {
2957        if !entry
2958            .file_type()
2959            .map_or(false, |file_type| file_type.is_file())
2960        {
2961            continue;
2962        }
2963        let path = entry.into_path();
2964        if filters.matches(filter_root, &path) && matches_file(&path) {
2965            files.push(path);
2966            if max_files.is_some_and(|limit| files.len() > limit) {
2967                return Err(files.len());
2968            }
2969        }
2970    }
2971
2972    if sort_by_mtime {
2973        sort_paths_by_mtime_desc(&mut files);
2974    }
2975    Ok(files)
2976}
2977
2978pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
2979    let bytes = fs::read(path).ok()?;
2980    if is_binary_bytes(&bytes) {
2981        return None;
2982    }
2983    String::from_utf8(bytes).ok()
2984}
2985
2986fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
2987    fs::read(path).ok()
2988}
2989
2990pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
2991    path.strip_prefix(root)
2992        .map(PathBuf::from)
2993        .unwrap_or_else(|_| path.to_path_buf())
2994}
2995
2996pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
2997    let normalized_root = normalize_path(root);
2998    let normalized_path = normalize_path(path);
2999    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
3000    validate_cached_relative_path(relative)
3001}
3002
3003pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
3004    let relative = validate_cached_relative_path(relative_path)?;
3005    let normalized_root = normalize_path(root);
3006    let full_path = normalize_path(&normalized_root.join(relative));
3007
3008    match fs::canonicalize(&full_path) {
3009        Ok(canonical_path) => {
3010            if canonical_path.starts_with(&normalized_root) {
3011                return Some(full_path);
3012            }
3013
3014            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
3015            canonical_path
3016                .starts_with(&canonical_root)
3017                .then_some(full_path)
3018        }
3019        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
3020    }
3021}
3022
3023pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
3024    if path.is_absolute() {
3025        return None;
3026    }
3027
3028    let mut normalized = PathBuf::new();
3029    for component in path.components() {
3030        match component {
3031            Component::Normal(part) => normalized.push(part),
3032            Component::CurDir => {}
3033            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
3034        }
3035    }
3036    (!normalized.as_os_str().is_empty()).then_some(normalized)
3037}
3038
3039/// Sort paths newest-first by mtime, falling back to normalized display-path order.
3040///
3041/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
3042/// `sort_by()` closure. That made the comparator non-deterministic — a
3043/// `stat()` syscall for the same path can return different values across
3044/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
3045/// concurrent file-watcher activity), and Rust's slice::sort panics at
3046/// runtime when it detects a non-total-order comparator. CI hit this on
3047/// a Pi e2e test where the bridge invalidated files in parallel with grep.
3048///
3049/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
3050/// from the map inside the closure. Pure function ⇒ guaranteed total order.
3051pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
3052    use std::collections::HashMap;
3053    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
3054    let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(paths.len());
3055    for path in paths.iter() {
3056        mtimes
3057            .entry(path.clone())
3058            .or_insert_with(|| path_modified_time(path));
3059        display_paths
3060            .entry(path.clone())
3061            .or_insert_with(|| normalized_display_sort_key(None, path));
3062    }
3063    paths.sort_by(|left, right| {
3064        let left_mtime = mtimes.get(left).and_then(|v| *v);
3065        let right_mtime = mtimes.get(right).and_then(|v| *v);
3066        let left_display = display_paths
3067            .get(left)
3068            .map(String::as_bytes)
3069            .unwrap_or_default();
3070        let right_display = display_paths
3071            .get(right)
3072            .map(String::as_bytes)
3073            .unwrap_or_default();
3074        right_mtime
3075            .cmp(&left_mtime)
3076            .then_with(|| left_display.cmp(right_display))
3077    });
3078}
3079
3080/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
3081/// the sort. Same fix, applied to grep matches that share files.
3082pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
3083    use std::collections::HashMap;
3084    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
3085    let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(matches.len());
3086    for m in matches.iter() {
3087        mtimes.entry(m.file.clone()).or_insert_with(|| {
3088            let resolved = resolve_match_path(project_root, &m.file);
3089            path_modified_time(&resolved)
3090        });
3091        display_paths
3092            .entry(m.file.clone())
3093            .or_insert_with(|| normalized_display_sort_key(Some(project_root), &m.file));
3094    }
3095    matches.sort_by(|left, right| {
3096        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
3097        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
3098        let left_display = display_paths
3099            .get(&left.file)
3100            .map(String::as_bytes)
3101            .unwrap_or_default();
3102        let right_display = display_paths
3103            .get(&right.file)
3104            .map(String::as_bytes)
3105            .unwrap_or_default();
3106        // The display-path tiebreak makes complete result sets deterministic.
3107        // If a parallel grep stops early after hitting a cap, the capped subset
3108        // can still depend on which worker reaches the cap first.
3109        right_mtime
3110            .cmp(&left_mtime)
3111            .then_with(|| left_display.cmp(right_display))
3112            .then_with(|| left.line.cmp(&right.line))
3113            .then_with(|| left.column.cmp(&right.column))
3114    });
3115}
3116
3117/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
3118/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
3119/// table from the search index), but it can still return different values if
3120/// the file is modified mid-sort. Snapshot once.
3121fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
3122    matches: &mut [SharedGrepMatch],
3123    project_root: &Path,
3124    modified_for_path: F,
3125) where
3126    F: Fn(&Path) -> Option<SystemTime>,
3127{
3128    use std::collections::HashMap;
3129    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
3130    let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(matches.len());
3131    for m in matches.iter() {
3132        let path = m.file.as_path().to_path_buf();
3133        mtimes
3134            .entry(path.clone())
3135            .or_insert_with(|| modified_for_path(&path));
3136        display_paths
3137            .entry(path.clone())
3138            .or_insert_with(|| normalized_display_sort_key(Some(project_root), &path));
3139    }
3140    matches.sort_by(|left, right| {
3141        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
3142        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
3143        let left_display = display_paths
3144            .get(left.file.as_path())
3145            .map(String::as_bytes)
3146            .unwrap_or_default();
3147        let right_display = display_paths
3148            .get(right.file.as_path())
3149            .map(String::as_bytes)
3150            .unwrap_or_default();
3151        // The display-path tiebreak makes complete result sets deterministic.
3152        // If a parallel grep stops early after hitting a cap, the capped subset
3153        // can still depend on which worker reaches the cap first.
3154        right_mtime
3155            .cmp(&left_mtime)
3156            .then_with(|| left_display.cmp(right_display))
3157            .then_with(|| left.line.cmp(&right.line))
3158            .then_with(|| left.column.cmp(&right.column))
3159    });
3160}
3161
3162pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
3163    let resolved_project_root = canonicalize_or_normalize(project_root);
3164    let root = match path {
3165        Some(path) => {
3166            let path = PathBuf::from(path);
3167            if path.is_absolute() {
3168                canonicalize_or_normalize(&path)
3169            } else {
3170                normalize_path(&resolved_project_root.join(path))
3171            }
3172        }
3173        None => resolved_project_root.clone(),
3174    };
3175
3176    let use_index = is_within_search_root(&resolved_project_root, &root);
3177    SearchScope { root, use_index }
3178}
3179
3180pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
3181    content_inspector::inspect(content).is_binary()
3182}
3183
3184pub(crate) fn current_git_head(root: &Path) -> Option<String> {
3185    run_git(root, &["rev-parse", "HEAD"])
3186}
3187
3188/// On-disk ARTIFACT cache key (search, semantic, symbol, callgraph, inspect).
3189///
3190/// For git repos this is the repository ROOT COMMIT — so a linked worktree
3191/// shares the main checkout's index (opened read-only), the deliberate
3192/// worktree-sharing mechanism. For non-git it is the canonical filesystem path.
3193///
3194/// This is the per-REPOSITORY identity. It is intentionally DISTINCT from
3195/// [`crate::path_identity::project_scope_key`] (the per-CHECKOUT identity used
3196/// for bash/compression/backup/checkpoint scoping). Its value is unchanged from
3197/// the historical `project_cache_key`, so existing on-disk caches are NOT
3198/// invalidated by the P0 identity split.
3199pub fn artifact_cache_key(project_root: &Path) -> String {
3200    use sha2::{Digest, Sha256};
3201
3202    let mut hasher = Sha256::new();
3203
3204    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
3205        // Git repo: root commit is the unique identity.
3206        // Same repo cloned anywhere produces the same key.
3207        hasher.update(root_commit.as_bytes());
3208    } else {
3209        // Non-git project: use the canonical filesystem path as identity.
3210        let canonical_root = canonicalize_or_normalize(project_root);
3211        hasher.update(canonical_root.to_string_lossy().as_bytes());
3212    }
3213
3214    let digest = format!("{:x}", hasher.finalize());
3215    digest[..16].to_string()
3216}
3217
3218/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
3219///
3220/// The search cache stores this value next to the file mtimes. If `.gitignore`,
3221/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
3222/// matching HEAD + matching file mtimes is not enough to safely reuse the old
3223/// cache: files that are now ignored may still be indexed. Hashing the ignore
3224/// files themselves makes cold-start cache reuse agree with the current walker.
3225pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
3226    use sha2::{Digest, Sha256};
3227
3228    let root = canonicalize_or_normalize(project_root);
3229    let mut files = Vec::new();
3230    collect_ignore_rule_files(&root, &mut files);
3231    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
3232        if global_ignore.is_file() {
3233            files.push(global_ignore);
3234        }
3235    }
3236    let info_exclude = git_info_exclude_path(&root);
3237    if info_exclude.is_file() {
3238        files.push(info_exclude);
3239    }
3240    files.sort();
3241    files.dedup();
3242
3243    let mut hasher = Sha256::new();
3244    hasher.update(b"aft-ignore-rules-v1\0");
3245    for path in files {
3246        if let Some(relative) = cache_relative_path(&root, &path) {
3247            hasher.update(relative.to_string_lossy().as_bytes());
3248        } else {
3249            hasher.update(path.to_string_lossy().as_bytes());
3250        }
3251        hasher.update(b"\0");
3252        match fs::read(&path) {
3253            Ok(bytes) => hasher.update(&bytes),
3254            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
3255        }
3256        hasher.update(b"\0");
3257    }
3258
3259    format!("{:x}", hasher.finalize())
3260}
3261
3262fn git_info_exclude_path(root: &Path) -> PathBuf {
3263    run_git(
3264        root,
3265        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
3266    )
3267    .map(PathBuf::from)
3268    .unwrap_or_else(|| root.join(".git"))
3269    .join("info")
3270    .join("exclude")
3271}
3272
3273fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
3274    let mut builder = WalkBuilder::new(root);
3275    builder
3276        .hidden(false)
3277        .git_ignore(true)
3278        .git_global(true)
3279        .git_exclude(true)
3280        .add_custom_ignore_filename(".aftignore")
3281        .filter_entry(|entry| {
3282            let name = entry.file_name().to_string_lossy();
3283            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3284                return !matches!(
3285                    name.as_ref(),
3286                    ".git"
3287                        | "node_modules"
3288                        | "target"
3289                        | "venv"
3290                        | ".venv"
3291                        | "__pycache__"
3292                        | ".tox"
3293                        | "dist"
3294                        | "build"
3295                );
3296            }
3297            true
3298        });
3299
3300    for entry in builder.build().filter_map(|entry| entry.ok()) {
3301        if !entry
3302            .file_type()
3303            .map_or(false, |file_type| file_type.is_file())
3304        {
3305            continue;
3306        }
3307        let file_name = entry.file_name();
3308        if file_name == ".gitignore" || file_name == ".aftignore" {
3309            files.push(entry.into_path());
3310        }
3311    }
3312}
3313
3314/// Count directories visited when discovering ignore rule files (for perf regression tests).
3315#[cfg(test)]
3316pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
3317    let mut dirs = 0usize;
3318    let mut builder = WalkBuilder::new(root);
3319    builder
3320        .hidden(false)
3321        .git_ignore(true)
3322        .git_global(true)
3323        .git_exclude(true)
3324        .add_custom_ignore_filename(".aftignore");
3325    for entry in builder.build().filter_map(|entry| entry.ok()) {
3326        if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3327            dirs += 1;
3328        }
3329    }
3330    dirs
3331}
3332
3333/// Legacy stack-based discovery (pre ignore-walker fix); used only in perf tests.
3334#[cfg(test)]
3335pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
3336    let mut stack = vec![root.to_path_buf()];
3337    let mut dirs = 0usize;
3338    while let Some(dir) = stack.pop() {
3339        dirs += 1;
3340        let Ok(entries) = fs::read_dir(&dir) else {
3341            continue;
3342        };
3343        for entry in entries.flatten() {
3344            let path = entry.path();
3345            let file_name = entry.file_name();
3346            if file_name == ".gitignore" || file_name == ".aftignore" {
3347                continue;
3348            }
3349            let Ok(file_type) = entry.file_type() else {
3350                continue;
3351            };
3352            if !file_type.is_dir() || file_type.is_symlink() {
3353                continue;
3354            }
3355            if matches!(
3356                file_name.to_str().unwrap_or(""),
3357                ".git"
3358                    | "node_modules"
3359                    | "target"
3360                    | "venv"
3361                    | ".venv"
3362                    | "__pycache__"
3363                    | ".tox"
3364                    | "dist"
3365                    | "build"
3366            ) {
3367                continue;
3368            }
3369            stack.push(path);
3370        }
3371    }
3372    dirs
3373}
3374
3375impl PathFilters {
3376    pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
3377        let relative = to_glob_path(&relative_to_root(root, path));
3378        if self
3379            .includes
3380            .as_ref()
3381            .is_some_and(|includes| !includes.is_match(&relative))
3382        {
3383            return false;
3384        }
3385        if self
3386            .excludes
3387            .as_ref()
3388            .is_some_and(|excludes| excludes.is_match(&relative))
3389        {
3390            return false;
3391        }
3392        true
3393    }
3394}
3395
3396fn canonicalize_or_normalize(path: &Path) -> PathBuf {
3397    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
3398}
3399
3400fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
3401    if path.is_absolute() {
3402        path.to_path_buf()
3403    } else {
3404        project_root.join(path)
3405    }
3406}
3407
3408fn path_modified_time(path: &Path) -> Option<SystemTime> {
3409    fs::metadata(path)
3410        .and_then(|metadata| metadata.modified())
3411        .ok()
3412}
3413
3414fn normalized_display_sort_key(project_root: Option<&Path>, path: &Path) -> String {
3415    let display_path = project_root
3416        .and_then(|root| path.strip_prefix(root).ok())
3417        .unwrap_or(path);
3418    to_glob_path(display_path)
3419}
3420
3421fn normalize_path(path: &Path) -> PathBuf {
3422    let mut result = PathBuf::new();
3423    for component in path.components() {
3424        match component {
3425            Component::ParentDir => {
3426                if !result.pop() {
3427                    result.push(component);
3428                }
3429            }
3430            Component::CurDir => {}
3431            _ => result.push(component),
3432        }
3433    }
3434    result
3435}
3436
3437fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3438    if let Ok(canonical) = fs::canonicalize(path) {
3439        return canonical;
3440    }
3441
3442    let Some(parent) = path.parent() else {
3443        return path.to_path_buf();
3444    };
3445    let Some(file_name) = path.file_name() else {
3446        return path.to_path_buf();
3447    };
3448
3449    fs::canonicalize(parent)
3450        .map(|canonical_parent| canonical_parent.join(file_name))
3451        .unwrap_or_else(|_| path.to_path_buf())
3452}
3453
3454/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
3455/// since the index was last written. Also detect new files and deleted files.
3456fn verify_file_mtimes(index: &mut SearchIndex) {
3457    let filters = PathFilters::default();
3458    let current_files = walk_project_files(&index.project_root, &filters);
3459    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
3460    let mut stale_paths = Vec::new();
3461    let mut removed_paths = Vec::new();
3462
3463    for entry in Arc::make_mut(&mut index.files).iter_mut() {
3464        if entry.path.as_os_str().is_empty() {
3465            continue; // tombstoned entry
3466        }
3467        if !current_file_set.contains(&entry.path) {
3468            removed_paths.push(entry.path.clone());
3469            continue;
3470        }
3471        let cached = FileFreshness {
3472            mtime: entry.modified,
3473            size: entry.size,
3474            content_hash: entry.content_hash,
3475        };
3476        match cache_freshness::verify_file_strict(&entry.path, &cached) {
3477            FreshnessVerdict::HotFresh => {}
3478            FreshnessVerdict::ContentFresh {
3479                new_mtime,
3480                new_size,
3481            } => {
3482                entry.modified = new_mtime;
3483                entry.size = new_size;
3484            }
3485            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
3486                stale_paths.push(entry.path.clone())
3487            }
3488        }
3489    }
3490
3491    for path in &removed_paths {
3492        index.remove_file(path);
3493    }
3494
3495    // Re-index stale files that are still in the current walk set. If an ignore
3496    // rule changed while AFT was down but the fingerprint missed it, this keeps
3497    // warm-cache verification from resurrecting now-ignored cached entries.
3498    for path in &stale_paths {
3499        if current_file_set.contains(path) {
3500            index.update_file(path);
3501        } else {
3502            index.remove_file(path);
3503        }
3504    }
3505
3506    // Detect new files not in the index
3507    for path in current_files {
3508        if !index.path_to_id.contains_key(&path) {
3509            index.update_file(&path);
3510        }
3511    }
3512
3513    if !stale_paths.is_empty() {
3514        crate::slog_info!(
3515            "search index: refreshed {} stale file(s) from disk cache",
3516            stale_paths.len()
3517        );
3518    }
3519}
3520
3521fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
3522    normalize_path(path).starts_with(normalize_path(search_root))
3523}
3524
3525impl QueryBuild {
3526    fn into_query(self) -> RegexQuery {
3527        let mut query = RegexQuery::default();
3528
3529        for run in self.and_runs {
3530            add_run_to_and_query(&mut query, &run);
3531        }
3532
3533        for group in self.or_groups {
3534            let mut trigrams = BTreeSet::new();
3535            let mut filters = HashMap::new();
3536            for run in group {
3537                for (trigram, filter) in trigram_filters(&run) {
3538                    trigrams.insert(trigram);
3539                    merge_filter(filters.entry(trigram).or_default(), filter);
3540                }
3541            }
3542            if !trigrams.is_empty() {
3543                query.or_groups.push(trigrams.into_iter().collect());
3544                query.or_filters.push(filters);
3545            }
3546        }
3547
3548        query
3549    }
3550}
3551
3552fn build_query(hir: &Hir) -> QueryBuild {
3553    match hir.kind() {
3554        HirKind::Literal(literal) => {
3555            if literal.0.len() >= 3 {
3556                QueryBuild {
3557                    and_runs: vec![literal.0.to_vec()],
3558                    or_groups: Vec::new(),
3559                }
3560            } else {
3561                QueryBuild::default()
3562            }
3563        }
3564        HirKind::Capture(capture) => build_query(&capture.sub),
3565        HirKind::Concat(parts) => {
3566            let mut build = QueryBuild::default();
3567            for part in parts {
3568                let part_build = build_query(part);
3569                build.and_runs.extend(part_build.and_runs);
3570                build.or_groups.extend(part_build.or_groups);
3571            }
3572            build
3573        }
3574        HirKind::Alternation(parts) => {
3575            let mut group = Vec::new();
3576            for part in parts {
3577                let Some(mut choices) = guaranteed_run_choices(part) else {
3578                    return QueryBuild::default();
3579                };
3580                group.append(&mut choices);
3581            }
3582            if group.is_empty() {
3583                QueryBuild::default()
3584            } else {
3585                QueryBuild {
3586                    and_runs: Vec::new(),
3587                    or_groups: vec![group],
3588                }
3589            }
3590        }
3591        HirKind::Repetition(repetition) => {
3592            if repetition.min == 0 {
3593                QueryBuild::default()
3594            } else {
3595                build_query(&repetition.sub)
3596            }
3597        }
3598        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
3599    }
3600}
3601
3602fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
3603    match hir.kind() {
3604        HirKind::Literal(literal) => {
3605            if literal.0.len() >= 3 {
3606                Some(vec![literal.0.to_vec()])
3607            } else {
3608                None
3609            }
3610        }
3611        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
3612        HirKind::Concat(parts) => {
3613            let mut runs = Vec::new();
3614            for part in parts {
3615                if let Some(mut part_runs) = guaranteed_run_choices(part) {
3616                    runs.append(&mut part_runs);
3617                }
3618            }
3619            if runs.is_empty() {
3620                None
3621            } else {
3622                Some(runs)
3623            }
3624        }
3625        HirKind::Alternation(parts) => {
3626            let mut runs = Vec::new();
3627            for part in parts {
3628                let Some(mut part_runs) = guaranteed_run_choices(part) else {
3629                    return None;
3630                };
3631                runs.append(&mut part_runs);
3632            }
3633            if runs.is_empty() {
3634                None
3635            } else {
3636                Some(runs)
3637            }
3638        }
3639        HirKind::Repetition(repetition) => {
3640            if repetition.min == 0 {
3641                None
3642            } else {
3643                guaranteed_run_choices(&repetition.sub)
3644            }
3645        }
3646        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
3647    }
3648}
3649
3650fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
3651    for (trigram, filter) in trigram_filters(run) {
3652        if !query.and_trigrams.contains(&trigram) {
3653            query.and_trigrams.push(trigram);
3654        }
3655        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
3656    }
3657}
3658
3659fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
3660    trigram_filter_map(run, false).into_iter().collect()
3661}
3662
3663fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
3664    target.next_mask |= filter.next_mask;
3665    target.loc_mask |= filter.loc_mask;
3666}
3667
3668fn mask_for_next_char(next_char: u8) -> u8 {
3669    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
3670    1u8 << bit
3671}
3672
3673fn mask_for_position(position: usize) -> u8 {
3674    1u8 << (position % 8)
3675}
3676
3677fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
3678    if patterns.is_empty() {
3679        return Ok(None);
3680    }
3681
3682    let mut builder = GlobSetBuilder::new();
3683    for pattern in patterns {
3684        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
3685        builder.add(glob);
3686    }
3687    builder.build().map(Some).map_err(|error| error.to_string())
3688}
3689
3690fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
3691    let mut buffer = [0u8; 4];
3692    reader.read_exact(&mut buffer)?;
3693    Ok(u32::from_le_bytes(buffer))
3694}
3695
3696fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
3697    let mut buffer = [0u8; 8];
3698    reader.read_exact(&mut buffer)?;
3699    Ok(u64::from_le_bytes(buffer))
3700}
3701
3702fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
3703    writer.write_all(&value.to_le_bytes())
3704}
3705
3706fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
3707    writer.write_all(&value.to_le_bytes())
3708}
3709
3710fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
3711    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
3712        return Err(std::io::Error::other("search index checksum missing"));
3713    };
3714    let expected = u32::from_le_bytes(*stored);
3715    let actual = crc32fast::hash(body);
3716    if actual != expected {
3717        return Err(std::io::Error::other("search index checksum mismatch"));
3718    }
3719    Ok(())
3720}
3721
3722fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
3723    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
3724    total_len.checked_sub(pos)
3725}
3726
3727fn run_git(root: &Path, args: &[&str]) -> Option<String> {
3728    let output = Command::new("git")
3729        .arg("-C")
3730        .arg(root)
3731        .args(args)
3732        .output()
3733        .ok()?;
3734    if !output.status.success() {
3735        return None;
3736    }
3737    let value = String::from_utf8(output.stdout).ok()?;
3738    let value = value.trim().to_string();
3739    if value.is_empty() {
3740        None
3741    } else {
3742        Some(value)
3743    }
3744}
3745
3746fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
3747    let diff_range = format!("{}..{}", from, to);
3748    let output = match Command::new("git")
3749        .arg("-C")
3750        .arg(root)
3751        .args(["diff", "--name-status", "-M", &diff_range])
3752        .output()
3753    {
3754        Ok(output) => output,
3755        Err(_) => return false,
3756    };
3757
3758    if !output.status.success() {
3759        return false;
3760    }
3761
3762    let Ok(diff) = String::from_utf8(output.stdout) else {
3763        return false;
3764    };
3765
3766    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
3767        let mut fields = line.split('\t');
3768        let Some(status) = fields.next() else {
3769            continue;
3770        };
3771
3772        if status.starts_with('R') {
3773            let Some(old_path) = fields
3774                .next()
3775                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3776            else {
3777                continue;
3778            };
3779            let Some(new_path) = fields
3780                .next()
3781                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3782            else {
3783                continue;
3784            };
3785            index.remove_file(&old_path);
3786            index.update_file(&new_path);
3787            continue;
3788        }
3789
3790        let Some(path) = fields
3791            .next()
3792            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3793        else {
3794            continue;
3795        };
3796        if status.starts_with('D') || !path.exists() {
3797            index.remove_file(&path);
3798        } else {
3799            index.update_file(&path);
3800        }
3801    }
3802
3803    true
3804}
3805
3806fn is_binary_path(path: &Path, size: u64) -> bool {
3807    if size == 0 {
3808        return false;
3809    }
3810
3811    let mut file = match File::open(path) {
3812        Ok(file) => file,
3813        Err(_) => return true,
3814    };
3815
3816    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
3817    match file.read(&mut preview) {
3818        Ok(read) => is_binary_bytes(&preview[..read]),
3819        Err(_) => true,
3820    }
3821}
3822
3823fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
3824    let mut starts = vec![0usize];
3825    for (index, byte) in content.iter().copied().enumerate() {
3826        if byte == b'\n' {
3827            starts.push(index + 1);
3828        }
3829    }
3830    starts
3831}
3832
3833fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
3834    let line_index = match line_starts.binary_search(&offset) {
3835        Ok(index) => index,
3836        Err(index) => index.saturating_sub(1),
3837    };
3838    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
3839    let line_end = content[line_start..]
3840        .iter()
3841        .position(|byte| *byte == b'\n')
3842        .map(|length| line_start + length)
3843        .unwrap_or(content.len());
3844    let mut line_slice = &content[line_start..line_end];
3845    if line_slice.ends_with(b"\r") {
3846        line_slice = &line_slice[..line_slice.len() - 1];
3847    }
3848    let line_text = String::from_utf8_lossy(line_slice).into_owned();
3849    let column = String::from_utf8_lossy(&content[line_start..offset])
3850        .chars()
3851        .count() as u32
3852        + 1;
3853    (line_index as u32 + 1, column, line_text)
3854}
3855
3856fn to_glob_path(path: &Path) -> String {
3857    path.to_string_lossy().replace('\\', "/")
3858}
3859
3860#[cfg(test)]
3861mod tests {
3862    use std::process::Command;
3863
3864    use super::*;
3865
3866    #[test]
3867    fn cached_path_under_root_allows_missing_lexical_child() {
3868        let dir = tempfile::tempdir().expect("create temp dir");
3869        let project = dir.path().join("project");
3870        fs::create_dir_all(&project).expect("create project dir");
3871        let root = fs::canonicalize(&project).expect("canonicalize project");
3872
3873        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
3874            .expect("missing child should fall back to lexical validation");
3875
3876        assert_eq!(path, root.join("future/file.rs"));
3877    }
3878
3879    #[cfg(unix)]
3880    #[test]
3881    fn cached_path_under_root_rejects_symlink_escape() {
3882        let dir = tempfile::tempdir().expect("create temp dir");
3883        let project = dir.path().join("project");
3884        let outside = dir.path().join("outside");
3885        fs::create_dir_all(&project).expect("create project dir");
3886        fs::create_dir_all(&outside).expect("create outside dir");
3887        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
3888        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
3889        let root = fs::canonicalize(&project).expect("canonicalize project");
3890
3891        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
3892    }
3893
3894    #[test]
3895    fn extract_trigrams_tracks_next_char_and_position() {
3896        let trigrams = extract_trigrams(b"Rust");
3897        assert_eq!(trigrams.len(), 2);
3898        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
3899        assert_eq!(
3900            trigrams[1],
3901            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
3902        );
3903    }
3904
3905    #[test]
3906    fn index_file_trigram_filters_match_legacy_extraction() {
3907        let dir = tempfile::tempdir().expect("create temp dir");
3908        let path = dir.path().join("sample.txt");
3909        let content = b"Rust rust RUST\nxy";
3910        fs::write(&path, content).expect("write sample");
3911
3912        let mut expected = BTreeMap::new();
3913        for (trigram, next_char, position) in extract_trigrams(content) {
3914            let entry: &mut PostingFilter = expected.entry(trigram).or_default();
3915            entry.next_mask |= mask_for_next_char(next_char);
3916            entry.loc_mask |= mask_for_position(position);
3917        }
3918
3919        let mut index = SearchIndex::new();
3920        index.project_root = dir.path().to_path_buf();
3921        index.index_file(&path, content);
3922
3923        let file_id = *index.path_to_id.get(&path).expect("file indexed");
3924        let file_trigrams = index
3925            .delta_file_trigrams
3926            .get(&file_id)
3927            .expect("delta file trigrams");
3928        assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
3929        for (trigram, filter) in expected {
3930            let postings = index
3931                .delta_postings
3932                .get(&trigram)
3933                .expect("delta posting list");
3934            assert_eq!(postings.len(), 1);
3935            assert_eq!(postings[0].file_id, file_id);
3936            assert_eq!(postings[0].next_mask, filter.next_mask);
3937            assert_eq!(postings[0].loc_mask, filter.loc_mask);
3938        }
3939    }
3940
3941    #[test]
3942    fn decompose_regex_extracts_literals_and_alternations() {
3943        let query = decompose_regex("abc(def|ghi)xyz");
3944        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
3945        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
3946        assert_eq!(query.or_groups.len(), 1);
3947        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
3948        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
3949    }
3950
3951    #[test]
3952    fn candidates_intersect_posting_lists() {
3953        let mut index = SearchIndex::new();
3954        let dir = tempfile::tempdir().expect("create temp dir");
3955        let alpha = dir.path().join("alpha.txt");
3956        let beta = dir.path().join("beta.txt");
3957        fs::write(&alpha, "abcdef").expect("write alpha");
3958        fs::write(&beta, "abcxyz").expect("write beta");
3959        index.project_root = dir.path().to_path_buf();
3960        index.index_file(&alpha, b"abcdef");
3961        index.index_file(&beta, b"abcxyz");
3962
3963        let query = RegexQuery {
3964            and_trigrams: vec![
3965                pack_trigram(b'a', b'b', b'c'),
3966                pack_trigram(b'd', b'e', b'f'),
3967            ],
3968            ..RegexQuery::default()
3969        };
3970
3971        let candidates = index.candidates(&query);
3972        assert_eq!(candidates.len(), 1);
3973        assert_eq!(index.files[candidates[0] as usize].path, alpha);
3974    }
3975
3976    #[test]
3977    fn candidates_apply_bloom_filters() {
3978        let mut index = SearchIndex::new();
3979        let dir = tempfile::tempdir().expect("create temp dir");
3980        let file = dir.path().join("sample.txt");
3981        fs::write(&file, "abcd efgh").expect("write sample");
3982        index.project_root = dir.path().to_path_buf();
3983        index.index_file(&file, b"abcd efgh");
3984
3985        let trigram = pack_trigram(b'a', b'b', b'c');
3986        let matching_filter = PostingFilter {
3987            next_mask: mask_for_next_char(b'd'),
3988            loc_mask: mask_for_position(0),
3989        };
3990        let non_matching_filter = PostingFilter {
3991            next_mask: mask_for_next_char(b'z'),
3992            loc_mask: mask_for_position(0),
3993        };
3994
3995        assert_eq!(
3996            index
3997                .postings_for_trigram(trigram, Some(matching_filter))
3998                .len(),
3999            1
4000        );
4001        assert!(index
4002            .postings_for_trigram(trigram, Some(non_matching_filter))
4003            .is_empty());
4004    }
4005
4006    #[test]
4007    fn base_delta_readd_masks_base_and_keeps_postings_sorted() {
4008        let dir = tempfile::tempdir().expect("create temp dir");
4009        let project = dir.path().join("project");
4010        fs::create_dir_all(&project).expect("create project dir");
4011        let a = project.join("a.txt");
4012        let b = project.join("b.txt");
4013        fs::write(&a, "abc old").expect("write a");
4014        fs::write(&b, "abc base").expect("write b");
4015
4016        let mut built = SearchIndex::build(&project);
4017        let cache_dir = dir.path().join("cache");
4018        built.write_to_disk(&cache_dir, None);
4019        let mut index = SearchIndex::read_from_disk(&cache_dir, &project).expect("load base");
4020        assert_eq!(index.base_file_count, 2);
4021
4022        let old_a_id = *index.path_to_id.get(&a).expect("original a id");
4023        let b_id = *index.path_to_id.get(&b).expect("original b id");
4024        index.remove_file(&a);
4025        index.index_file(&a, b"abc new");
4026        let new_id = *index.path_to_id.get(&a).expect("re-added file id");
4027        assert!(new_id >= index.base_file_count);
4028        let abc = pack_trigram(b'a', b'b', b'c');
4029        let ids = index.postings_for_trigram(abc, None);
4030        assert_eq!(ids, {
4031            let mut expected = vec![b_id, new_id];
4032            expected.sort_unstable();
4033            expected
4034        });
4035        assert!(!ids.contains(&old_a_id));
4036    }
4037
4038    #[test]
4039    fn write_to_disk_compacts_base_and_delta() {
4040        let dir = tempfile::tempdir().expect("create temp dir");
4041        let project = dir.path().join("project");
4042        fs::create_dir_all(&project).expect("create project dir");
4043        let file = project.join("src.txt");
4044        fs::write(&file, "abcdef").expect("write source");
4045        let mut index = SearchIndex::build(&project);
4046        let cache_dir = dir.path().join("cache");
4047        index.write_to_disk(&cache_dir, None);
4048        fs::write(&file, "abcxyz").expect("edit source");
4049        index.update_file(&file);
4050        assert!(!index.delta_postings.is_empty());
4051        index.write_to_disk(&cache_dir, None);
4052        assert!(index.delta_postings.is_empty());
4053        assert!(index.superseded.is_empty());
4054        assert_eq!(
4055            index.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4056            vec![0]
4057        );
4058        assert!(index
4059            .postings_for_trigram(pack_trigram(b'd', b'e', b'f'), None)
4060            .is_empty());
4061    }
4062
4063    #[test]
4064    fn legacy_cache_without_file_trigram_count_migrates_streaming_counts() {
4065        let dir = tempfile::tempdir().expect("create temp dir");
4066        let project = dir.path().join("project");
4067        fs::create_dir_all(&project).expect("create project dir");
4068        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4069        let cache_dir = dir.path().join("cache");
4070        let mut index = SearchIndex::build(&project);
4071        index.write_to_disk(&cache_dir, None);
4072        let cache_path = cache_dir.join("cache.bin");
4073        strip_file_trigram_count_extension(&cache_path);
4074        assert!(!cache_has_file_trigram_count_extension(&cache_path));
4075
4076        let loaded = SearchIndex::read_from_disk(&cache_dir, &project).expect("load legacy cache");
4077        assert_eq!(loaded.file_trigram_count.as_ref(), &[4]);
4078        assert!(loaded.delta_postings.is_empty());
4079        assert!(cache_has_file_trigram_count_extension(&cache_path));
4080    }
4081
4082    #[test]
4083    fn compaction_flags_buffer_paths_while_running() {
4084        let dir = tempfile::tempdir().expect("create temp dir");
4085        let project = dir.path().join("project");
4086        fs::create_dir_all(&project).expect("create project dir");
4087        let file = project.join("src.txt");
4088        fs::write(&file, "abcdef").expect("write source");
4089        let mut index = SearchIndex::new();
4090        index.project_root = project.clone();
4091        {
4092            let mut state = index.compaction_state.lock().expect("compaction state");
4093            state.running = true;
4094        }
4095        index.update_file(&file);
4096        let state = index.compaction_state.lock().expect("compaction state");
4097        assert!(state.requested_again || !index.delta_postings.is_empty());
4098        assert!(state.buffered_paths.contains(&file));
4099    }
4100
4101    fn cache_has_file_trigram_count_extension(cache_path: &Path) -> bool {
4102        file_trigram_count_extension_range(cache_path).is_some()
4103    }
4104
4105    fn strip_file_trigram_count_extension(cache_path: &Path) {
4106        let mut bytes = fs::read(cache_path).expect("read cache");
4107        let (start, end) = file_trigram_count_extension_range_from_bytes(&bytes)
4108            .expect("file trigram count extension");
4109        bytes.drain(start..end);
4110        let postings_len_total = u64::from_le_bytes(bytes[8..16].try_into().unwrap())
4111            - u64::try_from(end - start).unwrap();
4112        bytes[8..16].copy_from_slice(&postings_len_total.to_le_bytes());
4113        let checksum_pos = 16 + usize::try_from(postings_len_total).unwrap() - 4;
4114        let checksum = crc32fast::hash(&bytes[16..checksum_pos]);
4115        bytes[checksum_pos..checksum_pos + 4].copy_from_slice(&checksum.to_le_bytes());
4116        fs::write(cache_path, bytes).expect("write legacy cache");
4117    }
4118
4119    fn file_trigram_count_extension_range(cache_path: &Path) -> Option<(usize, usize)> {
4120        let bytes = fs::read(cache_path).ok()?;
4121        file_trigram_count_extension_range_from_bytes(&bytes)
4122    }
4123
4124    fn file_trigram_count_extension_range_from_bytes(bytes: &[u8]) -> Option<(usize, usize)> {
4125        let postings_len_total = u64::from_le_bytes(bytes.get(8..16)?.try_into().ok()?) as usize;
4126        let postings_start = 16usize;
4127        let postings_end = postings_start.checked_add(postings_len_total)?;
4128        let postings_body_end = postings_end.checked_sub(4)?;
4129        let mut reader = Cursor::new(&bytes[postings_start..postings_body_end]);
4130        let mut magic = [0u8; 8];
4131        reader.read_exact(&mut magic).ok()?;
4132        if &magic != INDEX_MAGIC {
4133            return None;
4134        }
4135        read_u32(&mut reader).ok()?;
4136        let head_len = read_u32(&mut reader).ok()? as u64;
4137        let root_len = read_u32(&mut reader).ok()? as u64;
4138        let ignore_len = read_u32(&mut reader).ok()? as u64;
4139        read_u64(&mut reader).ok()?;
4140        let file_count = read_u32(&mut reader).ok()? as usize;
4141        let skip = head_len.checked_add(root_len)?.checked_add(ignore_len)?;
4142        reader.seek(SeekFrom::Current(skip as i64)).ok()?;
4143        for _ in 0..file_count {
4144            let mut unindexed = [0u8; 1];
4145            reader.read_exact(&mut unindexed).ok()?;
4146            let path_len = read_u32(&mut reader).ok()? as u64;
4147            read_u64(&mut reader).ok()?;
4148            read_u64(&mut reader).ok()?;
4149            read_u32(&mut reader).ok()?;
4150            let mut hash = [0u8; 32];
4151            reader.read_exact(&mut hash).ok()?;
4152            reader.seek(SeekFrom::Current(path_len as i64)).ok()?;
4153        }
4154        let postings_blob_len = read_u64(&mut reader).ok()? as usize;
4155        let extension_start = postings_start
4156            .checked_add(reader.position() as usize)?
4157            .checked_add(postings_blob_len)?;
4158        if extension_start + 16 > postings_body_end {
4159            return None;
4160        }
4161        if bytes.get(extension_start..extension_start + 8)? != FILE_TRIGRAM_COUNT_MAGIC {
4162            return None;
4163        }
4164        let count = u32::from_le_bytes(
4165            bytes[extension_start + 12..extension_start + 16]
4166                .try_into()
4167                .ok()?,
4168        ) as usize;
4169        let extension_end = extension_start
4170            .checked_add(16)?
4171            .checked_add(count.checked_mul(4)?)?;
4172        (extension_end <= postings_body_end).then_some((extension_start, extension_end))
4173    }
4174
4175    #[test]
4176    fn disk_round_trip_preserves_postings_and_files() {
4177        let dir = tempfile::tempdir().expect("create temp dir");
4178        let project = dir.path().join("project");
4179        fs::create_dir_all(&project).expect("create project dir");
4180        let file = project.join("src.txt");
4181        fs::write(&file, "abcdef").expect("write source");
4182
4183        let mut index = SearchIndex::build(&project);
4184        index.git_head = Some("deadbeef".to_string());
4185        let cache_dir = dir.path().join("cache");
4186        let head = index.git_head.clone();
4187        index.write_to_disk(&cache_dir, head.as_deref());
4188
4189        let loaded =
4190            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
4191        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
4192        assert_eq!(loaded.files.len(), 1);
4193        assert_eq!(
4194            relative_to_root(&loaded.project_root, &loaded.files[0].path),
4195            PathBuf::from("src.txt")
4196        );
4197        assert_eq!(loaded.trigram_count(), index.trigram_count());
4198        assert_eq!(
4199            loaded.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4200            vec![0]
4201        );
4202        assert_eq!(
4203            loaded.file_trigram_count.as_ref(),
4204            index.file_trigram_count.as_ref()
4205        );
4206    }
4207
4208    #[test]
4209    fn cache_path_helpers_reject_absolute_and_parent_paths() {
4210        let root = PathBuf::from("/tmp/aft-project");
4211
4212        assert_eq!(
4213            cache_relative_path(&root, &root.join("src/lib.rs")),
4214            Some(PathBuf::from("src/lib.rs"))
4215        );
4216        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
4217        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
4218        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
4219        assert_eq!(
4220            cached_path_under_root(&root, Path::new("src/./lib.rs")),
4221            Some(root.join("src/lib.rs"))
4222        );
4223    }
4224
4225    #[test]
4226    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
4227        let dir = tempfile::tempdir().expect("create temp dir");
4228        let project = dir.path().join("project");
4229        fs::create_dir_all(&project).expect("create project dir");
4230        let canonical_project = fs::canonicalize(&project).expect("canonical project");
4231        fs::write(project.join("old.txt"), "old token\n").expect("write old");
4232        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
4233
4234        Command::new("git")
4235            .arg("init")
4236            .arg(&project)
4237            .status()
4238            .expect("git init");
4239        for args in [
4240            ["config", "user.email", "aft@example.invalid"],
4241            ["config", "user.name", "AFT Test"],
4242        ] {
4243            Command::new("git")
4244                .arg("-C")
4245                .arg(&project)
4246                .args(args)
4247                .status()
4248                .expect("git config");
4249        }
4250        Command::new("git")
4251            .arg("-C")
4252            .arg(&project)
4253            .args(["add", "."])
4254            .status()
4255            .expect("git add initial");
4256        Command::new("git")
4257            .arg("-C")
4258            .arg(&project)
4259            .args(["commit", "-m", "initial"])
4260            .status()
4261            .expect("git commit initial");
4262        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
4263        let mut baseline = SearchIndex::build(&project);
4264        baseline.git_head = Some(previous.clone());
4265
4266        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
4267        Command::new("git")
4268            .arg("-C")
4269            .arg(&project)
4270            .args(["add", "-A"])
4271            .status()
4272            .expect("git add rename");
4273        Command::new("git")
4274            .arg("-C")
4275            .arg(&project)
4276            .args(["commit", "-m", "rename"])
4277            .status()
4278            .expect("git commit rename");
4279        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
4280
4281        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
4282        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
4283
4284        let refreshed = SearchIndex::rebuild_or_refresh(
4285            &project,
4286            DEFAULT_MAX_FILE_SIZE,
4287            Some(current),
4288            Some(baseline),
4289            None,
4290        );
4291
4292        assert!(!refreshed
4293            .path_to_id
4294            .contains_key(&canonical_project.join("old.txt")));
4295        assert!(refreshed
4296            .path_to_id
4297            .contains_key(&canonical_project.join("new.txt")));
4298        assert!(refreshed
4299            .path_to_id
4300            .contains_key(&canonical_project.join("untracked.txt")));
4301        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
4302        assert_eq!(matches.matches.len(), 1);
4303    }
4304
4305    #[test]
4306    fn read_from_disk_rejects_corrupt_lookup_checksum() {
4307        let dir = tempfile::tempdir().expect("create temp dir");
4308        let project = dir.path().join("project");
4309        fs::create_dir_all(&project).expect("create project dir");
4310        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4311
4312        let mut index = SearchIndex::build(&project);
4313        let cache_dir = dir.path().join("cache");
4314        index.write_to_disk(&cache_dir, None);
4315
4316        let cache_path = cache_dir.join("cache.bin");
4317        let mut bytes = fs::read(&cache_path).expect("read cache");
4318        let last = bytes.len() - 1;
4319        bytes[last] ^= 0xff;
4320        fs::write(&cache_path, bytes).expect("write corrupted cache");
4321
4322        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
4323    }
4324
4325    #[test]
4326    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
4327        let dir = tempfile::tempdir().expect("create temp dir");
4328        let project = dir.path().join("project");
4329        fs::create_dir_all(&project).expect("create project dir");
4330        fs::write(project.join("src.txt"), "abcdef").expect("write source");
4331
4332        let mut index = SearchIndex::build(&project);
4333        let cache_dir = dir.path().join("cache");
4334        index.write_to_disk(&cache_dir, None);
4335
4336        assert!(cache_dir.join("cache.bin").is_file());
4337        assert!(fs::read_dir(&cache_dir)
4338            .expect("read cache dir")
4339            .all(|entry| !entry
4340                .expect("cache entry")
4341                .file_name()
4342                .to_string_lossy()
4343                .contains(".tmp.")));
4344    }
4345
4346    #[test]
4347    fn concurrent_search_index_writes_do_not_corrupt() {
4348        let dir = tempfile::tempdir().expect("create temp dir");
4349        let project = dir.path().join("project");
4350        fs::create_dir_all(&project).expect("create project dir");
4351        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
4352        let cache_dir = dir.path().join("cache");
4353
4354        let a_project = project.clone();
4355        let a_cache = cache_dir.clone();
4356        let a = std::thread::spawn(move || {
4357            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
4358            let mut index = SearchIndex::build(&a_project);
4359            index.write_to_disk(&a_cache, None);
4360        });
4361        let b_project = project.clone();
4362        let b_cache = cache_dir.clone();
4363        let b = std::thread::spawn(move || {
4364            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
4365            let mut index = SearchIndex::build(&b_project);
4366            index.write_to_disk(&b_cache, None);
4367        });
4368        a.join().expect("writer a");
4369        b.join().expect("writer b");
4370
4371        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
4372    }
4373
4374    #[test]
4375    fn search_index_atomic_rename_survives_partial_write() {
4376        let dir = tempfile::tempdir().expect("create temp dir");
4377        let cache_dir = dir.path().join("cache");
4378        fs::create_dir_all(&cache_dir).expect("create cache dir");
4379        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
4380
4381        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4382    }
4383
4384    #[test]
4385    fn artifact_cache_key_shared_across_clones_of_same_repo() {
4386        let dir = tempfile::tempdir().expect("create temp dir");
4387        let source = dir.path().join("source");
4388        fs::create_dir_all(&source).expect("create source repo dir");
4389        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
4390
4391        assert!(Command::new("git")
4392            .current_dir(&source)
4393            .args(["init"])
4394            .status()
4395            .expect("init git repo")
4396            .success());
4397        assert!(Command::new("git")
4398            .current_dir(&source)
4399            .args(["add", "."])
4400            .status()
4401            .expect("git add")
4402            .success());
4403        assert!(Command::new("git")
4404            .current_dir(&source)
4405            .args([
4406                "-c",
4407                "user.name=AFT Tests",
4408                "-c",
4409                "user.email=aft-tests@example.com",
4410                "commit",
4411                "-m",
4412                "initial",
4413            ])
4414            .status()
4415            .expect("git commit")
4416            .success());
4417
4418        let clone = dir.path().join("clone");
4419        assert!(Command::new("git")
4420            .args(["clone", "--quiet"])
4421            .arg(&source)
4422            .arg(&clone)
4423            .status()
4424            .expect("git clone")
4425            .success());
4426
4427        let source_key = artifact_cache_key(&source);
4428        let clone_key = artifact_cache_key(&clone);
4429
4430        assert_eq!(source_key.len(), 16);
4431        assert_eq!(clone_key.len(), 16);
4432        // Same repo (same root commit) → same cache key regardless of clone path
4433        assert_eq!(source_key, clone_key);
4434    }
4435
4436    #[test]
4437    fn git_head_unchanged_picks_up_local_edits() {
4438        let dir = tempfile::tempdir().expect("create temp dir");
4439        let project = dir.path().join("repo");
4440        fs::create_dir_all(&project).expect("create repo dir");
4441        let file = project.join("tracked.txt");
4442        fs::write(&file, "oldtoken\n").expect("write file");
4443        assert!(Command::new("git")
4444            .current_dir(&project)
4445            .arg("init")
4446            .status()
4447            .unwrap()
4448            .success());
4449        assert!(Command::new("git")
4450            .current_dir(&project)
4451            .args(["add", "."])
4452            .status()
4453            .unwrap()
4454            .success());
4455        assert!(Command::new("git")
4456            .current_dir(&project)
4457            .args([
4458                "-c",
4459                "user.name=AFT Tests",
4460                "-c",
4461                "user.email=aft-tests@example.com",
4462                "commit",
4463                "-m",
4464                "initial"
4465            ])
4466            .status()
4467            .unwrap()
4468            .success());
4469        let head = current_git_head(&project);
4470        let mut baseline = SearchIndex::build(&project);
4471        baseline.git_head = head.clone();
4472        fs::write(&file, "newtoken\n").expect("edit tracked file");
4473
4474        let refreshed = SearchIndex::rebuild_or_refresh(
4475            &project,
4476            DEFAULT_MAX_FILE_SIZE,
4477            head,
4478            Some(baseline),
4479            None,
4480        );
4481        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
4482
4483        assert_eq!(result.total_matches, 1);
4484    }
4485
4486    #[test]
4487    fn non_git_project_reuses_cache_when_files_unchanged() {
4488        let dir = tempfile::tempdir().expect("create temp dir");
4489        let project = dir.path().join("project");
4490        fs::create_dir_all(&project).expect("create project dir");
4491        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
4492        let baseline = SearchIndex::build(&project);
4493        let baseline_file_count = baseline.file_count();
4494
4495        let refreshed = SearchIndex::rebuild_or_refresh(
4496            &project,
4497            DEFAULT_MAX_FILE_SIZE,
4498            None,
4499            Some(baseline),
4500            None,
4501        );
4502
4503        assert_eq!(refreshed.file_count(), baseline_file_count);
4504        assert_eq!(
4505            refreshed
4506                .grep("unchangedtoken", true, &[], &[], &project, 10)
4507                .total_matches,
4508            1
4509        );
4510    }
4511
4512    #[test]
4513    fn resolve_search_scope_disables_index_for_external_path() {
4514        let dir = tempfile::tempdir().expect("create temp dir");
4515        let project = dir.path().join("project");
4516        let outside = dir.path().join("outside");
4517        fs::create_dir_all(&project).expect("create project dir");
4518        fs::create_dir_all(&outside).expect("create outside dir");
4519
4520        let scope = resolve_search_scope(&project, outside.to_str());
4521
4522        assert_eq!(
4523            scope.root,
4524            fs::canonicalize(&outside).expect("canonicalize outside")
4525        );
4526        assert!(!scope.use_index);
4527    }
4528
4529    #[test]
4530    fn grep_filters_matches_to_search_root() {
4531        let dir = tempfile::tempdir().expect("create temp dir");
4532        let project = dir.path().join("project");
4533        let src = project.join("src");
4534        let docs = project.join("docs");
4535        fs::create_dir_all(&src).expect("create src dir");
4536        fs::create_dir_all(&docs).expect("create docs dir");
4537        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
4538        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
4539
4540        let index = SearchIndex::build(&project);
4541        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4542
4543        assert_eq!(result.files_searched, 1);
4544        assert_eq!(result.files_with_matches, 1);
4545        assert_eq!(result.matches.len(), 1);
4546        // Index stores canonicalized paths; on macOS /var → /private/var
4547        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
4548        assert_eq!(result.matches[0].file, expected);
4549    }
4550
4551    #[test]
4552    fn grep_deduplicates_multiple_matches_on_same_line() {
4553        let dir = tempfile::tempdir().expect("create temp dir");
4554        let project = dir.path().join("project");
4555        let src = project.join("src");
4556        fs::create_dir_all(&src).expect("create src dir");
4557        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
4558
4559        let index = SearchIndex::build(&project);
4560        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4561
4562        assert_eq!(result.total_matches, 1);
4563        assert_eq!(result.matches.len(), 1);
4564    }
4565
4566    #[test]
4567    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
4568        let dir = tempfile::tempdir().expect("create temp dir");
4569        let project = dir.path().join("project");
4570        fs::create_dir_all(&project).expect("create project dir");
4571        let file = project.join("unicode.txt");
4572        fs::write(&file, "äbc\n").expect("write unicode file");
4573
4574        let index = SearchIndex::build(&project);
4575        let result = index.grep("Äbc", false, &[], &[], &project, 10);
4576
4577        assert_eq!(result.total_matches, 1);
4578        assert_eq!(result.matches.len(), 1);
4579        assert_eq!(
4580            result.matches[0].file,
4581            fs::canonicalize(file).expect("canonicalize unicode file")
4582        );
4583    }
4584
4585    #[test]
4586    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
4587        let dir = tempfile::tempdir().expect("create temp dir");
4588        let project = dir.path().join("project");
4589        fs::create_dir_all(&project).expect("create project dir");
4590        let file = project.join("tokens.txt");
4591        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
4592        fs::write(&file, "alpha").expect("write original file");
4593        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
4594
4595        let baseline = SearchIndex::build(&project);
4596        fs::write(&file, "bravo").expect("write same-size edit");
4597        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
4598
4599        let refreshed = SearchIndex::rebuild_or_refresh(
4600            &project,
4601            DEFAULT_MAX_FILE_SIZE,
4602            None,
4603            Some(baseline),
4604            None,
4605        );
4606        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
4607        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
4608        let refreshed_id = *refreshed
4609            .path_to_id
4610            .get(&canonical_file)
4611            .expect("file remains indexed");
4612
4613        assert_eq!(result.total_matches, 1);
4614        assert!(refreshed
4615            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
4616            .contains(&refreshed_id));
4617        assert!(!refreshed
4618            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
4619            .contains(&refreshed_id));
4620    }
4621
4622    #[test]
4623    fn grep_reports_total_matches_before_truncation() {
4624        let dir = tempfile::tempdir().expect("create temp dir");
4625        let project = dir.path().join("project");
4626        let src = project.join("src");
4627        fs::create_dir_all(&src).expect("create src dir");
4628        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
4629
4630        let index = SearchIndex::build(&project);
4631        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
4632
4633        assert_eq!(result.total_matches, 2);
4634        assert_eq!(result.matches.len(), 1);
4635        assert!(result.truncated);
4636    }
4637
4638    #[test]
4639    fn glob_filters_results_to_search_root() {
4640        let dir = tempfile::tempdir().expect("create temp dir");
4641        let project = dir.path().join("project");
4642        let src = project.join("src");
4643        let scripts = project.join("scripts");
4644        fs::create_dir_all(&src).expect("create src dir");
4645        fs::create_dir_all(&scripts).expect("create scripts dir");
4646        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
4647        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
4648
4649        let index = SearchIndex::build(&project);
4650        let files = index.glob("**/*.rs", &src);
4651
4652        assert_eq!(
4653            files,
4654            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
4655        );
4656    }
4657
4658    #[test]
4659    fn glob_includes_hidden_and_binary_files() {
4660        let dir = tempfile::tempdir().expect("create temp dir");
4661        let project = dir.path().join("project");
4662        let hidden_dir = project.join(".hidden");
4663        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
4664        let hidden_file = hidden_dir.join("data.bin");
4665        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
4666
4667        let index = SearchIndex::build(&project);
4668        let files = index.glob("**/*.bin", &project);
4669
4670        assert_eq!(
4671            files,
4672            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
4673        );
4674    }
4675
4676    #[test]
4677    fn read_from_disk_rejects_invalid_nanos() {
4678        let dir = tempfile::tempdir().expect("create temp dir");
4679        let cache_dir = dir.path().join("cache");
4680        fs::create_dir_all(&cache_dir).expect("create cache dir");
4681
4682        let mut postings = Vec::new();
4683        postings.extend_from_slice(INDEX_MAGIC);
4684        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4685        postings.extend_from_slice(&0u32.to_le_bytes());
4686        postings.extend_from_slice(&1u32.to_le_bytes());
4687        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
4688        postings.extend_from_slice(&1u32.to_le_bytes());
4689        postings.extend_from_slice(b"/");
4690        postings.push(0u8);
4691        postings.extend_from_slice(&1u32.to_le_bytes());
4692        postings.extend_from_slice(&0u64.to_le_bytes());
4693        postings.extend_from_slice(&0u64.to_le_bytes());
4694        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
4695        postings.extend_from_slice(b"a");
4696        postings.extend_from_slice(&0u64.to_le_bytes());
4697
4698        let mut lookup = Vec::new();
4699        lookup.extend_from_slice(LOOKUP_MAGIC);
4700        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4701        lookup.extend_from_slice(&0u32.to_le_bytes());
4702
4703        let postings_checksum = crc32fast::hash(&postings);
4704        postings.extend_from_slice(&postings_checksum.to_le_bytes());
4705        let lookup_checksum = crc32fast::hash(&lookup);
4706        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
4707        let mut cache = Vec::new();
4708        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
4709        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4710        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
4711        cache.extend_from_slice(&postings);
4712        cache.extend_from_slice(&lookup);
4713        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
4714
4715        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4716    }
4717
4718    #[test]
4719    fn parallel_cold_build_matches_serial_index() {
4720        let dir = tempfile::tempdir().expect("create temp dir");
4721        let project = dir.path().join("project");
4722        for index in 0..80 {
4723            let sub = project.join(format!("pkg_{index:03}"));
4724            fs::create_dir_all(&sub).expect("create subdir");
4725            fs::write(
4726                sub.join("lib.rs"),
4727                format!(
4728                    "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
4729                ),
4730            )
4731            .expect("write lib");
4732        }
4733
4734        let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
4735        let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
4736
4737        assert_eq!(serial.file_count(), parallel.file_count());
4738        assert_eq!(serial.trigram_count(), parallel.trigram_count());
4739        assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
4740        assert_eq!(
4741            serial.file_trigram_count.as_ref(),
4742            parallel.file_trigram_count.as_ref()
4743        );
4744        for (path, id) in serial.path_to_id.iter() {
4745            assert_eq!(parallel.path_to_id.get(path), Some(id));
4746        }
4747        for (serial_file, parallel_file) in serial.files.iter().zip(parallel.files.iter()) {
4748            assert_eq!(serial_file.path, parallel_file.path);
4749            assert_eq!(serial_file.size, parallel_file.size);
4750            assert_eq!(serial_file.modified, parallel_file.modified);
4751            assert_eq!(serial_file.content_hash, parallel_file.content_hash);
4752        }
4753
4754        let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4755        let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4756        assert_eq!(serial_grep.matches, parallel_grep.matches);
4757        assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
4758        assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
4759        assert_eq!(
4760            serial_grep.files_with_matches,
4761            parallel_grep.files_with_matches
4762        );
4763    }
4764
4765    #[test]
4766    fn ignore_rule_discovery_respects_gitignore() {
4767        let dir = tempfile::tempdir().expect("create temp dir");
4768        let project = dir.path().join("project");
4769        fs::create_dir_all(project.join("src")).expect("mkdir src");
4770        fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
4771        let data = project.join("src/data");
4772        fs::create_dir_all(&data).expect("mkdir data");
4773        for index in 0..200 {
4774            fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
4775            fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
4776                .expect("write ignored file");
4777        }
4778
4779        Command::new("git")
4780            .arg("init")
4781            .arg(&project)
4782            .status()
4783            .expect("git init");
4784        for args in [
4785            ["config", "user.email", "aft@example.invalid"],
4786            ["config", "user.name", "AFT Test"],
4787        ] {
4788            Command::new("git")
4789                .arg("-C")
4790                .arg(&project)
4791                .args(args)
4792                .status()
4793                .expect("git config");
4794        }
4795        Command::new("git")
4796            .arg("-C")
4797            .arg(&project)
4798            .args(["add", "."])
4799            .status()
4800            .expect("git add");
4801        Command::new("git")
4802            .arg("-C")
4803            .arg(&project)
4804            .args(["commit", "-m", "initial"])
4805            .status()
4806            .expect("git commit");
4807
4808        let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
4809        let walker_dirs = count_ignore_rule_discovery_dirs(&project);
4810        assert!(
4811            legacy_dirs > walker_dirs,
4812            "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
4813        );
4814        assert!(
4815            walker_dirs < 50,
4816            "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
4817        );
4818    }
4819
4820    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
4821    /// changed between cmp() calls.
4822    ///
4823    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
4824    /// which does a `stat()` syscall. If the file was deleted, modified, or
4825    /// touched mid-sort, the comparator returned different values for the
4826    /// same input pair on different invocations. Rust's slice::sort detects
4827    /// this and panics with "user-provided comparison function does not
4828    /// correctly implement a total order".
4829    ///
4830    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
4831    /// bridge invalidated files in parallel with grep's sort path. This
4832    /// test simulates the worst case: most paths don't exist (Err from
4833    /// fs::metadata) and sort still completes successfully.
4834    #[test]
4835    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
4836        // Mix of existing and non-existing paths in deliberately
4837        // non-monotonic order — pre-fix, the sort would call stat() at
4838        // least N log N times and any flakiness would trigger the panic.
4839        let dir = tempfile::tempdir().expect("create tempdir");
4840        let mut paths: Vec<PathBuf> = Vec::new();
4841        for i in 0..30 {
4842            // Half exist, half don't.
4843            let path = if i % 2 == 0 {
4844                let p = dir.path().join(format!("real-{i}.rs"));
4845                fs::write(&p, format!("// {i}\n")).expect("write");
4846                p
4847            } else {
4848                dir.path().join(format!("missing-{i}.rs"))
4849            };
4850            paths.push(path);
4851        }
4852
4853        // Run the sort many times to maximise the chance of catching any
4854        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
4855        for _ in 0..50 {
4856            let mut copy = paths.clone();
4857            sort_paths_by_mtime_desc(&mut copy);
4858            assert_eq!(copy.len(), paths.len());
4859        }
4860    }
4861
4862    /// Regression: the indexed parallel search's reduce() combine closure must
4863    /// NOT set engine_capped. reduce runs on every partial-result merge in a
4864    /// multi-chunk parallel search (>10 candidate files), capped or not — an
4865    /// unconditional store there falsely reported every such grep as capped,
4866    /// lying to the agent that results were truncated.
4867    #[test]
4868    fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
4869        let dir = tempfile::tempdir().expect("create tempdir");
4870        // >10 files so the parallel (reduce) branch is taken, each with exactly
4871        // one match, and a generous cap so the search is NOT actually capped.
4872        for i in 0..40 {
4873            fs::write(
4874                dir.path().join(format!("file-{i}.rs")),
4875                format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
4876            )
4877            .expect("write");
4878        }
4879        let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
4880        let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
4881        assert!(
4882            result.matches.len() >= 40,
4883            "expected a match per file, got {}",
4884            result.matches.len()
4885        );
4886        assert!(
4887            !result.engine_capped,
4888            "an uncapped grep over >10 files must not report engine_capped"
4889        );
4890        assert!(!result.truncated, "uncapped grep must not be truncated");
4891    }
4892
4893    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
4894    /// the same conditions as sort_paths_by_mtime_desc. See the
4895    /// sort_paths_... test above for the full rationale.
4896    #[test]
4897    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
4898        let dir = tempfile::tempdir().expect("create tempdir");
4899        let mut matches: Vec<GrepMatch> = Vec::new();
4900        for i in 0..30 {
4901            let file = if i % 2 == 0 {
4902                let p = dir.path().join(format!("real-{i}.rs"));
4903                fs::write(&p, format!("// {i}\n")).expect("write");
4904                p
4905            } else {
4906                dir.path().join(format!("missing-{i}.rs"))
4907            };
4908            matches.push(GrepMatch {
4909                file,
4910                line: u32::try_from(i).unwrap_or(0),
4911                column: 0,
4912                line_text: format!("match {i}"),
4913                match_text: format!("match {i}"),
4914            });
4915        }
4916
4917        for _ in 0..50 {
4918            let mut copy = matches.clone();
4919            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
4920            assert_eq!(copy.len(), matches.len());
4921        }
4922    }
4923}
aft/search_index.rs

aft/
search_index.rs