aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const PARALLEL_INGEST_CHUNK_SIZE: usize = 256;
29const EOF_SENTINEL: u8 = 0;
30const MAX_ENTRIES: usize = 10_000_000;
31const MIN_FILE_ENTRY_BYTES: usize = 57;
32const LOOKUP_ENTRY_BYTES: usize = 16;
33const POSTING_BYTES: usize = 6;
34static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
35
36pub struct CacheLock {
37    _guard: fs_lock::LockGuard,
38}
39
40impl CacheLock {
41    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
42        fs::create_dir_all(cache_dir)?;
43        let path = cache_dir.join("cache.lock");
44        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
45            .lock()
46            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
47        fs_lock::try_acquire(&path, Duration::from_secs(2))
48            .map(|guard| Self { _guard: guard })
49            .map_err(|error| match error {
50                fs_lock::AcquireError::Timeout => {
51                    std::io::Error::other("timed out acquiring search cache lock")
52                }
53                fs_lock::AcquireError::Io(error) => error,
54            })
55    }
56}
57
58#[derive(Clone, Debug)]
59pub struct SearchIndex {
60    pub postings: HashMap<u32, Vec<Posting>>,
61    pub files: Vec<FileEntry>,
62    pub path_to_id: HashMap<PathBuf, u32>,
63    pub ready: bool,
64    project_root: PathBuf,
65    git_head: Option<String>,
66    max_file_size: u64,
67    ignore_rules_fingerprint: String,
68    pub file_trigrams: HashMap<u32, Vec<u32>>,
69    unindexed_files: HashSet<u32>,
70}
71
72#[derive(Clone, Debug, Default)]
73pub struct LexicalRankResult {
74    pub files: Vec<(PathBuf, f32)>,
75    pub engine_capped: bool,
76}
77
78impl SearchIndex {
79    /// Number of indexed files.
80    pub fn file_count(&self) -> usize {
81        self.files.len()
82    }
83
84    /// Number of unique trigrams in the index.
85    pub fn trigram_count(&self) -> usize {
86        self.postings.len()
87    }
88
89    /// Compute distinct query trigrams from literal tokens.
90    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
91        query_trigrams_from_tokens(tokens)
92    }
93
94    /// Score-rank file candidates by lexical relevance to query trigrams.
95    pub fn lexical_rank(
96        &self,
97        query_trigrams: &[u32],
98        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
99        max_files: usize,
100    ) -> Vec<(PathBuf, f32)> {
101        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
102            .files
103    }
104
105    /// Score-rank file candidates and report whether pre-filter candidate
106    /// enumeration hit the internal 200/500 cap before ranking.
107    pub fn lexical_rank_with_stats(
108        &self,
109        query_trigrams: &[u32],
110        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
111        max_files: usize,
112    ) -> LexicalRankResult {
113        if query_trigrams.is_empty() || max_files == 0 {
114            return LexicalRankResult::default();
115        }
116
117        let mut non_zero: Vec<(u32, usize)> = query_trigrams
118            .iter()
119            .filter_map(|trigram| {
120                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
121                (posting_count > 0).then_some((*trigram, posting_count))
122            })
123            .collect();
124        if non_zero.is_empty() {
125            return LexicalRankResult::default();
126        }
127
128        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
129        let selected_count = non_zero.len().min(3);
130        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
131
132        let mut candidate_ids = BTreeSet::new();
133        for (trigram, _) in non_zero.iter().take(selected_count) {
134            if let Some(postings) = self.postings.get(trigram) {
135                for posting in postings {
136                    if self.is_active_file(posting.file_id) {
137                        candidate_ids.insert(posting.file_id);
138                    }
139                }
140            }
141        }
142        let pre_filter_candidate_count = candidate_ids.len();
143        let engine_capped = pre_filter_candidate_count > candidate_cap;
144        let filtered_candidates = candidate_ids
145            .into_iter()
146            .filter_map(|file_id| {
147                self.files
148                    .get(file_id as usize)
149                    .map(|entry| (file_id, entry))
150            })
151            .filter(|(_, entry)| {
152                if let Some(filter) = candidate_filter {
153                    filter(&entry.path)
154                } else {
155                    true
156                }
157            })
158            .collect::<Vec<_>>();
159
160        let mut ranked = Vec::new();
161        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
162            let score = lexical_score(self, query_trigrams, file_id);
163            if score > 0.0 {
164                ranked.push((entry.path.clone(), score));
165            }
166        }
167
168        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
169        ranked.truncate(max_files);
170        LexicalRankResult {
171            files: ranked,
172            engine_capped,
173        }
174    }
175}
176
177#[derive(Clone, Debug, PartialEq, Eq)]
178pub struct Posting {
179    pub file_id: u32,
180    pub next_mask: u8,
181    pub loc_mask: u8,
182}
183
184#[derive(Clone, Debug)]
185pub struct FileEntry {
186    pub path: PathBuf,
187    pub size: u64,
188    pub modified: SystemTime,
189    pub content_hash: blake3::Hash,
190}
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub struct GrepMatch {
194    pub file: PathBuf,
195    pub line: u32,
196    pub column: u32,
197    pub line_text: String,
198    pub match_text: String,
199}
200
201#[derive(Clone, Debug)]
202pub struct GrepResult {
203    pub matches: Vec<GrepMatch>,
204    pub total_matches: usize,
205    pub files_searched: usize,
206    pub files_with_matches: usize,
207    pub index_status: IndexStatus,
208    pub truncated: bool,
209    pub fully_degraded: bool,
210    pub engine_capped: bool,
211    /// True when a fallback directory walk stopped early due to file-count or time budget.
212    pub walk_truncated: bool,
213}
214
215#[derive(Clone, Copy, Debug, PartialEq, Eq)]
216pub enum IndexStatus {
217    Ready,
218    Building,
219    Fallback,
220    Disabled,
221}
222
223impl IndexStatus {
224    pub fn as_str(&self) -> &'static str {
225        match self {
226            IndexStatus::Ready => "Ready",
227            IndexStatus::Building => "Building",
228            IndexStatus::Fallback => "Fallback",
229            IndexStatus::Disabled => "Disabled",
230        }
231    }
232}
233
234#[derive(Clone, Debug, Default)]
235pub struct RegexQuery {
236    pub and_trigrams: Vec<u32>,
237    pub or_groups: Vec<Vec<u32>>,
238    pub(crate) and_filters: HashMap<u32, PostingFilter>,
239    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
240}
241
242#[derive(Clone, Copy, Debug, Default)]
243pub(crate) struct PostingFilter {
244    next_mask: u8,
245    loc_mask: u8,
246}
247
248#[derive(Clone, Copy)]
249struct SearchFileMetadata {
250    size: u64,
251    modified: SystemTime,
252}
253
254struct PreparedIndexedFile {
255    metadata: SearchFileMetadata,
256    content_hash: blake3::Hash,
257    trigram_map: BTreeMap<u32, PostingFilter>,
258}
259
260enum PreparedSearchPath {
261    Indexed(PreparedIndexedFile),
262    Unindexed(SearchFileMetadata),
263    Skipped,
264}
265
266#[derive(Clone, Debug, Default)]
267struct QueryBuild {
268    and_runs: Vec<Vec<u8>>,
269    or_groups: Vec<Vec<Vec<u8>>>,
270}
271
272#[derive(Clone, Debug, Default)]
273pub(crate) struct PathFilters {
274    includes: Option<GlobSet>,
275    excludes: Option<GlobSet>,
276}
277
278#[derive(Clone, Debug)]
279pub(crate) struct SearchScope {
280    pub root: PathBuf,
281    pub use_index: bool,
282}
283
284#[derive(Clone, Debug)]
285struct SharedGrepMatch {
286    file: Arc<PathBuf>,
287    line: u32,
288    column: u32,
289    line_text: String,
290    match_text: String,
291}
292
293#[derive(Clone, Debug)]
294enum SearchMatcher {
295    Literal(LiteralSearch),
296    Regex(Regex),
297}
298
299impl SearchIndex {
300    pub fn new() -> Self {
301        SearchIndex {
302            postings: HashMap::new(),
303            files: Vec::new(),
304            path_to_id: HashMap::new(),
305            ready: false,
306            project_root: PathBuf::new(),
307            git_head: None,
308            max_file_size: DEFAULT_MAX_FILE_SIZE,
309            ignore_rules_fingerprint: String::new(),
310            file_trigrams: HashMap::new(),
311            unindexed_files: HashSet::new(),
312        }
313    }
314
315    pub fn build(root: &Path) -> Self {
316        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
317    }
318
319    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
320        let started = std::time::Instant::now();
321        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
322        let mut index = SearchIndex {
323            project_root: project_root.clone(),
324            max_file_size,
325            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
326            ..SearchIndex::new()
327        };
328
329        let filters = PathFilters::default();
330        let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
331        let indexed = index.ingest_paths_parallel(&paths);
332
333        index.git_head = current_git_head(&project_root);
334        index.ready = true;
335        crate::slog_info!(
336            "search index cold build: {} files, {} trigrams, {} ms (pool={})",
337            indexed,
338            index.postings.len(),
339            started.elapsed().as_millis(),
340            search_index_build_pool_size()
341        );
342        index
343    }
344
345    /// Serial cold build for tests and parity checks against [`build_with_limit`].
346    #[cfg(test)]
347    pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
348        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
349        let mut index = SearchIndex {
350            project_root: project_root.clone(),
351            max_file_size,
352            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
353            ..SearchIndex::new()
354        };
355        let filters = PathFilters::default();
356        for path in walk_project_files(&project_root, &filters) {
357            index.update_file(&path);
358        }
359        index.git_head = current_git_head(&project_root);
360        index.ready = true;
361        index
362    }
363
364    fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
365        let max_file_size = self.max_file_size;
366        let pool = match rayon::ThreadPoolBuilder::new()
367            .num_threads(search_index_build_pool_size())
368            .thread_name(|index| format!("aft-search-build-{index}"))
369            .stack_size(8 * 1024 * 1024)
370            .build()
371        {
372            Ok(pool) => Some(pool),
373            Err(error) => {
374                log::warn!(
375                    "search index: bounded build pool unavailable ({error}); using global pool"
376                );
377                None
378            }
379        };
380
381        let mut indexed = 0usize;
382        for chunk in paths.chunks(PARALLEL_INGEST_CHUNK_SIZE) {
383            let prepare_chunk = || -> Vec<PreparedSearchPath> {
384                chunk
385                    .par_iter()
386                    .map(|path| prepare_search_path(path, max_file_size))
387                    .collect()
388            };
389            let prepared = match &pool {
390                Some(pool) => pool.install(prepare_chunk),
391                None => prepare_chunk(),
392            };
393
394            for (path, prepared) in chunk.iter().zip(prepared) {
395                let inserted = match prepared {
396                    PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
397                    PreparedSearchPath::Unindexed(metadata) => {
398                        self.track_unindexed_file_with_metadata(path, metadata)
399                    }
400                    PreparedSearchPath::Skipped => false,
401                };
402                if inserted {
403                    indexed += 1;
404                }
405            }
406        }
407
408        indexed
409    }
410
411    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
412        self.remove_file(path);
413        let metadata = metadata_for_indexed_content(path, content.len() as u64);
414        self.index_file_with_metadata(path, content, metadata);
415    }
416
417    fn index_file_with_metadata(
418        &mut self,
419        path: &Path,
420        content: &[u8],
421        metadata: SearchFileMetadata,
422    ) -> bool {
423        self.index_prepared_new_file(
424            path,
425            PreparedIndexedFile {
426                metadata,
427                content_hash: cache_freshness::hash_bytes(content),
428                trigram_map: trigram_filter_map(content, true),
429            },
430        )
431    }
432
433    fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
434        let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
435            Some(file_id) => file_id,
436            None => return false,
437        };
438        if let Some(entry) = self.files.get_mut(file_id as usize) {
439            entry.content_hash = file.content_hash;
440        }
441
442        let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
443        for (trigram, filter) in file.trigram_map {
444            let postings = self.postings.entry(trigram).or_default();
445            postings.push(Posting {
446                file_id,
447                next_mask: filter.next_mask,
448                loc_mask: filter.loc_mask,
449            });
450            // Posting lists are kept sorted by file_id for binary search during
451            // intersection. Since file_ids are allocated incrementally, the new
452            // entry is usually already in order. Only sort when needed.
453            if postings.len() > 1
454                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
455            {
456                postings.sort_unstable_by_key(|p| p.file_id);
457            }
458            file_trigrams.push(trigram);
459        }
460
461        self.file_trigrams.insert(file_id, file_trigrams);
462        self.unindexed_files.remove(&file_id);
463        true
464    }
465
466    pub fn remove_file(&mut self, path: &Path) {
467        let canonical_path = canonicalize_existing_or_deleted_path(path);
468        let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
469            file_id
470        } else if canonical_path.as_path() != path {
471            let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
472                return;
473            };
474            file_id
475        } else {
476            return;
477        };
478
479        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
480            for trigram in trigrams {
481                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
482                    postings.retain(|posting| posting.file_id != file_id);
483                    postings.is_empty()
484                } else {
485                    false
486                };
487
488                if should_remove {
489                    self.postings.remove(&trigram);
490                }
491            }
492        }
493
494        self.unindexed_files.remove(&file_id);
495        if let Some(file) = self.files.get_mut(file_id as usize) {
496            file.path = PathBuf::new();
497            file.size = 0;
498            file.modified = UNIX_EPOCH;
499            file.content_hash = cache_freshness::zero_hash();
500        }
501    }
502
503    pub fn update_file(&mut self, path: &Path) {
504        self.remove_file(path);
505
506        let metadata = match fs::metadata(path) {
507            Ok(metadata) if metadata.is_file() => metadata,
508            _ => return,
509        };
510
511        let metadata = search_file_metadata(&metadata);
512
513        if is_binary_path(path, metadata.size) {
514            self.track_unindexed_file_with_metadata(path, metadata);
515            return;
516        }
517
518        if metadata.size > self.max_file_size {
519            self.track_unindexed_file_with_metadata(path, metadata);
520            return;
521        }
522
523        let content = match fs::read(path) {
524            Ok(content) => content,
525            Err(_) => return,
526        };
527
528        if is_binary_bytes(&content) {
529            self.track_unindexed_file_with_metadata(path, metadata);
530            return;
531        }
532
533        self.index_file_with_metadata(path, &content, metadata);
534    }
535
536    pub fn grep(
537        &self,
538        pattern: &str,
539        case_sensitive: bool,
540        include: &[String],
541        exclude: &[String],
542        search_root: &Path,
543        max_results: usize,
544    ) -> GrepResult {
545        match pattern_compile::compile(
546            pattern,
547            CompileOpts {
548                case_insensitive: !case_sensitive,
549                ..CompileOpts::default()
550            },
551        ) {
552            CompileResult::Ok(compiled) => {
553                self.search_grep(&compiled, include, exclude, search_root, max_results)
554            }
555            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
556                self.empty_grep_result()
557            }
558        }
559    }
560
561    pub fn search_grep(
562        &self,
563        pattern: &CompiledPattern,
564        include: &[String],
565        exclude: &[String],
566        search_root: &Path,
567        max_results: usize,
568    ) -> GrepResult {
569        let matcher = match pattern {
570            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
571            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
572        };
573
574        let filters = match build_path_filters(include, exclude) {
575            Ok(filters) => filters,
576            Err(_) => PathFilters::default(),
577        };
578        let search_root = canonicalize_or_normalize(search_root);
579
580        let raw_pattern = pattern.raw_pattern_for_trigrams();
581        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
582            RegexQuery::default()
583        } else {
584            decompose_regex(&raw_pattern)
585        };
586        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
587        let candidate_ids = self.candidates(&query);
588
589        let candidate_files: Vec<&FileEntry> = candidate_ids
590            .into_iter()
591            .filter_map(|file_id| self.files.get(file_id as usize))
592            .filter(|file| !file.path.as_os_str().is_empty())
593            .filter(|file| is_within_search_root(&search_root, &file.path))
594            .filter(|file| filters.matches(&self.project_root, &file.path))
595            .collect();
596
597        let total_matches = AtomicUsize::new(0);
598        let files_searched = AtomicUsize::new(0);
599        let files_with_matches = AtomicUsize::new(0);
600        let truncated = AtomicBool::new(false);
601        let engine_capped = AtomicBool::new(false);
602        let stop_after = max_results.saturating_mul(2);
603        let stop_scan = Arc::new(AtomicBool::new(false));
604
605        let mut matches = if candidate_files.len() > 10 {
606            candidate_files
607                .par_iter()
608                .map(|file| {
609                    if grep_scan_should_stop(
610                        Some(&stop_scan),
611                        &truncated,
612                        &total_matches,
613                        stop_after,
614                    ) {
615                        engine_capped.store(true, Ordering::Relaxed);
616                        return Vec::new();
617                    }
618                    search_candidate_file(
619                        file,
620                        &matcher,
621                        max_results,
622                        stop_after,
623                        &total_matches,
624                        &files_searched,
625                        &files_with_matches,
626                        &truncated,
627                        &engine_capped,
628                        Some(&stop_scan),
629                    )
630                })
631                .reduce(Vec::new, |mut left, mut right| {
632                    // NOTE: do NOT set engine_capped here. reduce() runs on every
633                    // partial-result merge in a multi-chunk parallel search,
634                    // capped or not — setting it unconditionally would falsely
635                    // report every indexed grep over >10 candidates as capped.
636                    // Cap detection is owned by grep_scan_should_stop (in the map
637                    // closure) and should_stop_search (serial path), which set
638                    // engine_capped only when the result cap is actually reached.
639                    left.append(&mut right);
640                    left
641                })
642        } else {
643            let mut matches = Vec::new();
644            for file in candidate_files {
645                matches.extend(search_candidate_file(
646                    file,
647                    &matcher,
648                    max_results,
649                    stop_after,
650                    &total_matches,
651                    &files_searched,
652                    &files_with_matches,
653                    &truncated,
654                    &engine_capped,
655                    None,
656                ));
657
658                if should_stop_search(&truncated, &total_matches, stop_after) {
659                    engine_capped.store(true, Ordering::Relaxed);
660                    break;
661                }
662            }
663            matches
664        };
665
666        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
667            self.path_to_id
668                .get(path)
669                .and_then(|file_id| self.files.get(*file_id as usize))
670                .map(|file| file.modified)
671        });
672
673        let matches = matches
674            .into_iter()
675            .map(|matched| GrepMatch {
676                file: matched.file.as_ref().clone(),
677                line: matched.line,
678                column: matched.column,
679                line_text: matched.line_text,
680                match_text: matched.match_text,
681            })
682            .collect();
683
684        GrepResult {
685            total_matches: total_matches.load(Ordering::Relaxed),
686            matches,
687            files_searched: files_searched.load(Ordering::Relaxed),
688            files_with_matches: files_with_matches.load(Ordering::Relaxed),
689            index_status: if self.ready {
690                IndexStatus::Ready
691            } else {
692                IndexStatus::Building
693            },
694            truncated: truncated.load(Ordering::Relaxed),
695            fully_degraded,
696            engine_capped: engine_capped.load(Ordering::Relaxed),
697            walk_truncated: false,
698        }
699    }
700
701    fn empty_grep_result(&self) -> GrepResult {
702        GrepResult {
703            matches: Vec::new(),
704            total_matches: 0,
705            files_searched: 0,
706            files_with_matches: 0,
707            index_status: if self.ready {
708                IndexStatus::Ready
709            } else {
710                IndexStatus::Building
711            },
712            truncated: false,
713            fully_degraded: false,
714            engine_capped: false,
715            walk_truncated: false,
716        }
717    }
718
719    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
720        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
721            Ok(filters) => filters,
722            Err(_) => return Vec::new(),
723        };
724        let search_root = canonicalize_or_normalize(search_root);
725        let mut entries = self
726            .files
727            .iter()
728            .filter(|file| !file.path.as_os_str().is_empty())
729            .filter(|file| is_within_search_root(&search_root, &file.path))
730            .filter(|file| filters.matches(&self.project_root, &file.path))
731            .map(|file| (file.path.clone(), file.modified))
732            .collect::<Vec<_>>();
733
734        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
735            right_mtime
736                .cmp(left_mtime)
737                .then_with(|| left_path.cmp(right_path))
738        });
739
740        entries.into_iter().map(|(path, _)| path).collect()
741    }
742
743    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
744        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
745            return self.active_file_ids();
746        }
747
748        let mut and_trigrams = query.and_trigrams.clone();
749        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
750
751        let mut current: Option<Vec<u32>> = None;
752
753        for trigram in and_trigrams {
754            let filter = query.and_filters.get(&trigram).copied();
755            let matches = self.postings_for_trigram(trigram, filter);
756            current = Some(match current.take() {
757                Some(existing) => intersect_sorted_ids(&existing, &matches),
758                None => matches,
759            });
760
761            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
762                break;
763            }
764        }
765
766        let mut current = current.unwrap_or_else(|| self.active_file_ids());
767
768        for (index, group) in query.or_groups.iter().enumerate() {
769            let mut group_matches = Vec::new();
770            let filters = query.or_filters.get(index);
771
772            for trigram in group {
773                let filter = filters.and_then(|filters| filters.get(trigram).copied());
774                let matches = self.postings_for_trigram(*trigram, filter);
775                if group_matches.is_empty() {
776                    group_matches = matches;
777                } else {
778                    group_matches = union_sorted_ids(&group_matches, &matches);
779                }
780            }
781
782            current = intersect_sorted_ids(&current, &group_matches);
783            if current.is_empty() {
784                break;
785            }
786        }
787
788        let mut unindexed = self
789            .unindexed_files
790            .iter()
791            .copied()
792            .filter(|file_id| self.is_active_file(*file_id))
793            .collect::<Vec<_>>();
794        if !unindexed.is_empty() {
795            unindexed.sort_unstable();
796            current = union_sorted_ids(&current, &unindexed);
797        }
798
799        current
800    }
801
802    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
803        if fs::create_dir_all(cache_dir).is_err() {
804            return;
805        }
806
807        let cache_path = cache_dir.join("cache.bin");
808        let tmp_cache = cache_dir.join(format!(
809            "cache.bin.tmp.{}.{}",
810            std::process::id(),
811            SystemTime::now()
812                .duration_since(UNIX_EPOCH)
813                .unwrap_or(Duration::ZERO)
814                .as_nanos()
815        ));
816
817        let active_ids = self.active_file_ids();
818        let mut id_map = HashMap::new();
819        for (new_id, old_id) in active_ids.iter().enumerate() {
820            let Ok(new_id_u32) = u32::try_from(new_id) else {
821                return;
822            };
823            id_map.insert(*old_id, new_id_u32);
824        }
825
826        let write_result = (|| -> std::io::Result<()> {
827            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
828
829            postings_writer.write_all(INDEX_MAGIC)?;
830            write_u32(&mut postings_writer, INDEX_VERSION)?;
831
832            let head = git_head.unwrap_or_default();
833            let root = self.project_root.to_string_lossy();
834            let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
835                ignore_rules_fingerprint(&self.project_root)
836            } else {
837                self.ignore_rules_fingerprint.clone()
838            };
839            let head_len = u32::try_from(head.len())
840                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
841            let root_len = u32::try_from(root.len())
842                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
843            let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
844                .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
845            let file_count = u32::try_from(active_ids.len())
846                .map_err(|_| std::io::Error::other("too many files to cache"))?;
847
848            write_u32(&mut postings_writer, head_len)?;
849            write_u32(&mut postings_writer, root_len)?;
850            write_u32(&mut postings_writer, ignore_fingerprint_len)?;
851            write_u64(&mut postings_writer, self.max_file_size)?;
852            write_u32(&mut postings_writer, file_count)?;
853            postings_writer.write_all(head.as_bytes())?;
854            postings_writer.write_all(root.as_bytes())?;
855            postings_writer.write_all(ignore_fingerprint.as_bytes())?;
856
857            for old_id in &active_ids {
858                let Some(file) = self.files.get(*old_id as usize) else {
859                    return Err(std::io::Error::other("missing file entry for cache write"));
860                };
861                let path =
862                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
863                        std::io::Error::other(format!(
864                            "refusing to cache path outside project root: {}",
865                            file.path.display()
866                        ))
867                    })?;
868                let path = path.to_string_lossy();
869                let path_len = u32::try_from(path.len())
870                    .map_err(|_| std::io::Error::other("cached path too large"))?;
871                let modified = file
872                    .modified
873                    .duration_since(UNIX_EPOCH)
874                    .unwrap_or(Duration::ZERO);
875                let unindexed = if self.unindexed_files.contains(old_id) {
876                    1u8
877                } else {
878                    0u8
879                };
880
881                postings_writer.write_all(&[unindexed])?;
882                write_u32(&mut postings_writer, path_len)?;
883                write_u64(&mut postings_writer, file.size)?;
884                write_u64(&mut postings_writer, modified.as_secs())?;
885                write_u32(&mut postings_writer, modified.subsec_nanos())?;
886                postings_writer.write_all(file.content_hash.as_bytes())?;
887                postings_writer.write_all(path.as_bytes())?;
888            }
889
890            let mut lookup_entries = Vec::new();
891            let mut postings_blob = Vec::new();
892            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
893            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
894
895            for (trigram, postings) in sorted_postings {
896                let offset = u64::try_from(postings_blob.len())
897                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
898                let mut count = 0u32;
899
900                for posting in postings {
901                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
902                        continue;
903                    };
904
905                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
906                    postings_blob.push(posting.next_mask);
907                    postings_blob.push(posting.loc_mask);
908                    count = count.saturating_add(1);
909                }
910
911                if count > 0 {
912                    lookup_entries.push((*trigram, offset, count));
913                }
914            }
915
916            write_u64(
917                &mut postings_writer,
918                u64::try_from(postings_blob.len())
919                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
920            )?;
921            postings_writer.write_all(&postings_blob)?;
922            postings_writer.flush()?;
923            let mut postings_blob_file = postings_writer
924                .into_inner()
925                .map_err(|error| std::io::Error::other(error.to_string()))?
926                .into_inner();
927            let checksum = crc32fast::hash(&postings_blob_file);
928            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
929
930            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
931            let entry_count = u32::try_from(lookup_entries.len())
932                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
933
934            lookup_writer.write_all(LOOKUP_MAGIC)?;
935            write_u32(&mut lookup_writer, INDEX_VERSION)?;
936            write_u32(&mut lookup_writer, entry_count)?;
937
938            for (trigram, offset, count) in lookup_entries {
939                write_u32(&mut lookup_writer, trigram)?;
940                write_u64(&mut lookup_writer, offset)?;
941                write_u32(&mut lookup_writer, count)?;
942            }
943
944            lookup_writer.flush()?;
945            let mut lookup_blob_file = lookup_writer
946                .into_inner()
947                .map_err(|error| std::io::Error::other(error.to_string()))?
948                .into_inner();
949            let checksum = crc32fast::hash(&lookup_blob_file);
950            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
951
952            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
953            write_u32(&mut cache_writer, CACHE_MAGIC)?;
954            write_u32(&mut cache_writer, INDEX_VERSION)?;
955            write_u64(
956                &mut cache_writer,
957                u64::try_from(postings_blob_file.len())
958                    .map_err(|_| std::io::Error::other("postings section too large"))?,
959            )?;
960            cache_writer.write_all(&postings_blob_file)?;
961            cache_writer.write_all(&lookup_blob_file)?;
962            cache_writer.flush()?;
963            cache_writer.get_ref().sync_all()?;
964            drop(cache_writer);
965            fs::rename(&tmp_cache, &cache_path)?;
966
967            Ok(())
968        })();
969
970        if write_result.is_err() {
971            let _ = fs::remove_file(&tmp_cache);
972        }
973    }
974
975    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
976        debug_assert!(current_canonical_root.is_absolute());
977        let cache_path = cache_dir.join("cache.bin");
978        let cache_bytes = fs::read(&cache_path).ok()?;
979        if cache_bytes.len() < 16 {
980            return None;
981        }
982        let mut header = Cursor::new(&cache_bytes);
983        if read_u32(&mut header).ok()? != CACHE_MAGIC {
984            return None;
985        }
986        if read_u32(&mut header).ok()? != INDEX_VERSION {
987            return None;
988        }
989        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
990        let start = usize::try_from(header.position()).ok()?;
991        let postings_end = start.checked_add(postings_len_total)?;
992        if postings_end > cache_bytes.len() {
993            return None;
994        }
995        let postings_bytes = &cache_bytes[start..postings_end];
996        let lookup_bytes = &cache_bytes[postings_end..];
997        let lookup_len_total = lookup_bytes.len();
998        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
999        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
1000        if postings_len_total < 4 || lookup_len_total < 4 {
1001            return None;
1002        }
1003        verify_crc32_bytes_slice(postings_bytes).ok()?;
1004        verify_crc32_bytes_slice(lookup_bytes).ok()?;
1005
1006        let mut magic = [0u8; 8];
1007        postings_reader.read_exact(&mut magic).ok()?;
1008        if &magic != INDEX_MAGIC {
1009            return None;
1010        }
1011        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
1012            return None;
1013        }
1014
1015        let head_len = read_u32(&mut postings_reader).ok()? as usize;
1016        let root_len = read_u32(&mut postings_reader).ok()? as usize;
1017        let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
1018        let max_file_size = read_u64(&mut postings_reader).ok()?;
1019        let file_count = read_u32(&mut postings_reader).ok()? as usize;
1020        if file_count > MAX_ENTRIES {
1021            return None;
1022        }
1023        let postings_body_len = postings_len_total.checked_sub(4)?;
1024        let lookup_body_len = lookup_len_total.checked_sub(4)?;
1025
1026        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
1027        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
1028        if minimum_file_bytes > remaining_postings {
1029            return None;
1030        }
1031
1032        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1033            return None;
1034        }
1035        let mut head_bytes = vec![0u8; head_len];
1036        postings_reader.read_exact(&mut head_bytes).ok()?;
1037        let git_head = String::from_utf8(head_bytes)
1038            .ok()
1039            .filter(|head| !head.is_empty());
1040
1041        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1042            return None;
1043        }
1044        let mut root_bytes = vec![0u8; root_len];
1045        postings_reader.read_exact(&mut root_bytes).ok()?;
1046        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
1047        let project_root = current_canonical_root.to_path_buf();
1048
1049        if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1050            return None;
1051        }
1052        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
1053        postings_reader
1054            .read_exact(&mut ignore_fingerprint_bytes)
1055            .ok()?;
1056        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
1057        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
1058        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1059            return None;
1060        }
1061
1062        let mut files = Vec::with_capacity(file_count);
1063        let mut path_to_id = HashMap::new();
1064        let mut unindexed_files = HashSet::new();
1065
1066        for file_id in 0..file_count {
1067            let mut unindexed = [0u8; 1];
1068            postings_reader.read_exact(&mut unindexed).ok()?;
1069            let path_len = read_u32(&mut postings_reader).ok()? as usize;
1070            let size = read_u64(&mut postings_reader).ok()?;
1071            let secs = read_u64(&mut postings_reader).ok()?;
1072            let nanos = read_u32(&mut postings_reader).ok()?;
1073            let mut hash_bytes = [0u8; 32];
1074            postings_reader.read_exact(&mut hash_bytes).ok()?;
1075            let content_hash = blake3::Hash::from_bytes(hash_bytes);
1076            if nanos >= 1_000_000_000 {
1077                return None;
1078            }
1079            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1080                return None;
1081            }
1082            let mut path_bytes = vec![0u8; path_len];
1083            postings_reader.read_exact(&mut path_bytes).ok()?;
1084            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
1085            let full_path = cached_path_under_root(&project_root, &relative_path)?;
1086            let file_id_u32 = u32::try_from(file_id).ok()?;
1087
1088            files.push(FileEntry {
1089                path: full_path.clone(),
1090                size,
1091                modified: UNIX_EPOCH + Duration::new(secs, nanos),
1092                content_hash,
1093            });
1094            path_to_id.insert(full_path, file_id_u32);
1095            if unindexed[0] == 1 {
1096                unindexed_files.insert(file_id_u32);
1097            }
1098        }
1099
1100        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
1101        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
1102        if postings_len > max_postings_bytes {
1103            return None;
1104        }
1105        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1106            return None;
1107        }
1108        let mut postings_blob = vec![0u8; postings_len];
1109        postings_reader.read_exact(&mut postings_blob).ok()?;
1110
1111        let mut lookup_magic = [0u8; 8];
1112        lookup_reader.read_exact(&mut lookup_magic).ok()?;
1113        if &lookup_magic != LOOKUP_MAGIC {
1114            return None;
1115        }
1116        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
1117            return None;
1118        }
1119        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
1120        if entry_count > MAX_ENTRIES {
1121            return None;
1122        }
1123        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1124        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1125        if minimum_lookup_bytes > remaining_lookup {
1126            return None;
1127        }
1128
1129        let mut postings = HashMap::new();
1130        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1131
1132        for _ in 0..entry_count {
1133            let trigram = read_u32(&mut lookup_reader).ok()?;
1134            let offset = read_u64(&mut lookup_reader).ok()? as usize;
1135            let count = read_u32(&mut lookup_reader).ok()? as usize;
1136            if count > MAX_ENTRIES {
1137                return None;
1138            }
1139            let bytes_len = count.checked_mul(POSTING_BYTES)?;
1140            let end = offset.checked_add(bytes_len)?;
1141            if end > postings_blob.len() {
1142                return None;
1143            }
1144
1145            let mut trigram_postings = Vec::with_capacity(count);
1146            for chunk in postings_blob[offset..end].chunks_exact(6) {
1147                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1148                let posting = Posting {
1149                    file_id,
1150                    next_mask: chunk[4],
1151                    loc_mask: chunk[5],
1152                };
1153                trigram_postings.push(posting.clone());
1154                file_trigrams.entry(file_id).or_default().push(trigram);
1155            }
1156            postings.insert(trigram, trigram_postings);
1157        }
1158
1159        Some(SearchIndex {
1160            postings,
1161            files,
1162            path_to_id,
1163            ready: false,
1164            project_root,
1165            git_head,
1166            max_file_size,
1167            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1168            file_trigrams,
1169            unindexed_files,
1170        })
1171    }
1172
1173    pub fn stored_git_head(&self) -> Option<&str> {
1174        self.git_head.as_deref()
1175    }
1176
1177    pub(crate) fn set_ready(&mut self, ready: bool) {
1178        self.ready = ready;
1179    }
1180
1181    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1182        self.git_head = current_head;
1183        verify_file_mtimes(self);
1184        self.ready = true;
1185    }
1186
1187    #[cfg(debug_assertions)]
1188    #[doc(hidden)]
1189    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1190        self.verify_against_disk(current_head);
1191    }
1192
1193    pub(crate) fn rebuild_or_refresh(
1194        root: &Path,
1195        max_file_size: u64,
1196        current_head: Option<String>,
1197        baseline: Option<SearchIndex>,
1198    ) -> Self {
1199        if let Some(mut baseline) = baseline {
1200            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1201            baseline.max_file_size = max_file_size;
1202            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1203            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1204                return SearchIndex::build_with_limit(root, max_file_size);
1205            }
1206            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1207
1208            if baseline.git_head == current_head || current_head.is_none() {
1209                // HEAD matches, but files may have changed on disk since the index was
1210                // last written (e.g., uncommitted edits, stash pop, manual file changes
1211                // while OpenCode was closed). Verify mtimes and re-index stale files.
1212                // Non-git projects also use this per-file (path, mtime, size)
1213                // fingerprint so unchanged trees reuse the disk cache instead of
1214                // rebuilding every configure.
1215                baseline.git_head = current_head;
1216                verify_file_mtimes(&mut baseline);
1217                baseline.ready = true;
1218                return baseline;
1219            }
1220
1221            if let (Some(previous), Some(current)) =
1222                (baseline.git_head.clone(), current_head.clone())
1223            {
1224                let project_root = baseline.project_root.clone();
1225                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1226                    baseline.git_head = Some(current);
1227                    verify_file_mtimes(&mut baseline);
1228                    baseline.ready = true;
1229                    return baseline;
1230                }
1231            }
1232        }
1233
1234        SearchIndex::build_with_limit(root, max_file_size)
1235    }
1236
1237    fn allocate_file_id_with_metadata(
1238        &mut self,
1239        path: &Path,
1240        metadata: SearchFileMetadata,
1241    ) -> Option<u32> {
1242        let file_id = u32::try_from(self.files.len()).ok()?;
1243        self.files.push(FileEntry {
1244            path: path.to_path_buf(),
1245            size: metadata.size,
1246            modified: metadata.modified,
1247            content_hash: cache_freshness::zero_hash(),
1248        });
1249        self.path_to_id.insert(path.to_path_buf(), file_id);
1250        Some(file_id)
1251    }
1252
1253    fn track_unindexed_file_with_metadata(
1254        &mut self,
1255        path: &Path,
1256        metadata: SearchFileMetadata,
1257    ) -> bool {
1258        let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1259            return false;
1260        };
1261        self.unindexed_files.insert(file_id);
1262        self.file_trigrams.insert(file_id, Vec::new());
1263        true
1264    }
1265
1266    fn active_file_ids(&self) -> Vec<u32> {
1267        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1268        ids.sort_unstable();
1269        ids
1270    }
1271
1272    fn is_active_file(&self, file_id: u32) -> bool {
1273        self.files
1274            .get(file_id as usize)
1275            .map(|file| !file.path.as_os_str().is_empty())
1276            .unwrap_or(false)
1277    }
1278
1279    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1280        let Some(postings) = self.postings.get(&trigram) else {
1281            return Vec::new();
1282        };
1283
1284        let mut matches = Vec::with_capacity(postings.len());
1285
1286        for posting in postings {
1287            if let Some(filter) = filter {
1288                // next_mask: bloom filter check — the character following this trigram in the
1289                // query must also appear after this trigram somewhere in the file.
1290                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1291                    continue;
1292                }
1293                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1294                // between consecutive trigram pairs, but is NOT used as a single-trigram
1295                // filter because the position in the query string has no relationship to
1296                // the position in the file. Using it here causes false negatives.
1297            }
1298            if self.is_active_file(posting.file_id) {
1299                matches.push(posting.file_id);
1300            }
1301        }
1302
1303        matches
1304    }
1305}
1306
1307fn search_candidate_file(
1308    file: &FileEntry,
1309    matcher: &SearchMatcher,
1310    max_results: usize,
1311    stop_after: usize,
1312    total_matches: &AtomicUsize,
1313    files_searched: &AtomicUsize,
1314    files_with_matches: &AtomicUsize,
1315    truncated: &AtomicBool,
1316    engine_capped: &AtomicBool,
1317    stop_scan: Option<&Arc<AtomicBool>>,
1318) -> Vec<SharedGrepMatch> {
1319    if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1320        engine_capped.store(true, Ordering::Relaxed);
1321        return Vec::new();
1322    }
1323
1324    let content = match read_indexed_file_bytes(&file.path) {
1325        Some(content) => content,
1326        None => return Vec::new(),
1327    };
1328    // Defense in depth: even though indexing tries to filter binaries via
1329    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1330    // query time. content_inspector is fast (~bytes-per-cycle on a small
1331    // preview) and this guarantees we never surface matches inside binary
1332    // files even if the indexer somehow let one through (e.g. file changed
1333    // between indexing and query).
1334    if is_binary_bytes(&content) {
1335        return Vec::new();
1336    }
1337    files_searched.fetch_add(1, Ordering::Relaxed);
1338
1339    let shared_path = Arc::new(file.path.clone());
1340    let mut matches = Vec::new();
1341    let mut line_starts = None;
1342    let mut seen_lines = HashSet::new();
1343    let mut matched_this_file = false;
1344
1345    match matcher {
1346        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1347            let needle = &literal.needle;
1348            let finder = memchr::memmem::Finder::new(needle);
1349            let mut start = 0;
1350
1351            while let Some(position) = finder.find(&content[start..]) {
1352                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1353                    engine_capped.store(true, Ordering::Relaxed);
1354                    break;
1355                }
1356
1357                let offset = start + position;
1358                start = offset + 1;
1359
1360                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1361                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1362                if !seen_lines.insert(line) {
1363                    continue;
1364                }
1365
1366                matched_this_file = true;
1367                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1368                if match_number > max_results {
1369                    truncated.store(true, Ordering::Relaxed);
1370                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1371                    break;
1372                }
1373
1374                let end = offset + needle.len();
1375                matches.push(SharedGrepMatch {
1376                    file: shared_path.clone(),
1377                    line,
1378                    column,
1379                    line_text,
1380                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1381                });
1382            }
1383        }
1384        SearchMatcher::Literal(literal) => {
1385            let needle = &literal.needle;
1386            let search_content = content.to_ascii_lowercase();
1387            let finder = memchr::memmem::Finder::new(needle);
1388            let mut start = 0;
1389
1390            while let Some(position) = finder.find(&search_content[start..]) {
1391                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1392                    engine_capped.store(true, Ordering::Relaxed);
1393                    break;
1394                }
1395
1396                let offset = start + position;
1397                start = offset + 1;
1398
1399                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1400                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1401                if !seen_lines.insert(line) {
1402                    continue;
1403                }
1404
1405                matched_this_file = true;
1406                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1407                if match_number > max_results {
1408                    truncated.store(true, Ordering::Relaxed);
1409                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1410                    break;
1411                }
1412
1413                let end = offset + needle.len();
1414                matches.push(SharedGrepMatch {
1415                    file: shared_path.clone(),
1416                    line,
1417                    column,
1418                    line_text,
1419                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1420                });
1421            }
1422        }
1423        SearchMatcher::Regex(regex) => {
1424            for matched in regex.find_iter(&content) {
1425                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1426                    engine_capped.store(true, Ordering::Relaxed);
1427                    break;
1428                }
1429
1430                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1431                let (line, column, line_text) =
1432                    line_details_bytes(&content, line_starts, matched.start());
1433                if !seen_lines.insert(line) {
1434                    continue;
1435                }
1436
1437                matched_this_file = true;
1438                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1439                if match_number > max_results {
1440                    truncated.store(true, Ordering::Relaxed);
1441                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1442                    break;
1443                }
1444
1445                matches.push(SharedGrepMatch {
1446                    file: shared_path.clone(),
1447                    line,
1448                    column,
1449                    line_text,
1450                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1451                });
1452            }
1453        }
1454    }
1455
1456    if matched_this_file {
1457        files_with_matches.fetch_add(1, Ordering::Relaxed);
1458    }
1459
1460    matches
1461}
1462
1463fn should_stop_search(
1464    truncated: &AtomicBool,
1465    total_matches: &AtomicUsize,
1466    stop_after: usize,
1467) -> bool {
1468    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1469}
1470
1471fn grep_scan_should_stop(
1472    stop_scan: Option<&Arc<AtomicBool>>,
1473    truncated: &AtomicBool,
1474    total_matches: &AtomicUsize,
1475    stop_after: usize,
1476) -> bool {
1477    stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1478        || should_stop_search(truncated, total_matches, stop_after)
1479}
1480
1481fn signal_grep_scan_cap(
1482    stop_scan: Option<&Arc<AtomicBool>>,
1483    total_matches: &AtomicUsize,
1484    stop_after: usize,
1485) {
1486    if let Some(flag) = stop_scan {
1487        if total_matches.load(Ordering::Relaxed) >= stop_after {
1488            flag.store(true, Ordering::Relaxed);
1489        }
1490    }
1491}
1492
1493fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1494    SearchFileMetadata {
1495        size: metadata.len(),
1496        modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1497    }
1498}
1499
1500fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1501    fs::metadata(path)
1502        .ok()
1503        .map(|metadata| search_file_metadata(&metadata))
1504        .unwrap_or(SearchFileMetadata {
1505            size: size_hint,
1506            modified: UNIX_EPOCH,
1507        })
1508}
1509
1510fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1511    let metadata = match fs::metadata(path) {
1512        Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1513        _ => return PreparedSearchPath::Skipped,
1514    };
1515
1516    if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1517        return PreparedSearchPath::Unindexed(metadata);
1518    }
1519
1520    let content = match fs::read(path) {
1521        Ok(content) => content,
1522        Err(_) => return PreparedSearchPath::Skipped,
1523    };
1524
1525    if is_binary_bytes(&content) {
1526        return PreparedSearchPath::Unindexed(metadata);
1527    }
1528
1529    PreparedSearchPath::Indexed(PreparedIndexedFile {
1530        metadata,
1531        content_hash: cache_freshness::hash_bytes(&content),
1532        trigram_map: trigram_filter_map(&content, true),
1533    })
1534}
1535
1536/// Bound cold search-index ingestion to half the cores (cap 8), matching callgraph store.
1537fn search_index_build_pool_size() -> usize {
1538    std::thread::available_parallelism()
1539        .map(|parallelism| parallelism.get())
1540        .unwrap_or(1)
1541        .div_ceil(2)
1542        .clamp(1, 8)
1543}
1544
1545fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1546    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1547    let mut left_index = 0;
1548    let mut right_index = 0;
1549
1550    while left_index < left.len() && right_index < right.len() {
1551        match left[left_index].cmp(&right[right_index]) {
1552            std::cmp::Ordering::Less => left_index += 1,
1553            std::cmp::Ordering::Greater => right_index += 1,
1554            std::cmp::Ordering::Equal => {
1555                merged.push(left[left_index]);
1556                left_index += 1;
1557                right_index += 1;
1558            }
1559        }
1560    }
1561
1562    merged
1563}
1564
1565fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1566    let mut merged = Vec::with_capacity(left.len() + right.len());
1567    let mut left_index = 0;
1568    let mut right_index = 0;
1569
1570    while left_index < left.len() && right_index < right.len() {
1571        match left[left_index].cmp(&right[right_index]) {
1572            std::cmp::Ordering::Less => {
1573                merged.push(left[left_index]);
1574                left_index += 1;
1575            }
1576            std::cmp::Ordering::Greater => {
1577                merged.push(right[right_index]);
1578                right_index += 1;
1579            }
1580            std::cmp::Ordering::Equal => {
1581                merged.push(left[left_index]);
1582                left_index += 1;
1583                right_index += 1;
1584            }
1585        }
1586    }
1587
1588    merged.extend_from_slice(&left[left_index..]);
1589    merged.extend_from_slice(&right[right_index..]);
1590    merged
1591}
1592
1593pub fn decompose_regex(pattern: &str) -> RegexQuery {
1594    let hir = match regex_syntax::parse(pattern) {
1595        Ok(hir) => hir,
1596        Err(_) => return RegexQuery::default(),
1597    };
1598
1599    let build = build_query(&hir);
1600    build.into_query()
1601}
1602
1603pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1604    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1605}
1606
1607pub fn normalize_char(c: u8) -> u8 {
1608    c.to_ascii_lowercase()
1609}
1610
1611fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
1612    if content.len() < 3 {
1613        return;
1614    }
1615
1616    for start in 0..=content.len() - 3 {
1617        let trigram = pack_trigram(
1618            normalize_char(content[start]),
1619            normalize_char(content[start + 1]),
1620            normalize_char(content[start + 2]),
1621        );
1622        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1623        visit(trigram, next_char, start);
1624    }
1625}
1626
1627pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1628    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1629    scan_trigrams(content, |trigram, next_char, position| {
1630        trigrams.push((trigram, next_char, position));
1631    });
1632    trigrams
1633}
1634
1635fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
1636    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1637    scan_trigrams(content, |trigram, next_char, position| {
1638        let entry = filters.entry(trigram).or_default();
1639        if include_eof_next_char || next_char != EOF_SENTINEL {
1640            entry.next_mask |= mask_for_next_char(next_char);
1641        }
1642        entry.loc_mask |= mask_for_position(position);
1643    });
1644    filters
1645}
1646
1647pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1648    let mut seen = HashSet::new();
1649    let mut out = Vec::new();
1650    for token in tokens {
1651        scan_trigrams(token.as_bytes(), |trigram, _, _| {
1652            if seen.insert(trigram) {
1653                out.push(trigram);
1654            }
1655        });
1656    }
1657    out
1658}
1659
1660pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1661    if query_trigrams.is_empty() {
1662        return 0.0;
1663    }
1664
1665    let mut hits = 0u32;
1666    for &trigram in query_trigrams {
1667        if let Some(postings) = index.postings.get(&trigram) {
1668            if postings
1669                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1670                .is_ok()
1671            {
1672                hits += 1;
1673            }
1674        }
1675    }
1676
1677    if hits == 0 {
1678        return 0.0;
1679    }
1680
1681    let file_trigram_count = index
1682        .file_trigrams
1683        .get(&file_id)
1684        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1685    (hits as f32) / (1.0 + file_trigram_count.ln())
1686}
1687
1688pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1689    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1690    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1691        return PathBuf::from(override_dir)
1692            .join("index")
1693            .join(artifact_cache_key(project_root));
1694    }
1695    // Use configured storage dir (from plugin, XDG-compliant)
1696    if let Some(dir) = storage_dir {
1697        return dir.join("index").join(artifact_cache_key(project_root));
1698    }
1699    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1700    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1701    // If neither is set, fall back to a temp directory rather than `"."`
1702    // because the search-index code reads/writes absolute paths.
1703    let home = std::env::var_os("HOME")
1704        .or_else(|| std::env::var_os("USERPROFILE"))
1705        .map(PathBuf::from)
1706        .unwrap_or_else(std::env::temp_dir);
1707    home.join(".cache")
1708        .join("aft")
1709        .join("index")
1710        .join(artifact_cache_key(project_root))
1711}
1712
1713pub(crate) fn build_path_filters(
1714    include: &[String],
1715    exclude: &[String],
1716) -> Result<PathFilters, String> {
1717    Ok(PathFilters {
1718        includes: build_globset(include)?,
1719        excludes: build_globset(exclude)?,
1720    })
1721}
1722
1723pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1724    walk_project_files_from(root, root, filters)
1725}
1726
1727pub fn walk_project_files_bounded_default(
1728    root: &Path,
1729    max_files: usize,
1730) -> Result<Vec<PathBuf>, usize> {
1731    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
1732}
1733
1734pub(crate) fn walk_project_files_bounded_matching<F>(
1735    root: &Path,
1736    filters: &PathFilters,
1737    max_files: usize,
1738    matches_file: F,
1739) -> Result<Vec<PathBuf>, usize>
1740where
1741    F: Fn(&Path) -> bool,
1742{
1743    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
1744}
1745
1746pub fn walk_project_files_bounded_default_matching<F>(
1747    root: &Path,
1748    max_files: usize,
1749    matches_file: F,
1750) -> Result<Vec<PathBuf>, usize>
1751where
1752    F: Fn(&Path) -> bool,
1753{
1754    walk_project_files_from_inner_matching(
1755        root,
1756        root,
1757        &PathFilters::default(),
1758        Some(max_files),
1759        matches_file,
1760        true,
1761    )
1762}
1763
1764pub(crate) fn walk_project_files_from(
1765    filter_root: &Path,
1766    search_root: &Path,
1767    filters: &PathFilters,
1768) -> Vec<PathBuf> {
1769    walk_project_files_from_inner(filter_root, search_root, filters, None, true)
1770        .expect("unbounded project walk cannot exceed a file limit")
1771}
1772
1773pub(crate) fn has_any_project_file_from(
1774    filter_root: &Path,
1775    search_root: &Path,
1776    filters: &PathFilters,
1777) -> bool {
1778    walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
1779}
1780
1781fn walk_project_files_from_inner(
1782    filter_root: &Path,
1783    search_root: &Path,
1784    filters: &PathFilters,
1785    max_files: Option<usize>,
1786    sort_by_mtime: bool,
1787) -> Result<Vec<PathBuf>, usize> {
1788    walk_project_files_from_inner_matching(
1789        filter_root,
1790        search_root,
1791        filters,
1792        max_files,
1793        |_| true,
1794        sort_by_mtime,
1795    )
1796}
1797
1798fn project_walk_builder(search_root: &Path) -> WalkBuilder {
1799    let mut builder = WalkBuilder::new(search_root);
1800    builder
1801        .hidden(false)
1802        .git_ignore(true)
1803        .git_global(true)
1804        .git_exclude(true)
1805        .add_custom_ignore_filename(".aftignore")
1806        .filter_entry(|entry| {
1807            let name = entry.file_name().to_string_lossy();
1808            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1809                return !matches!(
1810                    name.as_ref(),
1811                    "node_modules"
1812                        | "target"
1813                        | "venv"
1814                        | ".venv"
1815                        | ".git"
1816                        | "__pycache__"
1817                        | ".tox"
1818                        | "dist"
1819                        | "build"
1820                );
1821            }
1822            true
1823        });
1824    builder
1825}
1826
1827fn walk_project_files_from_inner_matching<F>(
1828    filter_root: &Path,
1829    search_root: &Path,
1830    filters: &PathFilters,
1831    max_files: Option<usize>,
1832    matches_file: F,
1833    sort_by_mtime: bool,
1834) -> Result<Vec<PathBuf>, usize>
1835where
1836    F: Fn(&Path) -> bool,
1837{
1838    let builder = project_walk_builder(search_root);
1839
1840    let mut files = Vec::new();
1841    for entry in builder.build().filter_map(|entry| entry.ok()) {
1842        if !entry
1843            .file_type()
1844            .map_or(false, |file_type| file_type.is_file())
1845        {
1846            continue;
1847        }
1848        let path = entry.into_path();
1849        if filters.matches(filter_root, &path) && matches_file(&path) {
1850            files.push(path);
1851            if max_files.is_some_and(|limit| files.len() > limit) {
1852                return Err(files.len());
1853            }
1854        }
1855    }
1856
1857    if sort_by_mtime {
1858        sort_paths_by_mtime_desc(&mut files);
1859    }
1860    Ok(files)
1861}
1862
1863pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1864    let bytes = fs::read(path).ok()?;
1865    if is_binary_bytes(&bytes) {
1866        return None;
1867    }
1868    String::from_utf8(bytes).ok()
1869}
1870
1871fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1872    fs::read(path).ok()
1873}
1874
1875pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1876    path.strip_prefix(root)
1877        .map(PathBuf::from)
1878        .unwrap_or_else(|_| path.to_path_buf())
1879}
1880
1881pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1882    let normalized_root = normalize_path(root);
1883    let normalized_path = normalize_path(path);
1884    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1885    validate_cached_relative_path(relative)
1886}
1887
1888pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1889    let relative = validate_cached_relative_path(relative_path)?;
1890    let normalized_root = normalize_path(root);
1891    let full_path = normalize_path(&normalized_root.join(relative));
1892
1893    match fs::canonicalize(&full_path) {
1894        Ok(canonical_path) => {
1895            if canonical_path.starts_with(&normalized_root) {
1896                return Some(full_path);
1897            }
1898
1899            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1900            canonical_path
1901                .starts_with(&canonical_root)
1902                .then_some(full_path)
1903        }
1904        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1905    }
1906}
1907
1908pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1909    if path.is_absolute() {
1910        return None;
1911    }
1912
1913    let mut normalized = PathBuf::new();
1914    for component in path.components() {
1915        match component {
1916            Component::Normal(part) => normalized.push(part),
1917            Component::CurDir => {}
1918            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1919        }
1920    }
1921    (!normalized.as_os_str().is_empty()).then_some(normalized)
1922}
1923
1924/// Sort paths newest-first by mtime, falling back to lexicographic order.
1925///
1926/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1927/// `sort_by()` closure. That made the comparator non-deterministic — a
1928/// `stat()` syscall for the same path can return different values across
1929/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1930/// concurrent file-watcher activity), and Rust's slice::sort panics at
1931/// runtime when it detects a non-total-order comparator. CI hit this on
1932/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1933///
1934/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1935/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1936pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1937    use std::collections::HashMap;
1938    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1939    for path in paths.iter() {
1940        mtimes
1941            .entry(path.clone())
1942            .or_insert_with(|| path_modified_time(path));
1943    }
1944    paths.sort_by(|left, right| {
1945        let left_mtime = mtimes.get(left).and_then(|v| *v);
1946        let right_mtime = mtimes.get(right).and_then(|v| *v);
1947        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1948    });
1949}
1950
1951/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1952/// the sort. Same fix, applied to grep matches that share files.
1953pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1954    use std::collections::HashMap;
1955    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1956    for m in matches.iter() {
1957        mtimes.entry(m.file.clone()).or_insert_with(|| {
1958            let resolved = resolve_match_path(project_root, &m.file);
1959            path_modified_time(&resolved)
1960        });
1961    }
1962    matches.sort_by(|left, right| {
1963        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1964        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1965        right_mtime
1966            .cmp(&left_mtime)
1967            .then_with(|| left.file.cmp(&right.file))
1968            .then_with(|| left.line.cmp(&right.line))
1969            .then_with(|| left.column.cmp(&right.column))
1970    });
1971}
1972
1973/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1974/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1975/// table from the search index), but it can still return different values if
1976/// the file is modified mid-sort. Snapshot once.
1977fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1978    matches: &mut [SharedGrepMatch],
1979    modified_for_path: F,
1980) where
1981    F: Fn(&Path) -> Option<SystemTime>,
1982{
1983    use std::collections::HashMap;
1984    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1985    for m in matches.iter() {
1986        let path = m.file.as_path().to_path_buf();
1987        mtimes
1988            .entry(path.clone())
1989            .or_insert_with(|| modified_for_path(&path));
1990    }
1991    matches.sort_by(|left, right| {
1992        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1993        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1994        right_mtime
1995            .cmp(&left_mtime)
1996            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1997            .then_with(|| left.line.cmp(&right.line))
1998            .then_with(|| left.column.cmp(&right.column))
1999    });
2000}
2001
2002pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
2003    let resolved_project_root = canonicalize_or_normalize(project_root);
2004    let root = match path {
2005        Some(path) => {
2006            let path = PathBuf::from(path);
2007            if path.is_absolute() {
2008                canonicalize_or_normalize(&path)
2009            } else {
2010                normalize_path(&resolved_project_root.join(path))
2011            }
2012        }
2013        None => resolved_project_root.clone(),
2014    };
2015
2016    let use_index = is_within_search_root(&resolved_project_root, &root);
2017    SearchScope { root, use_index }
2018}
2019
2020pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
2021    content_inspector::inspect(content).is_binary()
2022}
2023
2024pub(crate) fn current_git_head(root: &Path) -> Option<String> {
2025    run_git(root, &["rev-parse", "HEAD"])
2026}
2027
2028/// On-disk ARTIFACT cache key (search, semantic, symbol, callgraph, inspect).
2029///
2030/// For git repos this is the repository ROOT COMMIT — so a linked worktree
2031/// shares the main checkout's index (opened read-only), the deliberate
2032/// worktree-sharing mechanism. For non-git it is the canonical filesystem path.
2033///
2034/// This is the per-REPOSITORY identity. It is intentionally DISTINCT from
2035/// [`crate::path_identity::project_scope_key`] (the per-CHECKOUT identity used
2036/// for bash/compression/backup/checkpoint scoping). Its value is unchanged from
2037/// the historical `project_cache_key`, so existing on-disk caches are NOT
2038/// invalidated by the P0 identity split.
2039pub fn artifact_cache_key(project_root: &Path) -> String {
2040    use sha2::{Digest, Sha256};
2041
2042    let mut hasher = Sha256::new();
2043
2044    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
2045        // Git repo: root commit is the unique identity.
2046        // Same repo cloned anywhere produces the same key.
2047        hasher.update(root_commit.as_bytes());
2048    } else {
2049        // Non-git project: use the canonical filesystem path as identity.
2050        let canonical_root = canonicalize_or_normalize(project_root);
2051        hasher.update(canonical_root.to_string_lossy().as_bytes());
2052    }
2053
2054    let digest = format!("{:x}", hasher.finalize());
2055    digest[..16].to_string()
2056}
2057
2058/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
2059///
2060/// The search cache stores this value next to the file mtimes. If `.gitignore`,
2061/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
2062/// matching HEAD + matching file mtimes is not enough to safely reuse the old
2063/// cache: files that are now ignored may still be indexed. Hashing the ignore
2064/// files themselves makes cold-start cache reuse agree with the current walker.
2065pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
2066    use sha2::{Digest, Sha256};
2067
2068    let root = canonicalize_or_normalize(project_root);
2069    let mut files = Vec::new();
2070    collect_ignore_rule_files(&root, &mut files);
2071    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
2072        if global_ignore.is_file() {
2073            files.push(global_ignore);
2074        }
2075    }
2076    let info_exclude = git_info_exclude_path(&root);
2077    if info_exclude.is_file() {
2078        files.push(info_exclude);
2079    }
2080    files.sort();
2081    files.dedup();
2082
2083    let mut hasher = Sha256::new();
2084    hasher.update(b"aft-ignore-rules-v1\0");
2085    for path in files {
2086        if let Some(relative) = cache_relative_path(&root, &path) {
2087            hasher.update(relative.to_string_lossy().as_bytes());
2088        } else {
2089            hasher.update(path.to_string_lossy().as_bytes());
2090        }
2091        hasher.update(b"\0");
2092        match fs::read(&path) {
2093            Ok(bytes) => hasher.update(&bytes),
2094            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
2095        }
2096        hasher.update(b"\0");
2097    }
2098
2099    format!("{:x}", hasher.finalize())
2100}
2101
2102fn git_info_exclude_path(root: &Path) -> PathBuf {
2103    run_git(
2104        root,
2105        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
2106    )
2107    .map(PathBuf::from)
2108    .unwrap_or_else(|| root.join(".git"))
2109    .join("info")
2110    .join("exclude")
2111}
2112
2113fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
2114    let mut builder = WalkBuilder::new(root);
2115    builder
2116        .hidden(false)
2117        .git_ignore(true)
2118        .git_global(true)
2119        .git_exclude(true)
2120        .add_custom_ignore_filename(".aftignore")
2121        .filter_entry(|entry| {
2122            let name = entry.file_name().to_string_lossy();
2123            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2124                return !matches!(
2125                    name.as_ref(),
2126                    ".git"
2127                        | "node_modules"
2128                        | "target"
2129                        | "venv"
2130                        | ".venv"
2131                        | "__pycache__"
2132                        | ".tox"
2133                        | "dist"
2134                        | "build"
2135                );
2136            }
2137            true
2138        });
2139
2140    for entry in builder.build().filter_map(|entry| entry.ok()) {
2141        if !entry
2142            .file_type()
2143            .map_or(false, |file_type| file_type.is_file())
2144        {
2145            continue;
2146        }
2147        let file_name = entry.file_name();
2148        if file_name == ".gitignore" || file_name == ".aftignore" {
2149            files.push(entry.into_path());
2150        }
2151    }
2152}
2153
2154/// Count directories visited when discovering ignore rule files (for perf regression tests).
2155#[cfg(test)]
2156pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
2157    let mut dirs = 0usize;
2158    let mut builder = WalkBuilder::new(root);
2159    builder
2160        .hidden(false)
2161        .git_ignore(true)
2162        .git_global(true)
2163        .git_exclude(true)
2164        .add_custom_ignore_filename(".aftignore");
2165    for entry in builder.build().filter_map(|entry| entry.ok()) {
2166        if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2167            dirs += 1;
2168        }
2169    }
2170    dirs
2171}
2172
2173/// Legacy stack-based discovery (pre ignore-walker fix); used only in perf tests.
2174#[cfg(test)]
2175pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
2176    let mut stack = vec![root.to_path_buf()];
2177    let mut dirs = 0usize;
2178    while let Some(dir) = stack.pop() {
2179        dirs += 1;
2180        let Ok(entries) = fs::read_dir(&dir) else {
2181            continue;
2182        };
2183        for entry in entries.flatten() {
2184            let path = entry.path();
2185            let file_name = entry.file_name();
2186            if file_name == ".gitignore" || file_name == ".aftignore" {
2187                continue;
2188            }
2189            let Ok(file_type) = entry.file_type() else {
2190                continue;
2191            };
2192            if !file_type.is_dir() || file_type.is_symlink() {
2193                continue;
2194            }
2195            if matches!(
2196                file_name.to_str().unwrap_or(""),
2197                ".git"
2198                    | "node_modules"
2199                    | "target"
2200                    | "venv"
2201                    | ".venv"
2202                    | "__pycache__"
2203                    | ".tox"
2204                    | "dist"
2205                    | "build"
2206            ) {
2207                continue;
2208            }
2209            stack.push(path);
2210        }
2211    }
2212    dirs
2213}
2214
2215impl PathFilters {
2216    pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
2217        let relative = to_glob_path(&relative_to_root(root, path));
2218        if self
2219            .includes
2220            .as_ref()
2221            .is_some_and(|includes| !includes.is_match(&relative))
2222        {
2223            return false;
2224        }
2225        if self
2226            .excludes
2227            .as_ref()
2228            .is_some_and(|excludes| excludes.is_match(&relative))
2229        {
2230            return false;
2231        }
2232        true
2233    }
2234}
2235
2236fn canonicalize_or_normalize(path: &Path) -> PathBuf {
2237    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
2238}
2239
2240fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
2241    if path.is_absolute() {
2242        path.to_path_buf()
2243    } else {
2244        project_root.join(path)
2245    }
2246}
2247
2248fn path_modified_time(path: &Path) -> Option<SystemTime> {
2249    fs::metadata(path)
2250        .and_then(|metadata| metadata.modified())
2251        .ok()
2252}
2253
2254fn normalize_path(path: &Path) -> PathBuf {
2255    let mut result = PathBuf::new();
2256    for component in path.components() {
2257        match component {
2258            Component::ParentDir => {
2259                if !result.pop() {
2260                    result.push(component);
2261                }
2262            }
2263            Component::CurDir => {}
2264            _ => result.push(component),
2265        }
2266    }
2267    result
2268}
2269
2270fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2271    if let Ok(canonical) = fs::canonicalize(path) {
2272        return canonical;
2273    }
2274
2275    let Some(parent) = path.parent() else {
2276        return path.to_path_buf();
2277    };
2278    let Some(file_name) = path.file_name() else {
2279        return path.to_path_buf();
2280    };
2281
2282    fs::canonicalize(parent)
2283        .map(|canonical_parent| canonical_parent.join(file_name))
2284        .unwrap_or_else(|_| path.to_path_buf())
2285}
2286
2287/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
2288/// since the index was last written. Also detect new files and deleted files.
2289fn verify_file_mtimes(index: &mut SearchIndex) {
2290    let filters = PathFilters::default();
2291    let current_files = walk_project_files(&index.project_root, &filters);
2292    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
2293    let mut stale_paths = Vec::new();
2294    let mut removed_paths = Vec::new();
2295
2296    for entry in &mut index.files {
2297        if entry.path.as_os_str().is_empty() {
2298            continue; // tombstoned entry
2299        }
2300        if !current_file_set.contains(&entry.path) {
2301            removed_paths.push(entry.path.clone());
2302            continue;
2303        }
2304        let cached = FileFreshness {
2305            mtime: entry.modified,
2306            size: entry.size,
2307            content_hash: entry.content_hash,
2308        };
2309        match cache_freshness::verify_file_strict(&entry.path, &cached) {
2310            FreshnessVerdict::HotFresh => {}
2311            FreshnessVerdict::ContentFresh {
2312                new_mtime,
2313                new_size,
2314            } => {
2315                entry.modified = new_mtime;
2316                entry.size = new_size;
2317            }
2318            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2319                stale_paths.push(entry.path.clone())
2320            }
2321        }
2322    }
2323
2324    for path in &removed_paths {
2325        index.remove_file(path);
2326    }
2327
2328    // Re-index stale files that are still in the current walk set. If an ignore
2329    // rule changed while AFT was down but the fingerprint missed it, this keeps
2330    // warm-cache verification from resurrecting now-ignored cached entries.
2331    for path in &stale_paths {
2332        if current_file_set.contains(path) {
2333            index.update_file(path);
2334        } else {
2335            index.remove_file(path);
2336        }
2337    }
2338
2339    // Detect new files not in the index
2340    for path in current_files {
2341        if !index.path_to_id.contains_key(&path) {
2342            index.update_file(&path);
2343        }
2344    }
2345
2346    if !stale_paths.is_empty() {
2347        crate::slog_info!(
2348            "search index: refreshed {} stale file(s) from disk cache",
2349            stale_paths.len()
2350        );
2351    }
2352}
2353
2354fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2355    normalize_path(path).starts_with(normalize_path(search_root))
2356}
2357
2358impl QueryBuild {
2359    fn into_query(self) -> RegexQuery {
2360        let mut query = RegexQuery::default();
2361
2362        for run in self.and_runs {
2363            add_run_to_and_query(&mut query, &run);
2364        }
2365
2366        for group in self.or_groups {
2367            let mut trigrams = BTreeSet::new();
2368            let mut filters = HashMap::new();
2369            for run in group {
2370                for (trigram, filter) in trigram_filters(&run) {
2371                    trigrams.insert(trigram);
2372                    merge_filter(filters.entry(trigram).or_default(), filter);
2373                }
2374            }
2375            if !trigrams.is_empty() {
2376                query.or_groups.push(trigrams.into_iter().collect());
2377                query.or_filters.push(filters);
2378            }
2379        }
2380
2381        query
2382    }
2383}
2384
2385fn build_query(hir: &Hir) -> QueryBuild {
2386    match hir.kind() {
2387        HirKind::Literal(literal) => {
2388            if literal.0.len() >= 3 {
2389                QueryBuild {
2390                    and_runs: vec![literal.0.to_vec()],
2391                    or_groups: Vec::new(),
2392                }
2393            } else {
2394                QueryBuild::default()
2395            }
2396        }
2397        HirKind::Capture(capture) => build_query(&capture.sub),
2398        HirKind::Concat(parts) => {
2399            let mut build = QueryBuild::default();
2400            for part in parts {
2401                let part_build = build_query(part);
2402                build.and_runs.extend(part_build.and_runs);
2403                build.or_groups.extend(part_build.or_groups);
2404            }
2405            build
2406        }
2407        HirKind::Alternation(parts) => {
2408            let mut group = Vec::new();
2409            for part in parts {
2410                let Some(mut choices) = guaranteed_run_choices(part) else {
2411                    return QueryBuild::default();
2412                };
2413                group.append(&mut choices);
2414            }
2415            if group.is_empty() {
2416                QueryBuild::default()
2417            } else {
2418                QueryBuild {
2419                    and_runs: Vec::new(),
2420                    or_groups: vec![group],
2421                }
2422            }
2423        }
2424        HirKind::Repetition(repetition) => {
2425            if repetition.min == 0 {
2426                QueryBuild::default()
2427            } else {
2428                build_query(&repetition.sub)
2429            }
2430        }
2431        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2432    }
2433}
2434
2435fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2436    match hir.kind() {
2437        HirKind::Literal(literal) => {
2438            if literal.0.len() >= 3 {
2439                Some(vec![literal.0.to_vec()])
2440            } else {
2441                None
2442            }
2443        }
2444        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2445        HirKind::Concat(parts) => {
2446            let mut runs = Vec::new();
2447            for part in parts {
2448                if let Some(mut part_runs) = guaranteed_run_choices(part) {
2449                    runs.append(&mut part_runs);
2450                }
2451            }
2452            if runs.is_empty() {
2453                None
2454            } else {
2455                Some(runs)
2456            }
2457        }
2458        HirKind::Alternation(parts) => {
2459            let mut runs = Vec::new();
2460            for part in parts {
2461                let Some(mut part_runs) = guaranteed_run_choices(part) else {
2462                    return None;
2463                };
2464                runs.append(&mut part_runs);
2465            }
2466            if runs.is_empty() {
2467                None
2468            } else {
2469                Some(runs)
2470            }
2471        }
2472        HirKind::Repetition(repetition) => {
2473            if repetition.min == 0 {
2474                None
2475            } else {
2476                guaranteed_run_choices(&repetition.sub)
2477            }
2478        }
2479        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2480    }
2481}
2482
2483fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2484    for (trigram, filter) in trigram_filters(run) {
2485        if !query.and_trigrams.contains(&trigram) {
2486            query.and_trigrams.push(trigram);
2487        }
2488        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2489    }
2490}
2491
2492fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2493    trigram_filter_map(run, false).into_iter().collect()
2494}
2495
2496fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2497    target.next_mask |= filter.next_mask;
2498    target.loc_mask |= filter.loc_mask;
2499}
2500
2501fn mask_for_next_char(next_char: u8) -> u8 {
2502    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2503    1u8 << bit
2504}
2505
2506fn mask_for_position(position: usize) -> u8 {
2507    1u8 << (position % 8)
2508}
2509
2510fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2511    if patterns.is_empty() {
2512        return Ok(None);
2513    }
2514
2515    let mut builder = GlobSetBuilder::new();
2516    for pattern in patterns {
2517        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2518        builder.add(glob);
2519    }
2520    builder.build().map(Some).map_err(|error| error.to_string())
2521}
2522
2523fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2524    let mut buffer = [0u8; 4];
2525    reader.read_exact(&mut buffer)?;
2526    Ok(u32::from_le_bytes(buffer))
2527}
2528
2529fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2530    let mut buffer = [0u8; 8];
2531    reader.read_exact(&mut buffer)?;
2532    Ok(u64::from_le_bytes(buffer))
2533}
2534
2535fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2536    writer.write_all(&value.to_le_bytes())
2537}
2538
2539fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2540    writer.write_all(&value.to_le_bytes())
2541}
2542
2543fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2544    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2545        return Err(std::io::Error::other("search index checksum missing"));
2546    };
2547    let expected = u32::from_le_bytes(*stored);
2548    let actual = crc32fast::hash(body);
2549    if actual != expected {
2550        return Err(std::io::Error::other("search index checksum mismatch"));
2551    }
2552    Ok(())
2553}
2554
2555fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2556    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2557    total_len.checked_sub(pos)
2558}
2559
2560fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2561    let output = Command::new("git")
2562        .arg("-C")
2563        .arg(root)
2564        .args(args)
2565        .output()
2566        .ok()?;
2567    if !output.status.success() {
2568        return None;
2569    }
2570    let value = String::from_utf8(output.stdout).ok()?;
2571    let value = value.trim().to_string();
2572    if value.is_empty() {
2573        None
2574    } else {
2575        Some(value)
2576    }
2577}
2578
2579fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2580    let diff_range = format!("{}..{}", from, to);
2581    let output = match Command::new("git")
2582        .arg("-C")
2583        .arg(root)
2584        .args(["diff", "--name-status", "-M", &diff_range])
2585        .output()
2586    {
2587        Ok(output) => output,
2588        Err(_) => return false,
2589    };
2590
2591    if !output.status.success() {
2592        return false;
2593    }
2594
2595    let Ok(diff) = String::from_utf8(output.stdout) else {
2596        return false;
2597    };
2598
2599    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2600        let mut fields = line.split('\t');
2601        let Some(status) = fields.next() else {
2602            continue;
2603        };
2604
2605        if status.starts_with('R') {
2606            let Some(old_path) = fields
2607                .next()
2608                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2609            else {
2610                continue;
2611            };
2612            let Some(new_path) = fields
2613                .next()
2614                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2615            else {
2616                continue;
2617            };
2618            index.remove_file(&old_path);
2619            index.update_file(&new_path);
2620            continue;
2621        }
2622
2623        let Some(path) = fields
2624            .next()
2625            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2626        else {
2627            continue;
2628        };
2629        if status.starts_with('D') || !path.exists() {
2630            index.remove_file(&path);
2631        } else {
2632            index.update_file(&path);
2633        }
2634    }
2635
2636    true
2637}
2638
2639fn is_binary_path(path: &Path, size: u64) -> bool {
2640    if size == 0 {
2641        return false;
2642    }
2643
2644    let mut file = match File::open(path) {
2645        Ok(file) => file,
2646        Err(_) => return true,
2647    };
2648
2649    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2650    match file.read(&mut preview) {
2651        Ok(read) => is_binary_bytes(&preview[..read]),
2652        Err(_) => true,
2653    }
2654}
2655
2656fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2657    let mut starts = vec![0usize];
2658    for (index, byte) in content.iter().copied().enumerate() {
2659        if byte == b'\n' {
2660            starts.push(index + 1);
2661        }
2662    }
2663    starts
2664}
2665
2666fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2667    let line_index = match line_starts.binary_search(&offset) {
2668        Ok(index) => index,
2669        Err(index) => index.saturating_sub(1),
2670    };
2671    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2672    let line_end = content[line_start..]
2673        .iter()
2674        .position(|byte| *byte == b'\n')
2675        .map(|length| line_start + length)
2676        .unwrap_or(content.len());
2677    let mut line_slice = &content[line_start..line_end];
2678    if line_slice.ends_with(b"\r") {
2679        line_slice = &line_slice[..line_slice.len() - 1];
2680    }
2681    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2682    let column = String::from_utf8_lossy(&content[line_start..offset])
2683        .chars()
2684        .count() as u32
2685        + 1;
2686    (line_index as u32 + 1, column, line_text)
2687}
2688
2689fn to_glob_path(path: &Path) -> String {
2690    path.to_string_lossy().replace('\\', "/")
2691}
2692
2693#[cfg(test)]
2694mod tests {
2695    use std::process::Command;
2696
2697    use super::*;
2698
2699    #[test]
2700    fn cached_path_under_root_allows_missing_lexical_child() {
2701        let dir = tempfile::tempdir().expect("create temp dir");
2702        let project = dir.path().join("project");
2703        fs::create_dir_all(&project).expect("create project dir");
2704        let root = fs::canonicalize(&project).expect("canonicalize project");
2705
2706        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2707            .expect("missing child should fall back to lexical validation");
2708
2709        assert_eq!(path, root.join("future/file.rs"));
2710    }
2711
2712    #[cfg(unix)]
2713    #[test]
2714    fn cached_path_under_root_rejects_symlink_escape() {
2715        let dir = tempfile::tempdir().expect("create temp dir");
2716        let project = dir.path().join("project");
2717        let outside = dir.path().join("outside");
2718        fs::create_dir_all(&project).expect("create project dir");
2719        fs::create_dir_all(&outside).expect("create outside dir");
2720        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2721        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2722        let root = fs::canonicalize(&project).expect("canonicalize project");
2723
2724        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2725    }
2726
2727    #[test]
2728    fn extract_trigrams_tracks_next_char_and_position() {
2729        let trigrams = extract_trigrams(b"Rust");
2730        assert_eq!(trigrams.len(), 2);
2731        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2732        assert_eq!(
2733            trigrams[1],
2734            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2735        );
2736    }
2737
2738    #[test]
2739    fn index_file_trigram_filters_match_legacy_extraction() {
2740        let dir = tempfile::tempdir().expect("create temp dir");
2741        let path = dir.path().join("sample.txt");
2742        let content = b"Rust rust RUST\nxy";
2743        fs::write(&path, content).expect("write sample");
2744
2745        let mut expected = BTreeMap::new();
2746        for (trigram, next_char, position) in extract_trigrams(content) {
2747            let entry: &mut PostingFilter = expected.entry(trigram).or_default();
2748            entry.next_mask |= mask_for_next_char(next_char);
2749            entry.loc_mask |= mask_for_position(position);
2750        }
2751
2752        let mut index = SearchIndex::new();
2753        index.project_root = dir.path().to_path_buf();
2754        index.index_file(&path, content);
2755
2756        let file_id = *index.path_to_id.get(&path).expect("file indexed");
2757        let file_trigrams = index.file_trigrams.get(&file_id).expect("file trigrams");
2758        assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
2759        for (trigram, filter) in expected {
2760            let postings = index.postings.get(&trigram).expect("posting list");
2761            assert_eq!(postings.len(), 1);
2762            assert_eq!(postings[0].file_id, file_id);
2763            assert_eq!(postings[0].next_mask, filter.next_mask);
2764            assert_eq!(postings[0].loc_mask, filter.loc_mask);
2765        }
2766    }
2767
2768    #[test]
2769    fn decompose_regex_extracts_literals_and_alternations() {
2770        let query = decompose_regex("abc(def|ghi)xyz");
2771        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2772        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2773        assert_eq!(query.or_groups.len(), 1);
2774        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2775        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2776    }
2777
2778    #[test]
2779    fn candidates_intersect_posting_lists() {
2780        let mut index = SearchIndex::new();
2781        let dir = tempfile::tempdir().expect("create temp dir");
2782        let alpha = dir.path().join("alpha.txt");
2783        let beta = dir.path().join("beta.txt");
2784        fs::write(&alpha, "abcdef").expect("write alpha");
2785        fs::write(&beta, "abcxyz").expect("write beta");
2786        index.project_root = dir.path().to_path_buf();
2787        index.index_file(&alpha, b"abcdef");
2788        index.index_file(&beta, b"abcxyz");
2789
2790        let query = RegexQuery {
2791            and_trigrams: vec![
2792                pack_trigram(b'a', b'b', b'c'),
2793                pack_trigram(b'd', b'e', b'f'),
2794            ],
2795            ..RegexQuery::default()
2796        };
2797
2798        let candidates = index.candidates(&query);
2799        assert_eq!(candidates.len(), 1);
2800        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2801    }
2802
2803    #[test]
2804    fn candidates_apply_bloom_filters() {
2805        let mut index = SearchIndex::new();
2806        let dir = tempfile::tempdir().expect("create temp dir");
2807        let file = dir.path().join("sample.txt");
2808        fs::write(&file, "abcd efgh").expect("write sample");
2809        index.project_root = dir.path().to_path_buf();
2810        index.index_file(&file, b"abcd efgh");
2811
2812        let trigram = pack_trigram(b'a', b'b', b'c');
2813        let matching_filter = PostingFilter {
2814            next_mask: mask_for_next_char(b'd'),
2815            loc_mask: mask_for_position(0),
2816        };
2817        let non_matching_filter = PostingFilter {
2818            next_mask: mask_for_next_char(b'z'),
2819            loc_mask: mask_for_position(0),
2820        };
2821
2822        assert_eq!(
2823            index
2824                .postings_for_trigram(trigram, Some(matching_filter))
2825                .len(),
2826            1
2827        );
2828        assert!(index
2829            .postings_for_trigram(trigram, Some(non_matching_filter))
2830            .is_empty());
2831    }
2832
2833    #[test]
2834    fn disk_round_trip_preserves_postings_and_files() {
2835        let dir = tempfile::tempdir().expect("create temp dir");
2836        let project = dir.path().join("project");
2837        fs::create_dir_all(&project).expect("create project dir");
2838        let file = project.join("src.txt");
2839        fs::write(&file, "abcdef").expect("write source");
2840
2841        let mut index = SearchIndex::build(&project);
2842        index.git_head = Some("deadbeef".to_string());
2843        let cache_dir = dir.path().join("cache");
2844        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2845
2846        let loaded =
2847            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2848        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2849        assert_eq!(loaded.files.len(), 1);
2850        assert_eq!(
2851            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2852            PathBuf::from("src.txt")
2853        );
2854        assert_eq!(loaded.postings.len(), index.postings.len());
2855        assert!(loaded
2856            .postings
2857            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2858    }
2859
2860    #[test]
2861    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2862        let root = PathBuf::from("/tmp/aft-project");
2863
2864        assert_eq!(
2865            cache_relative_path(&root, &root.join("src/lib.rs")),
2866            Some(PathBuf::from("src/lib.rs"))
2867        );
2868        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2869        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2870        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2871        assert_eq!(
2872            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2873            Some(root.join("src/lib.rs"))
2874        );
2875    }
2876
2877    #[test]
2878    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2879        let dir = tempfile::tempdir().expect("create temp dir");
2880        let project = dir.path().join("project");
2881        fs::create_dir_all(&project).expect("create project dir");
2882        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2883        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2884        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2885
2886        Command::new("git")
2887            .arg("init")
2888            .arg(&project)
2889            .status()
2890            .expect("git init");
2891        for args in [
2892            ["config", "user.email", "aft@example.invalid"],
2893            ["config", "user.name", "AFT Test"],
2894        ] {
2895            Command::new("git")
2896                .arg("-C")
2897                .arg(&project)
2898                .args(args)
2899                .status()
2900                .expect("git config");
2901        }
2902        Command::new("git")
2903            .arg("-C")
2904            .arg(&project)
2905            .args(["add", "."])
2906            .status()
2907            .expect("git add initial");
2908        Command::new("git")
2909            .arg("-C")
2910            .arg(&project)
2911            .args(["commit", "-m", "initial"])
2912            .status()
2913            .expect("git commit initial");
2914        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2915        let mut baseline = SearchIndex::build(&project);
2916        baseline.git_head = Some(previous.clone());
2917
2918        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2919        Command::new("git")
2920            .arg("-C")
2921            .arg(&project)
2922            .args(["add", "-A"])
2923            .status()
2924            .expect("git add rename");
2925        Command::new("git")
2926            .arg("-C")
2927            .arg(&project)
2928            .args(["commit", "-m", "rename"])
2929            .status()
2930            .expect("git commit rename");
2931        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2932
2933        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2934        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2935
2936        let refreshed = SearchIndex::rebuild_or_refresh(
2937            &project,
2938            DEFAULT_MAX_FILE_SIZE,
2939            Some(current),
2940            Some(baseline),
2941        );
2942
2943        assert!(!refreshed
2944            .path_to_id
2945            .contains_key(&canonical_project.join("old.txt")));
2946        assert!(refreshed
2947            .path_to_id
2948            .contains_key(&canonical_project.join("new.txt")));
2949        assert!(refreshed
2950            .path_to_id
2951            .contains_key(&canonical_project.join("untracked.txt")));
2952        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2953        assert_eq!(matches.matches.len(), 1);
2954    }
2955
2956    #[test]
2957    fn read_from_disk_rejects_corrupt_postings_checksum() {
2958        let dir = tempfile::tempdir().expect("create temp dir");
2959        let project = dir.path().join("project");
2960        fs::create_dir_all(&project).expect("create project dir");
2961        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2962
2963        let index = SearchIndex::build(&project);
2964        let cache_dir = dir.path().join("cache");
2965        index.write_to_disk(&cache_dir, None);
2966
2967        let cache_path = cache_dir.join("cache.bin");
2968        let mut bytes = fs::read(&cache_path).expect("read cache");
2969        let middle = bytes.len() / 2;
2970        bytes[middle] ^= 0xff;
2971        fs::write(&cache_path, bytes).expect("write corrupted cache");
2972
2973        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2974    }
2975
2976    #[test]
2977    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2978        let dir = tempfile::tempdir().expect("create temp dir");
2979        let project = dir.path().join("project");
2980        fs::create_dir_all(&project).expect("create project dir");
2981        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2982
2983        let index = SearchIndex::build(&project);
2984        let cache_dir = dir.path().join("cache");
2985        index.write_to_disk(&cache_dir, None);
2986
2987        assert!(cache_dir.join("cache.bin").is_file());
2988        assert!(fs::read_dir(&cache_dir)
2989            .expect("read cache dir")
2990            .all(|entry| !entry
2991                .expect("cache entry")
2992                .file_name()
2993                .to_string_lossy()
2994                .contains(".tmp.")));
2995    }
2996
2997    #[test]
2998    fn concurrent_search_index_writes_do_not_corrupt() {
2999        let dir = tempfile::tempdir().expect("create temp dir");
3000        let project = dir.path().join("project");
3001        fs::create_dir_all(&project).expect("create project dir");
3002        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
3003        let cache_dir = dir.path().join("cache");
3004
3005        let a_project = project.clone();
3006        let a_cache = cache_dir.clone();
3007        let a = std::thread::spawn(move || {
3008            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
3009            let index = SearchIndex::build(&a_project);
3010            index.write_to_disk(&a_cache, None);
3011        });
3012        let b_project = project.clone();
3013        let b_cache = cache_dir.clone();
3014        let b = std::thread::spawn(move || {
3015            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
3016            let index = SearchIndex::build(&b_project);
3017            index.write_to_disk(&b_cache, None);
3018        });
3019        a.join().expect("writer a");
3020        b.join().expect("writer b");
3021
3022        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
3023    }
3024
3025    #[test]
3026    fn search_index_atomic_rename_survives_partial_write() {
3027        let dir = tempfile::tempdir().expect("create temp dir");
3028        let cache_dir = dir.path().join("cache");
3029        fs::create_dir_all(&cache_dir).expect("create cache dir");
3030        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
3031
3032        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3033    }
3034
3035    #[test]
3036    fn artifact_cache_key_shared_across_clones_of_same_repo() {
3037        let dir = tempfile::tempdir().expect("create temp dir");
3038        let source = dir.path().join("source");
3039        fs::create_dir_all(&source).expect("create source repo dir");
3040        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
3041
3042        assert!(Command::new("git")
3043            .current_dir(&source)
3044            .args(["init"])
3045            .status()
3046            .expect("init git repo")
3047            .success());
3048        assert!(Command::new("git")
3049            .current_dir(&source)
3050            .args(["add", "."])
3051            .status()
3052            .expect("git add")
3053            .success());
3054        assert!(Command::new("git")
3055            .current_dir(&source)
3056            .args([
3057                "-c",
3058                "user.name=AFT Tests",
3059                "-c",
3060                "user.email=aft-tests@example.com",
3061                "commit",
3062                "-m",
3063                "initial",
3064            ])
3065            .status()
3066            .expect("git commit")
3067            .success());
3068
3069        let clone = dir.path().join("clone");
3070        assert!(Command::new("git")
3071            .args(["clone", "--quiet"])
3072            .arg(&source)
3073            .arg(&clone)
3074            .status()
3075            .expect("git clone")
3076            .success());
3077
3078        let source_key = artifact_cache_key(&source);
3079        let clone_key = artifact_cache_key(&clone);
3080
3081        assert_eq!(source_key.len(), 16);
3082        assert_eq!(clone_key.len(), 16);
3083        // Same repo (same root commit) → same cache key regardless of clone path
3084        assert_eq!(source_key, clone_key);
3085    }
3086
3087    #[test]
3088    fn git_head_unchanged_picks_up_local_edits() {
3089        let dir = tempfile::tempdir().expect("create temp dir");
3090        let project = dir.path().join("repo");
3091        fs::create_dir_all(&project).expect("create repo dir");
3092        let file = project.join("tracked.txt");
3093        fs::write(&file, "oldtoken\n").expect("write file");
3094        assert!(Command::new("git")
3095            .current_dir(&project)
3096            .arg("init")
3097            .status()
3098            .unwrap()
3099            .success());
3100        assert!(Command::new("git")
3101            .current_dir(&project)
3102            .args(["add", "."])
3103            .status()
3104            .unwrap()
3105            .success());
3106        assert!(Command::new("git")
3107            .current_dir(&project)
3108            .args([
3109                "-c",
3110                "user.name=AFT Tests",
3111                "-c",
3112                "user.email=aft-tests@example.com",
3113                "commit",
3114                "-m",
3115                "initial"
3116            ])
3117            .status()
3118            .unwrap()
3119            .success());
3120        let head = current_git_head(&project);
3121        let mut baseline = SearchIndex::build(&project);
3122        baseline.git_head = head.clone();
3123        fs::write(&file, "newtoken\n").expect("edit tracked file");
3124
3125        let refreshed =
3126            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
3127        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
3128
3129        assert_eq!(result.total_matches, 1);
3130    }
3131
3132    #[test]
3133    fn non_git_project_reuses_cache_when_files_unchanged() {
3134        let dir = tempfile::tempdir().expect("create temp dir");
3135        let project = dir.path().join("project");
3136        fs::create_dir_all(&project).expect("create project dir");
3137        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
3138        let baseline = SearchIndex::build(&project);
3139        let baseline_file_count = baseline.file_count();
3140
3141        let refreshed =
3142            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3143
3144        assert_eq!(refreshed.file_count(), baseline_file_count);
3145        assert_eq!(
3146            refreshed
3147                .grep("unchangedtoken", true, &[], &[], &project, 10)
3148                .total_matches,
3149            1
3150        );
3151    }
3152
3153    #[test]
3154    fn resolve_search_scope_disables_index_for_external_path() {
3155        let dir = tempfile::tempdir().expect("create temp dir");
3156        let project = dir.path().join("project");
3157        let outside = dir.path().join("outside");
3158        fs::create_dir_all(&project).expect("create project dir");
3159        fs::create_dir_all(&outside).expect("create outside dir");
3160
3161        let scope = resolve_search_scope(&project, outside.to_str());
3162
3163        assert_eq!(
3164            scope.root,
3165            fs::canonicalize(&outside).expect("canonicalize outside")
3166        );
3167        assert!(!scope.use_index);
3168    }
3169
3170    #[test]
3171    fn grep_filters_matches_to_search_root() {
3172        let dir = tempfile::tempdir().expect("create temp dir");
3173        let project = dir.path().join("project");
3174        let src = project.join("src");
3175        let docs = project.join("docs");
3176        fs::create_dir_all(&src).expect("create src dir");
3177        fs::create_dir_all(&docs).expect("create docs dir");
3178        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
3179        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
3180
3181        let index = SearchIndex::build(&project);
3182        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3183
3184        assert_eq!(result.files_searched, 1);
3185        assert_eq!(result.files_with_matches, 1);
3186        assert_eq!(result.matches.len(), 1);
3187        // Index stores canonicalized paths; on macOS /var → /private/var
3188        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
3189        assert_eq!(result.matches[0].file, expected);
3190    }
3191
3192    #[test]
3193    fn grep_deduplicates_multiple_matches_on_same_line() {
3194        let dir = tempfile::tempdir().expect("create temp dir");
3195        let project = dir.path().join("project");
3196        let src = project.join("src");
3197        fs::create_dir_all(&src).expect("create src dir");
3198        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
3199
3200        let index = SearchIndex::build(&project);
3201        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3202
3203        assert_eq!(result.total_matches, 1);
3204        assert_eq!(result.matches.len(), 1);
3205    }
3206
3207    #[test]
3208    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
3209        let dir = tempfile::tempdir().expect("create temp dir");
3210        let project = dir.path().join("project");
3211        fs::create_dir_all(&project).expect("create project dir");
3212        let file = project.join("unicode.txt");
3213        fs::write(&file, "äbc\n").expect("write unicode file");
3214
3215        let index = SearchIndex::build(&project);
3216        let result = index.grep("Äbc", false, &[], &[], &project, 10);
3217
3218        assert_eq!(result.total_matches, 1);
3219        assert_eq!(result.matches.len(), 1);
3220        assert_eq!(
3221            result.matches[0].file,
3222            fs::canonicalize(file).expect("canonicalize unicode file")
3223        );
3224    }
3225
3226    #[test]
3227    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
3228        let dir = tempfile::tempdir().expect("create temp dir");
3229        let project = dir.path().join("project");
3230        fs::create_dir_all(&project).expect("create project dir");
3231        let file = project.join("tokens.txt");
3232        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
3233        fs::write(&file, "alpha").expect("write original file");
3234        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
3235
3236        let baseline = SearchIndex::build(&project);
3237        fs::write(&file, "bravo").expect("write same-size edit");
3238        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
3239
3240        let refreshed =
3241            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3242        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
3243        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
3244        let refreshed_id = *refreshed
3245            .path_to_id
3246            .get(&canonical_file)
3247            .expect("file remains indexed");
3248
3249        assert_eq!(result.total_matches, 1);
3250        assert!(refreshed
3251            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
3252            .contains(&refreshed_id));
3253        assert!(!refreshed
3254            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
3255            .contains(&refreshed_id));
3256    }
3257
3258    #[test]
3259    fn grep_reports_total_matches_before_truncation() {
3260        let dir = tempfile::tempdir().expect("create temp dir");
3261        let project = dir.path().join("project");
3262        let src = project.join("src");
3263        fs::create_dir_all(&src).expect("create src dir");
3264        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
3265
3266        let index = SearchIndex::build(&project);
3267        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
3268
3269        assert_eq!(result.total_matches, 2);
3270        assert_eq!(result.matches.len(), 1);
3271        assert!(result.truncated);
3272    }
3273
3274    #[test]
3275    fn glob_filters_results_to_search_root() {
3276        let dir = tempfile::tempdir().expect("create temp dir");
3277        let project = dir.path().join("project");
3278        let src = project.join("src");
3279        let scripts = project.join("scripts");
3280        fs::create_dir_all(&src).expect("create src dir");
3281        fs::create_dir_all(&scripts).expect("create scripts dir");
3282        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
3283        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
3284
3285        let index = SearchIndex::build(&project);
3286        let files = index.glob("**/*.rs", &src);
3287
3288        assert_eq!(
3289            files,
3290            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
3291        );
3292    }
3293
3294    #[test]
3295    fn glob_includes_hidden_and_binary_files() {
3296        let dir = tempfile::tempdir().expect("create temp dir");
3297        let project = dir.path().join("project");
3298        let hidden_dir = project.join(".hidden");
3299        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
3300        let hidden_file = hidden_dir.join("data.bin");
3301        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
3302
3303        let index = SearchIndex::build(&project);
3304        let files = index.glob("**/*.bin", &project);
3305
3306        assert_eq!(
3307            files,
3308            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
3309        );
3310    }
3311
3312    #[test]
3313    fn read_from_disk_rejects_invalid_nanos() {
3314        let dir = tempfile::tempdir().expect("create temp dir");
3315        let cache_dir = dir.path().join("cache");
3316        fs::create_dir_all(&cache_dir).expect("create cache dir");
3317
3318        let mut postings = Vec::new();
3319        postings.extend_from_slice(INDEX_MAGIC);
3320        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3321        postings.extend_from_slice(&0u32.to_le_bytes());
3322        postings.extend_from_slice(&1u32.to_le_bytes());
3323        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3324        postings.extend_from_slice(&1u32.to_le_bytes());
3325        postings.extend_from_slice(b"/");
3326        postings.push(0u8);
3327        postings.extend_from_slice(&1u32.to_le_bytes());
3328        postings.extend_from_slice(&0u64.to_le_bytes());
3329        postings.extend_from_slice(&0u64.to_le_bytes());
3330        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3331        postings.extend_from_slice(b"a");
3332        postings.extend_from_slice(&0u64.to_le_bytes());
3333
3334        let mut lookup = Vec::new();
3335        lookup.extend_from_slice(LOOKUP_MAGIC);
3336        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3337        lookup.extend_from_slice(&0u32.to_le_bytes());
3338
3339        let postings_checksum = crc32fast::hash(&postings);
3340        postings.extend_from_slice(&postings_checksum.to_le_bytes());
3341        let lookup_checksum = crc32fast::hash(&lookup);
3342        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3343        let mut cache = Vec::new();
3344        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3345        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3346        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3347        cache.extend_from_slice(&postings);
3348        cache.extend_from_slice(&lookup);
3349        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3350
3351        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3352    }
3353
3354    #[test]
3355    fn parallel_cold_build_matches_serial_index() {
3356        let dir = tempfile::tempdir().expect("create temp dir");
3357        let project = dir.path().join("project");
3358        for index in 0..80 {
3359            let sub = project.join(format!("pkg_{index:03}"));
3360            fs::create_dir_all(&sub).expect("create subdir");
3361            fs::write(
3362                sub.join("lib.rs"),
3363                format!(
3364                    "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
3365                ),
3366            )
3367            .expect("write lib");
3368        }
3369
3370        let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
3371        let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
3372
3373        assert_eq!(serial.file_count(), parallel.file_count());
3374        assert_eq!(serial.trigram_count(), parallel.trigram_count());
3375        assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
3376        assert_eq!(serial.postings, parallel.postings);
3377        assert_eq!(serial.file_trigrams, parallel.file_trigrams);
3378        for (path, id) in &serial.path_to_id {
3379            assert_eq!(parallel.path_to_id.get(path), Some(id));
3380        }
3381        for (serial_file, parallel_file) in serial.files.iter().zip(&parallel.files) {
3382            assert_eq!(serial_file.path, parallel_file.path);
3383            assert_eq!(serial_file.size, parallel_file.size);
3384            assert_eq!(serial_file.modified, parallel_file.modified);
3385            assert_eq!(serial_file.content_hash, parallel_file.content_hash);
3386        }
3387
3388        let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
3389        let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
3390        assert_eq!(serial_grep.matches, parallel_grep.matches);
3391        assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
3392        assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
3393        assert_eq!(
3394            serial_grep.files_with_matches,
3395            parallel_grep.files_with_matches
3396        );
3397    }
3398
3399    #[test]
3400    fn ignore_rule_discovery_respects_gitignore() {
3401        let dir = tempfile::tempdir().expect("create temp dir");
3402        let project = dir.path().join("project");
3403        fs::create_dir_all(project.join("src")).expect("mkdir src");
3404        fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
3405        let data = project.join("src/data");
3406        fs::create_dir_all(&data).expect("mkdir data");
3407        for index in 0..200 {
3408            fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
3409            fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
3410                .expect("write ignored file");
3411        }
3412
3413        Command::new("git")
3414            .arg("init")
3415            .arg(&project)
3416            .status()
3417            .expect("git init");
3418        for args in [
3419            ["config", "user.email", "aft@example.invalid"],
3420            ["config", "user.name", "AFT Test"],
3421        ] {
3422            Command::new("git")
3423                .arg("-C")
3424                .arg(&project)
3425                .args(args)
3426                .status()
3427                .expect("git config");
3428        }
3429        Command::new("git")
3430            .arg("-C")
3431            .arg(&project)
3432            .args(["add", "."])
3433            .status()
3434            .expect("git add");
3435        Command::new("git")
3436            .arg("-C")
3437            .arg(&project)
3438            .args(["commit", "-m", "initial"])
3439            .status()
3440            .expect("git commit");
3441
3442        let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
3443        let walker_dirs = count_ignore_rule_discovery_dirs(&project);
3444        assert!(
3445            legacy_dirs > walker_dirs,
3446            "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
3447        );
3448        assert!(
3449            walker_dirs < 50,
3450            "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
3451        );
3452    }
3453
3454    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
3455    /// changed between cmp() calls.
3456    ///
3457    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
3458    /// which does a `stat()` syscall. If the file was deleted, modified, or
3459    /// touched mid-sort, the comparator returned different values for the
3460    /// same input pair on different invocations. Rust's slice::sort detects
3461    /// this and panics with "user-provided comparison function does not
3462    /// correctly implement a total order".
3463    ///
3464    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
3465    /// bridge invalidated files in parallel with grep's sort path. This
3466    /// test simulates the worst case: most paths don't exist (Err from
3467    /// fs::metadata) and sort still completes successfully.
3468    #[test]
3469    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3470        // Mix of existing and non-existing paths in deliberately
3471        // non-monotonic order — pre-fix, the sort would call stat() at
3472        // least N log N times and any flakiness would trigger the panic.
3473        let dir = tempfile::tempdir().expect("create tempdir");
3474        let mut paths: Vec<PathBuf> = Vec::new();
3475        for i in 0..30 {
3476            // Half exist, half don't.
3477            let path = if i % 2 == 0 {
3478                let p = dir.path().join(format!("real-{i}.rs"));
3479                fs::write(&p, format!("// {i}\n")).expect("write");
3480                p
3481            } else {
3482                dir.path().join(format!("missing-{i}.rs"))
3483            };
3484            paths.push(path);
3485        }
3486
3487        // Run the sort many times to maximise the chance of catching any
3488        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
3489        for _ in 0..50 {
3490            let mut copy = paths.clone();
3491            sort_paths_by_mtime_desc(&mut copy);
3492            assert_eq!(copy.len(), paths.len());
3493        }
3494    }
3495
3496    /// Regression: the indexed parallel search's reduce() combine closure must
3497    /// NOT set engine_capped. reduce runs on every partial-result merge in a
3498    /// multi-chunk parallel search (>10 candidate files), capped or not — an
3499    /// unconditional store there falsely reported every such grep as capped,
3500    /// lying to the agent that results were truncated.
3501    #[test]
3502    fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
3503        let dir = tempfile::tempdir().expect("create tempdir");
3504        // >10 files so the parallel (reduce) branch is taken, each with exactly
3505        // one match, and a generous cap so the search is NOT actually capped.
3506        for i in 0..40 {
3507            fs::write(
3508                dir.path().join(format!("file-{i}.rs")),
3509                format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
3510            )
3511            .expect("write");
3512        }
3513        let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
3514        let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
3515        assert!(
3516            result.matches.len() >= 40,
3517            "expected a match per file, got {}",
3518            result.matches.len()
3519        );
3520        assert!(
3521            !result.engine_capped,
3522            "an uncapped grep over >10 files must not report engine_capped"
3523        );
3524        assert!(!result.truncated, "uncapped grep must not be truncated");
3525    }
3526
3527    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
3528    /// the same conditions as sort_paths_by_mtime_desc. See the
3529    /// sort_paths_... test above for the full rationale.
3530    #[test]
3531    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3532        let dir = tempfile::tempdir().expect("create tempdir");
3533        let mut matches: Vec<GrepMatch> = Vec::new();
3534        for i in 0..30 {
3535            let file = if i % 2 == 0 {
3536                let p = dir.path().join(format!("real-{i}.rs"));
3537                fs::write(&p, format!("// {i}\n")).expect("write");
3538                p
3539            } else {
3540                dir.path().join(format!("missing-{i}.rs"))
3541            };
3542            matches.push(GrepMatch {
3543                file,
3544                line: u32::try_from(i).unwrap_or(0),
3545                column: 0,
3546                line_text: format!("match {i}"),
3547                match_text: format!("match {i}"),
3548            });
3549        }
3550
3551        for _ in 0..50 {
3552            let mut copy = matches.clone();
3553            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3554            assert_eq!(copy.len(), matches.len());
3555        }
3556    }
3557}
aft/search_index.rs

aft/
search_index.rs