aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36    _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41        fs::create_dir_all(cache_dir)?;
42        let path = cache_dir.join("cache.lock");
43        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44            .lock()
45            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46        fs_lock::try_acquire(&path, Duration::from_secs(2))
47            .map(|guard| Self { _guard: guard })
48            .map_err(|error| match error {
49                fs_lock::AcquireError::Timeout => {
50                    std::io::Error::other("timed out acquiring search cache lock")
51                }
52                fs_lock::AcquireError::Io(error) => error,
53            })
54    }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59    pub postings: HashMap<u32, Vec<Posting>>,
60    pub files: Vec<FileEntry>,
61    pub path_to_id: HashMap<PathBuf, u32>,
62    pub ready: bool,
63    project_root: PathBuf,
64    git_head: Option<String>,
65    max_file_size: u64,
66    ignore_rules_fingerprint: String,
67    pub file_trigrams: HashMap<u32, Vec<u32>>,
68    unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73    pub files: Vec<(PathBuf, f32)>,
74    pub engine_capped: bool,
75}
76
77impl SearchIndex {
78    /// Number of indexed files.
79    pub fn file_count(&self) -> usize {
80        self.files.len()
81    }
82
83    /// Number of unique trigrams in the index.
84    pub fn trigram_count(&self) -> usize {
85        self.postings.len()
86    }
87
88    /// Compute distinct query trigrams from literal tokens.
89    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90        query_trigrams_from_tokens(tokens)
91    }
92
93    /// Score-rank file candidates by lexical relevance to query trigrams.
94    pub fn lexical_rank(
95        &self,
96        query_trigrams: &[u32],
97        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98        max_files: usize,
99    ) -> Vec<(PathBuf, f32)> {
100        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101            .files
102    }
103
104    /// Score-rank file candidates and report whether pre-filter candidate
105    /// enumeration hit the internal 200/500 cap before ranking.
106    pub fn lexical_rank_with_stats(
107        &self,
108        query_trigrams: &[u32],
109        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110        max_files: usize,
111    ) -> LexicalRankResult {
112        if query_trigrams.is_empty() || max_files == 0 {
113            return LexicalRankResult::default();
114        }
115
116        let mut non_zero: Vec<(u32, usize)> = query_trigrams
117            .iter()
118            .filter_map(|trigram| {
119                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120                (posting_count > 0).then_some((*trigram, posting_count))
121            })
122            .collect();
123        if non_zero.is_empty() {
124            return LexicalRankResult::default();
125        }
126
127        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128        let selected_count = non_zero.len().min(3);
129        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131        let mut candidate_ids = BTreeSet::new();
132        for (trigram, _) in non_zero.iter().take(selected_count) {
133            if let Some(postings) = self.postings.get(trigram) {
134                for posting in postings {
135                    if self.is_active_file(posting.file_id) {
136                        candidate_ids.insert(posting.file_id);
137                    }
138                }
139            }
140        }
141        let pre_filter_candidate_count = candidate_ids.len();
142        let engine_capped = pre_filter_candidate_count > candidate_cap;
143        let filtered_candidates = candidate_ids
144            .into_iter()
145            .filter_map(|file_id| {
146                self.files
147                    .get(file_id as usize)
148                    .map(|entry| (file_id, entry))
149            })
150            .filter(|(_, entry)| {
151                if let Some(filter) = candidate_filter {
152                    filter(&entry.path)
153                } else {
154                    true
155                }
156            })
157            .collect::<Vec<_>>();
158
159        let mut ranked = Vec::new();
160        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161            let score = lexical_score(self, query_trigrams, file_id);
162            if score > 0.0 {
163                ranked.push((entry.path.clone(), score));
164            }
165        }
166
167        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168        ranked.truncate(max_files);
169        LexicalRankResult {
170            files: ranked,
171            engine_capped,
172        }
173    }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178    pub file_id: u32,
179    pub next_mask: u8,
180    pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185    pub path: PathBuf,
186    pub size: u64,
187    pub modified: SystemTime,
188    pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193    pub file: PathBuf,
194    pub line: u32,
195    pub column: u32,
196    pub line_text: String,
197    pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202    pub matches: Vec<GrepMatch>,
203    pub total_matches: usize,
204    pub files_searched: usize,
205    pub files_with_matches: usize,
206    pub index_status: IndexStatus,
207    pub truncated: bool,
208    pub fully_degraded: bool,
209    pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214    Ready,
215    Building,
216    Fallback,
217    Disabled,
218}
219
220impl IndexStatus {
221    pub fn as_str(&self) -> &'static str {
222        match self {
223            IndexStatus::Ready => "Ready",
224            IndexStatus::Building => "Building",
225            IndexStatus::Fallback => "Fallback",
226            IndexStatus::Disabled => "Disabled",
227        }
228    }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233    pub and_trigrams: Vec<u32>,
234    pub or_groups: Vec<Vec<u32>>,
235    pub(crate) and_filters: HashMap<u32, PostingFilter>,
236    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241    next_mask: u8,
242    loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247    and_runs: Vec<Vec<u8>>,
248    or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253    includes: Option<GlobSet>,
254    excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259    pub root: PathBuf,
260    pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265    file: Arc<PathBuf>,
266    line: u32,
267    column: u32,
268    line_text: String,
269    match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274    Literal(LiteralSearch),
275    Regex(Regex),
276}
277
278impl SearchIndex {
279    pub fn new() -> Self {
280        SearchIndex {
281            postings: HashMap::new(),
282            files: Vec::new(),
283            path_to_id: HashMap::new(),
284            ready: false,
285            project_root: PathBuf::new(),
286            git_head: None,
287            max_file_size: DEFAULT_MAX_FILE_SIZE,
288            ignore_rules_fingerprint: String::new(),
289            file_trigrams: HashMap::new(),
290            unindexed_files: HashSet::new(),
291        }
292    }
293
294    pub fn build(root: &Path) -> Self {
295        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296    }
297
298    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299        let started = std::time::Instant::now();
300        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
301        let mut index = SearchIndex {
302            project_root: project_root.clone(),
303            max_file_size,
304            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
305            ..SearchIndex::new()
306        };
307
308        let filters = PathFilters::default();
309        let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
310        let indexed = index.ingest_paths_parallel(&paths);
311
312        index.git_head = current_git_head(&project_root);
313        index.ready = true;
314        crate::slog_info!(
315            "search index cold build: {} files, {} trigrams, {} ms (pool={})",
316            indexed,
317            index.postings.len(),
318            started.elapsed().as_millis(),
319            search_index_build_pool_size()
320        );
321        index
322    }
323
324    /// Serial cold build for tests and parity checks against [`build_with_limit`].
325    #[cfg(test)]
326    pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
327        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
328        let mut index = SearchIndex {
329            project_root: project_root.clone(),
330            max_file_size,
331            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
332            ..SearchIndex::new()
333        };
334        let filters = PathFilters::default();
335        for path in walk_project_files(&project_root, &filters) {
336            index.update_file(&path);
337        }
338        index.git_head = current_git_head(&project_root);
339        index.ready = true;
340        index
341    }
342
343    fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
344        let max_file_size = self.max_file_size;
345
346        let parallel_read = || -> Vec<(PathBuf, Vec<u8>)> {
347            paths
348                .par_iter()
349                .filter_map(|path| {
350                    let metadata = fs::metadata(path).ok()?;
351                    if !metadata.is_file() {
352                        return None;
353                    }
354                    if is_binary_path(path, metadata.len()) || metadata.len() > max_file_size {
355                        return None;
356                    }
357                    let content = fs::read(path).ok()?;
358                    if is_binary_bytes(&content) {
359                        return None;
360                    }
361                    Some((path.clone(), content))
362                })
363                .collect()
364        };
365
366        let readable: Vec<(PathBuf, Vec<u8>)> = match rayon::ThreadPoolBuilder::new()
367            .num_threads(search_index_build_pool_size())
368            .thread_name(|index| format!("aft-search-build-{index}"))
369            .stack_size(8 * 1024 * 1024)
370            .build()
371        {
372            Ok(pool) => pool.install(parallel_read),
373            Err(error) => {
374                log::warn!(
375                    "search index: bounded build pool unavailable ({error}); using global pool"
376                );
377                parallel_read()
378            }
379        };
380
381        let mut readable_by_path: HashMap<PathBuf, Vec<u8>> = readable.into_iter().collect();
382        let mut indexed = 0usize;
383        for path in paths {
384            if let Some(content) = readable_by_path.remove(path) {
385                self.index_file(path, &content);
386                indexed += 1;
387            } else {
388                self.update_file(path);
389                if self.path_to_id.contains_key(path) {
390                    indexed += 1;
391                }
392            }
393        }
394        indexed
395    }
396
397    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
398        self.remove_file(path);
399
400        let file_id = match self.allocate_file_id(path, content.len() as u64) {
401            Some(file_id) => file_id,
402            None => return,
403        };
404        if let Some(file) = self.files.get_mut(file_id as usize) {
405            file.content_hash = cache_freshness::hash_bytes(content);
406        }
407
408        let trigram_map = trigram_filter_map(content, true);
409
410        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
411        for (trigram, filter) in trigram_map {
412            let postings = self.postings.entry(trigram).or_default();
413            postings.push(Posting {
414                file_id,
415                next_mask: filter.next_mask,
416                loc_mask: filter.loc_mask,
417            });
418            // Posting lists are kept sorted by file_id for binary search during
419            // intersection. Since file_ids are allocated incrementally, the new
420            // entry is usually already in order. Only sort when needed.
421            if postings.len() > 1
422                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
423            {
424                postings.sort_unstable_by_key(|p| p.file_id);
425            }
426            file_trigrams.push(trigram);
427        }
428
429        self.file_trigrams.insert(file_id, file_trigrams);
430        self.unindexed_files.remove(&file_id);
431    }
432
433    pub fn remove_file(&mut self, path: &Path) {
434        let canonical_path = canonicalize_existing_or_deleted_path(path);
435        let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
436            file_id
437        } else if canonical_path.as_path() != path {
438            let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
439                return;
440            };
441            file_id
442        } else {
443            return;
444        };
445
446        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
447            for trigram in trigrams {
448                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
449                    postings.retain(|posting| posting.file_id != file_id);
450                    postings.is_empty()
451                } else {
452                    false
453                };
454
455                if should_remove {
456                    self.postings.remove(&trigram);
457                }
458            }
459        }
460
461        self.unindexed_files.remove(&file_id);
462        if let Some(file) = self.files.get_mut(file_id as usize) {
463            file.path = PathBuf::new();
464            file.size = 0;
465            file.modified = UNIX_EPOCH;
466            file.content_hash = cache_freshness::zero_hash();
467        }
468    }
469
470    pub fn update_file(&mut self, path: &Path) {
471        self.remove_file(path);
472
473        let metadata = match fs::metadata(path) {
474            Ok(metadata) if metadata.is_file() => metadata,
475            _ => return,
476        };
477
478        if is_binary_path(path, metadata.len()) {
479            self.track_unindexed_file(path, &metadata);
480            return;
481        }
482
483        if metadata.len() > self.max_file_size {
484            self.track_unindexed_file(path, &metadata);
485            return;
486        }
487
488        let content = match fs::read(path) {
489            Ok(content) => content,
490            Err(_) => return,
491        };
492
493        if is_binary_bytes(&content) {
494            self.track_unindexed_file(path, &metadata);
495            return;
496        }
497
498        self.index_file(path, &content);
499    }
500
501    pub fn grep(
502        &self,
503        pattern: &str,
504        case_sensitive: bool,
505        include: &[String],
506        exclude: &[String],
507        search_root: &Path,
508        max_results: usize,
509    ) -> GrepResult {
510        match pattern_compile::compile(
511            pattern,
512            CompileOpts {
513                case_insensitive: !case_sensitive,
514                ..CompileOpts::default()
515            },
516        ) {
517            CompileResult::Ok(compiled) => {
518                self.search_grep(&compiled, include, exclude, search_root, max_results)
519            }
520            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
521                self.empty_grep_result()
522            }
523        }
524    }
525
526    pub fn search_grep(
527        &self,
528        pattern: &CompiledPattern,
529        include: &[String],
530        exclude: &[String],
531        search_root: &Path,
532        max_results: usize,
533    ) -> GrepResult {
534        let matcher = match pattern {
535            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
536            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
537        };
538
539        let filters = match build_path_filters(include, exclude) {
540            Ok(filters) => filters,
541            Err(_) => PathFilters::default(),
542        };
543        let search_root = canonicalize_or_normalize(search_root);
544
545        let raw_pattern = pattern.raw_pattern_for_trigrams();
546        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
547            RegexQuery::default()
548        } else {
549            decompose_regex(&raw_pattern)
550        };
551        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
552        let candidate_ids = self.candidates(&query);
553
554        let candidate_files: Vec<&FileEntry> = candidate_ids
555            .into_iter()
556            .filter_map(|file_id| self.files.get(file_id as usize))
557            .filter(|file| !file.path.as_os_str().is_empty())
558            .filter(|file| is_within_search_root(&search_root, &file.path))
559            .filter(|file| filters.matches(&self.project_root, &file.path))
560            .collect();
561
562        let total_matches = AtomicUsize::new(0);
563        let files_searched = AtomicUsize::new(0);
564        let files_with_matches = AtomicUsize::new(0);
565        let truncated = AtomicBool::new(false);
566        let engine_capped = AtomicBool::new(false);
567        let stop_after = max_results.saturating_mul(2);
568        let stop_scan = Arc::new(AtomicBool::new(false));
569
570        let mut matches = if candidate_files.len() > 10 {
571            candidate_files
572                .par_iter()
573                .map(|file| {
574                    if grep_scan_should_stop(
575                        Some(&stop_scan),
576                        &truncated,
577                        &total_matches,
578                        stop_after,
579                    ) {
580                        engine_capped.store(true, Ordering::Relaxed);
581                        return Vec::new();
582                    }
583                    search_candidate_file(
584                        file,
585                        &matcher,
586                        max_results,
587                        stop_after,
588                        &total_matches,
589                        &files_searched,
590                        &files_with_matches,
591                        &truncated,
592                        &engine_capped,
593                        Some(&stop_scan),
594                    )
595                })
596                .reduce(Vec::new, |mut left, mut right| {
597                    // NOTE: do NOT set engine_capped here. reduce() runs on every
598                    // partial-result merge in a multi-chunk parallel search,
599                    // capped or not — setting it unconditionally would falsely
600                    // report every indexed grep over >10 candidates as capped.
601                    // Cap detection is owned by grep_scan_should_stop (in the map
602                    // closure) and should_stop_search (serial path), which set
603                    // engine_capped only when the result cap is actually reached.
604                    left.append(&mut right);
605                    left
606                })
607        } else {
608            let mut matches = Vec::new();
609            for file in candidate_files {
610                matches.extend(search_candidate_file(
611                    file,
612                    &matcher,
613                    max_results,
614                    stop_after,
615                    &total_matches,
616                    &files_searched,
617                    &files_with_matches,
618                    &truncated,
619                    &engine_capped,
620                    None,
621                ));
622
623                if should_stop_search(&truncated, &total_matches, stop_after) {
624                    engine_capped.store(true, Ordering::Relaxed);
625                    break;
626                }
627            }
628            matches
629        };
630
631        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
632            self.path_to_id
633                .get(path)
634                .and_then(|file_id| self.files.get(*file_id as usize))
635                .map(|file| file.modified)
636        });
637
638        let matches = matches
639            .into_iter()
640            .map(|matched| GrepMatch {
641                file: matched.file.as_ref().clone(),
642                line: matched.line,
643                column: matched.column,
644                line_text: matched.line_text,
645                match_text: matched.match_text,
646            })
647            .collect();
648
649        GrepResult {
650            total_matches: total_matches.load(Ordering::Relaxed),
651            matches,
652            files_searched: files_searched.load(Ordering::Relaxed),
653            files_with_matches: files_with_matches.load(Ordering::Relaxed),
654            index_status: if self.ready {
655                IndexStatus::Ready
656            } else {
657                IndexStatus::Building
658            },
659            truncated: truncated.load(Ordering::Relaxed),
660            fully_degraded,
661            engine_capped: engine_capped.load(Ordering::Relaxed),
662        }
663    }
664
665    fn empty_grep_result(&self) -> GrepResult {
666        GrepResult {
667            matches: Vec::new(),
668            total_matches: 0,
669            files_searched: 0,
670            files_with_matches: 0,
671            index_status: if self.ready {
672                IndexStatus::Ready
673            } else {
674                IndexStatus::Building
675            },
676            truncated: false,
677            fully_degraded: false,
678            engine_capped: false,
679        }
680    }
681
682    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
683        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
684            Ok(filters) => filters,
685            Err(_) => return Vec::new(),
686        };
687        let search_root = canonicalize_or_normalize(search_root);
688        let mut entries = self
689            .files
690            .iter()
691            .filter(|file| !file.path.as_os_str().is_empty())
692            .filter(|file| is_within_search_root(&search_root, &file.path))
693            .filter(|file| filters.matches(&self.project_root, &file.path))
694            .map(|file| (file.path.clone(), file.modified))
695            .collect::<Vec<_>>();
696
697        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
698            right_mtime
699                .cmp(left_mtime)
700                .then_with(|| left_path.cmp(right_path))
701        });
702
703        entries.into_iter().map(|(path, _)| path).collect()
704    }
705
706    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
707        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
708            return self.active_file_ids();
709        }
710
711        let mut and_trigrams = query.and_trigrams.clone();
712        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
713
714        let mut current: Option<Vec<u32>> = None;
715
716        for trigram in and_trigrams {
717            let filter = query.and_filters.get(&trigram).copied();
718            let matches = self.postings_for_trigram(trigram, filter);
719            current = Some(match current.take() {
720                Some(existing) => intersect_sorted_ids(&existing, &matches),
721                None => matches,
722            });
723
724            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
725                break;
726            }
727        }
728
729        let mut current = current.unwrap_or_else(|| self.active_file_ids());
730
731        for (index, group) in query.or_groups.iter().enumerate() {
732            let mut group_matches = Vec::new();
733            let filters = query.or_filters.get(index);
734
735            for trigram in group {
736                let filter = filters.and_then(|filters| filters.get(trigram).copied());
737                let matches = self.postings_for_trigram(*trigram, filter);
738                if group_matches.is_empty() {
739                    group_matches = matches;
740                } else {
741                    group_matches = union_sorted_ids(&group_matches, &matches);
742                }
743            }
744
745            current = intersect_sorted_ids(&current, &group_matches);
746            if current.is_empty() {
747                break;
748            }
749        }
750
751        let mut unindexed = self
752            .unindexed_files
753            .iter()
754            .copied()
755            .filter(|file_id| self.is_active_file(*file_id))
756            .collect::<Vec<_>>();
757        if !unindexed.is_empty() {
758            unindexed.sort_unstable();
759            current = union_sorted_ids(&current, &unindexed);
760        }
761
762        current
763    }
764
765    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
766        if fs::create_dir_all(cache_dir).is_err() {
767            return;
768        }
769
770        let cache_path = cache_dir.join("cache.bin");
771        let tmp_cache = cache_dir.join(format!(
772            "cache.bin.tmp.{}.{}",
773            std::process::id(),
774            SystemTime::now()
775                .duration_since(UNIX_EPOCH)
776                .unwrap_or(Duration::ZERO)
777                .as_nanos()
778        ));
779
780        let active_ids = self.active_file_ids();
781        let mut id_map = HashMap::new();
782        for (new_id, old_id) in active_ids.iter().enumerate() {
783            let Ok(new_id_u32) = u32::try_from(new_id) else {
784                return;
785            };
786            id_map.insert(*old_id, new_id_u32);
787        }
788
789        let write_result = (|| -> std::io::Result<()> {
790            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
791
792            postings_writer.write_all(INDEX_MAGIC)?;
793            write_u32(&mut postings_writer, INDEX_VERSION)?;
794
795            let head = git_head.unwrap_or_default();
796            let root = self.project_root.to_string_lossy();
797            let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
798                ignore_rules_fingerprint(&self.project_root)
799            } else {
800                self.ignore_rules_fingerprint.clone()
801            };
802            let head_len = u32::try_from(head.len())
803                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
804            let root_len = u32::try_from(root.len())
805                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
806            let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
807                .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
808            let file_count = u32::try_from(active_ids.len())
809                .map_err(|_| std::io::Error::other("too many files to cache"))?;
810
811            write_u32(&mut postings_writer, head_len)?;
812            write_u32(&mut postings_writer, root_len)?;
813            write_u32(&mut postings_writer, ignore_fingerprint_len)?;
814            write_u64(&mut postings_writer, self.max_file_size)?;
815            write_u32(&mut postings_writer, file_count)?;
816            postings_writer.write_all(head.as_bytes())?;
817            postings_writer.write_all(root.as_bytes())?;
818            postings_writer.write_all(ignore_fingerprint.as_bytes())?;
819
820            for old_id in &active_ids {
821                let Some(file) = self.files.get(*old_id as usize) else {
822                    return Err(std::io::Error::other("missing file entry for cache write"));
823                };
824                let path =
825                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
826                        std::io::Error::other(format!(
827                            "refusing to cache path outside project root: {}",
828                            file.path.display()
829                        ))
830                    })?;
831                let path = path.to_string_lossy();
832                let path_len = u32::try_from(path.len())
833                    .map_err(|_| std::io::Error::other("cached path too large"))?;
834                let modified = file
835                    .modified
836                    .duration_since(UNIX_EPOCH)
837                    .unwrap_or(Duration::ZERO);
838                let unindexed = if self.unindexed_files.contains(old_id) {
839                    1u8
840                } else {
841                    0u8
842                };
843
844                postings_writer.write_all(&[unindexed])?;
845                write_u32(&mut postings_writer, path_len)?;
846                write_u64(&mut postings_writer, file.size)?;
847                write_u64(&mut postings_writer, modified.as_secs())?;
848                write_u32(&mut postings_writer, modified.subsec_nanos())?;
849                postings_writer.write_all(file.content_hash.as_bytes())?;
850                postings_writer.write_all(path.as_bytes())?;
851            }
852
853            let mut lookup_entries = Vec::new();
854            let mut postings_blob = Vec::new();
855            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
856            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
857
858            for (trigram, postings) in sorted_postings {
859                let offset = u64::try_from(postings_blob.len())
860                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
861                let mut count = 0u32;
862
863                for posting in postings {
864                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
865                        continue;
866                    };
867
868                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
869                    postings_blob.push(posting.next_mask);
870                    postings_blob.push(posting.loc_mask);
871                    count = count.saturating_add(1);
872                }
873
874                if count > 0 {
875                    lookup_entries.push((*trigram, offset, count));
876                }
877            }
878
879            write_u64(
880                &mut postings_writer,
881                u64::try_from(postings_blob.len())
882                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
883            )?;
884            postings_writer.write_all(&postings_blob)?;
885            postings_writer.flush()?;
886            let mut postings_blob_file = postings_writer
887                .into_inner()
888                .map_err(|error| std::io::Error::other(error.to_string()))?
889                .into_inner();
890            let checksum = crc32fast::hash(&postings_blob_file);
891            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
892
893            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
894            let entry_count = u32::try_from(lookup_entries.len())
895                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
896
897            lookup_writer.write_all(LOOKUP_MAGIC)?;
898            write_u32(&mut lookup_writer, INDEX_VERSION)?;
899            write_u32(&mut lookup_writer, entry_count)?;
900
901            for (trigram, offset, count) in lookup_entries {
902                write_u32(&mut lookup_writer, trigram)?;
903                write_u64(&mut lookup_writer, offset)?;
904                write_u32(&mut lookup_writer, count)?;
905            }
906
907            lookup_writer.flush()?;
908            let mut lookup_blob_file = lookup_writer
909                .into_inner()
910                .map_err(|error| std::io::Error::other(error.to_string()))?
911                .into_inner();
912            let checksum = crc32fast::hash(&lookup_blob_file);
913            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
914
915            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
916            write_u32(&mut cache_writer, CACHE_MAGIC)?;
917            write_u32(&mut cache_writer, INDEX_VERSION)?;
918            write_u64(
919                &mut cache_writer,
920                u64::try_from(postings_blob_file.len())
921                    .map_err(|_| std::io::Error::other("postings section too large"))?,
922            )?;
923            cache_writer.write_all(&postings_blob_file)?;
924            cache_writer.write_all(&lookup_blob_file)?;
925            cache_writer.flush()?;
926            cache_writer.get_ref().sync_all()?;
927            drop(cache_writer);
928            fs::rename(&tmp_cache, &cache_path)?;
929
930            Ok(())
931        })();
932
933        if write_result.is_err() {
934            let _ = fs::remove_file(&tmp_cache);
935        }
936    }
937
938    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
939        debug_assert!(current_canonical_root.is_absolute());
940        let cache_path = cache_dir.join("cache.bin");
941        let cache_bytes = fs::read(&cache_path).ok()?;
942        if cache_bytes.len() < 16 {
943            return None;
944        }
945        let mut header = Cursor::new(&cache_bytes);
946        if read_u32(&mut header).ok()? != CACHE_MAGIC {
947            return None;
948        }
949        if read_u32(&mut header).ok()? != INDEX_VERSION {
950            return None;
951        }
952        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
953        let start = usize::try_from(header.position()).ok()?;
954        let postings_end = start.checked_add(postings_len_total)?;
955        if postings_end > cache_bytes.len() {
956            return None;
957        }
958        let postings_bytes = &cache_bytes[start..postings_end];
959        let lookup_bytes = &cache_bytes[postings_end..];
960        let lookup_len_total = lookup_bytes.len();
961        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
962        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
963        if postings_len_total < 4 || lookup_len_total < 4 {
964            return None;
965        }
966        verify_crc32_bytes_slice(postings_bytes).ok()?;
967        verify_crc32_bytes_slice(lookup_bytes).ok()?;
968
969        let mut magic = [0u8; 8];
970        postings_reader.read_exact(&mut magic).ok()?;
971        if &magic != INDEX_MAGIC {
972            return None;
973        }
974        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
975            return None;
976        }
977
978        let head_len = read_u32(&mut postings_reader).ok()? as usize;
979        let root_len = read_u32(&mut postings_reader).ok()? as usize;
980        let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
981        let max_file_size = read_u64(&mut postings_reader).ok()?;
982        let file_count = read_u32(&mut postings_reader).ok()? as usize;
983        if file_count > MAX_ENTRIES {
984            return None;
985        }
986        let postings_body_len = postings_len_total.checked_sub(4)?;
987        let lookup_body_len = lookup_len_total.checked_sub(4)?;
988
989        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
990        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
991        if minimum_file_bytes > remaining_postings {
992            return None;
993        }
994
995        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
996            return None;
997        }
998        let mut head_bytes = vec![0u8; head_len];
999        postings_reader.read_exact(&mut head_bytes).ok()?;
1000        let git_head = String::from_utf8(head_bytes)
1001            .ok()
1002            .filter(|head| !head.is_empty());
1003
1004        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1005            return None;
1006        }
1007        let mut root_bytes = vec![0u8; root_len];
1008        postings_reader.read_exact(&mut root_bytes).ok()?;
1009        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
1010        let project_root = current_canonical_root.to_path_buf();
1011
1012        if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1013            return None;
1014        }
1015        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
1016        postings_reader
1017            .read_exact(&mut ignore_fingerprint_bytes)
1018            .ok()?;
1019        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
1020        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
1021        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1022            return None;
1023        }
1024
1025        let mut files = Vec::with_capacity(file_count);
1026        let mut path_to_id = HashMap::new();
1027        let mut unindexed_files = HashSet::new();
1028
1029        for file_id in 0..file_count {
1030            let mut unindexed = [0u8; 1];
1031            postings_reader.read_exact(&mut unindexed).ok()?;
1032            let path_len = read_u32(&mut postings_reader).ok()? as usize;
1033            let size = read_u64(&mut postings_reader).ok()?;
1034            let secs = read_u64(&mut postings_reader).ok()?;
1035            let nanos = read_u32(&mut postings_reader).ok()?;
1036            let mut hash_bytes = [0u8; 32];
1037            postings_reader.read_exact(&mut hash_bytes).ok()?;
1038            let content_hash = blake3::Hash::from_bytes(hash_bytes);
1039            if nanos >= 1_000_000_000 {
1040                return None;
1041            }
1042            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1043                return None;
1044            }
1045            let mut path_bytes = vec![0u8; path_len];
1046            postings_reader.read_exact(&mut path_bytes).ok()?;
1047            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
1048            let full_path = cached_path_under_root(&project_root, &relative_path)?;
1049            let file_id_u32 = u32::try_from(file_id).ok()?;
1050
1051            files.push(FileEntry {
1052                path: full_path.clone(),
1053                size,
1054                modified: UNIX_EPOCH + Duration::new(secs, nanos),
1055                content_hash,
1056            });
1057            path_to_id.insert(full_path, file_id_u32);
1058            if unindexed[0] == 1 {
1059                unindexed_files.insert(file_id_u32);
1060            }
1061        }
1062
1063        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
1064        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
1065        if postings_len > max_postings_bytes {
1066            return None;
1067        }
1068        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1069            return None;
1070        }
1071        let mut postings_blob = vec![0u8; postings_len];
1072        postings_reader.read_exact(&mut postings_blob).ok()?;
1073
1074        let mut lookup_magic = [0u8; 8];
1075        lookup_reader.read_exact(&mut lookup_magic).ok()?;
1076        if &lookup_magic != LOOKUP_MAGIC {
1077            return None;
1078        }
1079        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
1080            return None;
1081        }
1082        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
1083        if entry_count > MAX_ENTRIES {
1084            return None;
1085        }
1086        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1087        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1088        if minimum_lookup_bytes > remaining_lookup {
1089            return None;
1090        }
1091
1092        let mut postings = HashMap::new();
1093        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1094
1095        for _ in 0..entry_count {
1096            let trigram = read_u32(&mut lookup_reader).ok()?;
1097            let offset = read_u64(&mut lookup_reader).ok()? as usize;
1098            let count = read_u32(&mut lookup_reader).ok()? as usize;
1099            if count > MAX_ENTRIES {
1100                return None;
1101            }
1102            let bytes_len = count.checked_mul(POSTING_BYTES)?;
1103            let end = offset.checked_add(bytes_len)?;
1104            if end > postings_blob.len() {
1105                return None;
1106            }
1107
1108            let mut trigram_postings = Vec::with_capacity(count);
1109            for chunk in postings_blob[offset..end].chunks_exact(6) {
1110                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1111                let posting = Posting {
1112                    file_id,
1113                    next_mask: chunk[4],
1114                    loc_mask: chunk[5],
1115                };
1116                trigram_postings.push(posting.clone());
1117                file_trigrams.entry(file_id).or_default().push(trigram);
1118            }
1119            postings.insert(trigram, trigram_postings);
1120        }
1121
1122        Some(SearchIndex {
1123            postings,
1124            files,
1125            path_to_id,
1126            ready: false,
1127            project_root,
1128            git_head,
1129            max_file_size,
1130            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1131            file_trigrams,
1132            unindexed_files,
1133        })
1134    }
1135
1136    pub fn stored_git_head(&self) -> Option<&str> {
1137        self.git_head.as_deref()
1138    }
1139
1140    pub(crate) fn set_ready(&mut self, ready: bool) {
1141        self.ready = ready;
1142    }
1143
1144    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1145        self.git_head = current_head;
1146        verify_file_mtimes(self);
1147        self.ready = true;
1148    }
1149
1150    #[cfg(debug_assertions)]
1151    #[doc(hidden)]
1152    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1153        self.verify_against_disk(current_head);
1154    }
1155
1156    pub(crate) fn rebuild_or_refresh(
1157        root: &Path,
1158        max_file_size: u64,
1159        current_head: Option<String>,
1160        baseline: Option<SearchIndex>,
1161    ) -> Self {
1162        if let Some(mut baseline) = baseline {
1163            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1164            baseline.max_file_size = max_file_size;
1165            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1166            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1167                return SearchIndex::build_with_limit(root, max_file_size);
1168            }
1169            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1170
1171            if baseline.git_head == current_head || current_head.is_none() {
1172                // HEAD matches, but files may have changed on disk since the index was
1173                // last written (e.g., uncommitted edits, stash pop, manual file changes
1174                // while OpenCode was closed). Verify mtimes and re-index stale files.
1175                // Non-git projects also use this per-file (path, mtime, size)
1176                // fingerprint so unchanged trees reuse the disk cache instead of
1177                // rebuilding every configure.
1178                baseline.git_head = current_head;
1179                verify_file_mtimes(&mut baseline);
1180                baseline.ready = true;
1181                return baseline;
1182            }
1183
1184            if let (Some(previous), Some(current)) =
1185                (baseline.git_head.clone(), current_head.clone())
1186            {
1187                let project_root = baseline.project_root.clone();
1188                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1189                    baseline.git_head = Some(current);
1190                    verify_file_mtimes(&mut baseline);
1191                    baseline.ready = true;
1192                    return baseline;
1193                }
1194            }
1195        }
1196
1197        SearchIndex::build_with_limit(root, max_file_size)
1198    }
1199
1200    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1201        let file_id = u32::try_from(self.files.len()).ok()?;
1202        let metadata = fs::metadata(path).ok();
1203        let size = metadata
1204            .as_ref()
1205            .map_or(size_hint, |metadata| metadata.len());
1206        let modified = metadata
1207            .and_then(|metadata| metadata.modified().ok())
1208            .unwrap_or(UNIX_EPOCH);
1209
1210        self.files.push(FileEntry {
1211            path: path.to_path_buf(),
1212            size,
1213            modified,
1214            content_hash: cache_freshness::zero_hash(),
1215        });
1216        self.path_to_id.insert(path.to_path_buf(), file_id);
1217        Some(file_id)
1218    }
1219
1220    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1221        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1222            return;
1223        };
1224        self.unindexed_files.insert(file_id);
1225        self.file_trigrams.insert(file_id, Vec::new());
1226    }
1227
1228    fn active_file_ids(&self) -> Vec<u32> {
1229        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1230        ids.sort_unstable();
1231        ids
1232    }
1233
1234    fn is_active_file(&self, file_id: u32) -> bool {
1235        self.files
1236            .get(file_id as usize)
1237            .map(|file| !file.path.as_os_str().is_empty())
1238            .unwrap_or(false)
1239    }
1240
1241    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1242        let Some(postings) = self.postings.get(&trigram) else {
1243            return Vec::new();
1244        };
1245
1246        let mut matches = Vec::with_capacity(postings.len());
1247
1248        for posting in postings {
1249            if let Some(filter) = filter {
1250                // next_mask: bloom filter check — the character following this trigram in the
1251                // query must also appear after this trigram somewhere in the file.
1252                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1253                    continue;
1254                }
1255                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1256                // between consecutive trigram pairs, but is NOT used as a single-trigram
1257                // filter because the position in the query string has no relationship to
1258                // the position in the file. Using it here causes false negatives.
1259            }
1260            if self.is_active_file(posting.file_id) {
1261                matches.push(posting.file_id);
1262            }
1263        }
1264
1265        matches
1266    }
1267}
1268
1269fn search_candidate_file(
1270    file: &FileEntry,
1271    matcher: &SearchMatcher,
1272    max_results: usize,
1273    stop_after: usize,
1274    total_matches: &AtomicUsize,
1275    files_searched: &AtomicUsize,
1276    files_with_matches: &AtomicUsize,
1277    truncated: &AtomicBool,
1278    engine_capped: &AtomicBool,
1279    stop_scan: Option<&Arc<AtomicBool>>,
1280) -> Vec<SharedGrepMatch> {
1281    if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1282        engine_capped.store(true, Ordering::Relaxed);
1283        return Vec::new();
1284    }
1285
1286    let content = match read_indexed_file_bytes(&file.path) {
1287        Some(content) => content,
1288        None => return Vec::new(),
1289    };
1290    // Defense in depth: even though indexing tries to filter binaries via
1291    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1292    // query time. content_inspector is fast (~bytes-per-cycle on a small
1293    // preview) and this guarantees we never surface matches inside binary
1294    // files even if the indexer somehow let one through (e.g. file changed
1295    // between indexing and query).
1296    if is_binary_bytes(&content) {
1297        return Vec::new();
1298    }
1299    files_searched.fetch_add(1, Ordering::Relaxed);
1300
1301    let shared_path = Arc::new(file.path.clone());
1302    let mut matches = Vec::new();
1303    let mut line_starts = None;
1304    let mut seen_lines = HashSet::new();
1305    let mut matched_this_file = false;
1306
1307    match matcher {
1308        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1309            let needle = &literal.needle;
1310            let finder = memchr::memmem::Finder::new(needle);
1311            let mut start = 0;
1312
1313            while let Some(position) = finder.find(&content[start..]) {
1314                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1315                    engine_capped.store(true, Ordering::Relaxed);
1316                    break;
1317                }
1318
1319                let offset = start + position;
1320                start = offset + 1;
1321
1322                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1323                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1324                if !seen_lines.insert(line) {
1325                    continue;
1326                }
1327
1328                matched_this_file = true;
1329                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1330                if match_number > max_results {
1331                    truncated.store(true, Ordering::Relaxed);
1332                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1333                    break;
1334                }
1335
1336                let end = offset + needle.len();
1337                matches.push(SharedGrepMatch {
1338                    file: shared_path.clone(),
1339                    line,
1340                    column,
1341                    line_text,
1342                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1343                });
1344            }
1345        }
1346        SearchMatcher::Literal(literal) => {
1347            let needle = &literal.needle;
1348            let search_content = content.to_ascii_lowercase();
1349            let finder = memchr::memmem::Finder::new(needle);
1350            let mut start = 0;
1351
1352            while let Some(position) = finder.find(&search_content[start..]) {
1353                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1354                    engine_capped.store(true, Ordering::Relaxed);
1355                    break;
1356                }
1357
1358                let offset = start + position;
1359                start = offset + 1;
1360
1361                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1362                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1363                if !seen_lines.insert(line) {
1364                    continue;
1365                }
1366
1367                matched_this_file = true;
1368                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1369                if match_number > max_results {
1370                    truncated.store(true, Ordering::Relaxed);
1371                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1372                    break;
1373                }
1374
1375                let end = offset + needle.len();
1376                matches.push(SharedGrepMatch {
1377                    file: shared_path.clone(),
1378                    line,
1379                    column,
1380                    line_text,
1381                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1382                });
1383            }
1384        }
1385        SearchMatcher::Regex(regex) => {
1386            for matched in regex.find_iter(&content) {
1387                if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1388                    engine_capped.store(true, Ordering::Relaxed);
1389                    break;
1390                }
1391
1392                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1393                let (line, column, line_text) =
1394                    line_details_bytes(&content, line_starts, matched.start());
1395                if !seen_lines.insert(line) {
1396                    continue;
1397                }
1398
1399                matched_this_file = true;
1400                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1401                if match_number > max_results {
1402                    truncated.store(true, Ordering::Relaxed);
1403                    signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1404                    break;
1405                }
1406
1407                matches.push(SharedGrepMatch {
1408                    file: shared_path.clone(),
1409                    line,
1410                    column,
1411                    line_text,
1412                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1413                });
1414            }
1415        }
1416    }
1417
1418    if matched_this_file {
1419        files_with_matches.fetch_add(1, Ordering::Relaxed);
1420    }
1421
1422    matches
1423}
1424
1425fn should_stop_search(
1426    truncated: &AtomicBool,
1427    total_matches: &AtomicUsize,
1428    stop_after: usize,
1429) -> bool {
1430    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1431}
1432
1433fn grep_scan_should_stop(
1434    stop_scan: Option<&Arc<AtomicBool>>,
1435    truncated: &AtomicBool,
1436    total_matches: &AtomicUsize,
1437    stop_after: usize,
1438) -> bool {
1439    stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1440        || should_stop_search(truncated, total_matches, stop_after)
1441}
1442
1443fn signal_grep_scan_cap(
1444    stop_scan: Option<&Arc<AtomicBool>>,
1445    total_matches: &AtomicUsize,
1446    stop_after: usize,
1447) {
1448    if let Some(flag) = stop_scan {
1449        if total_matches.load(Ordering::Relaxed) >= stop_after {
1450            flag.store(true, Ordering::Relaxed);
1451        }
1452    }
1453}
1454
1455/// Bound cold search-index ingestion to half the cores (cap 8), matching callgraph store.
1456fn search_index_build_pool_size() -> usize {
1457    std::thread::available_parallelism()
1458        .map(|parallelism| parallelism.get())
1459        .unwrap_or(1)
1460        .div_ceil(2)
1461        .clamp(1, 8)
1462}
1463
1464fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1465    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1466    let mut left_index = 0;
1467    let mut right_index = 0;
1468
1469    while left_index < left.len() && right_index < right.len() {
1470        match left[left_index].cmp(&right[right_index]) {
1471            std::cmp::Ordering::Less => left_index += 1,
1472            std::cmp::Ordering::Greater => right_index += 1,
1473            std::cmp::Ordering::Equal => {
1474                merged.push(left[left_index]);
1475                left_index += 1;
1476                right_index += 1;
1477            }
1478        }
1479    }
1480
1481    merged
1482}
1483
1484fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1485    let mut merged = Vec::with_capacity(left.len() + right.len());
1486    let mut left_index = 0;
1487    let mut right_index = 0;
1488
1489    while left_index < left.len() && right_index < right.len() {
1490        match left[left_index].cmp(&right[right_index]) {
1491            std::cmp::Ordering::Less => {
1492                merged.push(left[left_index]);
1493                left_index += 1;
1494            }
1495            std::cmp::Ordering::Greater => {
1496                merged.push(right[right_index]);
1497                right_index += 1;
1498            }
1499            std::cmp::Ordering::Equal => {
1500                merged.push(left[left_index]);
1501                left_index += 1;
1502                right_index += 1;
1503            }
1504        }
1505    }
1506
1507    merged.extend_from_slice(&left[left_index..]);
1508    merged.extend_from_slice(&right[right_index..]);
1509    merged
1510}
1511
1512pub fn decompose_regex(pattern: &str) -> RegexQuery {
1513    let hir = match regex_syntax::parse(pattern) {
1514        Ok(hir) => hir,
1515        Err(_) => return RegexQuery::default(),
1516    };
1517
1518    let build = build_query(&hir);
1519    build.into_query()
1520}
1521
1522pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1523    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1524}
1525
1526pub fn normalize_char(c: u8) -> u8 {
1527    c.to_ascii_lowercase()
1528}
1529
1530fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
1531    if content.len() < 3 {
1532        return;
1533    }
1534
1535    for start in 0..=content.len() - 3 {
1536        let trigram = pack_trigram(
1537            normalize_char(content[start]),
1538            normalize_char(content[start + 1]),
1539            normalize_char(content[start + 2]),
1540        );
1541        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1542        visit(trigram, next_char, start);
1543    }
1544}
1545
1546pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1547    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1548    scan_trigrams(content, |trigram, next_char, position| {
1549        trigrams.push((trigram, next_char, position));
1550    });
1551    trigrams
1552}
1553
1554fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
1555    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1556    scan_trigrams(content, |trigram, next_char, position| {
1557        let entry = filters.entry(trigram).or_default();
1558        if include_eof_next_char || next_char != EOF_SENTINEL {
1559            entry.next_mask |= mask_for_next_char(next_char);
1560        }
1561        entry.loc_mask |= mask_for_position(position);
1562    });
1563    filters
1564}
1565
1566pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1567    let mut seen = HashSet::new();
1568    let mut out = Vec::new();
1569    for token in tokens {
1570        scan_trigrams(token.as_bytes(), |trigram, _, _| {
1571            if seen.insert(trigram) {
1572                out.push(trigram);
1573            }
1574        });
1575    }
1576    out
1577}
1578
1579pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1580    if query_trigrams.is_empty() {
1581        return 0.0;
1582    }
1583
1584    let mut hits = 0u32;
1585    for &trigram in query_trigrams {
1586        if let Some(postings) = index.postings.get(&trigram) {
1587            if postings
1588                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1589                .is_ok()
1590            {
1591                hits += 1;
1592            }
1593        }
1594    }
1595
1596    if hits == 0 {
1597        return 0.0;
1598    }
1599
1600    let file_trigram_count = index
1601        .file_trigrams
1602        .get(&file_id)
1603        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1604    (hits as f32) / (1.0 + file_trigram_count.ln())
1605}
1606
1607pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1608    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1609    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1610        return PathBuf::from(override_dir)
1611            .join("index")
1612            .join(project_cache_key(project_root));
1613    }
1614    // Use configured storage dir (from plugin, XDG-compliant)
1615    if let Some(dir) = storage_dir {
1616        return dir.join("index").join(project_cache_key(project_root));
1617    }
1618    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1619    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1620    // If neither is set, fall back to a temp directory rather than `"."`
1621    // because the search-index code reads/writes absolute paths.
1622    let home = std::env::var_os("HOME")
1623        .or_else(|| std::env::var_os("USERPROFILE"))
1624        .map(PathBuf::from)
1625        .unwrap_or_else(std::env::temp_dir);
1626    home.join(".cache")
1627        .join("aft")
1628        .join("index")
1629        .join(project_cache_key(project_root))
1630}
1631
1632pub(crate) fn build_path_filters(
1633    include: &[String],
1634    exclude: &[String],
1635) -> Result<PathFilters, String> {
1636    Ok(PathFilters {
1637        includes: build_globset(include)?,
1638        excludes: build_globset(exclude)?,
1639    })
1640}
1641
1642pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1643    walk_project_files_from(root, root, filters)
1644}
1645
1646pub fn walk_project_files_bounded_default(
1647    root: &Path,
1648    max_files: usize,
1649) -> Result<Vec<PathBuf>, usize> {
1650    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
1651}
1652
1653pub(crate) fn walk_project_files_bounded_matching<F>(
1654    root: &Path,
1655    filters: &PathFilters,
1656    max_files: usize,
1657    matches_file: F,
1658) -> Result<Vec<PathBuf>, usize>
1659where
1660    F: Fn(&Path) -> bool,
1661{
1662    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
1663}
1664
1665pub fn walk_project_files_bounded_default_matching<F>(
1666    root: &Path,
1667    max_files: usize,
1668    matches_file: F,
1669) -> Result<Vec<PathBuf>, usize>
1670where
1671    F: Fn(&Path) -> bool,
1672{
1673    walk_project_files_from_inner_matching(
1674        root,
1675        root,
1676        &PathFilters::default(),
1677        Some(max_files),
1678        matches_file,
1679        true,
1680    )
1681}
1682
1683pub(crate) fn walk_project_files_from(
1684    filter_root: &Path,
1685    search_root: &Path,
1686    filters: &PathFilters,
1687) -> Vec<PathBuf> {
1688    walk_project_files_from_inner(filter_root, search_root, filters, None, true)
1689        .expect("unbounded project walk cannot exceed a file limit")
1690}
1691
1692pub(crate) fn for_each_walk_project_file_from<F>(
1693    filter_root: &Path,
1694    search_root: &Path,
1695    filters: &PathFilters,
1696    mut on_file: F,
1697) where
1698    F: FnMut(&PathBuf),
1699{
1700    let builder = project_walk_builder(search_root);
1701    for entry in builder.build().filter_map(|entry| entry.ok()) {
1702        if !entry
1703            .file_type()
1704            .map_or(false, |file_type| file_type.is_file())
1705        {
1706            continue;
1707        }
1708        let path = entry.into_path();
1709        if filters.matches(filter_root, &path) {
1710            on_file(&path);
1711        }
1712    }
1713}
1714
1715pub(crate) fn has_any_project_file_from(
1716    filter_root: &Path,
1717    search_root: &Path,
1718    filters: &PathFilters,
1719) -> bool {
1720    walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
1721}
1722
1723fn walk_project_files_from_inner(
1724    filter_root: &Path,
1725    search_root: &Path,
1726    filters: &PathFilters,
1727    max_files: Option<usize>,
1728    sort_by_mtime: bool,
1729) -> Result<Vec<PathBuf>, usize> {
1730    walk_project_files_from_inner_matching(
1731        filter_root,
1732        search_root,
1733        filters,
1734        max_files,
1735        |_| true,
1736        sort_by_mtime,
1737    )
1738}
1739
1740fn project_walk_builder(search_root: &Path) -> WalkBuilder {
1741    let mut builder = WalkBuilder::new(search_root);
1742    builder
1743        .hidden(false)
1744        .git_ignore(true)
1745        .git_global(true)
1746        .git_exclude(true)
1747        .add_custom_ignore_filename(".aftignore")
1748        .filter_entry(|entry| {
1749            let name = entry.file_name().to_string_lossy();
1750            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1751                return !matches!(
1752                    name.as_ref(),
1753                    "node_modules"
1754                        | "target"
1755                        | "venv"
1756                        | ".venv"
1757                        | ".git"
1758                        | "__pycache__"
1759                        | ".tox"
1760                        | "dist"
1761                        | "build"
1762                );
1763            }
1764            true
1765        });
1766    builder
1767}
1768
1769fn walk_project_files_from_inner_matching<F>(
1770    filter_root: &Path,
1771    search_root: &Path,
1772    filters: &PathFilters,
1773    max_files: Option<usize>,
1774    matches_file: F,
1775    sort_by_mtime: bool,
1776) -> Result<Vec<PathBuf>, usize>
1777where
1778    F: Fn(&Path) -> bool,
1779{
1780    let builder = project_walk_builder(search_root);
1781
1782    let mut files = Vec::new();
1783    for entry in builder.build().filter_map(|entry| entry.ok()) {
1784        if !entry
1785            .file_type()
1786            .map_or(false, |file_type| file_type.is_file())
1787        {
1788            continue;
1789        }
1790        let path = entry.into_path();
1791        if filters.matches(filter_root, &path) && matches_file(&path) {
1792            files.push(path);
1793            if max_files.is_some_and(|limit| files.len() > limit) {
1794                return Err(files.len());
1795            }
1796        }
1797    }
1798
1799    if sort_by_mtime {
1800        sort_paths_by_mtime_desc(&mut files);
1801    }
1802    Ok(files)
1803}
1804
1805pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1806    let bytes = fs::read(path).ok()?;
1807    if is_binary_bytes(&bytes) {
1808        return None;
1809    }
1810    String::from_utf8(bytes).ok()
1811}
1812
1813fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1814    fs::read(path).ok()
1815}
1816
1817pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1818    path.strip_prefix(root)
1819        .map(PathBuf::from)
1820        .unwrap_or_else(|_| path.to_path_buf())
1821}
1822
1823pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1824    let normalized_root = normalize_path(root);
1825    let normalized_path = normalize_path(path);
1826    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1827    validate_cached_relative_path(relative)
1828}
1829
1830pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1831    let relative = validate_cached_relative_path(relative_path)?;
1832    let normalized_root = normalize_path(root);
1833    let full_path = normalize_path(&normalized_root.join(relative));
1834
1835    match fs::canonicalize(&full_path) {
1836        Ok(canonical_path) => {
1837            if canonical_path.starts_with(&normalized_root) {
1838                return Some(full_path);
1839            }
1840
1841            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1842            canonical_path
1843                .starts_with(&canonical_root)
1844                .then_some(full_path)
1845        }
1846        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1847    }
1848}
1849
1850pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1851    if path.is_absolute() {
1852        return None;
1853    }
1854
1855    let mut normalized = PathBuf::new();
1856    for component in path.components() {
1857        match component {
1858            Component::Normal(part) => normalized.push(part),
1859            Component::CurDir => {}
1860            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1861        }
1862    }
1863    (!normalized.as_os_str().is_empty()).then_some(normalized)
1864}
1865
1866/// Sort paths newest-first by mtime, falling back to lexicographic order.
1867///
1868/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1869/// `sort_by()` closure. That made the comparator non-deterministic — a
1870/// `stat()` syscall for the same path can return different values across
1871/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1872/// concurrent file-watcher activity), and Rust's slice::sort panics at
1873/// runtime when it detects a non-total-order comparator. CI hit this on
1874/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1875///
1876/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1877/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1878pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1879    use std::collections::HashMap;
1880    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1881    for path in paths.iter() {
1882        mtimes
1883            .entry(path.clone())
1884            .or_insert_with(|| path_modified_time(path));
1885    }
1886    paths.sort_by(|left, right| {
1887        let left_mtime = mtimes.get(left).and_then(|v| *v);
1888        let right_mtime = mtimes.get(right).and_then(|v| *v);
1889        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1890    });
1891}
1892
1893/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1894/// the sort. Same fix, applied to grep matches that share files.
1895pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1896    use std::collections::HashMap;
1897    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1898    for m in matches.iter() {
1899        mtimes.entry(m.file.clone()).or_insert_with(|| {
1900            let resolved = resolve_match_path(project_root, &m.file);
1901            path_modified_time(&resolved)
1902        });
1903    }
1904    matches.sort_by(|left, right| {
1905        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1906        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1907        right_mtime
1908            .cmp(&left_mtime)
1909            .then_with(|| left.file.cmp(&right.file))
1910            .then_with(|| left.line.cmp(&right.line))
1911            .then_with(|| left.column.cmp(&right.column))
1912    });
1913}
1914
1915/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1916/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1917/// table from the search index), but it can still return different values if
1918/// the file is modified mid-sort. Snapshot once.
1919fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1920    matches: &mut [SharedGrepMatch],
1921    modified_for_path: F,
1922) where
1923    F: Fn(&Path) -> Option<SystemTime>,
1924{
1925    use std::collections::HashMap;
1926    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1927    for m in matches.iter() {
1928        let path = m.file.as_path().to_path_buf();
1929        mtimes
1930            .entry(path.clone())
1931            .or_insert_with(|| modified_for_path(&path));
1932    }
1933    matches.sort_by(|left, right| {
1934        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1935        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1936        right_mtime
1937            .cmp(&left_mtime)
1938            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1939            .then_with(|| left.line.cmp(&right.line))
1940            .then_with(|| left.column.cmp(&right.column))
1941    });
1942}
1943
1944pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1945    let resolved_project_root = canonicalize_or_normalize(project_root);
1946    let root = match path {
1947        Some(path) => {
1948            let path = PathBuf::from(path);
1949            if path.is_absolute() {
1950                canonicalize_or_normalize(&path)
1951            } else {
1952                normalize_path(&resolved_project_root.join(path))
1953            }
1954        }
1955        None => resolved_project_root.clone(),
1956    };
1957
1958    let use_index = is_within_search_root(&resolved_project_root, &root);
1959    SearchScope { root, use_index }
1960}
1961
1962pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1963    content_inspector::inspect(content).is_binary()
1964}
1965
1966pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1967    run_git(root, &["rev-parse", "HEAD"])
1968}
1969
1970pub fn project_cache_key(project_root: &Path) -> String {
1971    use sha2::{Digest, Sha256};
1972
1973    let mut hasher = Sha256::new();
1974
1975    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1976        // Git repo: root commit is the unique identity.
1977        // Same repo cloned anywhere produces the same key.
1978        hasher.update(root_commit.as_bytes());
1979    } else {
1980        // Non-git project: use the canonical filesystem path as identity.
1981        let canonical_root = canonicalize_or_normalize(project_root);
1982        hasher.update(canonical_root.to_string_lossy().as_bytes());
1983    }
1984
1985    let digest = format!("{:x}", hasher.finalize());
1986    digest[..16].to_string()
1987}
1988
1989/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
1990///
1991/// The search cache stores this value next to the file mtimes. If `.gitignore`,
1992/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
1993/// matching HEAD + matching file mtimes is not enough to safely reuse the old
1994/// cache: files that are now ignored may still be indexed. Hashing the ignore
1995/// files themselves makes cold-start cache reuse agree with the current walker.
1996pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1997    use sha2::{Digest, Sha256};
1998
1999    let root = canonicalize_or_normalize(project_root);
2000    let mut files = Vec::new();
2001    collect_ignore_rule_files(&root, &mut files);
2002    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
2003        if global_ignore.is_file() {
2004            files.push(global_ignore);
2005        }
2006    }
2007    let info_exclude = git_info_exclude_path(&root);
2008    if info_exclude.is_file() {
2009        files.push(info_exclude);
2010    }
2011    files.sort();
2012    files.dedup();
2013
2014    let mut hasher = Sha256::new();
2015    hasher.update(b"aft-ignore-rules-v1\0");
2016    for path in files {
2017        if let Some(relative) = cache_relative_path(&root, &path) {
2018            hasher.update(relative.to_string_lossy().as_bytes());
2019        } else {
2020            hasher.update(path.to_string_lossy().as_bytes());
2021        }
2022        hasher.update(b"\0");
2023        match fs::read(&path) {
2024            Ok(bytes) => hasher.update(&bytes),
2025            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
2026        }
2027        hasher.update(b"\0");
2028    }
2029
2030    format!("{:x}", hasher.finalize())
2031}
2032
2033fn git_info_exclude_path(root: &Path) -> PathBuf {
2034    run_git(
2035        root,
2036        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
2037    )
2038    .map(PathBuf::from)
2039    .unwrap_or_else(|| root.join(".git"))
2040    .join("info")
2041    .join("exclude")
2042}
2043
2044fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
2045    let mut builder = WalkBuilder::new(root);
2046    builder
2047        .hidden(false)
2048        .git_ignore(true)
2049        .git_global(true)
2050        .git_exclude(true)
2051        .add_custom_ignore_filename(".aftignore")
2052        .filter_entry(|entry| {
2053            let name = entry.file_name().to_string_lossy();
2054            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2055                return !matches!(
2056                    name.as_ref(),
2057                    ".git"
2058                        | "node_modules"
2059                        | "target"
2060                        | "venv"
2061                        | ".venv"
2062                        | "__pycache__"
2063                        | ".tox"
2064                        | "dist"
2065                        | "build"
2066                );
2067            }
2068            true
2069        });
2070
2071    for entry in builder.build().filter_map(|entry| entry.ok()) {
2072        if !entry
2073            .file_type()
2074            .map_or(false, |file_type| file_type.is_file())
2075        {
2076            continue;
2077        }
2078        let file_name = entry.file_name();
2079        if file_name == ".gitignore" || file_name == ".aftignore" {
2080            files.push(entry.into_path());
2081        }
2082    }
2083}
2084
2085/// Count directories visited when discovering ignore rule files (for perf regression tests).
2086#[cfg(test)]
2087pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
2088    let mut dirs = 0usize;
2089    let mut builder = WalkBuilder::new(root);
2090    builder
2091        .hidden(false)
2092        .git_ignore(true)
2093        .git_global(true)
2094        .git_exclude(true)
2095        .add_custom_ignore_filename(".aftignore");
2096    for entry in builder.build().filter_map(|entry| entry.ok()) {
2097        if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2098            dirs += 1;
2099        }
2100    }
2101    dirs
2102}
2103
2104/// Legacy stack-based discovery (pre ignore-walker fix); used only in perf tests.
2105#[cfg(test)]
2106pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
2107    let mut stack = vec![root.to_path_buf()];
2108    let mut dirs = 0usize;
2109    while let Some(dir) = stack.pop() {
2110        dirs += 1;
2111        let Ok(entries) = fs::read_dir(&dir) else {
2112            continue;
2113        };
2114        for entry in entries.flatten() {
2115            let path = entry.path();
2116            let file_name = entry.file_name();
2117            if file_name == ".gitignore" || file_name == ".aftignore" {
2118                continue;
2119            }
2120            let Ok(file_type) = entry.file_type() else {
2121                continue;
2122            };
2123            if !file_type.is_dir() || file_type.is_symlink() {
2124                continue;
2125            }
2126            if matches!(
2127                file_name.to_str().unwrap_or(""),
2128                ".git"
2129                    | "node_modules"
2130                    | "target"
2131                    | "venv"
2132                    | ".venv"
2133                    | "__pycache__"
2134                    | ".tox"
2135                    | "dist"
2136                    | "build"
2137            ) {
2138                continue;
2139            }
2140            stack.push(path);
2141        }
2142    }
2143    dirs
2144}
2145
2146impl PathFilters {
2147    fn matches(&self, root: &Path, path: &Path) -> bool {
2148        let relative = to_glob_path(&relative_to_root(root, path));
2149        if self
2150            .includes
2151            .as_ref()
2152            .is_some_and(|includes| !includes.is_match(&relative))
2153        {
2154            return false;
2155        }
2156        if self
2157            .excludes
2158            .as_ref()
2159            .is_some_and(|excludes| excludes.is_match(&relative))
2160        {
2161            return false;
2162        }
2163        true
2164    }
2165}
2166
2167fn canonicalize_or_normalize(path: &Path) -> PathBuf {
2168    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
2169}
2170
2171fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
2172    if path.is_absolute() {
2173        path.to_path_buf()
2174    } else {
2175        project_root.join(path)
2176    }
2177}
2178
2179fn path_modified_time(path: &Path) -> Option<SystemTime> {
2180    fs::metadata(path)
2181        .and_then(|metadata| metadata.modified())
2182        .ok()
2183}
2184
2185fn normalize_path(path: &Path) -> PathBuf {
2186    let mut result = PathBuf::new();
2187    for component in path.components() {
2188        match component {
2189            Component::ParentDir => {
2190                if !result.pop() {
2191                    result.push(component);
2192                }
2193            }
2194            Component::CurDir => {}
2195            _ => result.push(component),
2196        }
2197    }
2198    result
2199}
2200
2201fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2202    if let Ok(canonical) = fs::canonicalize(path) {
2203        return canonical;
2204    }
2205
2206    let Some(parent) = path.parent() else {
2207        return path.to_path_buf();
2208    };
2209    let Some(file_name) = path.file_name() else {
2210        return path.to_path_buf();
2211    };
2212
2213    fs::canonicalize(parent)
2214        .map(|canonical_parent| canonical_parent.join(file_name))
2215        .unwrap_or_else(|_| path.to_path_buf())
2216}
2217
2218/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
2219/// since the index was last written. Also detect new files and deleted files.
2220fn verify_file_mtimes(index: &mut SearchIndex) {
2221    let filters = PathFilters::default();
2222    let current_files = walk_project_files(&index.project_root, &filters);
2223    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
2224    let mut stale_paths = Vec::new();
2225    let mut removed_paths = Vec::new();
2226
2227    for entry in &mut index.files {
2228        if entry.path.as_os_str().is_empty() {
2229            continue; // tombstoned entry
2230        }
2231        if !current_file_set.contains(&entry.path) {
2232            removed_paths.push(entry.path.clone());
2233            continue;
2234        }
2235        let cached = FileFreshness {
2236            mtime: entry.modified,
2237            size: entry.size,
2238            content_hash: entry.content_hash,
2239        };
2240        match cache_freshness::verify_file_strict(&entry.path, &cached) {
2241            FreshnessVerdict::HotFresh => {}
2242            FreshnessVerdict::ContentFresh {
2243                new_mtime,
2244                new_size,
2245            } => {
2246                entry.modified = new_mtime;
2247                entry.size = new_size;
2248            }
2249            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2250                stale_paths.push(entry.path.clone())
2251            }
2252        }
2253    }
2254
2255    for path in &removed_paths {
2256        index.remove_file(path);
2257    }
2258
2259    // Re-index stale files that are still in the current walk set. If an ignore
2260    // rule changed while AFT was down but the fingerprint missed it, this keeps
2261    // warm-cache verification from resurrecting now-ignored cached entries.
2262    for path in &stale_paths {
2263        if current_file_set.contains(path) {
2264            index.update_file(path);
2265        } else {
2266            index.remove_file(path);
2267        }
2268    }
2269
2270    // Detect new files not in the index
2271    for path in current_files {
2272        if !index.path_to_id.contains_key(&path) {
2273            index.update_file(&path);
2274        }
2275    }
2276
2277    if !stale_paths.is_empty() {
2278        crate::slog_info!(
2279            "search index: refreshed {} stale file(s) from disk cache",
2280            stale_paths.len()
2281        );
2282    }
2283}
2284
2285fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2286    normalize_path(path).starts_with(normalize_path(search_root))
2287}
2288
2289impl QueryBuild {
2290    fn into_query(self) -> RegexQuery {
2291        let mut query = RegexQuery::default();
2292
2293        for run in self.and_runs {
2294            add_run_to_and_query(&mut query, &run);
2295        }
2296
2297        for group in self.or_groups {
2298            let mut trigrams = BTreeSet::new();
2299            let mut filters = HashMap::new();
2300            for run in group {
2301                for (trigram, filter) in trigram_filters(&run) {
2302                    trigrams.insert(trigram);
2303                    merge_filter(filters.entry(trigram).or_default(), filter);
2304                }
2305            }
2306            if !trigrams.is_empty() {
2307                query.or_groups.push(trigrams.into_iter().collect());
2308                query.or_filters.push(filters);
2309            }
2310        }
2311
2312        query
2313    }
2314}
2315
2316fn build_query(hir: &Hir) -> QueryBuild {
2317    match hir.kind() {
2318        HirKind::Literal(literal) => {
2319            if literal.0.len() >= 3 {
2320                QueryBuild {
2321                    and_runs: vec![literal.0.to_vec()],
2322                    or_groups: Vec::new(),
2323                }
2324            } else {
2325                QueryBuild::default()
2326            }
2327        }
2328        HirKind::Capture(capture) => build_query(&capture.sub),
2329        HirKind::Concat(parts) => {
2330            let mut build = QueryBuild::default();
2331            for part in parts {
2332                let part_build = build_query(part);
2333                build.and_runs.extend(part_build.and_runs);
2334                build.or_groups.extend(part_build.or_groups);
2335            }
2336            build
2337        }
2338        HirKind::Alternation(parts) => {
2339            let mut group = Vec::new();
2340            for part in parts {
2341                let Some(mut choices) = guaranteed_run_choices(part) else {
2342                    return QueryBuild::default();
2343                };
2344                group.append(&mut choices);
2345            }
2346            if group.is_empty() {
2347                QueryBuild::default()
2348            } else {
2349                QueryBuild {
2350                    and_runs: Vec::new(),
2351                    or_groups: vec![group],
2352                }
2353            }
2354        }
2355        HirKind::Repetition(repetition) => {
2356            if repetition.min == 0 {
2357                QueryBuild::default()
2358            } else {
2359                build_query(&repetition.sub)
2360            }
2361        }
2362        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2363    }
2364}
2365
2366fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2367    match hir.kind() {
2368        HirKind::Literal(literal) => {
2369            if literal.0.len() >= 3 {
2370                Some(vec![literal.0.to_vec()])
2371            } else {
2372                None
2373            }
2374        }
2375        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2376        HirKind::Concat(parts) => {
2377            let mut runs = Vec::new();
2378            for part in parts {
2379                if let Some(mut part_runs) = guaranteed_run_choices(part) {
2380                    runs.append(&mut part_runs);
2381                }
2382            }
2383            if runs.is_empty() {
2384                None
2385            } else {
2386                Some(runs)
2387            }
2388        }
2389        HirKind::Alternation(parts) => {
2390            let mut runs = Vec::new();
2391            for part in parts {
2392                let Some(mut part_runs) = guaranteed_run_choices(part) else {
2393                    return None;
2394                };
2395                runs.append(&mut part_runs);
2396            }
2397            if runs.is_empty() {
2398                None
2399            } else {
2400                Some(runs)
2401            }
2402        }
2403        HirKind::Repetition(repetition) => {
2404            if repetition.min == 0 {
2405                None
2406            } else {
2407                guaranteed_run_choices(&repetition.sub)
2408            }
2409        }
2410        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2411    }
2412}
2413
2414fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2415    for (trigram, filter) in trigram_filters(run) {
2416        if !query.and_trigrams.contains(&trigram) {
2417            query.and_trigrams.push(trigram);
2418        }
2419        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2420    }
2421}
2422
2423fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2424    trigram_filter_map(run, false).into_iter().collect()
2425}
2426
2427fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2428    target.next_mask |= filter.next_mask;
2429    target.loc_mask |= filter.loc_mask;
2430}
2431
2432fn mask_for_next_char(next_char: u8) -> u8 {
2433    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2434    1u8 << bit
2435}
2436
2437fn mask_for_position(position: usize) -> u8 {
2438    1u8 << (position % 8)
2439}
2440
2441fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2442    if patterns.is_empty() {
2443        return Ok(None);
2444    }
2445
2446    let mut builder = GlobSetBuilder::new();
2447    for pattern in patterns {
2448        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2449        builder.add(glob);
2450    }
2451    builder.build().map(Some).map_err(|error| error.to_string())
2452}
2453
2454fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2455    let mut buffer = [0u8; 4];
2456    reader.read_exact(&mut buffer)?;
2457    Ok(u32::from_le_bytes(buffer))
2458}
2459
2460fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2461    let mut buffer = [0u8; 8];
2462    reader.read_exact(&mut buffer)?;
2463    Ok(u64::from_le_bytes(buffer))
2464}
2465
2466fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2467    writer.write_all(&value.to_le_bytes())
2468}
2469
2470fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2471    writer.write_all(&value.to_le_bytes())
2472}
2473
2474fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2475    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2476        return Err(std::io::Error::other("search index checksum missing"));
2477    };
2478    let expected = u32::from_le_bytes(*stored);
2479    let actual = crc32fast::hash(body);
2480    if actual != expected {
2481        return Err(std::io::Error::other("search index checksum mismatch"));
2482    }
2483    Ok(())
2484}
2485
2486fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2487    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2488    total_len.checked_sub(pos)
2489}
2490
2491fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2492    let output = Command::new("git")
2493        .arg("-C")
2494        .arg(root)
2495        .args(args)
2496        .output()
2497        .ok()?;
2498    if !output.status.success() {
2499        return None;
2500    }
2501    let value = String::from_utf8(output.stdout).ok()?;
2502    let value = value.trim().to_string();
2503    if value.is_empty() {
2504        None
2505    } else {
2506        Some(value)
2507    }
2508}
2509
2510fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2511    let diff_range = format!("{}..{}", from, to);
2512    let output = match Command::new("git")
2513        .arg("-C")
2514        .arg(root)
2515        .args(["diff", "--name-status", "-M", &diff_range])
2516        .output()
2517    {
2518        Ok(output) => output,
2519        Err(_) => return false,
2520    };
2521
2522    if !output.status.success() {
2523        return false;
2524    }
2525
2526    let Ok(diff) = String::from_utf8(output.stdout) else {
2527        return false;
2528    };
2529
2530    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2531        let mut fields = line.split('\t');
2532        let Some(status) = fields.next() else {
2533            continue;
2534        };
2535
2536        if status.starts_with('R') {
2537            let Some(old_path) = fields
2538                .next()
2539                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2540            else {
2541                continue;
2542            };
2543            let Some(new_path) = fields
2544                .next()
2545                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2546            else {
2547                continue;
2548            };
2549            index.remove_file(&old_path);
2550            index.update_file(&new_path);
2551            continue;
2552        }
2553
2554        let Some(path) = fields
2555            .next()
2556            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2557        else {
2558            continue;
2559        };
2560        if status.starts_with('D') || !path.exists() {
2561            index.remove_file(&path);
2562        } else {
2563            index.update_file(&path);
2564        }
2565    }
2566
2567    true
2568}
2569
2570fn is_binary_path(path: &Path, size: u64) -> bool {
2571    if size == 0 {
2572        return false;
2573    }
2574
2575    let mut file = match File::open(path) {
2576        Ok(file) => file,
2577        Err(_) => return true,
2578    };
2579
2580    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2581    match file.read(&mut preview) {
2582        Ok(read) => is_binary_bytes(&preview[..read]),
2583        Err(_) => true,
2584    }
2585}
2586
2587fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2588    let mut starts = vec![0usize];
2589    for (index, byte) in content.iter().copied().enumerate() {
2590        if byte == b'\n' {
2591            starts.push(index + 1);
2592        }
2593    }
2594    starts
2595}
2596
2597fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2598    let line_index = match line_starts.binary_search(&offset) {
2599        Ok(index) => index,
2600        Err(index) => index.saturating_sub(1),
2601    };
2602    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2603    let line_end = content[line_start..]
2604        .iter()
2605        .position(|byte| *byte == b'\n')
2606        .map(|length| line_start + length)
2607        .unwrap_or(content.len());
2608    let mut line_slice = &content[line_start..line_end];
2609    if line_slice.ends_with(b"\r") {
2610        line_slice = &line_slice[..line_slice.len() - 1];
2611    }
2612    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2613    let column = String::from_utf8_lossy(&content[line_start..offset])
2614        .chars()
2615        .count() as u32
2616        + 1;
2617    (line_index as u32 + 1, column, line_text)
2618}
2619
2620fn to_glob_path(path: &Path) -> String {
2621    path.to_string_lossy().replace('\\', "/")
2622}
2623
2624#[cfg(test)]
2625mod tests {
2626    use std::process::Command;
2627
2628    use super::*;
2629
2630    #[test]
2631    fn cached_path_under_root_allows_missing_lexical_child() {
2632        let dir = tempfile::tempdir().expect("create temp dir");
2633        let project = dir.path().join("project");
2634        fs::create_dir_all(&project).expect("create project dir");
2635        let root = fs::canonicalize(&project).expect("canonicalize project");
2636
2637        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2638            .expect("missing child should fall back to lexical validation");
2639
2640        assert_eq!(path, root.join("future/file.rs"));
2641    }
2642
2643    #[cfg(unix)]
2644    #[test]
2645    fn cached_path_under_root_rejects_symlink_escape() {
2646        let dir = tempfile::tempdir().expect("create temp dir");
2647        let project = dir.path().join("project");
2648        let outside = dir.path().join("outside");
2649        fs::create_dir_all(&project).expect("create project dir");
2650        fs::create_dir_all(&outside).expect("create outside dir");
2651        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2652        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2653        let root = fs::canonicalize(&project).expect("canonicalize project");
2654
2655        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2656    }
2657
2658    #[test]
2659    fn extract_trigrams_tracks_next_char_and_position() {
2660        let trigrams = extract_trigrams(b"Rust");
2661        assert_eq!(trigrams.len(), 2);
2662        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2663        assert_eq!(
2664            trigrams[1],
2665            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2666        );
2667    }
2668
2669    #[test]
2670    fn index_file_trigram_filters_match_legacy_extraction() {
2671        let dir = tempfile::tempdir().expect("create temp dir");
2672        let path = dir.path().join("sample.txt");
2673        let content = b"Rust rust RUST\nxy";
2674        fs::write(&path, content).expect("write sample");
2675
2676        let mut expected = BTreeMap::new();
2677        for (trigram, next_char, position) in extract_trigrams(content) {
2678            let entry: &mut PostingFilter = expected.entry(trigram).or_default();
2679            entry.next_mask |= mask_for_next_char(next_char);
2680            entry.loc_mask |= mask_for_position(position);
2681        }
2682
2683        let mut index = SearchIndex::new();
2684        index.project_root = dir.path().to_path_buf();
2685        index.index_file(&path, content);
2686
2687        let file_id = *index.path_to_id.get(&path).expect("file indexed");
2688        let file_trigrams = index.file_trigrams.get(&file_id).expect("file trigrams");
2689        assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
2690        for (trigram, filter) in expected {
2691            let postings = index.postings.get(&trigram).expect("posting list");
2692            assert_eq!(postings.len(), 1);
2693            assert_eq!(postings[0].file_id, file_id);
2694            assert_eq!(postings[0].next_mask, filter.next_mask);
2695            assert_eq!(postings[0].loc_mask, filter.loc_mask);
2696        }
2697    }
2698
2699    #[test]
2700    fn decompose_regex_extracts_literals_and_alternations() {
2701        let query = decompose_regex("abc(def|ghi)xyz");
2702        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2703        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2704        assert_eq!(query.or_groups.len(), 1);
2705        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2706        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2707    }
2708
2709    #[test]
2710    fn candidates_intersect_posting_lists() {
2711        let mut index = SearchIndex::new();
2712        let dir = tempfile::tempdir().expect("create temp dir");
2713        let alpha = dir.path().join("alpha.txt");
2714        let beta = dir.path().join("beta.txt");
2715        fs::write(&alpha, "abcdef").expect("write alpha");
2716        fs::write(&beta, "abcxyz").expect("write beta");
2717        index.project_root = dir.path().to_path_buf();
2718        index.index_file(&alpha, b"abcdef");
2719        index.index_file(&beta, b"abcxyz");
2720
2721        let query = RegexQuery {
2722            and_trigrams: vec![
2723                pack_trigram(b'a', b'b', b'c'),
2724                pack_trigram(b'd', b'e', b'f'),
2725            ],
2726            ..RegexQuery::default()
2727        };
2728
2729        let candidates = index.candidates(&query);
2730        assert_eq!(candidates.len(), 1);
2731        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2732    }
2733
2734    #[test]
2735    fn candidates_apply_bloom_filters() {
2736        let mut index = SearchIndex::new();
2737        let dir = tempfile::tempdir().expect("create temp dir");
2738        let file = dir.path().join("sample.txt");
2739        fs::write(&file, "abcd efgh").expect("write sample");
2740        index.project_root = dir.path().to_path_buf();
2741        index.index_file(&file, b"abcd efgh");
2742
2743        let trigram = pack_trigram(b'a', b'b', b'c');
2744        let matching_filter = PostingFilter {
2745            next_mask: mask_for_next_char(b'd'),
2746            loc_mask: mask_for_position(0),
2747        };
2748        let non_matching_filter = PostingFilter {
2749            next_mask: mask_for_next_char(b'z'),
2750            loc_mask: mask_for_position(0),
2751        };
2752
2753        assert_eq!(
2754            index
2755                .postings_for_trigram(trigram, Some(matching_filter))
2756                .len(),
2757            1
2758        );
2759        assert!(index
2760            .postings_for_trigram(trigram, Some(non_matching_filter))
2761            .is_empty());
2762    }
2763
2764    #[test]
2765    fn disk_round_trip_preserves_postings_and_files() {
2766        let dir = tempfile::tempdir().expect("create temp dir");
2767        let project = dir.path().join("project");
2768        fs::create_dir_all(&project).expect("create project dir");
2769        let file = project.join("src.txt");
2770        fs::write(&file, "abcdef").expect("write source");
2771
2772        let mut index = SearchIndex::build(&project);
2773        index.git_head = Some("deadbeef".to_string());
2774        let cache_dir = dir.path().join("cache");
2775        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2776
2777        let loaded =
2778            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2779        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2780        assert_eq!(loaded.files.len(), 1);
2781        assert_eq!(
2782            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2783            PathBuf::from("src.txt")
2784        );
2785        assert_eq!(loaded.postings.len(), index.postings.len());
2786        assert!(loaded
2787            .postings
2788            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2789    }
2790
2791    #[test]
2792    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2793        let root = PathBuf::from("/tmp/aft-project");
2794
2795        assert_eq!(
2796            cache_relative_path(&root, &root.join("src/lib.rs")),
2797            Some(PathBuf::from("src/lib.rs"))
2798        );
2799        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2800        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2801        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2802        assert_eq!(
2803            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2804            Some(root.join("src/lib.rs"))
2805        );
2806    }
2807
2808    #[test]
2809    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2810        let dir = tempfile::tempdir().expect("create temp dir");
2811        let project = dir.path().join("project");
2812        fs::create_dir_all(&project).expect("create project dir");
2813        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2814        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2815        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2816
2817        Command::new("git")
2818            .arg("init")
2819            .arg(&project)
2820            .status()
2821            .expect("git init");
2822        for args in [
2823            ["config", "user.email", "aft@example.invalid"],
2824            ["config", "user.name", "AFT Test"],
2825        ] {
2826            Command::new("git")
2827                .arg("-C")
2828                .arg(&project)
2829                .args(args)
2830                .status()
2831                .expect("git config");
2832        }
2833        Command::new("git")
2834            .arg("-C")
2835            .arg(&project)
2836            .args(["add", "."])
2837            .status()
2838            .expect("git add initial");
2839        Command::new("git")
2840            .arg("-C")
2841            .arg(&project)
2842            .args(["commit", "-m", "initial"])
2843            .status()
2844            .expect("git commit initial");
2845        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2846        let mut baseline = SearchIndex::build(&project);
2847        baseline.git_head = Some(previous.clone());
2848
2849        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2850        Command::new("git")
2851            .arg("-C")
2852            .arg(&project)
2853            .args(["add", "-A"])
2854            .status()
2855            .expect("git add rename");
2856        Command::new("git")
2857            .arg("-C")
2858            .arg(&project)
2859            .args(["commit", "-m", "rename"])
2860            .status()
2861            .expect("git commit rename");
2862        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2863
2864        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2865        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2866
2867        let refreshed = SearchIndex::rebuild_or_refresh(
2868            &project,
2869            DEFAULT_MAX_FILE_SIZE,
2870            Some(current),
2871            Some(baseline),
2872        );
2873
2874        assert!(!refreshed
2875            .path_to_id
2876            .contains_key(&canonical_project.join("old.txt")));
2877        assert!(refreshed
2878            .path_to_id
2879            .contains_key(&canonical_project.join("new.txt")));
2880        assert!(refreshed
2881            .path_to_id
2882            .contains_key(&canonical_project.join("untracked.txt")));
2883        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2884        assert_eq!(matches.matches.len(), 1);
2885    }
2886
2887    #[test]
2888    fn read_from_disk_rejects_corrupt_postings_checksum() {
2889        let dir = tempfile::tempdir().expect("create temp dir");
2890        let project = dir.path().join("project");
2891        fs::create_dir_all(&project).expect("create project dir");
2892        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2893
2894        let index = SearchIndex::build(&project);
2895        let cache_dir = dir.path().join("cache");
2896        index.write_to_disk(&cache_dir, None);
2897
2898        let cache_path = cache_dir.join("cache.bin");
2899        let mut bytes = fs::read(&cache_path).expect("read cache");
2900        let middle = bytes.len() / 2;
2901        bytes[middle] ^= 0xff;
2902        fs::write(&cache_path, bytes).expect("write corrupted cache");
2903
2904        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2905    }
2906
2907    #[test]
2908    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2909        let dir = tempfile::tempdir().expect("create temp dir");
2910        let project = dir.path().join("project");
2911        fs::create_dir_all(&project).expect("create project dir");
2912        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2913
2914        let index = SearchIndex::build(&project);
2915        let cache_dir = dir.path().join("cache");
2916        index.write_to_disk(&cache_dir, None);
2917
2918        assert!(cache_dir.join("cache.bin").is_file());
2919        assert!(fs::read_dir(&cache_dir)
2920            .expect("read cache dir")
2921            .all(|entry| !entry
2922                .expect("cache entry")
2923                .file_name()
2924                .to_string_lossy()
2925                .contains(".tmp.")));
2926    }
2927
2928    #[test]
2929    fn concurrent_search_index_writes_do_not_corrupt() {
2930        let dir = tempfile::tempdir().expect("create temp dir");
2931        let project = dir.path().join("project");
2932        fs::create_dir_all(&project).expect("create project dir");
2933        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2934        let cache_dir = dir.path().join("cache");
2935
2936        let a_project = project.clone();
2937        let a_cache = cache_dir.clone();
2938        let a = std::thread::spawn(move || {
2939            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2940            let index = SearchIndex::build(&a_project);
2941            index.write_to_disk(&a_cache, None);
2942        });
2943        let b_project = project.clone();
2944        let b_cache = cache_dir.clone();
2945        let b = std::thread::spawn(move || {
2946            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2947            let index = SearchIndex::build(&b_project);
2948            index.write_to_disk(&b_cache, None);
2949        });
2950        a.join().expect("writer a");
2951        b.join().expect("writer b");
2952
2953        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2954    }
2955
2956    #[test]
2957    fn search_index_atomic_rename_survives_partial_write() {
2958        let dir = tempfile::tempdir().expect("create temp dir");
2959        let cache_dir = dir.path().join("cache");
2960        fs::create_dir_all(&cache_dir).expect("create cache dir");
2961        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2962
2963        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2964    }
2965
2966    #[test]
2967    fn project_cache_key_includes_checkout_path() {
2968        let dir = tempfile::tempdir().expect("create temp dir");
2969        let source = dir.path().join("source");
2970        fs::create_dir_all(&source).expect("create source repo dir");
2971        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2972
2973        assert!(Command::new("git")
2974            .current_dir(&source)
2975            .args(["init"])
2976            .status()
2977            .expect("init git repo")
2978            .success());
2979        assert!(Command::new("git")
2980            .current_dir(&source)
2981            .args(["add", "."])
2982            .status()
2983            .expect("git add")
2984            .success());
2985        assert!(Command::new("git")
2986            .current_dir(&source)
2987            .args([
2988                "-c",
2989                "user.name=AFT Tests",
2990                "-c",
2991                "user.email=aft-tests@example.com",
2992                "commit",
2993                "-m",
2994                "initial",
2995            ])
2996            .status()
2997            .expect("git commit")
2998            .success());
2999
3000        let clone = dir.path().join("clone");
3001        assert!(Command::new("git")
3002            .args(["clone", "--quiet"])
3003            .arg(&source)
3004            .arg(&clone)
3005            .status()
3006            .expect("git clone")
3007            .success());
3008
3009        let source_key = project_cache_key(&source);
3010        let clone_key = project_cache_key(&clone);
3011
3012        assert_eq!(source_key.len(), 16);
3013        assert_eq!(clone_key.len(), 16);
3014        // Same repo (same root commit) → same cache key regardless of clone path
3015        assert_eq!(source_key, clone_key);
3016    }
3017
3018    #[test]
3019    fn git_head_unchanged_picks_up_local_edits() {
3020        let dir = tempfile::tempdir().expect("create temp dir");
3021        let project = dir.path().join("repo");
3022        fs::create_dir_all(&project).expect("create repo dir");
3023        let file = project.join("tracked.txt");
3024        fs::write(&file, "oldtoken\n").expect("write file");
3025        assert!(Command::new("git")
3026            .current_dir(&project)
3027            .arg("init")
3028            .status()
3029            .unwrap()
3030            .success());
3031        assert!(Command::new("git")
3032            .current_dir(&project)
3033            .args(["add", "."])
3034            .status()
3035            .unwrap()
3036            .success());
3037        assert!(Command::new("git")
3038            .current_dir(&project)
3039            .args([
3040                "-c",
3041                "user.name=AFT Tests",
3042                "-c",
3043                "user.email=aft-tests@example.com",
3044                "commit",
3045                "-m",
3046                "initial"
3047            ])
3048            .status()
3049            .unwrap()
3050            .success());
3051        let head = current_git_head(&project);
3052        let mut baseline = SearchIndex::build(&project);
3053        baseline.git_head = head.clone();
3054        fs::write(&file, "newtoken\n").expect("edit tracked file");
3055
3056        let refreshed =
3057            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
3058        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
3059
3060        assert_eq!(result.total_matches, 1);
3061    }
3062
3063    #[test]
3064    fn non_git_project_reuses_cache_when_files_unchanged() {
3065        let dir = tempfile::tempdir().expect("create temp dir");
3066        let project = dir.path().join("project");
3067        fs::create_dir_all(&project).expect("create project dir");
3068        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
3069        let baseline = SearchIndex::build(&project);
3070        let baseline_file_count = baseline.file_count();
3071
3072        let refreshed =
3073            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3074
3075        assert_eq!(refreshed.file_count(), baseline_file_count);
3076        assert_eq!(
3077            refreshed
3078                .grep("unchangedtoken", true, &[], &[], &project, 10)
3079                .total_matches,
3080            1
3081        );
3082    }
3083
3084    #[test]
3085    fn resolve_search_scope_disables_index_for_external_path() {
3086        let dir = tempfile::tempdir().expect("create temp dir");
3087        let project = dir.path().join("project");
3088        let outside = dir.path().join("outside");
3089        fs::create_dir_all(&project).expect("create project dir");
3090        fs::create_dir_all(&outside).expect("create outside dir");
3091
3092        let scope = resolve_search_scope(&project, outside.to_str());
3093
3094        assert_eq!(
3095            scope.root,
3096            fs::canonicalize(&outside).expect("canonicalize outside")
3097        );
3098        assert!(!scope.use_index);
3099    }
3100
3101    #[test]
3102    fn grep_filters_matches_to_search_root() {
3103        let dir = tempfile::tempdir().expect("create temp dir");
3104        let project = dir.path().join("project");
3105        let src = project.join("src");
3106        let docs = project.join("docs");
3107        fs::create_dir_all(&src).expect("create src dir");
3108        fs::create_dir_all(&docs).expect("create docs dir");
3109        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
3110        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
3111
3112        let index = SearchIndex::build(&project);
3113        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3114
3115        assert_eq!(result.files_searched, 1);
3116        assert_eq!(result.files_with_matches, 1);
3117        assert_eq!(result.matches.len(), 1);
3118        // Index stores canonicalized paths; on macOS /var → /private/var
3119        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
3120        assert_eq!(result.matches[0].file, expected);
3121    }
3122
3123    #[test]
3124    fn grep_deduplicates_multiple_matches_on_same_line() {
3125        let dir = tempfile::tempdir().expect("create temp dir");
3126        let project = dir.path().join("project");
3127        let src = project.join("src");
3128        fs::create_dir_all(&src).expect("create src dir");
3129        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
3130
3131        let index = SearchIndex::build(&project);
3132        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3133
3134        assert_eq!(result.total_matches, 1);
3135        assert_eq!(result.matches.len(), 1);
3136    }
3137
3138    #[test]
3139    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
3140        let dir = tempfile::tempdir().expect("create temp dir");
3141        let project = dir.path().join("project");
3142        fs::create_dir_all(&project).expect("create project dir");
3143        let file = project.join("unicode.txt");
3144        fs::write(&file, "äbc\n").expect("write unicode file");
3145
3146        let index = SearchIndex::build(&project);
3147        let result = index.grep("Äbc", false, &[], &[], &project, 10);
3148
3149        assert_eq!(result.total_matches, 1);
3150        assert_eq!(result.matches.len(), 1);
3151        assert_eq!(
3152            result.matches[0].file,
3153            fs::canonicalize(file).expect("canonicalize unicode file")
3154        );
3155    }
3156
3157    #[test]
3158    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
3159        let dir = tempfile::tempdir().expect("create temp dir");
3160        let project = dir.path().join("project");
3161        fs::create_dir_all(&project).expect("create project dir");
3162        let file = project.join("tokens.txt");
3163        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
3164        fs::write(&file, "alpha").expect("write original file");
3165        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
3166
3167        let baseline = SearchIndex::build(&project);
3168        fs::write(&file, "bravo").expect("write same-size edit");
3169        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
3170
3171        let refreshed =
3172            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3173        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
3174        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
3175        let refreshed_id = *refreshed
3176            .path_to_id
3177            .get(&canonical_file)
3178            .expect("file remains indexed");
3179
3180        assert_eq!(result.total_matches, 1);
3181        assert!(refreshed
3182            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
3183            .contains(&refreshed_id));
3184        assert!(!refreshed
3185            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
3186            .contains(&refreshed_id));
3187    }
3188
3189    #[test]
3190    fn grep_reports_total_matches_before_truncation() {
3191        let dir = tempfile::tempdir().expect("create temp dir");
3192        let project = dir.path().join("project");
3193        let src = project.join("src");
3194        fs::create_dir_all(&src).expect("create src dir");
3195        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
3196
3197        let index = SearchIndex::build(&project);
3198        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
3199
3200        assert_eq!(result.total_matches, 2);
3201        assert_eq!(result.matches.len(), 1);
3202        assert!(result.truncated);
3203    }
3204
3205    #[test]
3206    fn glob_filters_results_to_search_root() {
3207        let dir = tempfile::tempdir().expect("create temp dir");
3208        let project = dir.path().join("project");
3209        let src = project.join("src");
3210        let scripts = project.join("scripts");
3211        fs::create_dir_all(&src).expect("create src dir");
3212        fs::create_dir_all(&scripts).expect("create scripts dir");
3213        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
3214        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
3215
3216        let index = SearchIndex::build(&project);
3217        let files = index.glob("**/*.rs", &src);
3218
3219        assert_eq!(
3220            files,
3221            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
3222        );
3223    }
3224
3225    #[test]
3226    fn glob_includes_hidden_and_binary_files() {
3227        let dir = tempfile::tempdir().expect("create temp dir");
3228        let project = dir.path().join("project");
3229        let hidden_dir = project.join(".hidden");
3230        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
3231        let hidden_file = hidden_dir.join("data.bin");
3232        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
3233
3234        let index = SearchIndex::build(&project);
3235        let files = index.glob("**/*.bin", &project);
3236
3237        assert_eq!(
3238            files,
3239            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
3240        );
3241    }
3242
3243    #[test]
3244    fn read_from_disk_rejects_invalid_nanos() {
3245        let dir = tempfile::tempdir().expect("create temp dir");
3246        let cache_dir = dir.path().join("cache");
3247        fs::create_dir_all(&cache_dir).expect("create cache dir");
3248
3249        let mut postings = Vec::new();
3250        postings.extend_from_slice(INDEX_MAGIC);
3251        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3252        postings.extend_from_slice(&0u32.to_le_bytes());
3253        postings.extend_from_slice(&1u32.to_le_bytes());
3254        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3255        postings.extend_from_slice(&1u32.to_le_bytes());
3256        postings.extend_from_slice(b"/");
3257        postings.push(0u8);
3258        postings.extend_from_slice(&1u32.to_le_bytes());
3259        postings.extend_from_slice(&0u64.to_le_bytes());
3260        postings.extend_from_slice(&0u64.to_le_bytes());
3261        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3262        postings.extend_from_slice(b"a");
3263        postings.extend_from_slice(&0u64.to_le_bytes());
3264
3265        let mut lookup = Vec::new();
3266        lookup.extend_from_slice(LOOKUP_MAGIC);
3267        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3268        lookup.extend_from_slice(&0u32.to_le_bytes());
3269
3270        let postings_checksum = crc32fast::hash(&postings);
3271        postings.extend_from_slice(&postings_checksum.to_le_bytes());
3272        let lookup_checksum = crc32fast::hash(&lookup);
3273        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3274        let mut cache = Vec::new();
3275        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3276        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3277        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3278        cache.extend_from_slice(&postings);
3279        cache.extend_from_slice(&lookup);
3280        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3281
3282        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3283    }
3284
3285    #[test]
3286    fn parallel_cold_build_matches_serial_index() {
3287        let dir = tempfile::tempdir().expect("create temp dir");
3288        let project = dir.path().join("project");
3289        for index in 0..80 {
3290            let sub = project.join(format!("pkg_{index:03}"));
3291            fs::create_dir_all(&sub).expect("create subdir");
3292            fs::write(
3293                sub.join("lib.rs"),
3294                format!(
3295                    "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
3296                ),
3297            )
3298            .expect("write lib");
3299        }
3300
3301        let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
3302        let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
3303
3304        assert_eq!(serial.file_count(), parallel.file_count());
3305        assert_eq!(serial.trigram_count(), parallel.trigram_count());
3306        assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
3307        for (path, id) in &serial.path_to_id {
3308            assert_eq!(parallel.path_to_id.get(path), Some(id));
3309        }
3310    }
3311
3312    #[test]
3313    fn ignore_rule_discovery_respects_gitignore() {
3314        let dir = tempfile::tempdir().expect("create temp dir");
3315        let project = dir.path().join("project");
3316        fs::create_dir_all(project.join("src")).expect("mkdir src");
3317        fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
3318        let data = project.join("src/data");
3319        fs::create_dir_all(&data).expect("mkdir data");
3320        for index in 0..200 {
3321            fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
3322            fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
3323                .expect("write ignored file");
3324        }
3325
3326        Command::new("git")
3327            .arg("init")
3328            .arg(&project)
3329            .status()
3330            .expect("git init");
3331        for args in [
3332            ["config", "user.email", "aft@example.invalid"],
3333            ["config", "user.name", "AFT Test"],
3334        ] {
3335            Command::new("git")
3336                .arg("-C")
3337                .arg(&project)
3338                .args(args)
3339                .status()
3340                .expect("git config");
3341        }
3342        Command::new("git")
3343            .arg("-C")
3344            .arg(&project)
3345            .args(["add", "."])
3346            .status()
3347            .expect("git add");
3348        Command::new("git")
3349            .arg("-C")
3350            .arg(&project)
3351            .args(["commit", "-m", "initial"])
3352            .status()
3353            .expect("git commit");
3354
3355        let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
3356        let walker_dirs = count_ignore_rule_discovery_dirs(&project);
3357        assert!(
3358            legacy_dirs > walker_dirs,
3359            "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
3360        );
3361        assert!(
3362            walker_dirs < 50,
3363            "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
3364        );
3365    }
3366
3367    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
3368    /// changed between cmp() calls.
3369    ///
3370    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
3371    /// which does a `stat()` syscall. If the file was deleted, modified, or
3372    /// touched mid-sort, the comparator returned different values for the
3373    /// same input pair on different invocations. Rust's slice::sort detects
3374    /// this and panics with "user-provided comparison function does not
3375    /// correctly implement a total order".
3376    ///
3377    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
3378    /// bridge invalidated files in parallel with grep's sort path. This
3379    /// test simulates the worst case: most paths don't exist (Err from
3380    /// fs::metadata) and sort still completes successfully.
3381    #[test]
3382    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3383        // Mix of existing and non-existing paths in deliberately
3384        // non-monotonic order — pre-fix, the sort would call stat() at
3385        // least N log N times and any flakiness would trigger the panic.
3386        let dir = tempfile::tempdir().expect("create tempdir");
3387        let mut paths: Vec<PathBuf> = Vec::new();
3388        for i in 0..30 {
3389            // Half exist, half don't.
3390            let path = if i % 2 == 0 {
3391                let p = dir.path().join(format!("real-{i}.rs"));
3392                fs::write(&p, format!("// {i}\n")).expect("write");
3393                p
3394            } else {
3395                dir.path().join(format!("missing-{i}.rs"))
3396            };
3397            paths.push(path);
3398        }
3399
3400        // Run the sort many times to maximise the chance of catching any
3401        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
3402        for _ in 0..50 {
3403            let mut copy = paths.clone();
3404            sort_paths_by_mtime_desc(&mut copy);
3405            assert_eq!(copy.len(), paths.len());
3406        }
3407    }
3408
3409    /// Regression: the indexed parallel search's reduce() combine closure must
3410    /// NOT set engine_capped. reduce runs on every partial-result merge in a
3411    /// multi-chunk parallel search (>10 candidate files), capped or not — an
3412    /// unconditional store there falsely reported every such grep as capped,
3413    /// lying to the agent that results were truncated.
3414    #[test]
3415    fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
3416        let dir = tempfile::tempdir().expect("create tempdir");
3417        // >10 files so the parallel (reduce) branch is taken, each with exactly
3418        // one match, and a generous cap so the search is NOT actually capped.
3419        for i in 0..40 {
3420            fs::write(
3421                dir.path().join(format!("file-{i}.rs")),
3422                format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
3423            )
3424            .expect("write");
3425        }
3426        let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
3427        let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
3428        assert!(
3429            result.matches.len() >= 40,
3430            "expected a match per file, got {}",
3431            result.matches.len()
3432        );
3433        assert!(
3434            !result.engine_capped,
3435            "an uncapped grep over >10 files must not report engine_capped"
3436        );
3437        assert!(!result.truncated, "uncapped grep must not be truncated");
3438    }
3439
3440    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
3441    /// the same conditions as sort_paths_by_mtime_desc. See the
3442    /// sort_paths_... test above for the full rationale.
3443    #[test]
3444    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3445        let dir = tempfile::tempdir().expect("create tempdir");
3446        let mut matches: Vec<GrepMatch> = Vec::new();
3447        for i in 0..30 {
3448            let file = if i % 2 == 0 {
3449                let p = dir.path().join(format!("real-{i}.rs"));
3450                fs::write(&p, format!("// {i}\n")).expect("write");
3451                p
3452            } else {
3453                dir.path().join(format!("missing-{i}.rs"))
3454            };
3455            matches.push(GrepMatch {
3456                file,
3457                line: u32::try_from(i).unwrap_or(0),
3458                column: 0,
3459                line_text: format!("match {i}"),
3460                match_text: format!("match {i}"),
3461            });
3462        }
3463
3464        for _ in 0..50 {
3465            let mut copy = matches.clone();
3466            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3467            assert_eq!(copy.len(), matches.len());
3468        }
3469    }
3470}
aft/search_index.rs

aft/
search_index.rs