Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36    _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41        fs::create_dir_all(cache_dir)?;
42        let path = cache_dir.join("cache.lock");
43        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44            .lock()
45            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46        fs_lock::try_acquire(&path, Duration::from_secs(2))
47            .map(|guard| Self { _guard: guard })
48            .map_err(|error| match error {
49                fs_lock::AcquireError::Timeout => {
50                    std::io::Error::other("timed out acquiring search cache lock")
51                }
52                fs_lock::AcquireError::Io(error) => error,
53            })
54    }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59    pub postings: HashMap<u32, Vec<Posting>>,
60    pub files: Vec<FileEntry>,
61    pub path_to_id: HashMap<PathBuf, u32>,
62    pub ready: bool,
63    project_root: PathBuf,
64    git_head: Option<String>,
65    max_file_size: u64,
66    ignore_rules_fingerprint: String,
67    pub file_trigrams: HashMap<u32, Vec<u32>>,
68    unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73    pub files: Vec<(PathBuf, f32)>,
74    pub engine_capped: bool,
75}
76
77impl SearchIndex {
78    /// Number of indexed files.
79    pub fn file_count(&self) -> usize {
80        self.files.len()
81    }
82
83    /// Number of unique trigrams in the index.
84    pub fn trigram_count(&self) -> usize {
85        self.postings.len()
86    }
87
88    /// Compute distinct query trigrams from literal tokens.
89    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90        query_trigrams_from_tokens(tokens)
91    }
92
93    /// Score-rank file candidates by lexical relevance to query trigrams.
94    pub fn lexical_rank(
95        &self,
96        query_trigrams: &[u32],
97        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98        max_files: usize,
99    ) -> Vec<(PathBuf, f32)> {
100        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101            .files
102    }
103
104    /// Score-rank file candidates and report whether pre-filter candidate
105    /// enumeration hit the internal 200/500 cap before ranking.
106    pub fn lexical_rank_with_stats(
107        &self,
108        query_trigrams: &[u32],
109        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110        max_files: usize,
111    ) -> LexicalRankResult {
112        if query_trigrams.is_empty() || max_files == 0 {
113            return LexicalRankResult::default();
114        }
115
116        let mut non_zero: Vec<(u32, usize)> = query_trigrams
117            .iter()
118            .filter_map(|trigram| {
119                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120                (posting_count > 0).then_some((*trigram, posting_count))
121            })
122            .collect();
123        if non_zero.is_empty() {
124            return LexicalRankResult::default();
125        }
126
127        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128        let selected_count = non_zero.len().min(3);
129        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131        let mut candidate_ids = BTreeSet::new();
132        for (trigram, _) in non_zero.iter().take(selected_count) {
133            if let Some(postings) = self.postings.get(trigram) {
134                for posting in postings {
135                    if self.is_active_file(posting.file_id) {
136                        candidate_ids.insert(posting.file_id);
137                    }
138                }
139            }
140        }
141        let pre_filter_candidate_count = candidate_ids.len();
142        let engine_capped = pre_filter_candidate_count > candidate_cap;
143        let filtered_candidates = candidate_ids
144            .into_iter()
145            .filter_map(|file_id| {
146                self.files
147                    .get(file_id as usize)
148                    .map(|entry| (file_id, entry))
149            })
150            .filter(|(_, entry)| {
151                if let Some(filter) = candidate_filter {
152                    filter(&entry.path)
153                } else {
154                    true
155                }
156            })
157            .collect::<Vec<_>>();
158
159        let mut ranked = Vec::new();
160        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161            let score = lexical_score(self, query_trigrams, file_id);
162            if score > 0.0 {
163                ranked.push((entry.path.clone(), score));
164            }
165        }
166
167        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168        ranked.truncate(max_files);
169        LexicalRankResult {
170            files: ranked,
171            engine_capped,
172        }
173    }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178    pub file_id: u32,
179    pub next_mask: u8,
180    pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185    pub path: PathBuf,
186    pub size: u64,
187    pub modified: SystemTime,
188    pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193    pub file: PathBuf,
194    pub line: u32,
195    pub column: u32,
196    pub line_text: String,
197    pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202    pub matches: Vec<GrepMatch>,
203    pub total_matches: usize,
204    pub files_searched: usize,
205    pub files_with_matches: usize,
206    pub index_status: IndexStatus,
207    pub truncated: bool,
208    pub fully_degraded: bool,
209    pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214    Ready,
215    Building,
216    Fallback,
217    Disabled,
218}
219
220impl IndexStatus {
221    pub fn as_str(&self) -> &'static str {
222        match self {
223            IndexStatus::Ready => "Ready",
224            IndexStatus::Building => "Building",
225            IndexStatus::Fallback => "Fallback",
226            IndexStatus::Disabled => "Disabled",
227        }
228    }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233    pub and_trigrams: Vec<u32>,
234    pub or_groups: Vec<Vec<u32>>,
235    pub(crate) and_filters: HashMap<u32, PostingFilter>,
236    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241    next_mask: u8,
242    loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247    and_runs: Vec<Vec<u8>>,
248    or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253    includes: Option<GlobSet>,
254    excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259    pub root: PathBuf,
260    pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265    file: Arc<PathBuf>,
266    line: u32,
267    column: u32,
268    line_text: String,
269    match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274    Literal(LiteralSearch),
275    Regex(Regex),
276}
277
278impl SearchIndex {
279    pub fn new() -> Self {
280        SearchIndex {
281            postings: HashMap::new(),
282            files: Vec::new(),
283            path_to_id: HashMap::new(),
284            ready: false,
285            project_root: PathBuf::new(),
286            git_head: None,
287            max_file_size: DEFAULT_MAX_FILE_SIZE,
288            ignore_rules_fingerprint: String::new(),
289            file_trigrams: HashMap::new(),
290            unindexed_files: HashSet::new(),
291        }
292    }
293
294    pub fn build(root: &Path) -> Self {
295        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296    }
297
298    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299        let started = std::time::Instant::now();
300        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
301        let mut index = SearchIndex {
302            project_root: project_root.clone(),
303            max_file_size,
304            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
305            ..SearchIndex::new()
306        };
307
308        let filters = PathFilters::default();
309        let mut indexed = 0usize;
310        for path in walk_project_files(&project_root, &filters) {
311            index.update_file(&path);
312            indexed += 1;
313        }
314
315        index.git_head = current_git_head(&project_root);
316        index.ready = true;
317        crate::slog_info!(
318            "search index cold build: {} files, {} trigrams, {} ms",
319            indexed,
320            index.postings.len(),
321            started.elapsed().as_millis()
322        );
323        index
324    }
325
326    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
327        self.remove_file(path);
328
329        let file_id = match self.allocate_file_id(path, content.len() as u64) {
330            Some(file_id) => file_id,
331            None => return,
332        };
333        if let Some(file) = self.files.get_mut(file_id as usize) {
334            file.content_hash = cache_freshness::hash_bytes(content);
335        }
336
337        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
338        for (trigram, next_char, position) in extract_trigrams(content) {
339            let entry = trigram_map.entry(trigram).or_default();
340            entry.next_mask |= mask_for_next_char(next_char);
341            entry.loc_mask |= mask_for_position(position);
342        }
343
344        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
345        for (trigram, filter) in trigram_map {
346            let postings = self.postings.entry(trigram).or_default();
347            postings.push(Posting {
348                file_id,
349                next_mask: filter.next_mask,
350                loc_mask: filter.loc_mask,
351            });
352            // Posting lists are kept sorted by file_id for binary search during
353            // intersection. Since file_ids are allocated incrementally, the new
354            // entry is usually already in order. Only sort when needed.
355            if postings.len() > 1
356                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
357            {
358                postings.sort_unstable_by_key(|p| p.file_id);
359            }
360            file_trigrams.push(trigram);
361        }
362
363        self.file_trigrams.insert(file_id, file_trigrams);
364        self.unindexed_files.remove(&file_id);
365    }
366
367    pub fn remove_file(&mut self, path: &Path) {
368        let canonical_path = canonicalize_existing_or_deleted_path(path);
369        let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
370            file_id
371        } else if canonical_path.as_path() != path {
372            let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
373                return;
374            };
375            file_id
376        } else {
377            return;
378        };
379
380        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
381            for trigram in trigrams {
382                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
383                    postings.retain(|posting| posting.file_id != file_id);
384                    postings.is_empty()
385                } else {
386                    false
387                };
388
389                if should_remove {
390                    self.postings.remove(&trigram);
391                }
392            }
393        }
394
395        self.unindexed_files.remove(&file_id);
396        if let Some(file) = self.files.get_mut(file_id as usize) {
397            file.path = PathBuf::new();
398            file.size = 0;
399            file.modified = UNIX_EPOCH;
400            file.content_hash = cache_freshness::zero_hash();
401        }
402    }
403
404    pub fn update_file(&mut self, path: &Path) {
405        self.remove_file(path);
406
407        let metadata = match fs::metadata(path) {
408            Ok(metadata) if metadata.is_file() => metadata,
409            _ => return,
410        };
411
412        if is_binary_path(path, metadata.len()) {
413            self.track_unindexed_file(path, &metadata);
414            return;
415        }
416
417        if metadata.len() > self.max_file_size {
418            self.track_unindexed_file(path, &metadata);
419            return;
420        }
421
422        let content = match fs::read(path) {
423            Ok(content) => content,
424            Err(_) => return,
425        };
426
427        if is_binary_bytes(&content) {
428            self.track_unindexed_file(path, &metadata);
429            return;
430        }
431
432        self.index_file(path, &content);
433    }
434
435    pub fn grep(
436        &self,
437        pattern: &str,
438        case_sensitive: bool,
439        include: &[String],
440        exclude: &[String],
441        search_root: &Path,
442        max_results: usize,
443    ) -> GrepResult {
444        match pattern_compile::compile(
445            pattern,
446            CompileOpts {
447                case_insensitive: !case_sensitive,
448                ..CompileOpts::default()
449            },
450        ) {
451            CompileResult::Ok(compiled) => {
452                self.search_grep(&compiled, include, exclude, search_root, max_results)
453            }
454            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
455                self.empty_grep_result()
456            }
457        }
458    }
459
460    pub fn search_grep(
461        &self,
462        pattern: &CompiledPattern,
463        include: &[String],
464        exclude: &[String],
465        search_root: &Path,
466        max_results: usize,
467    ) -> GrepResult {
468        let matcher = match pattern {
469            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
470            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
471        };
472
473        let filters = match build_path_filters(include, exclude) {
474            Ok(filters) => filters,
475            Err(_) => PathFilters::default(),
476        };
477        let search_root = canonicalize_or_normalize(search_root);
478
479        let raw_pattern = pattern.raw_pattern_for_trigrams();
480        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
481            RegexQuery::default()
482        } else {
483            decompose_regex(&raw_pattern)
484        };
485        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
486        let candidate_ids = self.candidates(&query);
487
488        let candidate_files: Vec<&FileEntry> = candidate_ids
489            .into_iter()
490            .filter_map(|file_id| self.files.get(file_id as usize))
491            .filter(|file| !file.path.as_os_str().is_empty())
492            .filter(|file| is_within_search_root(&search_root, &file.path))
493            .filter(|file| filters.matches(&self.project_root, &file.path))
494            .collect();
495
496        let total_matches = AtomicUsize::new(0);
497        let files_searched = AtomicUsize::new(0);
498        let files_with_matches = AtomicUsize::new(0);
499        let truncated = AtomicBool::new(false);
500        let engine_capped = AtomicBool::new(false);
501        let stop_after = max_results.saturating_mul(2);
502
503        let mut matches = if candidate_files.len() > 10 {
504            candidate_files
505                .par_iter()
506                .map(|file| {
507                    search_candidate_file(
508                        file,
509                        &matcher,
510                        max_results,
511                        stop_after,
512                        &total_matches,
513                        &files_searched,
514                        &files_with_matches,
515                        &truncated,
516                        &engine_capped,
517                    )
518                })
519                .reduce(Vec::new, |mut left, mut right| {
520                    left.append(&mut right);
521                    left
522                })
523        } else {
524            let mut matches = Vec::new();
525            for file in candidate_files {
526                matches.extend(search_candidate_file(
527                    file,
528                    &matcher,
529                    max_results,
530                    stop_after,
531                    &total_matches,
532                    &files_searched,
533                    &files_with_matches,
534                    &truncated,
535                    &engine_capped,
536                ));
537
538                if should_stop_search(&truncated, &total_matches, stop_after) {
539                    engine_capped.store(true, Ordering::Relaxed);
540                    break;
541                }
542            }
543            matches
544        };
545
546        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
547            self.path_to_id
548                .get(path)
549                .and_then(|file_id| self.files.get(*file_id as usize))
550                .map(|file| file.modified)
551        });
552
553        let matches = matches
554            .into_iter()
555            .map(|matched| GrepMatch {
556                file: matched.file.as_ref().clone(),
557                line: matched.line,
558                column: matched.column,
559                line_text: matched.line_text,
560                match_text: matched.match_text,
561            })
562            .collect();
563
564        GrepResult {
565            total_matches: total_matches.load(Ordering::Relaxed),
566            matches,
567            files_searched: files_searched.load(Ordering::Relaxed),
568            files_with_matches: files_with_matches.load(Ordering::Relaxed),
569            index_status: if self.ready {
570                IndexStatus::Ready
571            } else {
572                IndexStatus::Building
573            },
574            truncated: truncated.load(Ordering::Relaxed),
575            fully_degraded,
576            engine_capped: engine_capped.load(Ordering::Relaxed),
577        }
578    }
579
580    fn empty_grep_result(&self) -> GrepResult {
581        GrepResult {
582            matches: Vec::new(),
583            total_matches: 0,
584            files_searched: 0,
585            files_with_matches: 0,
586            index_status: if self.ready {
587                IndexStatus::Ready
588            } else {
589                IndexStatus::Building
590            },
591            truncated: false,
592            fully_degraded: false,
593            engine_capped: false,
594        }
595    }
596
597    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
598        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
599            Ok(filters) => filters,
600            Err(_) => return Vec::new(),
601        };
602        let search_root = canonicalize_or_normalize(search_root);
603        let mut entries = self
604            .files
605            .iter()
606            .filter(|file| !file.path.as_os_str().is_empty())
607            .filter(|file| is_within_search_root(&search_root, &file.path))
608            .filter(|file| filters.matches(&self.project_root, &file.path))
609            .map(|file| (file.path.clone(), file.modified))
610            .collect::<Vec<_>>();
611
612        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
613            right_mtime
614                .cmp(left_mtime)
615                .then_with(|| left_path.cmp(right_path))
616        });
617
618        entries.into_iter().map(|(path, _)| path).collect()
619    }
620
621    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
622        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
623            return self.active_file_ids();
624        }
625
626        let mut and_trigrams = query.and_trigrams.clone();
627        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
628
629        let mut current: Option<Vec<u32>> = None;
630
631        for trigram in and_trigrams {
632            let filter = query.and_filters.get(&trigram).copied();
633            let matches = self.postings_for_trigram(trigram, filter);
634            current = Some(match current.take() {
635                Some(existing) => intersect_sorted_ids(&existing, &matches),
636                None => matches,
637            });
638
639            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
640                break;
641            }
642        }
643
644        let mut current = current.unwrap_or_else(|| self.active_file_ids());
645
646        for (index, group) in query.or_groups.iter().enumerate() {
647            let mut group_matches = Vec::new();
648            let filters = query.or_filters.get(index);
649
650            for trigram in group {
651                let filter = filters.and_then(|filters| filters.get(trigram).copied());
652                let matches = self.postings_for_trigram(*trigram, filter);
653                if group_matches.is_empty() {
654                    group_matches = matches;
655                } else {
656                    group_matches = union_sorted_ids(&group_matches, &matches);
657                }
658            }
659
660            current = intersect_sorted_ids(&current, &group_matches);
661            if current.is_empty() {
662                break;
663            }
664        }
665
666        let mut unindexed = self
667            .unindexed_files
668            .iter()
669            .copied()
670            .filter(|file_id| self.is_active_file(*file_id))
671            .collect::<Vec<_>>();
672        if !unindexed.is_empty() {
673            unindexed.sort_unstable();
674            current = union_sorted_ids(&current, &unindexed);
675        }
676
677        current
678    }
679
680    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
681        if fs::create_dir_all(cache_dir).is_err() {
682            return;
683        }
684
685        let cache_path = cache_dir.join("cache.bin");
686        let tmp_cache = cache_dir.join(format!(
687            "cache.bin.tmp.{}.{}",
688            std::process::id(),
689            SystemTime::now()
690                .duration_since(UNIX_EPOCH)
691                .unwrap_or(Duration::ZERO)
692                .as_nanos()
693        ));
694
695        let active_ids = self.active_file_ids();
696        let mut id_map = HashMap::new();
697        for (new_id, old_id) in active_ids.iter().enumerate() {
698            let Ok(new_id_u32) = u32::try_from(new_id) else {
699                return;
700            };
701            id_map.insert(*old_id, new_id_u32);
702        }
703
704        let write_result = (|| -> std::io::Result<()> {
705            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
706
707            postings_writer.write_all(INDEX_MAGIC)?;
708            write_u32(&mut postings_writer, INDEX_VERSION)?;
709
710            let head = git_head.unwrap_or_default();
711            let root = self.project_root.to_string_lossy();
712            let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
713                ignore_rules_fingerprint(&self.project_root)
714            } else {
715                self.ignore_rules_fingerprint.clone()
716            };
717            let head_len = u32::try_from(head.len())
718                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
719            let root_len = u32::try_from(root.len())
720                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
721            let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
722                .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
723            let file_count = u32::try_from(active_ids.len())
724                .map_err(|_| std::io::Error::other("too many files to cache"))?;
725
726            write_u32(&mut postings_writer, head_len)?;
727            write_u32(&mut postings_writer, root_len)?;
728            write_u32(&mut postings_writer, ignore_fingerprint_len)?;
729            write_u64(&mut postings_writer, self.max_file_size)?;
730            write_u32(&mut postings_writer, file_count)?;
731            postings_writer.write_all(head.as_bytes())?;
732            postings_writer.write_all(root.as_bytes())?;
733            postings_writer.write_all(ignore_fingerprint.as_bytes())?;
734
735            for old_id in &active_ids {
736                let Some(file) = self.files.get(*old_id as usize) else {
737                    return Err(std::io::Error::other("missing file entry for cache write"));
738                };
739                let path =
740                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
741                        std::io::Error::other(format!(
742                            "refusing to cache path outside project root: {}",
743                            file.path.display()
744                        ))
745                    })?;
746                let path = path.to_string_lossy();
747                let path_len = u32::try_from(path.len())
748                    .map_err(|_| std::io::Error::other("cached path too large"))?;
749                let modified = file
750                    .modified
751                    .duration_since(UNIX_EPOCH)
752                    .unwrap_or(Duration::ZERO);
753                let unindexed = if self.unindexed_files.contains(old_id) {
754                    1u8
755                } else {
756                    0u8
757                };
758
759                postings_writer.write_all(&[unindexed])?;
760                write_u32(&mut postings_writer, path_len)?;
761                write_u64(&mut postings_writer, file.size)?;
762                write_u64(&mut postings_writer, modified.as_secs())?;
763                write_u32(&mut postings_writer, modified.subsec_nanos())?;
764                postings_writer.write_all(file.content_hash.as_bytes())?;
765                postings_writer.write_all(path.as_bytes())?;
766            }
767
768            let mut lookup_entries = Vec::new();
769            let mut postings_blob = Vec::new();
770            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
771            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
772
773            for (trigram, postings) in sorted_postings {
774                let offset = u64::try_from(postings_blob.len())
775                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
776                let mut count = 0u32;
777
778                for posting in postings {
779                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
780                        continue;
781                    };
782
783                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
784                    postings_blob.push(posting.next_mask);
785                    postings_blob.push(posting.loc_mask);
786                    count = count.saturating_add(1);
787                }
788
789                if count > 0 {
790                    lookup_entries.push((*trigram, offset, count));
791                }
792            }
793
794            write_u64(
795                &mut postings_writer,
796                u64::try_from(postings_blob.len())
797                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
798            )?;
799            postings_writer.write_all(&postings_blob)?;
800            postings_writer.flush()?;
801            let mut postings_blob_file = postings_writer
802                .into_inner()
803                .map_err(|error| std::io::Error::other(error.to_string()))?
804                .into_inner();
805            let checksum = crc32fast::hash(&postings_blob_file);
806            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
807
808            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
809            let entry_count = u32::try_from(lookup_entries.len())
810                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
811
812            lookup_writer.write_all(LOOKUP_MAGIC)?;
813            write_u32(&mut lookup_writer, INDEX_VERSION)?;
814            write_u32(&mut lookup_writer, entry_count)?;
815
816            for (trigram, offset, count) in lookup_entries {
817                write_u32(&mut lookup_writer, trigram)?;
818                write_u64(&mut lookup_writer, offset)?;
819                write_u32(&mut lookup_writer, count)?;
820            }
821
822            lookup_writer.flush()?;
823            let mut lookup_blob_file = lookup_writer
824                .into_inner()
825                .map_err(|error| std::io::Error::other(error.to_string()))?
826                .into_inner();
827            let checksum = crc32fast::hash(&lookup_blob_file);
828            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
829
830            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
831            write_u32(&mut cache_writer, CACHE_MAGIC)?;
832            write_u32(&mut cache_writer, INDEX_VERSION)?;
833            write_u64(
834                &mut cache_writer,
835                u64::try_from(postings_blob_file.len())
836                    .map_err(|_| std::io::Error::other("postings section too large"))?,
837            )?;
838            cache_writer.write_all(&postings_blob_file)?;
839            cache_writer.write_all(&lookup_blob_file)?;
840            cache_writer.flush()?;
841            cache_writer.get_ref().sync_all()?;
842            drop(cache_writer);
843            fs::rename(&tmp_cache, &cache_path)?;
844
845            Ok(())
846        })();
847
848        if write_result.is_err() {
849            let _ = fs::remove_file(&tmp_cache);
850        }
851    }
852
853    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
854        debug_assert!(current_canonical_root.is_absolute());
855        let cache_path = cache_dir.join("cache.bin");
856        let cache_bytes = fs::read(&cache_path).ok()?;
857        if cache_bytes.len() < 16 {
858            return None;
859        }
860        let mut header = Cursor::new(&cache_bytes);
861        if read_u32(&mut header).ok()? != CACHE_MAGIC {
862            return None;
863        }
864        if read_u32(&mut header).ok()? != INDEX_VERSION {
865            return None;
866        }
867        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
868        let start = usize::try_from(header.position()).ok()?;
869        let postings_end = start.checked_add(postings_len_total)?;
870        if postings_end > cache_bytes.len() {
871            return None;
872        }
873        let postings_bytes = &cache_bytes[start..postings_end];
874        let lookup_bytes = &cache_bytes[postings_end..];
875        let lookup_len_total = lookup_bytes.len();
876        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
877        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
878        if postings_len_total < 4 || lookup_len_total < 4 {
879            return None;
880        }
881        verify_crc32_bytes_slice(postings_bytes).ok()?;
882        verify_crc32_bytes_slice(lookup_bytes).ok()?;
883
884        let mut magic = [0u8; 8];
885        postings_reader.read_exact(&mut magic).ok()?;
886        if &magic != INDEX_MAGIC {
887            return None;
888        }
889        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
890            return None;
891        }
892
893        let head_len = read_u32(&mut postings_reader).ok()? as usize;
894        let root_len = read_u32(&mut postings_reader).ok()? as usize;
895        let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
896        let max_file_size = read_u64(&mut postings_reader).ok()?;
897        let file_count = read_u32(&mut postings_reader).ok()? as usize;
898        if file_count > MAX_ENTRIES {
899            return None;
900        }
901        let postings_body_len = postings_len_total.checked_sub(4)?;
902        let lookup_body_len = lookup_len_total.checked_sub(4)?;
903
904        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
905        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
906        if minimum_file_bytes > remaining_postings {
907            return None;
908        }
909
910        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
911            return None;
912        }
913        let mut head_bytes = vec![0u8; head_len];
914        postings_reader.read_exact(&mut head_bytes).ok()?;
915        let git_head = String::from_utf8(head_bytes)
916            .ok()
917            .filter(|head| !head.is_empty());
918
919        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
920            return None;
921        }
922        let mut root_bytes = vec![0u8; root_len];
923        postings_reader.read_exact(&mut root_bytes).ok()?;
924        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
925        let project_root = current_canonical_root.to_path_buf();
926
927        if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
928            return None;
929        }
930        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
931        postings_reader
932            .read_exact(&mut ignore_fingerprint_bytes)
933            .ok()?;
934        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
935        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
936        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
937            return None;
938        }
939
940        let mut files = Vec::with_capacity(file_count);
941        let mut path_to_id = HashMap::new();
942        let mut unindexed_files = HashSet::new();
943
944        for file_id in 0..file_count {
945            let mut unindexed = [0u8; 1];
946            postings_reader.read_exact(&mut unindexed).ok()?;
947            let path_len = read_u32(&mut postings_reader).ok()? as usize;
948            let size = read_u64(&mut postings_reader).ok()?;
949            let secs = read_u64(&mut postings_reader).ok()?;
950            let nanos = read_u32(&mut postings_reader).ok()?;
951            let mut hash_bytes = [0u8; 32];
952            postings_reader.read_exact(&mut hash_bytes).ok()?;
953            let content_hash = blake3::Hash::from_bytes(hash_bytes);
954            if nanos >= 1_000_000_000 {
955                return None;
956            }
957            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
958                return None;
959            }
960            let mut path_bytes = vec![0u8; path_len];
961            postings_reader.read_exact(&mut path_bytes).ok()?;
962            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
963            let full_path = cached_path_under_root(&project_root, &relative_path)?;
964            let file_id_u32 = u32::try_from(file_id).ok()?;
965
966            files.push(FileEntry {
967                path: full_path.clone(),
968                size,
969                modified: UNIX_EPOCH + Duration::new(secs, nanos),
970                content_hash,
971            });
972            path_to_id.insert(full_path, file_id_u32);
973            if unindexed[0] == 1 {
974                unindexed_files.insert(file_id_u32);
975            }
976        }
977
978        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
979        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
980        if postings_len > max_postings_bytes {
981            return None;
982        }
983        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
984            return None;
985        }
986        let mut postings_blob = vec![0u8; postings_len];
987        postings_reader.read_exact(&mut postings_blob).ok()?;
988
989        let mut lookup_magic = [0u8; 8];
990        lookup_reader.read_exact(&mut lookup_magic).ok()?;
991        if &lookup_magic != LOOKUP_MAGIC {
992            return None;
993        }
994        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
995            return None;
996        }
997        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
998        if entry_count > MAX_ENTRIES {
999            return None;
1000        }
1001        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1002        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1003        if minimum_lookup_bytes > remaining_lookup {
1004            return None;
1005        }
1006
1007        let mut postings = HashMap::new();
1008        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1009
1010        for _ in 0..entry_count {
1011            let trigram = read_u32(&mut lookup_reader).ok()?;
1012            let offset = read_u64(&mut lookup_reader).ok()? as usize;
1013            let count = read_u32(&mut lookup_reader).ok()? as usize;
1014            if count > MAX_ENTRIES {
1015                return None;
1016            }
1017            let bytes_len = count.checked_mul(POSTING_BYTES)?;
1018            let end = offset.checked_add(bytes_len)?;
1019            if end > postings_blob.len() {
1020                return None;
1021            }
1022
1023            let mut trigram_postings = Vec::with_capacity(count);
1024            for chunk in postings_blob[offset..end].chunks_exact(6) {
1025                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1026                let posting = Posting {
1027                    file_id,
1028                    next_mask: chunk[4],
1029                    loc_mask: chunk[5],
1030                };
1031                trigram_postings.push(posting.clone());
1032                file_trigrams.entry(file_id).or_default().push(trigram);
1033            }
1034            postings.insert(trigram, trigram_postings);
1035        }
1036
1037        Some(SearchIndex {
1038            postings,
1039            files,
1040            path_to_id,
1041            ready: false,
1042            project_root,
1043            git_head,
1044            max_file_size,
1045            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1046            file_trigrams,
1047            unindexed_files,
1048        })
1049    }
1050
1051    pub fn stored_git_head(&self) -> Option<&str> {
1052        self.git_head.as_deref()
1053    }
1054
1055    pub(crate) fn set_ready(&mut self, ready: bool) {
1056        self.ready = ready;
1057    }
1058
1059    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1060        self.git_head = current_head;
1061        verify_file_mtimes(self);
1062        self.ready = true;
1063    }
1064
1065    #[cfg(debug_assertions)]
1066    #[doc(hidden)]
1067    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1068        self.verify_against_disk(current_head);
1069    }
1070
1071    pub(crate) fn rebuild_or_refresh(
1072        root: &Path,
1073        max_file_size: u64,
1074        current_head: Option<String>,
1075        baseline: Option<SearchIndex>,
1076    ) -> Self {
1077        if let Some(mut baseline) = baseline {
1078            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1079            baseline.max_file_size = max_file_size;
1080            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1081            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1082                return SearchIndex::build_with_limit(root, max_file_size);
1083            }
1084            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1085
1086            if baseline.git_head == current_head || current_head.is_none() {
1087                // HEAD matches, but files may have changed on disk since the index was
1088                // last written (e.g., uncommitted edits, stash pop, manual file changes
1089                // while OpenCode was closed). Verify mtimes and re-index stale files.
1090                // Non-git projects also use this per-file (path, mtime, size)
1091                // fingerprint so unchanged trees reuse the disk cache instead of
1092                // rebuilding every configure.
1093                baseline.git_head = current_head;
1094                verify_file_mtimes(&mut baseline);
1095                baseline.ready = true;
1096                return baseline;
1097            }
1098
1099            if let (Some(previous), Some(current)) =
1100                (baseline.git_head.clone(), current_head.clone())
1101            {
1102                let project_root = baseline.project_root.clone();
1103                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1104                    baseline.git_head = Some(current);
1105                    verify_file_mtimes(&mut baseline);
1106                    baseline.ready = true;
1107                    return baseline;
1108                }
1109            }
1110        }
1111
1112        SearchIndex::build_with_limit(root, max_file_size)
1113    }
1114
1115    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1116        let file_id = u32::try_from(self.files.len()).ok()?;
1117        let metadata = fs::metadata(path).ok();
1118        let size = metadata
1119            .as_ref()
1120            .map_or(size_hint, |metadata| metadata.len());
1121        let modified = metadata
1122            .and_then(|metadata| metadata.modified().ok())
1123            .unwrap_or(UNIX_EPOCH);
1124
1125        self.files.push(FileEntry {
1126            path: path.to_path_buf(),
1127            size,
1128            modified,
1129            content_hash: cache_freshness::zero_hash(),
1130        });
1131        self.path_to_id.insert(path.to_path_buf(), file_id);
1132        Some(file_id)
1133    }
1134
1135    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1136        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1137            return;
1138        };
1139        self.unindexed_files.insert(file_id);
1140        self.file_trigrams.insert(file_id, Vec::new());
1141    }
1142
1143    fn active_file_ids(&self) -> Vec<u32> {
1144        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1145        ids.sort_unstable();
1146        ids
1147    }
1148
1149    fn is_active_file(&self, file_id: u32) -> bool {
1150        self.files
1151            .get(file_id as usize)
1152            .map(|file| !file.path.as_os_str().is_empty())
1153            .unwrap_or(false)
1154    }
1155
1156    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1157        let Some(postings) = self.postings.get(&trigram) else {
1158            return Vec::new();
1159        };
1160
1161        let mut matches = Vec::with_capacity(postings.len());
1162
1163        for posting in postings {
1164            if let Some(filter) = filter {
1165                // next_mask: bloom filter check — the character following this trigram in the
1166                // query must also appear after this trigram somewhere in the file.
1167                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1168                    continue;
1169                }
1170                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1171                // between consecutive trigram pairs, but is NOT used as a single-trigram
1172                // filter because the position in the query string has no relationship to
1173                // the position in the file. Using it here causes false negatives.
1174            }
1175            if self.is_active_file(posting.file_id) {
1176                matches.push(posting.file_id);
1177            }
1178        }
1179
1180        matches
1181    }
1182}
1183
1184fn search_candidate_file(
1185    file: &FileEntry,
1186    matcher: &SearchMatcher,
1187    max_results: usize,
1188    stop_after: usize,
1189    total_matches: &AtomicUsize,
1190    files_searched: &AtomicUsize,
1191    files_with_matches: &AtomicUsize,
1192    truncated: &AtomicBool,
1193    engine_capped: &AtomicBool,
1194) -> Vec<SharedGrepMatch> {
1195    if should_stop_search(truncated, total_matches, stop_after) {
1196        engine_capped.store(true, Ordering::Relaxed);
1197        return Vec::new();
1198    }
1199
1200    let content = match read_indexed_file_bytes(&file.path) {
1201        Some(content) => content,
1202        None => return Vec::new(),
1203    };
1204    // Defense in depth: even though indexing tries to filter binaries via
1205    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1206    // query time. content_inspector is fast (~bytes-per-cycle on a small
1207    // preview) and this guarantees we never surface matches inside binary
1208    // files even if the indexer somehow let one through (e.g. file changed
1209    // between indexing and query).
1210    if is_binary_bytes(&content) {
1211        return Vec::new();
1212    }
1213    files_searched.fetch_add(1, Ordering::Relaxed);
1214
1215    let shared_path = Arc::new(file.path.clone());
1216    let mut matches = Vec::new();
1217    let mut line_starts = None;
1218    let mut seen_lines = HashSet::new();
1219    let mut matched_this_file = false;
1220
1221    match matcher {
1222        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1223            let needle = &literal.needle;
1224            let finder = memchr::memmem::Finder::new(needle);
1225            let mut start = 0;
1226
1227            while let Some(position) = finder.find(&content[start..]) {
1228                if should_stop_search(truncated, total_matches, stop_after) {
1229                    engine_capped.store(true, Ordering::Relaxed);
1230                    break;
1231                }
1232
1233                let offset = start + position;
1234                start = offset + 1;
1235
1236                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1237                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1238                if !seen_lines.insert(line) {
1239                    continue;
1240                }
1241
1242                matched_this_file = true;
1243                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1244                if match_number > max_results {
1245                    truncated.store(true, Ordering::Relaxed);
1246                    break;
1247                }
1248
1249                let end = offset + needle.len();
1250                matches.push(SharedGrepMatch {
1251                    file: shared_path.clone(),
1252                    line,
1253                    column,
1254                    line_text,
1255                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1256                });
1257            }
1258        }
1259        SearchMatcher::Literal(literal) => {
1260            let needle = &literal.needle;
1261            let search_content = content.to_ascii_lowercase();
1262            let finder = memchr::memmem::Finder::new(needle);
1263            let mut start = 0;
1264
1265            while let Some(position) = finder.find(&search_content[start..]) {
1266                if should_stop_search(truncated, total_matches, stop_after) {
1267                    engine_capped.store(true, Ordering::Relaxed);
1268                    break;
1269                }
1270
1271                let offset = start + position;
1272                start = offset + 1;
1273
1274                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1275                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1276                if !seen_lines.insert(line) {
1277                    continue;
1278                }
1279
1280                matched_this_file = true;
1281                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1282                if match_number > max_results {
1283                    truncated.store(true, Ordering::Relaxed);
1284                    break;
1285                }
1286
1287                let end = offset + needle.len();
1288                matches.push(SharedGrepMatch {
1289                    file: shared_path.clone(),
1290                    line,
1291                    column,
1292                    line_text,
1293                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1294                });
1295            }
1296        }
1297        SearchMatcher::Regex(regex) => {
1298            for matched in regex.find_iter(&content) {
1299                if should_stop_search(truncated, total_matches, stop_after) {
1300                    engine_capped.store(true, Ordering::Relaxed);
1301                    break;
1302                }
1303
1304                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1305                let (line, column, line_text) =
1306                    line_details_bytes(&content, line_starts, matched.start());
1307                if !seen_lines.insert(line) {
1308                    continue;
1309                }
1310
1311                matched_this_file = true;
1312                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1313                if match_number > max_results {
1314                    truncated.store(true, Ordering::Relaxed);
1315                    break;
1316                }
1317
1318                matches.push(SharedGrepMatch {
1319                    file: shared_path.clone(),
1320                    line,
1321                    column,
1322                    line_text,
1323                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1324                });
1325            }
1326        }
1327    }
1328
1329    if matched_this_file {
1330        files_with_matches.fetch_add(1, Ordering::Relaxed);
1331    }
1332
1333    matches
1334}
1335
1336fn should_stop_search(
1337    truncated: &AtomicBool,
1338    total_matches: &AtomicUsize,
1339    stop_after: usize,
1340) -> bool {
1341    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1342}
1343
1344fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1345    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1346    let mut left_index = 0;
1347    let mut right_index = 0;
1348
1349    while left_index < left.len() && right_index < right.len() {
1350        match left[left_index].cmp(&right[right_index]) {
1351            std::cmp::Ordering::Less => left_index += 1,
1352            std::cmp::Ordering::Greater => right_index += 1,
1353            std::cmp::Ordering::Equal => {
1354                merged.push(left[left_index]);
1355                left_index += 1;
1356                right_index += 1;
1357            }
1358        }
1359    }
1360
1361    merged
1362}
1363
1364fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1365    let mut merged = Vec::with_capacity(left.len() + right.len());
1366    let mut left_index = 0;
1367    let mut right_index = 0;
1368
1369    while left_index < left.len() && right_index < right.len() {
1370        match left[left_index].cmp(&right[right_index]) {
1371            std::cmp::Ordering::Less => {
1372                merged.push(left[left_index]);
1373                left_index += 1;
1374            }
1375            std::cmp::Ordering::Greater => {
1376                merged.push(right[right_index]);
1377                right_index += 1;
1378            }
1379            std::cmp::Ordering::Equal => {
1380                merged.push(left[left_index]);
1381                left_index += 1;
1382                right_index += 1;
1383            }
1384        }
1385    }
1386
1387    merged.extend_from_slice(&left[left_index..]);
1388    merged.extend_from_slice(&right[right_index..]);
1389    merged
1390}
1391
1392pub fn decompose_regex(pattern: &str) -> RegexQuery {
1393    let hir = match regex_syntax::parse(pattern) {
1394        Ok(hir) => hir,
1395        Err(_) => return RegexQuery::default(),
1396    };
1397
1398    let build = build_query(&hir);
1399    build.into_query()
1400}
1401
1402pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1403    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1404}
1405
1406pub fn normalize_char(c: u8) -> u8 {
1407    c.to_ascii_lowercase()
1408}
1409
1410pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1411    if content.len() < 3 {
1412        return Vec::new();
1413    }
1414
1415    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1416    for start in 0..=content.len() - 3 {
1417        let trigram = pack_trigram(
1418            normalize_char(content[start]),
1419            normalize_char(content[start + 1]),
1420            normalize_char(content[start + 2]),
1421        );
1422        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1423        trigrams.push((trigram, next_char, start));
1424    }
1425    trigrams
1426}
1427
1428pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1429    let mut seen = HashSet::new();
1430    let mut out = Vec::new();
1431    for token in tokens {
1432        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1433            if seen.insert(trigram) {
1434                out.push(trigram);
1435            }
1436        }
1437    }
1438    out
1439}
1440
1441pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1442    if query_trigrams.is_empty() {
1443        return 0.0;
1444    }
1445
1446    let mut hits = 0u32;
1447    for &trigram in query_trigrams {
1448        if let Some(postings) = index.postings.get(&trigram) {
1449            if postings
1450                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1451                .is_ok()
1452            {
1453                hits += 1;
1454            }
1455        }
1456    }
1457
1458    if hits == 0 {
1459        return 0.0;
1460    }
1461
1462    let file_trigram_count = index
1463        .file_trigrams
1464        .get(&file_id)
1465        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1466    (hits as f32) / (1.0 + file_trigram_count.ln())
1467}
1468
1469pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1470    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1471    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1472        return PathBuf::from(override_dir)
1473            .join("index")
1474            .join(project_cache_key(project_root));
1475    }
1476    // Use configured storage dir (from plugin, XDG-compliant)
1477    if let Some(dir) = storage_dir {
1478        return dir.join("index").join(project_cache_key(project_root));
1479    }
1480    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1481    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1482    // If neither is set, fall back to a temp directory rather than `"."`
1483    // because the search-index code reads/writes absolute paths.
1484    let home = std::env::var_os("HOME")
1485        .or_else(|| std::env::var_os("USERPROFILE"))
1486        .map(PathBuf::from)
1487        .unwrap_or_else(std::env::temp_dir);
1488    home.join(".cache")
1489        .join("aft")
1490        .join("index")
1491        .join(project_cache_key(project_root))
1492}
1493
1494pub(crate) fn build_path_filters(
1495    include: &[String],
1496    exclude: &[String],
1497) -> Result<PathFilters, String> {
1498    Ok(PathFilters {
1499        includes: build_globset(include)?,
1500        excludes: build_globset(exclude)?,
1501    })
1502}
1503
1504pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1505    walk_project_files_from(root, root, filters)
1506}
1507
1508pub fn walk_project_files_bounded_default(
1509    root: &Path,
1510    max_files: usize,
1511) -> Result<Vec<PathBuf>, usize> {
1512    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files))
1513}
1514
1515pub(crate) fn walk_project_files_bounded_matching<F>(
1516    root: &Path,
1517    filters: &PathFilters,
1518    max_files: usize,
1519    matches_file: F,
1520) -> Result<Vec<PathBuf>, usize>
1521where
1522    F: Fn(&Path) -> bool,
1523{
1524    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file)
1525}
1526
1527pub fn walk_project_files_bounded_default_matching<F>(
1528    root: &Path,
1529    max_files: usize,
1530    matches_file: F,
1531) -> Result<Vec<PathBuf>, usize>
1532where
1533    F: Fn(&Path) -> bool,
1534{
1535    walk_project_files_from_inner_matching(
1536        root,
1537        root,
1538        &PathFilters::default(),
1539        Some(max_files),
1540        matches_file,
1541    )
1542}
1543
1544pub(crate) fn walk_project_files_from(
1545    filter_root: &Path,
1546    search_root: &Path,
1547    filters: &PathFilters,
1548) -> Vec<PathBuf> {
1549    walk_project_files_from_inner(filter_root, search_root, filters, None)
1550        .expect("unbounded project walk cannot exceed a file limit")
1551}
1552
1553pub(crate) fn has_any_project_file_from(
1554    filter_root: &Path,
1555    search_root: &Path,
1556    filters: &PathFilters,
1557) -> bool {
1558    walk_project_files_from_inner(filter_root, search_root, filters, Some(0)).is_err()
1559}
1560
1561fn walk_project_files_from_inner(
1562    filter_root: &Path,
1563    search_root: &Path,
1564    filters: &PathFilters,
1565    max_files: Option<usize>,
1566) -> Result<Vec<PathBuf>, usize> {
1567    walk_project_files_from_inner_matching(filter_root, search_root, filters, max_files, |_| true)
1568}
1569
1570fn walk_project_files_from_inner_matching<F>(
1571    filter_root: &Path,
1572    search_root: &Path,
1573    filters: &PathFilters,
1574    max_files: Option<usize>,
1575    matches_file: F,
1576) -> Result<Vec<PathBuf>, usize>
1577where
1578    F: Fn(&Path) -> bool,
1579{
1580    let mut builder = WalkBuilder::new(search_root);
1581    builder
1582        .hidden(false)
1583        .git_ignore(true)
1584        .git_global(true)
1585        .git_exclude(true)
1586        // `.aftignore` — AFT-specific ignores layered on top of .gitignore.
1587        // Honored hierarchically like .gitignore (works in non-git projects too),
1588        // so users can exclude paths git can't (e.g. submodules) from indexing.
1589        .add_custom_ignore_filename(".aftignore")
1590        .filter_entry(|entry| {
1591            let name = entry.file_name().to_string_lossy();
1592            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1593                return !matches!(
1594                    name.as_ref(),
1595                    "node_modules"
1596                        | "target"
1597                        | "venv"
1598                        | ".venv"
1599                        | ".git"
1600                        | "__pycache__"
1601                        | ".tox"
1602                        | "dist"
1603                        | "build"
1604                );
1605            }
1606            true
1607        });
1608
1609    let mut files = Vec::new();
1610    for entry in builder.build().filter_map(|entry| entry.ok()) {
1611        if !entry
1612            .file_type()
1613            .map_or(false, |file_type| file_type.is_file())
1614        {
1615            continue;
1616        }
1617        let path = entry.into_path();
1618        if filters.matches(filter_root, &path) && matches_file(&path) {
1619            files.push(path);
1620            if max_files.is_some_and(|limit| files.len() > limit) {
1621                return Err(files.len());
1622            }
1623        }
1624    }
1625
1626    sort_paths_by_mtime_desc(&mut files);
1627    Ok(files)
1628}
1629
1630pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1631    let bytes = fs::read(path).ok()?;
1632    if is_binary_bytes(&bytes) {
1633        return None;
1634    }
1635    String::from_utf8(bytes).ok()
1636}
1637
1638fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1639    fs::read(path).ok()
1640}
1641
1642pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1643    path.strip_prefix(root)
1644        .map(PathBuf::from)
1645        .unwrap_or_else(|_| path.to_path_buf())
1646}
1647
1648pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1649    let normalized_root = normalize_path(root);
1650    let normalized_path = normalize_path(path);
1651    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1652    validate_cached_relative_path(relative)
1653}
1654
1655pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1656    let relative = validate_cached_relative_path(relative_path)?;
1657    let normalized_root = normalize_path(root);
1658    let full_path = normalize_path(&normalized_root.join(relative));
1659
1660    match fs::canonicalize(&full_path) {
1661        Ok(canonical_path) => {
1662            if canonical_path.starts_with(&normalized_root) {
1663                return Some(full_path);
1664            }
1665
1666            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1667            canonical_path
1668                .starts_with(&canonical_root)
1669                .then_some(full_path)
1670        }
1671        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1672    }
1673}
1674
1675pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1676    if path.is_absolute() {
1677        return None;
1678    }
1679
1680    let mut normalized = PathBuf::new();
1681    for component in path.components() {
1682        match component {
1683            Component::Normal(part) => normalized.push(part),
1684            Component::CurDir => {}
1685            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1686        }
1687    }
1688    (!normalized.as_os_str().is_empty()).then_some(normalized)
1689}
1690
1691/// Sort paths newest-first by mtime, falling back to lexicographic order.
1692///
1693/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1694/// `sort_by()` closure. That made the comparator non-deterministic — a
1695/// `stat()` syscall for the same path can return different values across
1696/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1697/// concurrent file-watcher activity), and Rust's slice::sort panics at
1698/// runtime when it detects a non-total-order comparator. CI hit this on
1699/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1700///
1701/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1702/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1703pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1704    use std::collections::HashMap;
1705    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1706    for path in paths.iter() {
1707        mtimes
1708            .entry(path.clone())
1709            .or_insert_with(|| path_modified_time(path));
1710    }
1711    paths.sort_by(|left, right| {
1712        let left_mtime = mtimes.get(left).and_then(|v| *v);
1713        let right_mtime = mtimes.get(right).and_then(|v| *v);
1714        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1715    });
1716}
1717
1718/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1719/// the sort. Same fix, applied to grep matches that share files.
1720pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1721    use std::collections::HashMap;
1722    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1723    for m in matches.iter() {
1724        mtimes.entry(m.file.clone()).or_insert_with(|| {
1725            let resolved = resolve_match_path(project_root, &m.file);
1726            path_modified_time(&resolved)
1727        });
1728    }
1729    matches.sort_by(|left, right| {
1730        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1731        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1732        right_mtime
1733            .cmp(&left_mtime)
1734            .then_with(|| left.file.cmp(&right.file))
1735            .then_with(|| left.line.cmp(&right.line))
1736            .then_with(|| left.column.cmp(&right.column))
1737    });
1738}
1739
1740/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1741/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1742/// table from the search index), but it can still return different values if
1743/// the file is modified mid-sort. Snapshot once.
1744fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1745    matches: &mut [SharedGrepMatch],
1746    modified_for_path: F,
1747) where
1748    F: Fn(&Path) -> Option<SystemTime>,
1749{
1750    use std::collections::HashMap;
1751    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1752    for m in matches.iter() {
1753        let path = m.file.as_path().to_path_buf();
1754        mtimes
1755            .entry(path.clone())
1756            .or_insert_with(|| modified_for_path(&path));
1757    }
1758    matches.sort_by(|left, right| {
1759        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1760        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1761        right_mtime
1762            .cmp(&left_mtime)
1763            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1764            .then_with(|| left.line.cmp(&right.line))
1765            .then_with(|| left.column.cmp(&right.column))
1766    });
1767}
1768
1769pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1770    let resolved_project_root = canonicalize_or_normalize(project_root);
1771    let root = match path {
1772        Some(path) => {
1773            let path = PathBuf::from(path);
1774            if path.is_absolute() {
1775                canonicalize_or_normalize(&path)
1776            } else {
1777                normalize_path(&resolved_project_root.join(path))
1778            }
1779        }
1780        None => resolved_project_root.clone(),
1781    };
1782
1783    let use_index = is_within_search_root(&resolved_project_root, &root);
1784    SearchScope { root, use_index }
1785}
1786
1787pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1788    content_inspector::inspect(content).is_binary()
1789}
1790
1791pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1792    run_git(root, &["rev-parse", "HEAD"])
1793}
1794
1795pub fn project_cache_key(project_root: &Path) -> String {
1796    use sha2::{Digest, Sha256};
1797
1798    let mut hasher = Sha256::new();
1799
1800    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1801        // Git repo: root commit is the unique identity.
1802        // Same repo cloned anywhere produces the same key.
1803        hasher.update(root_commit.as_bytes());
1804    } else {
1805        // Non-git project: use the canonical filesystem path as identity.
1806        let canonical_root = canonicalize_or_normalize(project_root);
1807        hasher.update(canonical_root.to_string_lossy().as_bytes());
1808    }
1809
1810    let digest = format!("{:x}", hasher.finalize());
1811    digest[..16].to_string()
1812}
1813
1814/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
1815///
1816/// The search cache stores this value next to the file mtimes. If `.gitignore`,
1817/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
1818/// matching HEAD + matching file mtimes is not enough to safely reuse the old
1819/// cache: files that are now ignored may still be indexed. Hashing the ignore
1820/// files themselves makes cold-start cache reuse agree with the current walker.
1821pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1822    use sha2::{Digest, Sha256};
1823
1824    let root = canonicalize_or_normalize(project_root);
1825    let mut files = Vec::new();
1826    collect_ignore_rule_files(&root, &mut files);
1827    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
1828        if global_ignore.is_file() {
1829            files.push(global_ignore);
1830        }
1831    }
1832    let info_exclude = git_info_exclude_path(&root);
1833    if info_exclude.is_file() {
1834        files.push(info_exclude);
1835    }
1836    files.sort();
1837    files.dedup();
1838
1839    let mut hasher = Sha256::new();
1840    hasher.update(b"aft-ignore-rules-v1\0");
1841    for path in files {
1842        if let Some(relative) = cache_relative_path(&root, &path) {
1843            hasher.update(relative.to_string_lossy().as_bytes());
1844        } else {
1845            hasher.update(path.to_string_lossy().as_bytes());
1846        }
1847        hasher.update(b"\0");
1848        match fs::read(&path) {
1849            Ok(bytes) => hasher.update(&bytes),
1850            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
1851        }
1852        hasher.update(b"\0");
1853    }
1854
1855    format!("{:x}", hasher.finalize())
1856}
1857
1858fn git_info_exclude_path(root: &Path) -> PathBuf {
1859    run_git(
1860        root,
1861        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
1862    )
1863    .map(PathBuf::from)
1864    .unwrap_or_else(|| root.join(".git"))
1865    .join("info")
1866    .join("exclude")
1867}
1868
1869fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
1870    let mut stack = vec![root.to_path_buf()];
1871    while let Some(dir) = stack.pop() {
1872        let Ok(entries) = fs::read_dir(&dir) else {
1873            continue;
1874        };
1875        for entry in entries.flatten() {
1876            let path = entry.path();
1877            let file_name = entry.file_name();
1878            if file_name == ".gitignore" || file_name == ".aftignore" {
1879                if path.is_file() {
1880                    files.push(path);
1881                }
1882                continue;
1883            }
1884
1885            let Ok(file_type) = entry.file_type() else {
1886                continue;
1887            };
1888            if !file_type.is_dir() || file_type.is_symlink() {
1889                continue;
1890            }
1891            if ignore_rule_fingerprint_skips_dir(&file_name) {
1892                continue;
1893            }
1894            stack.push(path);
1895        }
1896    }
1897}
1898
1899fn ignore_rule_fingerprint_skips_dir(name: &std::ffi::OsStr) -> bool {
1900    matches!(
1901        name.to_str().unwrap_or(""),
1902        ".git"
1903            | "node_modules"
1904            | "target"
1905            | "venv"
1906            | ".venv"
1907            | "__pycache__"
1908            | ".tox"
1909            | "dist"
1910            | "build"
1911    )
1912}
1913
1914impl PathFilters {
1915    fn matches(&self, root: &Path, path: &Path) -> bool {
1916        let relative = to_glob_path(&relative_to_root(root, path));
1917        if self
1918            .includes
1919            .as_ref()
1920            .is_some_and(|includes| !includes.is_match(&relative))
1921        {
1922            return false;
1923        }
1924        if self
1925            .excludes
1926            .as_ref()
1927            .is_some_and(|excludes| excludes.is_match(&relative))
1928        {
1929            return false;
1930        }
1931        true
1932    }
1933}
1934
1935fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1936    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1937}
1938
1939fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1940    if path.is_absolute() {
1941        path.to_path_buf()
1942    } else {
1943        project_root.join(path)
1944    }
1945}
1946
1947fn path_modified_time(path: &Path) -> Option<SystemTime> {
1948    fs::metadata(path)
1949        .and_then(|metadata| metadata.modified())
1950        .ok()
1951}
1952
1953fn normalize_path(path: &Path) -> PathBuf {
1954    let mut result = PathBuf::new();
1955    for component in path.components() {
1956        match component {
1957            Component::ParentDir => {
1958                if !result.pop() {
1959                    result.push(component);
1960                }
1961            }
1962            Component::CurDir => {}
1963            _ => result.push(component),
1964        }
1965    }
1966    result
1967}
1968
1969fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
1970    if let Ok(canonical) = fs::canonicalize(path) {
1971        return canonical;
1972    }
1973
1974    let Some(parent) = path.parent() else {
1975        return path.to_path_buf();
1976    };
1977    let Some(file_name) = path.file_name() else {
1978        return path.to_path_buf();
1979    };
1980
1981    fs::canonicalize(parent)
1982        .map(|canonical_parent| canonical_parent.join(file_name))
1983        .unwrap_or_else(|_| path.to_path_buf())
1984}
1985
1986/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1987/// since the index was last written. Also detect new files and deleted files.
1988fn verify_file_mtimes(index: &mut SearchIndex) {
1989    let filters = PathFilters::default();
1990    let current_files = walk_project_files(&index.project_root, &filters);
1991    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
1992    let mut stale_paths = Vec::new();
1993    let mut removed_paths = Vec::new();
1994
1995    for entry in &mut index.files {
1996        if entry.path.as_os_str().is_empty() {
1997            continue; // tombstoned entry
1998        }
1999        if !current_file_set.contains(&entry.path) {
2000            removed_paths.push(entry.path.clone());
2001            continue;
2002        }
2003        let cached = FileFreshness {
2004            mtime: entry.modified,
2005            size: entry.size,
2006            content_hash: entry.content_hash,
2007        };
2008        match cache_freshness::verify_file_strict(&entry.path, &cached) {
2009            FreshnessVerdict::HotFresh => {}
2010            FreshnessVerdict::ContentFresh {
2011                new_mtime,
2012                new_size,
2013            } => {
2014                entry.modified = new_mtime;
2015                entry.size = new_size;
2016            }
2017            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2018                stale_paths.push(entry.path.clone())
2019            }
2020        }
2021    }
2022
2023    for path in &removed_paths {
2024        index.remove_file(path);
2025    }
2026
2027    // Re-index stale files that are still in the current walk set. If an ignore
2028    // rule changed while AFT was down but the fingerprint missed it, this keeps
2029    // warm-cache verification from resurrecting now-ignored cached entries.
2030    for path in &stale_paths {
2031        if current_file_set.contains(path) {
2032            index.update_file(path);
2033        } else {
2034            index.remove_file(path);
2035        }
2036    }
2037
2038    // Detect new files not in the index
2039    for path in current_files {
2040        if !index.path_to_id.contains_key(&path) {
2041            index.update_file(&path);
2042        }
2043    }
2044
2045    if !stale_paths.is_empty() {
2046        crate::slog_info!(
2047            "search index: refreshed {} stale file(s) from disk cache",
2048            stale_paths.len()
2049        );
2050    }
2051}
2052
2053fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2054    normalize_path(path).starts_with(normalize_path(search_root))
2055}
2056
2057impl QueryBuild {
2058    fn into_query(self) -> RegexQuery {
2059        let mut query = RegexQuery::default();
2060
2061        for run in self.and_runs {
2062            add_run_to_and_query(&mut query, &run);
2063        }
2064
2065        for group in self.or_groups {
2066            let mut trigrams = BTreeSet::new();
2067            let mut filters = HashMap::new();
2068            for run in group {
2069                for (trigram, filter) in trigram_filters(&run) {
2070                    trigrams.insert(trigram);
2071                    merge_filter(filters.entry(trigram).or_default(), filter);
2072                }
2073            }
2074            if !trigrams.is_empty() {
2075                query.or_groups.push(trigrams.into_iter().collect());
2076                query.or_filters.push(filters);
2077            }
2078        }
2079
2080        query
2081    }
2082}
2083
2084fn build_query(hir: &Hir) -> QueryBuild {
2085    match hir.kind() {
2086        HirKind::Literal(literal) => {
2087            if literal.0.len() >= 3 {
2088                QueryBuild {
2089                    and_runs: vec![literal.0.to_vec()],
2090                    or_groups: Vec::new(),
2091                }
2092            } else {
2093                QueryBuild::default()
2094            }
2095        }
2096        HirKind::Capture(capture) => build_query(&capture.sub),
2097        HirKind::Concat(parts) => {
2098            let mut build = QueryBuild::default();
2099            for part in parts {
2100                let part_build = build_query(part);
2101                build.and_runs.extend(part_build.and_runs);
2102                build.or_groups.extend(part_build.or_groups);
2103            }
2104            build
2105        }
2106        HirKind::Alternation(parts) => {
2107            let mut group = Vec::new();
2108            for part in parts {
2109                let Some(mut choices) = guaranteed_run_choices(part) else {
2110                    return QueryBuild::default();
2111                };
2112                group.append(&mut choices);
2113            }
2114            if group.is_empty() {
2115                QueryBuild::default()
2116            } else {
2117                QueryBuild {
2118                    and_runs: Vec::new(),
2119                    or_groups: vec![group],
2120                }
2121            }
2122        }
2123        HirKind::Repetition(repetition) => {
2124            if repetition.min == 0 {
2125                QueryBuild::default()
2126            } else {
2127                build_query(&repetition.sub)
2128            }
2129        }
2130        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2131    }
2132}
2133
2134fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2135    match hir.kind() {
2136        HirKind::Literal(literal) => {
2137            if literal.0.len() >= 3 {
2138                Some(vec![literal.0.to_vec()])
2139            } else {
2140                None
2141            }
2142        }
2143        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2144        HirKind::Concat(parts) => {
2145            let mut runs = Vec::new();
2146            for part in parts {
2147                if let Some(mut part_runs) = guaranteed_run_choices(part) {
2148                    runs.append(&mut part_runs);
2149                }
2150            }
2151            if runs.is_empty() {
2152                None
2153            } else {
2154                Some(runs)
2155            }
2156        }
2157        HirKind::Alternation(parts) => {
2158            let mut runs = Vec::new();
2159            for part in parts {
2160                let Some(mut part_runs) = guaranteed_run_choices(part) else {
2161                    return None;
2162                };
2163                runs.append(&mut part_runs);
2164            }
2165            if runs.is_empty() {
2166                None
2167            } else {
2168                Some(runs)
2169            }
2170        }
2171        HirKind::Repetition(repetition) => {
2172            if repetition.min == 0 {
2173                None
2174            } else {
2175                guaranteed_run_choices(&repetition.sub)
2176            }
2177        }
2178        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2179    }
2180}
2181
2182fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2183    for (trigram, filter) in trigram_filters(run) {
2184        if !query.and_trigrams.contains(&trigram) {
2185            query.and_trigrams.push(trigram);
2186        }
2187        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2188    }
2189}
2190
2191fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2192    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2193    for (trigram, next_char, position) in extract_trigrams(run) {
2194        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
2195        if next_char != EOF_SENTINEL {
2196            entry.next_mask |= mask_for_next_char(next_char);
2197        }
2198        entry.loc_mask |= mask_for_position(position);
2199    }
2200    filters.into_iter().collect()
2201}
2202
2203fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2204    target.next_mask |= filter.next_mask;
2205    target.loc_mask |= filter.loc_mask;
2206}
2207
2208fn mask_for_next_char(next_char: u8) -> u8 {
2209    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2210    1u8 << bit
2211}
2212
2213fn mask_for_position(position: usize) -> u8 {
2214    1u8 << (position % 8)
2215}
2216
2217fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2218    if patterns.is_empty() {
2219        return Ok(None);
2220    }
2221
2222    let mut builder = GlobSetBuilder::new();
2223    for pattern in patterns {
2224        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2225        builder.add(glob);
2226    }
2227    builder.build().map(Some).map_err(|error| error.to_string())
2228}
2229
2230fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2231    let mut buffer = [0u8; 4];
2232    reader.read_exact(&mut buffer)?;
2233    Ok(u32::from_le_bytes(buffer))
2234}
2235
2236fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2237    let mut buffer = [0u8; 8];
2238    reader.read_exact(&mut buffer)?;
2239    Ok(u64::from_le_bytes(buffer))
2240}
2241
2242fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2243    writer.write_all(&value.to_le_bytes())
2244}
2245
2246fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2247    writer.write_all(&value.to_le_bytes())
2248}
2249
2250fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2251    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2252        return Err(std::io::Error::other("search index checksum missing"));
2253    };
2254    let expected = u32::from_le_bytes(*stored);
2255    let actual = crc32fast::hash(body);
2256    if actual != expected {
2257        return Err(std::io::Error::other("search index checksum mismatch"));
2258    }
2259    Ok(())
2260}
2261
2262fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2263    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2264    total_len.checked_sub(pos)
2265}
2266
2267fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2268    let output = Command::new("git")
2269        .arg("-C")
2270        .arg(root)
2271        .args(args)
2272        .output()
2273        .ok()?;
2274    if !output.status.success() {
2275        return None;
2276    }
2277    let value = String::from_utf8(output.stdout).ok()?;
2278    let value = value.trim().to_string();
2279    if value.is_empty() {
2280        None
2281    } else {
2282        Some(value)
2283    }
2284}
2285
2286fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2287    let diff_range = format!("{}..{}", from, to);
2288    let output = match Command::new("git")
2289        .arg("-C")
2290        .arg(root)
2291        .args(["diff", "--name-status", "-M", &diff_range])
2292        .output()
2293    {
2294        Ok(output) => output,
2295        Err(_) => return false,
2296    };
2297
2298    if !output.status.success() {
2299        return false;
2300    }
2301
2302    let Ok(diff) = String::from_utf8(output.stdout) else {
2303        return false;
2304    };
2305
2306    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2307        let mut fields = line.split('\t');
2308        let Some(status) = fields.next() else {
2309            continue;
2310        };
2311
2312        if status.starts_with('R') {
2313            let Some(old_path) = fields
2314                .next()
2315                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2316            else {
2317                continue;
2318            };
2319            let Some(new_path) = fields
2320                .next()
2321                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2322            else {
2323                continue;
2324            };
2325            index.remove_file(&old_path);
2326            index.update_file(&new_path);
2327            continue;
2328        }
2329
2330        let Some(path) = fields
2331            .next()
2332            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2333        else {
2334            continue;
2335        };
2336        if status.starts_with('D') || !path.exists() {
2337            index.remove_file(&path);
2338        } else {
2339            index.update_file(&path);
2340        }
2341    }
2342
2343    true
2344}
2345
2346fn is_binary_path(path: &Path, size: u64) -> bool {
2347    if size == 0 {
2348        return false;
2349    }
2350
2351    let mut file = match File::open(path) {
2352        Ok(file) => file,
2353        Err(_) => return true,
2354    };
2355
2356    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2357    match file.read(&mut preview) {
2358        Ok(read) => is_binary_bytes(&preview[..read]),
2359        Err(_) => true,
2360    }
2361}
2362
2363fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2364    let mut starts = vec![0usize];
2365    for (index, byte) in content.iter().copied().enumerate() {
2366        if byte == b'\n' {
2367            starts.push(index + 1);
2368        }
2369    }
2370    starts
2371}
2372
2373fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2374    let line_index = match line_starts.binary_search(&offset) {
2375        Ok(index) => index,
2376        Err(index) => index.saturating_sub(1),
2377    };
2378    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2379    let line_end = content[line_start..]
2380        .iter()
2381        .position(|byte| *byte == b'\n')
2382        .map(|length| line_start + length)
2383        .unwrap_or(content.len());
2384    let mut line_slice = &content[line_start..line_end];
2385    if line_slice.ends_with(b"\r") {
2386        line_slice = &line_slice[..line_slice.len() - 1];
2387    }
2388    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2389    let column = String::from_utf8_lossy(&content[line_start..offset])
2390        .chars()
2391        .count() as u32
2392        + 1;
2393    (line_index as u32 + 1, column, line_text)
2394}
2395
2396fn to_glob_path(path: &Path) -> String {
2397    path.to_string_lossy().replace('\\', "/")
2398}
2399
2400#[cfg(test)]
2401mod tests {
2402    use std::process::Command;
2403
2404    use super::*;
2405
2406    #[test]
2407    fn cached_path_under_root_allows_missing_lexical_child() {
2408        let dir = tempfile::tempdir().expect("create temp dir");
2409        let project = dir.path().join("project");
2410        fs::create_dir_all(&project).expect("create project dir");
2411        let root = fs::canonicalize(&project).expect("canonicalize project");
2412
2413        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2414            .expect("missing child should fall back to lexical validation");
2415
2416        assert_eq!(path, root.join("future/file.rs"));
2417    }
2418
2419    #[cfg(unix)]
2420    #[test]
2421    fn cached_path_under_root_rejects_symlink_escape() {
2422        let dir = tempfile::tempdir().expect("create temp dir");
2423        let project = dir.path().join("project");
2424        let outside = dir.path().join("outside");
2425        fs::create_dir_all(&project).expect("create project dir");
2426        fs::create_dir_all(&outside).expect("create outside dir");
2427        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2428        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2429        let root = fs::canonicalize(&project).expect("canonicalize project");
2430
2431        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2432    }
2433
2434    #[test]
2435    fn extract_trigrams_tracks_next_char_and_position() {
2436        let trigrams = extract_trigrams(b"Rust");
2437        assert_eq!(trigrams.len(), 2);
2438        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2439        assert_eq!(
2440            trigrams[1],
2441            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2442        );
2443    }
2444
2445    #[test]
2446    fn decompose_regex_extracts_literals_and_alternations() {
2447        let query = decompose_regex("abc(def|ghi)xyz");
2448        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2449        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2450        assert_eq!(query.or_groups.len(), 1);
2451        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2452        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2453    }
2454
2455    #[test]
2456    fn candidates_intersect_posting_lists() {
2457        let mut index = SearchIndex::new();
2458        let dir = tempfile::tempdir().expect("create temp dir");
2459        let alpha = dir.path().join("alpha.txt");
2460        let beta = dir.path().join("beta.txt");
2461        fs::write(&alpha, "abcdef").expect("write alpha");
2462        fs::write(&beta, "abcxyz").expect("write beta");
2463        index.project_root = dir.path().to_path_buf();
2464        index.index_file(&alpha, b"abcdef");
2465        index.index_file(&beta, b"abcxyz");
2466
2467        let query = RegexQuery {
2468            and_trigrams: vec![
2469                pack_trigram(b'a', b'b', b'c'),
2470                pack_trigram(b'd', b'e', b'f'),
2471            ],
2472            ..RegexQuery::default()
2473        };
2474
2475        let candidates = index.candidates(&query);
2476        assert_eq!(candidates.len(), 1);
2477        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2478    }
2479
2480    #[test]
2481    fn candidates_apply_bloom_filters() {
2482        let mut index = SearchIndex::new();
2483        let dir = tempfile::tempdir().expect("create temp dir");
2484        let file = dir.path().join("sample.txt");
2485        fs::write(&file, "abcd efgh").expect("write sample");
2486        index.project_root = dir.path().to_path_buf();
2487        index.index_file(&file, b"abcd efgh");
2488
2489        let trigram = pack_trigram(b'a', b'b', b'c');
2490        let matching_filter = PostingFilter {
2491            next_mask: mask_for_next_char(b'd'),
2492            loc_mask: mask_for_position(0),
2493        };
2494        let non_matching_filter = PostingFilter {
2495            next_mask: mask_for_next_char(b'z'),
2496            loc_mask: mask_for_position(0),
2497        };
2498
2499        assert_eq!(
2500            index
2501                .postings_for_trigram(trigram, Some(matching_filter))
2502                .len(),
2503            1
2504        );
2505        assert!(index
2506            .postings_for_trigram(trigram, Some(non_matching_filter))
2507            .is_empty());
2508    }
2509
2510    #[test]
2511    fn disk_round_trip_preserves_postings_and_files() {
2512        let dir = tempfile::tempdir().expect("create temp dir");
2513        let project = dir.path().join("project");
2514        fs::create_dir_all(&project).expect("create project dir");
2515        let file = project.join("src.txt");
2516        fs::write(&file, "abcdef").expect("write source");
2517
2518        let mut index = SearchIndex::build(&project);
2519        index.git_head = Some("deadbeef".to_string());
2520        let cache_dir = dir.path().join("cache");
2521        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2522
2523        let loaded =
2524            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2525        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2526        assert_eq!(loaded.files.len(), 1);
2527        assert_eq!(
2528            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2529            PathBuf::from("src.txt")
2530        );
2531        assert_eq!(loaded.postings.len(), index.postings.len());
2532        assert!(loaded
2533            .postings
2534            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2535    }
2536
2537    #[test]
2538    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2539        let root = PathBuf::from("/tmp/aft-project");
2540
2541        assert_eq!(
2542            cache_relative_path(&root, &root.join("src/lib.rs")),
2543            Some(PathBuf::from("src/lib.rs"))
2544        );
2545        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2546        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2547        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2548        assert_eq!(
2549            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2550            Some(root.join("src/lib.rs"))
2551        );
2552    }
2553
2554    #[test]
2555    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2556        let dir = tempfile::tempdir().expect("create temp dir");
2557        let project = dir.path().join("project");
2558        fs::create_dir_all(&project).expect("create project dir");
2559        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2560        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2561        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2562
2563        Command::new("git")
2564            .arg("init")
2565            .arg(&project)
2566            .status()
2567            .expect("git init");
2568        for args in [
2569            ["config", "user.email", "aft@example.invalid"],
2570            ["config", "user.name", "AFT Test"],
2571        ] {
2572            Command::new("git")
2573                .arg("-C")
2574                .arg(&project)
2575                .args(args)
2576                .status()
2577                .expect("git config");
2578        }
2579        Command::new("git")
2580            .arg("-C")
2581            .arg(&project)
2582            .args(["add", "."])
2583            .status()
2584            .expect("git add initial");
2585        Command::new("git")
2586            .arg("-C")
2587            .arg(&project)
2588            .args(["commit", "-m", "initial"])
2589            .status()
2590            .expect("git commit initial");
2591        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2592        let mut baseline = SearchIndex::build(&project);
2593        baseline.git_head = Some(previous.clone());
2594
2595        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2596        Command::new("git")
2597            .arg("-C")
2598            .arg(&project)
2599            .args(["add", "-A"])
2600            .status()
2601            .expect("git add rename");
2602        Command::new("git")
2603            .arg("-C")
2604            .arg(&project)
2605            .args(["commit", "-m", "rename"])
2606            .status()
2607            .expect("git commit rename");
2608        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2609
2610        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2611        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2612
2613        let refreshed = SearchIndex::rebuild_or_refresh(
2614            &project,
2615            DEFAULT_MAX_FILE_SIZE,
2616            Some(current),
2617            Some(baseline),
2618        );
2619
2620        assert!(!refreshed
2621            .path_to_id
2622            .contains_key(&canonical_project.join("old.txt")));
2623        assert!(refreshed
2624            .path_to_id
2625            .contains_key(&canonical_project.join("new.txt")));
2626        assert!(refreshed
2627            .path_to_id
2628            .contains_key(&canonical_project.join("untracked.txt")));
2629        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2630        assert_eq!(matches.matches.len(), 1);
2631    }
2632
2633    #[test]
2634    fn read_from_disk_rejects_corrupt_postings_checksum() {
2635        let dir = tempfile::tempdir().expect("create temp dir");
2636        let project = dir.path().join("project");
2637        fs::create_dir_all(&project).expect("create project dir");
2638        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2639
2640        let index = SearchIndex::build(&project);
2641        let cache_dir = dir.path().join("cache");
2642        index.write_to_disk(&cache_dir, None);
2643
2644        let cache_path = cache_dir.join("cache.bin");
2645        let mut bytes = fs::read(&cache_path).expect("read cache");
2646        let middle = bytes.len() / 2;
2647        bytes[middle] ^= 0xff;
2648        fs::write(&cache_path, bytes).expect("write corrupted cache");
2649
2650        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2651    }
2652
2653    #[test]
2654    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2655        let dir = tempfile::tempdir().expect("create temp dir");
2656        let project = dir.path().join("project");
2657        fs::create_dir_all(&project).expect("create project dir");
2658        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2659
2660        let index = SearchIndex::build(&project);
2661        let cache_dir = dir.path().join("cache");
2662        index.write_to_disk(&cache_dir, None);
2663
2664        assert!(cache_dir.join("cache.bin").is_file());
2665        assert!(fs::read_dir(&cache_dir)
2666            .expect("read cache dir")
2667            .all(|entry| !entry
2668                .expect("cache entry")
2669                .file_name()
2670                .to_string_lossy()
2671                .contains(".tmp.")));
2672    }
2673
2674    #[test]
2675    fn concurrent_search_index_writes_do_not_corrupt() {
2676        let dir = tempfile::tempdir().expect("create temp dir");
2677        let project = dir.path().join("project");
2678        fs::create_dir_all(&project).expect("create project dir");
2679        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2680        let cache_dir = dir.path().join("cache");
2681
2682        let a_project = project.clone();
2683        let a_cache = cache_dir.clone();
2684        let a = std::thread::spawn(move || {
2685            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2686            let index = SearchIndex::build(&a_project);
2687            index.write_to_disk(&a_cache, None);
2688        });
2689        let b_project = project.clone();
2690        let b_cache = cache_dir.clone();
2691        let b = std::thread::spawn(move || {
2692            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2693            let index = SearchIndex::build(&b_project);
2694            index.write_to_disk(&b_cache, None);
2695        });
2696        a.join().expect("writer a");
2697        b.join().expect("writer b");
2698
2699        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2700    }
2701
2702    #[test]
2703    fn search_index_atomic_rename_survives_partial_write() {
2704        let dir = tempfile::tempdir().expect("create temp dir");
2705        let cache_dir = dir.path().join("cache");
2706        fs::create_dir_all(&cache_dir).expect("create cache dir");
2707        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2708
2709        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2710    }
2711
2712    #[test]
2713    fn project_cache_key_includes_checkout_path() {
2714        let dir = tempfile::tempdir().expect("create temp dir");
2715        let source = dir.path().join("source");
2716        fs::create_dir_all(&source).expect("create source repo dir");
2717        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2718
2719        assert!(Command::new("git")
2720            .current_dir(&source)
2721            .args(["init"])
2722            .status()
2723            .expect("init git repo")
2724            .success());
2725        assert!(Command::new("git")
2726            .current_dir(&source)
2727            .args(["add", "."])
2728            .status()
2729            .expect("git add")
2730            .success());
2731        assert!(Command::new("git")
2732            .current_dir(&source)
2733            .args([
2734                "-c",
2735                "user.name=AFT Tests",
2736                "-c",
2737                "user.email=aft-tests@example.com",
2738                "commit",
2739                "-m",
2740                "initial",
2741            ])
2742            .status()
2743            .expect("git commit")
2744            .success());
2745
2746        let clone = dir.path().join("clone");
2747        assert!(Command::new("git")
2748            .args(["clone", "--quiet"])
2749            .arg(&source)
2750            .arg(&clone)
2751            .status()
2752            .expect("git clone")
2753            .success());
2754
2755        let source_key = project_cache_key(&source);
2756        let clone_key = project_cache_key(&clone);
2757
2758        assert_eq!(source_key.len(), 16);
2759        assert_eq!(clone_key.len(), 16);
2760        // Same repo (same root commit) → same cache key regardless of clone path
2761        assert_eq!(source_key, clone_key);
2762    }
2763
2764    #[test]
2765    fn git_head_unchanged_picks_up_local_edits() {
2766        let dir = tempfile::tempdir().expect("create temp dir");
2767        let project = dir.path().join("repo");
2768        fs::create_dir_all(&project).expect("create repo dir");
2769        let file = project.join("tracked.txt");
2770        fs::write(&file, "oldtoken\n").expect("write file");
2771        assert!(Command::new("git")
2772            .current_dir(&project)
2773            .arg("init")
2774            .status()
2775            .unwrap()
2776            .success());
2777        assert!(Command::new("git")
2778            .current_dir(&project)
2779            .args(["add", "."])
2780            .status()
2781            .unwrap()
2782            .success());
2783        assert!(Command::new("git")
2784            .current_dir(&project)
2785            .args([
2786                "-c",
2787                "user.name=AFT Tests",
2788                "-c",
2789                "user.email=aft-tests@example.com",
2790                "commit",
2791                "-m",
2792                "initial"
2793            ])
2794            .status()
2795            .unwrap()
2796            .success());
2797        let head = current_git_head(&project);
2798        let mut baseline = SearchIndex::build(&project);
2799        baseline.git_head = head.clone();
2800        fs::write(&file, "newtoken\n").expect("edit tracked file");
2801
2802        let refreshed =
2803            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2804        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2805
2806        assert_eq!(result.total_matches, 1);
2807    }
2808
2809    #[test]
2810    fn non_git_project_reuses_cache_when_files_unchanged() {
2811        let dir = tempfile::tempdir().expect("create temp dir");
2812        let project = dir.path().join("project");
2813        fs::create_dir_all(&project).expect("create project dir");
2814        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2815        let baseline = SearchIndex::build(&project);
2816        let baseline_file_count = baseline.file_count();
2817
2818        let refreshed =
2819            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2820
2821        assert_eq!(refreshed.file_count(), baseline_file_count);
2822        assert_eq!(
2823            refreshed
2824                .grep("unchangedtoken", true, &[], &[], &project, 10)
2825                .total_matches,
2826            1
2827        );
2828    }
2829
2830    #[test]
2831    fn resolve_search_scope_disables_index_for_external_path() {
2832        let dir = tempfile::tempdir().expect("create temp dir");
2833        let project = dir.path().join("project");
2834        let outside = dir.path().join("outside");
2835        fs::create_dir_all(&project).expect("create project dir");
2836        fs::create_dir_all(&outside).expect("create outside dir");
2837
2838        let scope = resolve_search_scope(&project, outside.to_str());
2839
2840        assert_eq!(
2841            scope.root,
2842            fs::canonicalize(&outside).expect("canonicalize outside")
2843        );
2844        assert!(!scope.use_index);
2845    }
2846
2847    #[test]
2848    fn grep_filters_matches_to_search_root() {
2849        let dir = tempfile::tempdir().expect("create temp dir");
2850        let project = dir.path().join("project");
2851        let src = project.join("src");
2852        let docs = project.join("docs");
2853        fs::create_dir_all(&src).expect("create src dir");
2854        fs::create_dir_all(&docs).expect("create docs dir");
2855        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2856        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2857
2858        let index = SearchIndex::build(&project);
2859        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2860
2861        assert_eq!(result.files_searched, 1);
2862        assert_eq!(result.files_with_matches, 1);
2863        assert_eq!(result.matches.len(), 1);
2864        // Index stores canonicalized paths; on macOS /var → /private/var
2865        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2866        assert_eq!(result.matches[0].file, expected);
2867    }
2868
2869    #[test]
2870    fn grep_deduplicates_multiple_matches_on_same_line() {
2871        let dir = tempfile::tempdir().expect("create temp dir");
2872        let project = dir.path().join("project");
2873        let src = project.join("src");
2874        fs::create_dir_all(&src).expect("create src dir");
2875        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2876
2877        let index = SearchIndex::build(&project);
2878        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2879
2880        assert_eq!(result.total_matches, 1);
2881        assert_eq!(result.matches.len(), 1);
2882    }
2883
2884    #[test]
2885    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2886        let dir = tempfile::tempdir().expect("create temp dir");
2887        let project = dir.path().join("project");
2888        fs::create_dir_all(&project).expect("create project dir");
2889        let file = project.join("unicode.txt");
2890        fs::write(&file, "äbc\n").expect("write unicode file");
2891
2892        let index = SearchIndex::build(&project);
2893        let result = index.grep("Äbc", false, &[], &[], &project, 10);
2894
2895        assert_eq!(result.total_matches, 1);
2896        assert_eq!(result.matches.len(), 1);
2897        assert_eq!(
2898            result.matches[0].file,
2899            fs::canonicalize(file).expect("canonicalize unicode file")
2900        );
2901    }
2902
2903    #[test]
2904    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2905        let dir = tempfile::tempdir().expect("create temp dir");
2906        let project = dir.path().join("project");
2907        fs::create_dir_all(&project).expect("create project dir");
2908        let file = project.join("tokens.txt");
2909        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2910        fs::write(&file, "alpha").expect("write original file");
2911        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2912
2913        let baseline = SearchIndex::build(&project);
2914        fs::write(&file, "bravo").expect("write same-size edit");
2915        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2916
2917        let refreshed =
2918            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2919        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2920        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2921        let refreshed_id = *refreshed
2922            .path_to_id
2923            .get(&canonical_file)
2924            .expect("file remains indexed");
2925
2926        assert_eq!(result.total_matches, 1);
2927        assert!(refreshed
2928            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2929            .contains(&refreshed_id));
2930        assert!(!refreshed
2931            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2932            .contains(&refreshed_id));
2933    }
2934
2935    #[test]
2936    fn grep_reports_total_matches_before_truncation() {
2937        let dir = tempfile::tempdir().expect("create temp dir");
2938        let project = dir.path().join("project");
2939        let src = project.join("src");
2940        fs::create_dir_all(&src).expect("create src dir");
2941        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2942
2943        let index = SearchIndex::build(&project);
2944        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2945
2946        assert_eq!(result.total_matches, 2);
2947        assert_eq!(result.matches.len(), 1);
2948        assert!(result.truncated);
2949    }
2950
2951    #[test]
2952    fn glob_filters_results_to_search_root() {
2953        let dir = tempfile::tempdir().expect("create temp dir");
2954        let project = dir.path().join("project");
2955        let src = project.join("src");
2956        let scripts = project.join("scripts");
2957        fs::create_dir_all(&src).expect("create src dir");
2958        fs::create_dir_all(&scripts).expect("create scripts dir");
2959        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2960        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2961
2962        let index = SearchIndex::build(&project);
2963        let files = index.glob("**/*.rs", &src);
2964
2965        assert_eq!(
2966            files,
2967            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2968        );
2969    }
2970
2971    #[test]
2972    fn glob_includes_hidden_and_binary_files() {
2973        let dir = tempfile::tempdir().expect("create temp dir");
2974        let project = dir.path().join("project");
2975        let hidden_dir = project.join(".hidden");
2976        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2977        let hidden_file = hidden_dir.join("data.bin");
2978        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2979
2980        let index = SearchIndex::build(&project);
2981        let files = index.glob("**/*.bin", &project);
2982
2983        assert_eq!(
2984            files,
2985            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2986        );
2987    }
2988
2989    #[test]
2990    fn read_from_disk_rejects_invalid_nanos() {
2991        let dir = tempfile::tempdir().expect("create temp dir");
2992        let cache_dir = dir.path().join("cache");
2993        fs::create_dir_all(&cache_dir).expect("create cache dir");
2994
2995        let mut postings = Vec::new();
2996        postings.extend_from_slice(INDEX_MAGIC);
2997        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2998        postings.extend_from_slice(&0u32.to_le_bytes());
2999        postings.extend_from_slice(&1u32.to_le_bytes());
3000        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3001        postings.extend_from_slice(&1u32.to_le_bytes());
3002        postings.extend_from_slice(b"/");
3003        postings.push(0u8);
3004        postings.extend_from_slice(&1u32.to_le_bytes());
3005        postings.extend_from_slice(&0u64.to_le_bytes());
3006        postings.extend_from_slice(&0u64.to_le_bytes());
3007        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3008        postings.extend_from_slice(b"a");
3009        postings.extend_from_slice(&0u64.to_le_bytes());
3010
3011        let mut lookup = Vec::new();
3012        lookup.extend_from_slice(LOOKUP_MAGIC);
3013        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3014        lookup.extend_from_slice(&0u32.to_le_bytes());
3015
3016        let postings_checksum = crc32fast::hash(&postings);
3017        postings.extend_from_slice(&postings_checksum.to_le_bytes());
3018        let lookup_checksum = crc32fast::hash(&lookup);
3019        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3020        let mut cache = Vec::new();
3021        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3022        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3023        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3024        cache.extend_from_slice(&postings);
3025        cache.extend_from_slice(&lookup);
3026        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3027
3028        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3029    }
3030
3031    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
3032    /// changed between cmp() calls.
3033    ///
3034    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
3035    /// which does a `stat()` syscall. If the file was deleted, modified, or
3036    /// touched mid-sort, the comparator returned different values for the
3037    /// same input pair on different invocations. Rust's slice::sort detects
3038    /// this and panics with "user-provided comparison function does not
3039    /// correctly implement a total order".
3040    ///
3041    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
3042    /// bridge invalidated files in parallel with grep's sort path. This
3043    /// test simulates the worst case: most paths don't exist (Err from
3044    /// fs::metadata) and sort still completes successfully.
3045    #[test]
3046    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3047        // Mix of existing and non-existing paths in deliberately
3048        // non-monotonic order — pre-fix, the sort would call stat() at
3049        // least N log N times and any flakiness would trigger the panic.
3050        let dir = tempfile::tempdir().expect("create tempdir");
3051        let mut paths: Vec<PathBuf> = Vec::new();
3052        for i in 0..30 {
3053            // Half exist, half don't.
3054            let path = if i % 2 == 0 {
3055                let p = dir.path().join(format!("real-{i}.rs"));
3056                fs::write(&p, format!("// {i}\n")).expect("write");
3057                p
3058            } else {
3059                dir.path().join(format!("missing-{i}.rs"))
3060            };
3061            paths.push(path);
3062        }
3063
3064        // Run the sort many times to maximise the chance of catching any
3065        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
3066        for _ in 0..50 {
3067            let mut copy = paths.clone();
3068            sort_paths_by_mtime_desc(&mut copy);
3069            assert_eq!(copy.len(), paths.len());
3070        }
3071    }
3072
3073    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
3074    /// the same conditions as sort_paths_by_mtime_desc. See the
3075    /// sort_paths_... test above for the full rationale.
3076    #[test]
3077    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3078        let dir = tempfile::tempdir().expect("create tempdir");
3079        let mut matches: Vec<GrepMatch> = Vec::new();
3080        for i in 0..30 {
3081            let file = if i % 2 == 0 {
3082                let p = dir.path().join(format!("real-{i}.rs"));
3083                fs::write(&p, format!("// {i}\n")).expect("write");
3084                p
3085            } else {
3086                dir.path().join(format!("missing-{i}.rs"))
3087            };
3088            matches.push(GrepMatch {
3089                file,
3090                line: u32::try_from(i).unwrap_or(0),
3091                column: 0,
3092                line_text: format!("match {i}"),
3093                match_text: format!("match {i}"),
3094            });
3095        }
3096
3097        for _ in 0..50 {
3098            let mut copy = matches.clone();
3099            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3100            assert_eq!(copy.len(), matches.len());
3101        }
3102    }
3103}