aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36    _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41        fs::create_dir_all(cache_dir)?;
42        let path = cache_dir.join("cache.lock");
43        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44            .lock()
45            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46        fs_lock::try_acquire(&path, Duration::from_secs(2))
47            .map(|guard| Self { _guard: guard })
48            .map_err(|error| match error {
49                fs_lock::AcquireError::Timeout => {
50                    std::io::Error::other("timed out acquiring search cache lock")
51                }
52                fs_lock::AcquireError::Io(error) => error,
53            })
54    }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59    pub postings: HashMap<u32, Vec<Posting>>,
60    pub files: Vec<FileEntry>,
61    pub path_to_id: HashMap<PathBuf, u32>,
62    pub ready: bool,
63    project_root: PathBuf,
64    git_head: Option<String>,
65    max_file_size: u64,
66    ignore_rules_fingerprint: String,
67    pub file_trigrams: HashMap<u32, Vec<u32>>,
68    unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73    pub files: Vec<(PathBuf, f32)>,
74    pub engine_capped: bool,
75}
76
77impl SearchIndex {
78    /// Number of indexed files.
79    pub fn file_count(&self) -> usize {
80        self.files.len()
81    }
82
83    /// Number of unique trigrams in the index.
84    pub fn trigram_count(&self) -> usize {
85        self.postings.len()
86    }
87
88    /// Compute distinct query trigrams from literal tokens.
89    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90        query_trigrams_from_tokens(tokens)
91    }
92
93    /// Score-rank file candidates by lexical relevance to query trigrams.
94    pub fn lexical_rank(
95        &self,
96        query_trigrams: &[u32],
97        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98        max_files: usize,
99    ) -> Vec<(PathBuf, f32)> {
100        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101            .files
102    }
103
104    /// Score-rank file candidates and report whether pre-filter candidate
105    /// enumeration hit the internal 200/500 cap before ranking.
106    pub fn lexical_rank_with_stats(
107        &self,
108        query_trigrams: &[u32],
109        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110        max_files: usize,
111    ) -> LexicalRankResult {
112        if query_trigrams.is_empty() || max_files == 0 {
113            return LexicalRankResult::default();
114        }
115
116        let mut non_zero: Vec<(u32, usize)> = query_trigrams
117            .iter()
118            .filter_map(|trigram| {
119                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120                (posting_count > 0).then_some((*trigram, posting_count))
121            })
122            .collect();
123        if non_zero.is_empty() {
124            return LexicalRankResult::default();
125        }
126
127        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128        let selected_count = non_zero.len().min(3);
129        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131        let mut candidate_ids = BTreeSet::new();
132        for (trigram, _) in non_zero.iter().take(selected_count) {
133            if let Some(postings) = self.postings.get(trigram) {
134                for posting in postings {
135                    if self.is_active_file(posting.file_id) {
136                        candidate_ids.insert(posting.file_id);
137                    }
138                }
139            }
140        }
141        let pre_filter_candidate_count = candidate_ids.len();
142        let engine_capped = pre_filter_candidate_count > candidate_cap;
143        let filtered_candidates = candidate_ids
144            .into_iter()
145            .filter_map(|file_id| {
146                self.files
147                    .get(file_id as usize)
148                    .map(|entry| (file_id, entry))
149            })
150            .filter(|(_, entry)| {
151                if let Some(filter) = candidate_filter {
152                    filter(&entry.path)
153                } else {
154                    true
155                }
156            })
157            .collect::<Vec<_>>();
158
159        let mut ranked = Vec::new();
160        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161            let score = lexical_score(self, query_trigrams, file_id);
162            if score > 0.0 {
163                ranked.push((entry.path.clone(), score));
164            }
165        }
166
167        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168        ranked.truncate(max_files);
169        LexicalRankResult {
170            files: ranked,
171            engine_capped,
172        }
173    }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178    pub file_id: u32,
179    pub next_mask: u8,
180    pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185    pub path: PathBuf,
186    pub size: u64,
187    pub modified: SystemTime,
188    pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193    pub file: PathBuf,
194    pub line: u32,
195    pub column: u32,
196    pub line_text: String,
197    pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202    pub matches: Vec<GrepMatch>,
203    pub total_matches: usize,
204    pub files_searched: usize,
205    pub files_with_matches: usize,
206    pub index_status: IndexStatus,
207    pub truncated: bool,
208    pub fully_degraded: bool,
209    pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214    Ready,
215    Building,
216    Fallback,
217    Disabled,
218}
219
220impl IndexStatus {
221    pub fn as_str(&self) -> &'static str {
222        match self {
223            IndexStatus::Ready => "Ready",
224            IndexStatus::Building => "Building",
225            IndexStatus::Fallback => "Fallback",
226            IndexStatus::Disabled => "Disabled",
227        }
228    }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233    pub and_trigrams: Vec<u32>,
234    pub or_groups: Vec<Vec<u32>>,
235    pub(crate) and_filters: HashMap<u32, PostingFilter>,
236    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241    next_mask: u8,
242    loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247    and_runs: Vec<Vec<u8>>,
248    or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253    includes: Option<GlobSet>,
254    excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259    pub root: PathBuf,
260    pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265    file: Arc<PathBuf>,
266    line: u32,
267    column: u32,
268    line_text: String,
269    match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274    Literal(LiteralSearch),
275    Regex(Regex),
276}
277
278impl SearchIndex {
279    pub fn new() -> Self {
280        SearchIndex {
281            postings: HashMap::new(),
282            files: Vec::new(),
283            path_to_id: HashMap::new(),
284            ready: false,
285            project_root: PathBuf::new(),
286            git_head: None,
287            max_file_size: DEFAULT_MAX_FILE_SIZE,
288            ignore_rules_fingerprint: String::new(),
289            file_trigrams: HashMap::new(),
290            unindexed_files: HashSet::new(),
291        }
292    }
293
294    pub fn build(root: &Path) -> Self {
295        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296    }
297
298    pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
300        let mut index = SearchIndex {
301            project_root: project_root.clone(),
302            max_file_size,
303            ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
304            ..SearchIndex::new()
305        };
306
307        let filters = PathFilters::default();
308        for path in walk_project_files(&project_root, &filters) {
309            index.update_file(&path);
310        }
311
312        index.git_head = current_git_head(&project_root);
313        index.ready = true;
314        index
315    }
316
317    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
318        self.remove_file(path);
319
320        let file_id = match self.allocate_file_id(path, content.len() as u64) {
321            Some(file_id) => file_id,
322            None => return,
323        };
324        if let Some(file) = self.files.get_mut(file_id as usize) {
325            file.content_hash = cache_freshness::hash_bytes(content);
326        }
327
328        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
329        for (trigram, next_char, position) in extract_trigrams(content) {
330            let entry = trigram_map.entry(trigram).or_default();
331            entry.next_mask |= mask_for_next_char(next_char);
332            entry.loc_mask |= mask_for_position(position);
333        }
334
335        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
336        for (trigram, filter) in trigram_map {
337            let postings = self.postings.entry(trigram).or_default();
338            postings.push(Posting {
339                file_id,
340                next_mask: filter.next_mask,
341                loc_mask: filter.loc_mask,
342            });
343            // Posting lists are kept sorted by file_id for binary search during
344            // intersection. Since file_ids are allocated incrementally, the new
345            // entry is usually already in order. Only sort when needed.
346            if postings.len() > 1
347                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
348            {
349                postings.sort_unstable_by_key(|p| p.file_id);
350            }
351            file_trigrams.push(trigram);
352        }
353
354        self.file_trigrams.insert(file_id, file_trigrams);
355        self.unindexed_files.remove(&file_id);
356    }
357
358    pub fn remove_file(&mut self, path: &Path) {
359        let canonical_path = canonicalize_existing_or_deleted_path(path);
360        let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
361            file_id
362        } else if canonical_path.as_path() != path {
363            let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
364                return;
365            };
366            file_id
367        } else {
368            return;
369        };
370
371        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
372            for trigram in trigrams {
373                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
374                    postings.retain(|posting| posting.file_id != file_id);
375                    postings.is_empty()
376                } else {
377                    false
378                };
379
380                if should_remove {
381                    self.postings.remove(&trigram);
382                }
383            }
384        }
385
386        self.unindexed_files.remove(&file_id);
387        if let Some(file) = self.files.get_mut(file_id as usize) {
388            file.path = PathBuf::new();
389            file.size = 0;
390            file.modified = UNIX_EPOCH;
391            file.content_hash = cache_freshness::zero_hash();
392        }
393    }
394
395    pub fn update_file(&mut self, path: &Path) {
396        self.remove_file(path);
397
398        let metadata = match fs::metadata(path) {
399            Ok(metadata) if metadata.is_file() => metadata,
400            _ => return,
401        };
402
403        if is_binary_path(path, metadata.len()) {
404            self.track_unindexed_file(path, &metadata);
405            return;
406        }
407
408        if metadata.len() > self.max_file_size {
409            self.track_unindexed_file(path, &metadata);
410            return;
411        }
412
413        let content = match fs::read(path) {
414            Ok(content) => content,
415            Err(_) => return,
416        };
417
418        if is_binary_bytes(&content) {
419            self.track_unindexed_file(path, &metadata);
420            return;
421        }
422
423        self.index_file(path, &content);
424    }
425
426    pub fn grep(
427        &self,
428        pattern: &str,
429        case_sensitive: bool,
430        include: &[String],
431        exclude: &[String],
432        search_root: &Path,
433        max_results: usize,
434    ) -> GrepResult {
435        match pattern_compile::compile(
436            pattern,
437            CompileOpts {
438                case_insensitive: !case_sensitive,
439                ..CompileOpts::default()
440            },
441        ) {
442            CompileResult::Ok(compiled) => {
443                self.search_grep(&compiled, include, exclude, search_root, max_results)
444            }
445            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
446                self.empty_grep_result()
447            }
448        }
449    }
450
451    pub fn search_grep(
452        &self,
453        pattern: &CompiledPattern,
454        include: &[String],
455        exclude: &[String],
456        search_root: &Path,
457        max_results: usize,
458    ) -> GrepResult {
459        let matcher = match pattern {
460            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
461            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
462        };
463
464        let filters = match build_path_filters(include, exclude) {
465            Ok(filters) => filters,
466            Err(_) => PathFilters::default(),
467        };
468        let search_root = canonicalize_or_normalize(search_root);
469
470        let raw_pattern = pattern.raw_pattern_for_trigrams();
471        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
472            RegexQuery::default()
473        } else {
474            decompose_regex(&raw_pattern)
475        };
476        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
477        let candidate_ids = self.candidates(&query);
478
479        let candidate_files: Vec<&FileEntry> = candidate_ids
480            .into_iter()
481            .filter_map(|file_id| self.files.get(file_id as usize))
482            .filter(|file| !file.path.as_os_str().is_empty())
483            .filter(|file| is_within_search_root(&search_root, &file.path))
484            .filter(|file| filters.matches(&self.project_root, &file.path))
485            .collect();
486
487        let total_matches = AtomicUsize::new(0);
488        let files_searched = AtomicUsize::new(0);
489        let files_with_matches = AtomicUsize::new(0);
490        let truncated = AtomicBool::new(false);
491        let engine_capped = AtomicBool::new(false);
492        let stop_after = max_results.saturating_mul(2);
493
494        let mut matches = if candidate_files.len() > 10 {
495            candidate_files
496                .par_iter()
497                .map(|file| {
498                    search_candidate_file(
499                        file,
500                        &matcher,
501                        max_results,
502                        stop_after,
503                        &total_matches,
504                        &files_searched,
505                        &files_with_matches,
506                        &truncated,
507                        &engine_capped,
508                    )
509                })
510                .reduce(Vec::new, |mut left, mut right| {
511                    left.append(&mut right);
512                    left
513                })
514        } else {
515            let mut matches = Vec::new();
516            for file in candidate_files {
517                matches.extend(search_candidate_file(
518                    file,
519                    &matcher,
520                    max_results,
521                    stop_after,
522                    &total_matches,
523                    &files_searched,
524                    &files_with_matches,
525                    &truncated,
526                    &engine_capped,
527                ));
528
529                if should_stop_search(&truncated, &total_matches, stop_after) {
530                    engine_capped.store(true, Ordering::Relaxed);
531                    break;
532                }
533            }
534            matches
535        };
536
537        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
538            self.path_to_id
539                .get(path)
540                .and_then(|file_id| self.files.get(*file_id as usize))
541                .map(|file| file.modified)
542        });
543
544        let matches = matches
545            .into_iter()
546            .map(|matched| GrepMatch {
547                file: matched.file.as_ref().clone(),
548                line: matched.line,
549                column: matched.column,
550                line_text: matched.line_text,
551                match_text: matched.match_text,
552            })
553            .collect();
554
555        GrepResult {
556            total_matches: total_matches.load(Ordering::Relaxed),
557            matches,
558            files_searched: files_searched.load(Ordering::Relaxed),
559            files_with_matches: files_with_matches.load(Ordering::Relaxed),
560            index_status: if self.ready {
561                IndexStatus::Ready
562            } else {
563                IndexStatus::Building
564            },
565            truncated: truncated.load(Ordering::Relaxed),
566            fully_degraded,
567            engine_capped: engine_capped.load(Ordering::Relaxed),
568        }
569    }
570
571    fn empty_grep_result(&self) -> GrepResult {
572        GrepResult {
573            matches: Vec::new(),
574            total_matches: 0,
575            files_searched: 0,
576            files_with_matches: 0,
577            index_status: if self.ready {
578                IndexStatus::Ready
579            } else {
580                IndexStatus::Building
581            },
582            truncated: false,
583            fully_degraded: false,
584            engine_capped: false,
585        }
586    }
587
588    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
589        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
590            Ok(filters) => filters,
591            Err(_) => return Vec::new(),
592        };
593        let search_root = canonicalize_or_normalize(search_root);
594        let mut entries = self
595            .files
596            .iter()
597            .filter(|file| !file.path.as_os_str().is_empty())
598            .filter(|file| is_within_search_root(&search_root, &file.path))
599            .filter(|file| filters.matches(&self.project_root, &file.path))
600            .map(|file| (file.path.clone(), file.modified))
601            .collect::<Vec<_>>();
602
603        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
604            right_mtime
605                .cmp(left_mtime)
606                .then_with(|| left_path.cmp(right_path))
607        });
608
609        entries.into_iter().map(|(path, _)| path).collect()
610    }
611
612    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
613        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
614            return self.active_file_ids();
615        }
616
617        let mut and_trigrams = query.and_trigrams.clone();
618        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
619
620        let mut current: Option<Vec<u32>> = None;
621
622        for trigram in and_trigrams {
623            let filter = query.and_filters.get(&trigram).copied();
624            let matches = self.postings_for_trigram(trigram, filter);
625            current = Some(match current.take() {
626                Some(existing) => intersect_sorted_ids(&existing, &matches),
627                None => matches,
628            });
629
630            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
631                break;
632            }
633        }
634
635        let mut current = current.unwrap_or_else(|| self.active_file_ids());
636
637        for (index, group) in query.or_groups.iter().enumerate() {
638            let mut group_matches = Vec::new();
639            let filters = query.or_filters.get(index);
640
641            for trigram in group {
642                let filter = filters.and_then(|filters| filters.get(trigram).copied());
643                let matches = self.postings_for_trigram(*trigram, filter);
644                if group_matches.is_empty() {
645                    group_matches = matches;
646                } else {
647                    group_matches = union_sorted_ids(&group_matches, &matches);
648                }
649            }
650
651            current = intersect_sorted_ids(&current, &group_matches);
652            if current.is_empty() {
653                break;
654            }
655        }
656
657        let mut unindexed = self
658            .unindexed_files
659            .iter()
660            .copied()
661            .filter(|file_id| self.is_active_file(*file_id))
662            .collect::<Vec<_>>();
663        if !unindexed.is_empty() {
664            unindexed.sort_unstable();
665            current = union_sorted_ids(&current, &unindexed);
666        }
667
668        current
669    }
670
671    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
672        if fs::create_dir_all(cache_dir).is_err() {
673            return;
674        }
675
676        let cache_path = cache_dir.join("cache.bin");
677        let tmp_cache = cache_dir.join(format!(
678            "cache.bin.tmp.{}.{}",
679            std::process::id(),
680            SystemTime::now()
681                .duration_since(UNIX_EPOCH)
682                .unwrap_or(Duration::ZERO)
683                .as_nanos()
684        ));
685
686        let active_ids = self.active_file_ids();
687        let mut id_map = HashMap::new();
688        for (new_id, old_id) in active_ids.iter().enumerate() {
689            let Ok(new_id_u32) = u32::try_from(new_id) else {
690                return;
691            };
692            id_map.insert(*old_id, new_id_u32);
693        }
694
695        let write_result = (|| -> std::io::Result<()> {
696            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
697
698            postings_writer.write_all(INDEX_MAGIC)?;
699            write_u32(&mut postings_writer, INDEX_VERSION)?;
700
701            let head = git_head.unwrap_or_default();
702            let root = self.project_root.to_string_lossy();
703            let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
704                ignore_rules_fingerprint(&self.project_root)
705            } else {
706                self.ignore_rules_fingerprint.clone()
707            };
708            let head_len = u32::try_from(head.len())
709                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
710            let root_len = u32::try_from(root.len())
711                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
712            let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
713                .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
714            let file_count = u32::try_from(active_ids.len())
715                .map_err(|_| std::io::Error::other("too many files to cache"))?;
716
717            write_u32(&mut postings_writer, head_len)?;
718            write_u32(&mut postings_writer, root_len)?;
719            write_u32(&mut postings_writer, ignore_fingerprint_len)?;
720            write_u64(&mut postings_writer, self.max_file_size)?;
721            write_u32(&mut postings_writer, file_count)?;
722            postings_writer.write_all(head.as_bytes())?;
723            postings_writer.write_all(root.as_bytes())?;
724            postings_writer.write_all(ignore_fingerprint.as_bytes())?;
725
726            for old_id in &active_ids {
727                let Some(file) = self.files.get(*old_id as usize) else {
728                    return Err(std::io::Error::other("missing file entry for cache write"));
729                };
730                let path =
731                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
732                        std::io::Error::other(format!(
733                            "refusing to cache path outside project root: {}",
734                            file.path.display()
735                        ))
736                    })?;
737                let path = path.to_string_lossy();
738                let path_len = u32::try_from(path.len())
739                    .map_err(|_| std::io::Error::other("cached path too large"))?;
740                let modified = file
741                    .modified
742                    .duration_since(UNIX_EPOCH)
743                    .unwrap_or(Duration::ZERO);
744                let unindexed = if self.unindexed_files.contains(old_id) {
745                    1u8
746                } else {
747                    0u8
748                };
749
750                postings_writer.write_all(&[unindexed])?;
751                write_u32(&mut postings_writer, path_len)?;
752                write_u64(&mut postings_writer, file.size)?;
753                write_u64(&mut postings_writer, modified.as_secs())?;
754                write_u32(&mut postings_writer, modified.subsec_nanos())?;
755                postings_writer.write_all(file.content_hash.as_bytes())?;
756                postings_writer.write_all(path.as_bytes())?;
757            }
758
759            let mut lookup_entries = Vec::new();
760            let mut postings_blob = Vec::new();
761            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
762            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
763
764            for (trigram, postings) in sorted_postings {
765                let offset = u64::try_from(postings_blob.len())
766                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
767                let mut count = 0u32;
768
769                for posting in postings {
770                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
771                        continue;
772                    };
773
774                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
775                    postings_blob.push(posting.next_mask);
776                    postings_blob.push(posting.loc_mask);
777                    count = count.saturating_add(1);
778                }
779
780                if count > 0 {
781                    lookup_entries.push((*trigram, offset, count));
782                }
783            }
784
785            write_u64(
786                &mut postings_writer,
787                u64::try_from(postings_blob.len())
788                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
789            )?;
790            postings_writer.write_all(&postings_blob)?;
791            postings_writer.flush()?;
792            let mut postings_blob_file = postings_writer
793                .into_inner()
794                .map_err(|error| std::io::Error::other(error.to_string()))?
795                .into_inner();
796            let checksum = crc32fast::hash(&postings_blob_file);
797            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
798
799            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
800            let entry_count = u32::try_from(lookup_entries.len())
801                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
802
803            lookup_writer.write_all(LOOKUP_MAGIC)?;
804            write_u32(&mut lookup_writer, INDEX_VERSION)?;
805            write_u32(&mut lookup_writer, entry_count)?;
806
807            for (trigram, offset, count) in lookup_entries {
808                write_u32(&mut lookup_writer, trigram)?;
809                write_u64(&mut lookup_writer, offset)?;
810                write_u32(&mut lookup_writer, count)?;
811            }
812
813            lookup_writer.flush()?;
814            let mut lookup_blob_file = lookup_writer
815                .into_inner()
816                .map_err(|error| std::io::Error::other(error.to_string()))?
817                .into_inner();
818            let checksum = crc32fast::hash(&lookup_blob_file);
819            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
820
821            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
822            write_u32(&mut cache_writer, CACHE_MAGIC)?;
823            write_u32(&mut cache_writer, INDEX_VERSION)?;
824            write_u64(
825                &mut cache_writer,
826                u64::try_from(postings_blob_file.len())
827                    .map_err(|_| std::io::Error::other("postings section too large"))?,
828            )?;
829            cache_writer.write_all(&postings_blob_file)?;
830            cache_writer.write_all(&lookup_blob_file)?;
831            cache_writer.flush()?;
832            cache_writer.get_ref().sync_all()?;
833            drop(cache_writer);
834            fs::rename(&tmp_cache, &cache_path)?;
835
836            Ok(())
837        })();
838
839        if write_result.is_err() {
840            let _ = fs::remove_file(&tmp_cache);
841        }
842    }
843
844    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
845        debug_assert!(current_canonical_root.is_absolute());
846        let cache_path = cache_dir.join("cache.bin");
847        let cache_bytes = fs::read(&cache_path).ok()?;
848        if cache_bytes.len() < 16 {
849            return None;
850        }
851        let mut header = Cursor::new(&cache_bytes);
852        if read_u32(&mut header).ok()? != CACHE_MAGIC {
853            return None;
854        }
855        if read_u32(&mut header).ok()? != INDEX_VERSION {
856            return None;
857        }
858        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
859        let start = usize::try_from(header.position()).ok()?;
860        let postings_end = start.checked_add(postings_len_total)?;
861        if postings_end > cache_bytes.len() {
862            return None;
863        }
864        let postings_bytes = &cache_bytes[start..postings_end];
865        let lookup_bytes = &cache_bytes[postings_end..];
866        let lookup_len_total = lookup_bytes.len();
867        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
868        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
869        if postings_len_total < 4 || lookup_len_total < 4 {
870            return None;
871        }
872        verify_crc32_bytes_slice(postings_bytes).ok()?;
873        verify_crc32_bytes_slice(lookup_bytes).ok()?;
874
875        let mut magic = [0u8; 8];
876        postings_reader.read_exact(&mut magic).ok()?;
877        if &magic != INDEX_MAGIC {
878            return None;
879        }
880        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
881            return None;
882        }
883
884        let head_len = read_u32(&mut postings_reader).ok()? as usize;
885        let root_len = read_u32(&mut postings_reader).ok()? as usize;
886        let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
887        let max_file_size = read_u64(&mut postings_reader).ok()?;
888        let file_count = read_u32(&mut postings_reader).ok()? as usize;
889        if file_count > MAX_ENTRIES {
890            return None;
891        }
892        let postings_body_len = postings_len_total.checked_sub(4)?;
893        let lookup_body_len = lookup_len_total.checked_sub(4)?;
894
895        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
896        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
897        if minimum_file_bytes > remaining_postings {
898            return None;
899        }
900
901        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
902            return None;
903        }
904        let mut head_bytes = vec![0u8; head_len];
905        postings_reader.read_exact(&mut head_bytes).ok()?;
906        let git_head = String::from_utf8(head_bytes)
907            .ok()
908            .filter(|head| !head.is_empty());
909
910        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
911            return None;
912        }
913        let mut root_bytes = vec![0u8; root_len];
914        postings_reader.read_exact(&mut root_bytes).ok()?;
915        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
916        let project_root = current_canonical_root.to_path_buf();
917
918        if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
919            return None;
920        }
921        let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
922        postings_reader
923            .read_exact(&mut ignore_fingerprint_bytes)
924            .ok()?;
925        let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
926        let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
927        if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
928            return None;
929        }
930
931        let mut files = Vec::with_capacity(file_count);
932        let mut path_to_id = HashMap::new();
933        let mut unindexed_files = HashSet::new();
934
935        for file_id in 0..file_count {
936            let mut unindexed = [0u8; 1];
937            postings_reader.read_exact(&mut unindexed).ok()?;
938            let path_len = read_u32(&mut postings_reader).ok()? as usize;
939            let size = read_u64(&mut postings_reader).ok()?;
940            let secs = read_u64(&mut postings_reader).ok()?;
941            let nanos = read_u32(&mut postings_reader).ok()?;
942            let mut hash_bytes = [0u8; 32];
943            postings_reader.read_exact(&mut hash_bytes).ok()?;
944            let content_hash = blake3::Hash::from_bytes(hash_bytes);
945            if nanos >= 1_000_000_000 {
946                return None;
947            }
948            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
949                return None;
950            }
951            let mut path_bytes = vec![0u8; path_len];
952            postings_reader.read_exact(&mut path_bytes).ok()?;
953            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
954            let full_path = cached_path_under_root(&project_root, &relative_path)?;
955            let file_id_u32 = u32::try_from(file_id).ok()?;
956
957            files.push(FileEntry {
958                path: full_path.clone(),
959                size,
960                modified: UNIX_EPOCH + Duration::new(secs, nanos),
961                content_hash,
962            });
963            path_to_id.insert(full_path, file_id_u32);
964            if unindexed[0] == 1 {
965                unindexed_files.insert(file_id_u32);
966            }
967        }
968
969        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
970        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
971        if postings_len > max_postings_bytes {
972            return None;
973        }
974        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
975            return None;
976        }
977        let mut postings_blob = vec![0u8; postings_len];
978        postings_reader.read_exact(&mut postings_blob).ok()?;
979
980        let mut lookup_magic = [0u8; 8];
981        lookup_reader.read_exact(&mut lookup_magic).ok()?;
982        if &lookup_magic != LOOKUP_MAGIC {
983            return None;
984        }
985        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
986            return None;
987        }
988        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
989        if entry_count > MAX_ENTRIES {
990            return None;
991        }
992        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
993        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
994        if minimum_lookup_bytes > remaining_lookup {
995            return None;
996        }
997
998        let mut postings = HashMap::new();
999        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1000
1001        for _ in 0..entry_count {
1002            let trigram = read_u32(&mut lookup_reader).ok()?;
1003            let offset = read_u64(&mut lookup_reader).ok()? as usize;
1004            let count = read_u32(&mut lookup_reader).ok()? as usize;
1005            if count > MAX_ENTRIES {
1006                return None;
1007            }
1008            let bytes_len = count.checked_mul(POSTING_BYTES)?;
1009            let end = offset.checked_add(bytes_len)?;
1010            if end > postings_blob.len() {
1011                return None;
1012            }
1013
1014            let mut trigram_postings = Vec::with_capacity(count);
1015            for chunk in postings_blob[offset..end].chunks_exact(6) {
1016                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1017                let posting = Posting {
1018                    file_id,
1019                    next_mask: chunk[4],
1020                    loc_mask: chunk[5],
1021                };
1022                trigram_postings.push(posting.clone());
1023                file_trigrams.entry(file_id).or_default().push(trigram);
1024            }
1025            postings.insert(trigram, trigram_postings);
1026        }
1027
1028        Some(SearchIndex {
1029            postings,
1030            files,
1031            path_to_id,
1032            ready: false,
1033            project_root,
1034            git_head,
1035            max_file_size,
1036            ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1037            file_trigrams,
1038            unindexed_files,
1039        })
1040    }
1041
1042    pub fn stored_git_head(&self) -> Option<&str> {
1043        self.git_head.as_deref()
1044    }
1045
1046    pub(crate) fn set_ready(&mut self, ready: bool) {
1047        self.ready = ready;
1048    }
1049
1050    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1051        self.git_head = current_head;
1052        verify_file_mtimes(self);
1053        self.ready = true;
1054    }
1055
1056    #[cfg(debug_assertions)]
1057    #[doc(hidden)]
1058    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1059        self.verify_against_disk(current_head);
1060    }
1061
1062    pub(crate) fn rebuild_or_refresh(
1063        root: &Path,
1064        max_file_size: u64,
1065        current_head: Option<String>,
1066        baseline: Option<SearchIndex>,
1067    ) -> Self {
1068        if let Some(mut baseline) = baseline {
1069            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1070            baseline.max_file_size = max_file_size;
1071            let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1072            if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1073                return SearchIndex::build_with_limit(root, max_file_size);
1074            }
1075            baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1076
1077            if baseline.git_head == current_head || current_head.is_none() {
1078                // HEAD matches, but files may have changed on disk since the index was
1079                // last written (e.g., uncommitted edits, stash pop, manual file changes
1080                // while OpenCode was closed). Verify mtimes and re-index stale files.
1081                // Non-git projects also use this per-file (path, mtime, size)
1082                // fingerprint so unchanged trees reuse the disk cache instead of
1083                // rebuilding every configure.
1084                baseline.git_head = current_head;
1085                verify_file_mtimes(&mut baseline);
1086                baseline.ready = true;
1087                return baseline;
1088            }
1089
1090            if let (Some(previous), Some(current)) =
1091                (baseline.git_head.clone(), current_head.clone())
1092            {
1093                let project_root = baseline.project_root.clone();
1094                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1095                    baseline.git_head = Some(current);
1096                    verify_file_mtimes(&mut baseline);
1097                    baseline.ready = true;
1098                    return baseline;
1099                }
1100            }
1101        }
1102
1103        SearchIndex::build_with_limit(root, max_file_size)
1104    }
1105
1106    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1107        let file_id = u32::try_from(self.files.len()).ok()?;
1108        let metadata = fs::metadata(path).ok();
1109        let size = metadata
1110            .as_ref()
1111            .map_or(size_hint, |metadata| metadata.len());
1112        let modified = metadata
1113            .and_then(|metadata| metadata.modified().ok())
1114            .unwrap_or(UNIX_EPOCH);
1115
1116        self.files.push(FileEntry {
1117            path: path.to_path_buf(),
1118            size,
1119            modified,
1120            content_hash: cache_freshness::zero_hash(),
1121        });
1122        self.path_to_id.insert(path.to_path_buf(), file_id);
1123        Some(file_id)
1124    }
1125
1126    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1127        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1128            return;
1129        };
1130        self.unindexed_files.insert(file_id);
1131        self.file_trigrams.insert(file_id, Vec::new());
1132    }
1133
1134    fn active_file_ids(&self) -> Vec<u32> {
1135        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1136        ids.sort_unstable();
1137        ids
1138    }
1139
1140    fn is_active_file(&self, file_id: u32) -> bool {
1141        self.files
1142            .get(file_id as usize)
1143            .map(|file| !file.path.as_os_str().is_empty())
1144            .unwrap_or(false)
1145    }
1146
1147    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1148        let Some(postings) = self.postings.get(&trigram) else {
1149            return Vec::new();
1150        };
1151
1152        let mut matches = Vec::with_capacity(postings.len());
1153
1154        for posting in postings {
1155            if let Some(filter) = filter {
1156                // next_mask: bloom filter check — the character following this trigram in the
1157                // query must also appear after this trigram somewhere in the file.
1158                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1159                    continue;
1160                }
1161                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1162                // between consecutive trigram pairs, but is NOT used as a single-trigram
1163                // filter because the position in the query string has no relationship to
1164                // the position in the file. Using it here causes false negatives.
1165            }
1166            if self.is_active_file(posting.file_id) {
1167                matches.push(posting.file_id);
1168            }
1169        }
1170
1171        matches
1172    }
1173}
1174
1175fn search_candidate_file(
1176    file: &FileEntry,
1177    matcher: &SearchMatcher,
1178    max_results: usize,
1179    stop_after: usize,
1180    total_matches: &AtomicUsize,
1181    files_searched: &AtomicUsize,
1182    files_with_matches: &AtomicUsize,
1183    truncated: &AtomicBool,
1184    engine_capped: &AtomicBool,
1185) -> Vec<SharedGrepMatch> {
1186    if should_stop_search(truncated, total_matches, stop_after) {
1187        engine_capped.store(true, Ordering::Relaxed);
1188        return Vec::new();
1189    }
1190
1191    let content = match read_indexed_file_bytes(&file.path) {
1192        Some(content) => content,
1193        None => return Vec::new(),
1194    };
1195    // Defense in depth: even though indexing tries to filter binaries via
1196    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1197    // query time. content_inspector is fast (~bytes-per-cycle on a small
1198    // preview) and this guarantees we never surface matches inside binary
1199    // files even if the indexer somehow let one through (e.g. file changed
1200    // between indexing and query).
1201    if is_binary_bytes(&content) {
1202        return Vec::new();
1203    }
1204    files_searched.fetch_add(1, Ordering::Relaxed);
1205
1206    let shared_path = Arc::new(file.path.clone());
1207    let mut matches = Vec::new();
1208    let mut line_starts = None;
1209    let mut seen_lines = HashSet::new();
1210    let mut matched_this_file = false;
1211
1212    match matcher {
1213        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1214            let needle = &literal.needle;
1215            let finder = memchr::memmem::Finder::new(needle);
1216            let mut start = 0;
1217
1218            while let Some(position) = finder.find(&content[start..]) {
1219                if should_stop_search(truncated, total_matches, stop_after) {
1220                    engine_capped.store(true, Ordering::Relaxed);
1221                    break;
1222                }
1223
1224                let offset = start + position;
1225                start = offset + 1;
1226
1227                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1228                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1229                if !seen_lines.insert(line) {
1230                    continue;
1231                }
1232
1233                matched_this_file = true;
1234                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1235                if match_number > max_results {
1236                    truncated.store(true, Ordering::Relaxed);
1237                    break;
1238                }
1239
1240                let end = offset + needle.len();
1241                matches.push(SharedGrepMatch {
1242                    file: shared_path.clone(),
1243                    line,
1244                    column,
1245                    line_text,
1246                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1247                });
1248            }
1249        }
1250        SearchMatcher::Literal(literal) => {
1251            let needle = &literal.needle;
1252            let search_content = content.to_ascii_lowercase();
1253            let finder = memchr::memmem::Finder::new(needle);
1254            let mut start = 0;
1255
1256            while let Some(position) = finder.find(&search_content[start..]) {
1257                if should_stop_search(truncated, total_matches, stop_after) {
1258                    engine_capped.store(true, Ordering::Relaxed);
1259                    break;
1260                }
1261
1262                let offset = start + position;
1263                start = offset + 1;
1264
1265                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1266                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1267                if !seen_lines.insert(line) {
1268                    continue;
1269                }
1270
1271                matched_this_file = true;
1272                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1273                if match_number > max_results {
1274                    truncated.store(true, Ordering::Relaxed);
1275                    break;
1276                }
1277
1278                let end = offset + needle.len();
1279                matches.push(SharedGrepMatch {
1280                    file: shared_path.clone(),
1281                    line,
1282                    column,
1283                    line_text,
1284                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1285                });
1286            }
1287        }
1288        SearchMatcher::Regex(regex) => {
1289            for matched in regex.find_iter(&content) {
1290                if should_stop_search(truncated, total_matches, stop_after) {
1291                    engine_capped.store(true, Ordering::Relaxed);
1292                    break;
1293                }
1294
1295                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1296                let (line, column, line_text) =
1297                    line_details_bytes(&content, line_starts, matched.start());
1298                if !seen_lines.insert(line) {
1299                    continue;
1300                }
1301
1302                matched_this_file = true;
1303                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1304                if match_number > max_results {
1305                    truncated.store(true, Ordering::Relaxed);
1306                    break;
1307                }
1308
1309                matches.push(SharedGrepMatch {
1310                    file: shared_path.clone(),
1311                    line,
1312                    column,
1313                    line_text,
1314                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1315                });
1316            }
1317        }
1318    }
1319
1320    if matched_this_file {
1321        files_with_matches.fetch_add(1, Ordering::Relaxed);
1322    }
1323
1324    matches
1325}
1326
1327fn should_stop_search(
1328    truncated: &AtomicBool,
1329    total_matches: &AtomicUsize,
1330    stop_after: usize,
1331) -> bool {
1332    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1333}
1334
1335fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1336    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1337    let mut left_index = 0;
1338    let mut right_index = 0;
1339
1340    while left_index < left.len() && right_index < right.len() {
1341        match left[left_index].cmp(&right[right_index]) {
1342            std::cmp::Ordering::Less => left_index += 1,
1343            std::cmp::Ordering::Greater => right_index += 1,
1344            std::cmp::Ordering::Equal => {
1345                merged.push(left[left_index]);
1346                left_index += 1;
1347                right_index += 1;
1348            }
1349        }
1350    }
1351
1352    merged
1353}
1354
1355fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1356    let mut merged = Vec::with_capacity(left.len() + right.len());
1357    let mut left_index = 0;
1358    let mut right_index = 0;
1359
1360    while left_index < left.len() && right_index < right.len() {
1361        match left[left_index].cmp(&right[right_index]) {
1362            std::cmp::Ordering::Less => {
1363                merged.push(left[left_index]);
1364                left_index += 1;
1365            }
1366            std::cmp::Ordering::Greater => {
1367                merged.push(right[right_index]);
1368                right_index += 1;
1369            }
1370            std::cmp::Ordering::Equal => {
1371                merged.push(left[left_index]);
1372                left_index += 1;
1373                right_index += 1;
1374            }
1375        }
1376    }
1377
1378    merged.extend_from_slice(&left[left_index..]);
1379    merged.extend_from_slice(&right[right_index..]);
1380    merged
1381}
1382
1383pub fn decompose_regex(pattern: &str) -> RegexQuery {
1384    let hir = match regex_syntax::parse(pattern) {
1385        Ok(hir) => hir,
1386        Err(_) => return RegexQuery::default(),
1387    };
1388
1389    let build = build_query(&hir);
1390    build.into_query()
1391}
1392
1393pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1394    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1395}
1396
1397pub fn normalize_char(c: u8) -> u8 {
1398    c.to_ascii_lowercase()
1399}
1400
1401pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1402    if content.len() < 3 {
1403        return Vec::new();
1404    }
1405
1406    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1407    for start in 0..=content.len() - 3 {
1408        let trigram = pack_trigram(
1409            normalize_char(content[start]),
1410            normalize_char(content[start + 1]),
1411            normalize_char(content[start + 2]),
1412        );
1413        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1414        trigrams.push((trigram, next_char, start));
1415    }
1416    trigrams
1417}
1418
1419pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1420    let mut seen = HashSet::new();
1421    let mut out = Vec::new();
1422    for token in tokens {
1423        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1424            if seen.insert(trigram) {
1425                out.push(trigram);
1426            }
1427        }
1428    }
1429    out
1430}
1431
1432pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1433    if query_trigrams.is_empty() {
1434        return 0.0;
1435    }
1436
1437    let mut hits = 0u32;
1438    for &trigram in query_trigrams {
1439        if let Some(postings) = index.postings.get(&trigram) {
1440            if postings
1441                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1442                .is_ok()
1443            {
1444                hits += 1;
1445            }
1446        }
1447    }
1448
1449    if hits == 0 {
1450        return 0.0;
1451    }
1452
1453    let file_trigram_count = index
1454        .file_trigrams
1455        .get(&file_id)
1456        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1457    (hits as f32) / (1.0 + file_trigram_count.ln())
1458}
1459
1460pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1461    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1462    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1463        return PathBuf::from(override_dir)
1464            .join("index")
1465            .join(project_cache_key(project_root));
1466    }
1467    // Use configured storage dir (from plugin, XDG-compliant)
1468    if let Some(dir) = storage_dir {
1469        return dir.join("index").join(project_cache_key(project_root));
1470    }
1471    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1472    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1473    // If neither is set, fall back to a temp directory rather than `"."`
1474    // because the search-index code reads/writes absolute paths.
1475    let home = std::env::var_os("HOME")
1476        .or_else(|| std::env::var_os("USERPROFILE"))
1477        .map(PathBuf::from)
1478        .unwrap_or_else(std::env::temp_dir);
1479    home.join(".cache")
1480        .join("aft")
1481        .join("index")
1482        .join(project_cache_key(project_root))
1483}
1484
1485pub(crate) fn build_path_filters(
1486    include: &[String],
1487    exclude: &[String],
1488) -> Result<PathFilters, String> {
1489    Ok(PathFilters {
1490        includes: build_globset(include)?,
1491        excludes: build_globset(exclude)?,
1492    })
1493}
1494
1495pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1496    walk_project_files_from(root, root, filters)
1497}
1498
1499pub fn walk_project_files_bounded_default(
1500    root: &Path,
1501    max_files: usize,
1502) -> Result<Vec<PathBuf>, usize> {
1503    walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files))
1504}
1505
1506pub(crate) fn walk_project_files_bounded_matching<F>(
1507    root: &Path,
1508    filters: &PathFilters,
1509    max_files: usize,
1510    matches_file: F,
1511) -> Result<Vec<PathBuf>, usize>
1512where
1513    F: Fn(&Path) -> bool,
1514{
1515    walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file)
1516}
1517
1518pub fn walk_project_files_bounded_default_matching<F>(
1519    root: &Path,
1520    max_files: usize,
1521    matches_file: F,
1522) -> Result<Vec<PathBuf>, usize>
1523where
1524    F: Fn(&Path) -> bool,
1525{
1526    walk_project_files_from_inner_matching(
1527        root,
1528        root,
1529        &PathFilters::default(),
1530        Some(max_files),
1531        matches_file,
1532    )
1533}
1534
1535pub(crate) fn walk_project_files_from(
1536    filter_root: &Path,
1537    search_root: &Path,
1538    filters: &PathFilters,
1539) -> Vec<PathBuf> {
1540    walk_project_files_from_inner(filter_root, search_root, filters, None)
1541        .expect("unbounded project walk cannot exceed a file limit")
1542}
1543
1544pub(crate) fn has_any_project_file_from(
1545    filter_root: &Path,
1546    search_root: &Path,
1547    filters: &PathFilters,
1548) -> bool {
1549    walk_project_files_from_inner(filter_root, search_root, filters, Some(0)).is_err()
1550}
1551
1552fn walk_project_files_from_inner(
1553    filter_root: &Path,
1554    search_root: &Path,
1555    filters: &PathFilters,
1556    max_files: Option<usize>,
1557) -> Result<Vec<PathBuf>, usize> {
1558    walk_project_files_from_inner_matching(filter_root, search_root, filters, max_files, |_| true)
1559}
1560
1561fn walk_project_files_from_inner_matching<F>(
1562    filter_root: &Path,
1563    search_root: &Path,
1564    filters: &PathFilters,
1565    max_files: Option<usize>,
1566    matches_file: F,
1567) -> Result<Vec<PathBuf>, usize>
1568where
1569    F: Fn(&Path) -> bool,
1570{
1571    let mut builder = WalkBuilder::new(search_root);
1572    builder
1573        .hidden(false)
1574        .git_ignore(true)
1575        .git_global(true)
1576        .git_exclude(true)
1577        // `.aftignore` — AFT-specific ignores layered on top of .gitignore.
1578        // Honored hierarchically like .gitignore (works in non-git projects too),
1579        // so users can exclude paths git can't (e.g. submodules) from indexing.
1580        .add_custom_ignore_filename(".aftignore")
1581        .filter_entry(|entry| {
1582            let name = entry.file_name().to_string_lossy();
1583            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1584                return !matches!(
1585                    name.as_ref(),
1586                    "node_modules"
1587                        | "target"
1588                        | "venv"
1589                        | ".venv"
1590                        | ".git"
1591                        | "__pycache__"
1592                        | ".tox"
1593                        | "dist"
1594                        | "build"
1595                );
1596            }
1597            true
1598        });
1599
1600    let mut files = Vec::new();
1601    for entry in builder.build().filter_map(|entry| entry.ok()) {
1602        if !entry
1603            .file_type()
1604            .map_or(false, |file_type| file_type.is_file())
1605        {
1606            continue;
1607        }
1608        let path = entry.into_path();
1609        if filters.matches(filter_root, &path) && matches_file(&path) {
1610            files.push(path);
1611            if max_files.is_some_and(|limit| files.len() > limit) {
1612                return Err(files.len());
1613            }
1614        }
1615    }
1616
1617    sort_paths_by_mtime_desc(&mut files);
1618    Ok(files)
1619}
1620
1621pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1622    let bytes = fs::read(path).ok()?;
1623    if is_binary_bytes(&bytes) {
1624        return None;
1625    }
1626    String::from_utf8(bytes).ok()
1627}
1628
1629fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1630    fs::read(path).ok()
1631}
1632
1633pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1634    path.strip_prefix(root)
1635        .map(PathBuf::from)
1636        .unwrap_or_else(|_| path.to_path_buf())
1637}
1638
1639pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1640    let normalized_root = normalize_path(root);
1641    let normalized_path = normalize_path(path);
1642    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1643    validate_cached_relative_path(relative)
1644}
1645
1646pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1647    let relative = validate_cached_relative_path(relative_path)?;
1648    let normalized_root = normalize_path(root);
1649    let full_path = normalize_path(&normalized_root.join(relative));
1650
1651    match fs::canonicalize(&full_path) {
1652        Ok(canonical_path) => {
1653            if canonical_path.starts_with(&normalized_root) {
1654                return Some(full_path);
1655            }
1656
1657            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1658            canonical_path
1659                .starts_with(&canonical_root)
1660                .then_some(full_path)
1661        }
1662        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1663    }
1664}
1665
1666pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1667    if path.is_absolute() {
1668        return None;
1669    }
1670
1671    let mut normalized = PathBuf::new();
1672    for component in path.components() {
1673        match component {
1674            Component::Normal(part) => normalized.push(part),
1675            Component::CurDir => {}
1676            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1677        }
1678    }
1679    (!normalized.as_os_str().is_empty()).then_some(normalized)
1680}
1681
1682/// Sort paths newest-first by mtime, falling back to lexicographic order.
1683///
1684/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1685/// `sort_by()` closure. That made the comparator non-deterministic — a
1686/// `stat()` syscall for the same path can return different values across
1687/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1688/// concurrent file-watcher activity), and Rust's slice::sort panics at
1689/// runtime when it detects a non-total-order comparator. CI hit this on
1690/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1691///
1692/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1693/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1694pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1695    use std::collections::HashMap;
1696    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1697    for path in paths.iter() {
1698        mtimes
1699            .entry(path.clone())
1700            .or_insert_with(|| path_modified_time(path));
1701    }
1702    paths.sort_by(|left, right| {
1703        let left_mtime = mtimes.get(left).and_then(|v| *v);
1704        let right_mtime = mtimes.get(right).and_then(|v| *v);
1705        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1706    });
1707}
1708
1709/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1710/// the sort. Same fix, applied to grep matches that share files.
1711pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1712    use std::collections::HashMap;
1713    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1714    for m in matches.iter() {
1715        mtimes.entry(m.file.clone()).or_insert_with(|| {
1716            let resolved = resolve_match_path(project_root, &m.file);
1717            path_modified_time(&resolved)
1718        });
1719    }
1720    matches.sort_by(|left, right| {
1721        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1722        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1723        right_mtime
1724            .cmp(&left_mtime)
1725            .then_with(|| left.file.cmp(&right.file))
1726            .then_with(|| left.line.cmp(&right.line))
1727            .then_with(|| left.column.cmp(&right.column))
1728    });
1729}
1730
1731/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1732/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1733/// table from the search index), but it can still return different values if
1734/// the file is modified mid-sort. Snapshot once.
1735fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1736    matches: &mut [SharedGrepMatch],
1737    modified_for_path: F,
1738) where
1739    F: Fn(&Path) -> Option<SystemTime>,
1740{
1741    use std::collections::HashMap;
1742    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1743    for m in matches.iter() {
1744        let path = m.file.as_path().to_path_buf();
1745        mtimes
1746            .entry(path.clone())
1747            .or_insert_with(|| modified_for_path(&path));
1748    }
1749    matches.sort_by(|left, right| {
1750        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1751        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1752        right_mtime
1753            .cmp(&left_mtime)
1754            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1755            .then_with(|| left.line.cmp(&right.line))
1756            .then_with(|| left.column.cmp(&right.column))
1757    });
1758}
1759
1760pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1761    let resolved_project_root = canonicalize_or_normalize(project_root);
1762    let root = match path {
1763        Some(path) => {
1764            let path = PathBuf::from(path);
1765            if path.is_absolute() {
1766                canonicalize_or_normalize(&path)
1767            } else {
1768                normalize_path(&resolved_project_root.join(path))
1769            }
1770        }
1771        None => resolved_project_root.clone(),
1772    };
1773
1774    let use_index = is_within_search_root(&resolved_project_root, &root);
1775    SearchScope { root, use_index }
1776}
1777
1778pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1779    content_inspector::inspect(content).is_binary()
1780}
1781
1782pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1783    run_git(root, &["rev-parse", "HEAD"])
1784}
1785
1786pub fn project_cache_key(project_root: &Path) -> String {
1787    use sha2::{Digest, Sha256};
1788
1789    let mut hasher = Sha256::new();
1790
1791    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1792        // Git repo: root commit is the unique identity.
1793        // Same repo cloned anywhere produces the same key.
1794        hasher.update(root_commit.as_bytes());
1795    } else {
1796        // Non-git project: use the canonical filesystem path as identity.
1797        let canonical_root = canonicalize_or_normalize(project_root);
1798        hasher.update(canonical_root.to_string_lossy().as_bytes());
1799    }
1800
1801    let digest = format!("{:x}", hasher.finalize());
1802    digest[..16].to_string()
1803}
1804
1805/// Fingerprint corpus-shaping ignore rules that are not represented by git HEAD.
1806///
1807/// The search cache stores this value next to the file mtimes. If `.gitignore`,
1808/// `.aftignore`, or `.git/info/exclude` changes while AFT is not running, a
1809/// matching HEAD + matching file mtimes is not enough to safely reuse the old
1810/// cache: files that are now ignored may still be indexed. Hashing the ignore
1811/// files themselves makes cold-start cache reuse agree with the current walker.
1812pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1813    use sha2::{Digest, Sha256};
1814
1815    let root = canonicalize_or_normalize(project_root);
1816    let mut files = Vec::new();
1817    collect_ignore_rule_files(&root, &mut files);
1818    if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
1819        if global_ignore.is_file() {
1820            files.push(global_ignore);
1821        }
1822    }
1823    let info_exclude = git_info_exclude_path(&root);
1824    if info_exclude.is_file() {
1825        files.push(info_exclude);
1826    }
1827    files.sort();
1828    files.dedup();
1829
1830    let mut hasher = Sha256::new();
1831    hasher.update(b"aft-ignore-rules-v1\0");
1832    for path in files {
1833        if let Some(relative) = cache_relative_path(&root, &path) {
1834            hasher.update(relative.to_string_lossy().as_bytes());
1835        } else {
1836            hasher.update(path.to_string_lossy().as_bytes());
1837        }
1838        hasher.update(b"\0");
1839        match fs::read(&path) {
1840            Ok(bytes) => hasher.update(&bytes),
1841            Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
1842        }
1843        hasher.update(b"\0");
1844    }
1845
1846    format!("{:x}", hasher.finalize())
1847}
1848
1849fn git_info_exclude_path(root: &Path) -> PathBuf {
1850    run_git(
1851        root,
1852        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
1853    )
1854    .map(PathBuf::from)
1855    .unwrap_or_else(|| root.join(".git"))
1856    .join("info")
1857    .join("exclude")
1858}
1859
1860fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
1861    let mut stack = vec![root.to_path_buf()];
1862    while let Some(dir) = stack.pop() {
1863        let Ok(entries) = fs::read_dir(&dir) else {
1864            continue;
1865        };
1866        for entry in entries.flatten() {
1867            let path = entry.path();
1868            let file_name = entry.file_name();
1869            if file_name == ".gitignore" || file_name == ".aftignore" {
1870                if path.is_file() {
1871                    files.push(path);
1872                }
1873                continue;
1874            }
1875
1876            let Ok(file_type) = entry.file_type() else {
1877                continue;
1878            };
1879            if !file_type.is_dir() || file_type.is_symlink() {
1880                continue;
1881            }
1882            if ignore_rule_fingerprint_skips_dir(&file_name) {
1883                continue;
1884            }
1885            stack.push(path);
1886        }
1887    }
1888}
1889
1890fn ignore_rule_fingerprint_skips_dir(name: &std::ffi::OsStr) -> bool {
1891    matches!(
1892        name.to_str().unwrap_or(""),
1893        ".git"
1894            | "node_modules"
1895            | "target"
1896            | "venv"
1897            | ".venv"
1898            | "__pycache__"
1899            | ".tox"
1900            | "dist"
1901            | "build"
1902    )
1903}
1904
1905impl PathFilters {
1906    fn matches(&self, root: &Path, path: &Path) -> bool {
1907        let relative = to_glob_path(&relative_to_root(root, path));
1908        if self
1909            .includes
1910            .as_ref()
1911            .is_some_and(|includes| !includes.is_match(&relative))
1912        {
1913            return false;
1914        }
1915        if self
1916            .excludes
1917            .as_ref()
1918            .is_some_and(|excludes| excludes.is_match(&relative))
1919        {
1920            return false;
1921        }
1922        true
1923    }
1924}
1925
1926fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1927    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1928}
1929
1930fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1931    if path.is_absolute() {
1932        path.to_path_buf()
1933    } else {
1934        project_root.join(path)
1935    }
1936}
1937
1938fn path_modified_time(path: &Path) -> Option<SystemTime> {
1939    fs::metadata(path)
1940        .and_then(|metadata| metadata.modified())
1941        .ok()
1942}
1943
1944fn normalize_path(path: &Path) -> PathBuf {
1945    let mut result = PathBuf::new();
1946    for component in path.components() {
1947        match component {
1948            Component::ParentDir => {
1949                if !result.pop() {
1950                    result.push(component);
1951                }
1952            }
1953            Component::CurDir => {}
1954            _ => result.push(component),
1955        }
1956    }
1957    result
1958}
1959
1960fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
1961    if let Ok(canonical) = fs::canonicalize(path) {
1962        return canonical;
1963    }
1964
1965    let Some(parent) = path.parent() else {
1966        return path.to_path_buf();
1967    };
1968    let Some(file_name) = path.file_name() else {
1969        return path.to_path_buf();
1970    };
1971
1972    fs::canonicalize(parent)
1973        .map(|canonical_parent| canonical_parent.join(file_name))
1974        .unwrap_or_else(|_| path.to_path_buf())
1975}
1976
1977/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1978/// since the index was last written. Also detect new files and deleted files.
1979fn verify_file_mtimes(index: &mut SearchIndex) {
1980    let filters = PathFilters::default();
1981    let current_files = walk_project_files(&index.project_root, &filters);
1982    let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
1983    let mut stale_paths = Vec::new();
1984    let mut removed_paths = Vec::new();
1985
1986    for entry in &mut index.files {
1987        if entry.path.as_os_str().is_empty() {
1988            continue; // tombstoned entry
1989        }
1990        if !current_file_set.contains(&entry.path) {
1991            removed_paths.push(entry.path.clone());
1992            continue;
1993        }
1994        let cached = FileFreshness {
1995            mtime: entry.modified,
1996            size: entry.size,
1997            content_hash: entry.content_hash,
1998        };
1999        match cache_freshness::verify_file_strict(&entry.path, &cached) {
2000            FreshnessVerdict::HotFresh => {}
2001            FreshnessVerdict::ContentFresh {
2002                new_mtime,
2003                new_size,
2004            } => {
2005                entry.modified = new_mtime;
2006                entry.size = new_size;
2007            }
2008            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2009                stale_paths.push(entry.path.clone())
2010            }
2011        }
2012    }
2013
2014    for path in &removed_paths {
2015        index.remove_file(path);
2016    }
2017
2018    // Re-index stale files that are still in the current walk set. If an ignore
2019    // rule changed while AFT was down but the fingerprint missed it, this keeps
2020    // warm-cache verification from resurrecting now-ignored cached entries.
2021    for path in &stale_paths {
2022        if current_file_set.contains(path) {
2023            index.update_file(path);
2024        } else {
2025            index.remove_file(path);
2026        }
2027    }
2028
2029    // Detect new files not in the index
2030    for path in current_files {
2031        if !index.path_to_id.contains_key(&path) {
2032            index.update_file(&path);
2033        }
2034    }
2035
2036    if !stale_paths.is_empty() {
2037        crate::slog_info!(
2038            "search index: refreshed {} stale file(s) from disk cache",
2039            stale_paths.len()
2040        );
2041    }
2042}
2043
2044fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2045    normalize_path(path).starts_with(normalize_path(search_root))
2046}
2047
2048impl QueryBuild {
2049    fn into_query(self) -> RegexQuery {
2050        let mut query = RegexQuery::default();
2051
2052        for run in self.and_runs {
2053            add_run_to_and_query(&mut query, &run);
2054        }
2055
2056        for group in self.or_groups {
2057            let mut trigrams = BTreeSet::new();
2058            let mut filters = HashMap::new();
2059            for run in group {
2060                for (trigram, filter) in trigram_filters(&run) {
2061                    trigrams.insert(trigram);
2062                    merge_filter(filters.entry(trigram).or_default(), filter);
2063                }
2064            }
2065            if !trigrams.is_empty() {
2066                query.or_groups.push(trigrams.into_iter().collect());
2067                query.or_filters.push(filters);
2068            }
2069        }
2070
2071        query
2072    }
2073}
2074
2075fn build_query(hir: &Hir) -> QueryBuild {
2076    match hir.kind() {
2077        HirKind::Literal(literal) => {
2078            if literal.0.len() >= 3 {
2079                QueryBuild {
2080                    and_runs: vec![literal.0.to_vec()],
2081                    or_groups: Vec::new(),
2082                }
2083            } else {
2084                QueryBuild::default()
2085            }
2086        }
2087        HirKind::Capture(capture) => build_query(&capture.sub),
2088        HirKind::Concat(parts) => {
2089            let mut build = QueryBuild::default();
2090            for part in parts {
2091                let part_build = build_query(part);
2092                build.and_runs.extend(part_build.and_runs);
2093                build.or_groups.extend(part_build.or_groups);
2094            }
2095            build
2096        }
2097        HirKind::Alternation(parts) => {
2098            let mut group = Vec::new();
2099            for part in parts {
2100                let Some(mut choices) = guaranteed_run_choices(part) else {
2101                    return QueryBuild::default();
2102                };
2103                group.append(&mut choices);
2104            }
2105            if group.is_empty() {
2106                QueryBuild::default()
2107            } else {
2108                QueryBuild {
2109                    and_runs: Vec::new(),
2110                    or_groups: vec![group],
2111                }
2112            }
2113        }
2114        HirKind::Repetition(repetition) => {
2115            if repetition.min == 0 {
2116                QueryBuild::default()
2117            } else {
2118                build_query(&repetition.sub)
2119            }
2120        }
2121        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2122    }
2123}
2124
2125fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2126    match hir.kind() {
2127        HirKind::Literal(literal) => {
2128            if literal.0.len() >= 3 {
2129                Some(vec![literal.0.to_vec()])
2130            } else {
2131                None
2132            }
2133        }
2134        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2135        HirKind::Concat(parts) => {
2136            let mut runs = Vec::new();
2137            for part in parts {
2138                if let Some(mut part_runs) = guaranteed_run_choices(part) {
2139                    runs.append(&mut part_runs);
2140                }
2141            }
2142            if runs.is_empty() {
2143                None
2144            } else {
2145                Some(runs)
2146            }
2147        }
2148        HirKind::Alternation(parts) => {
2149            let mut runs = Vec::new();
2150            for part in parts {
2151                let Some(mut part_runs) = guaranteed_run_choices(part) else {
2152                    return None;
2153                };
2154                runs.append(&mut part_runs);
2155            }
2156            if runs.is_empty() {
2157                None
2158            } else {
2159                Some(runs)
2160            }
2161        }
2162        HirKind::Repetition(repetition) => {
2163            if repetition.min == 0 {
2164                None
2165            } else {
2166                guaranteed_run_choices(&repetition.sub)
2167            }
2168        }
2169        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2170    }
2171}
2172
2173fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2174    for (trigram, filter) in trigram_filters(run) {
2175        if !query.and_trigrams.contains(&trigram) {
2176            query.and_trigrams.push(trigram);
2177        }
2178        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2179    }
2180}
2181
2182fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2183    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2184    for (trigram, next_char, position) in extract_trigrams(run) {
2185        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
2186        if next_char != EOF_SENTINEL {
2187            entry.next_mask |= mask_for_next_char(next_char);
2188        }
2189        entry.loc_mask |= mask_for_position(position);
2190    }
2191    filters.into_iter().collect()
2192}
2193
2194fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2195    target.next_mask |= filter.next_mask;
2196    target.loc_mask |= filter.loc_mask;
2197}
2198
2199fn mask_for_next_char(next_char: u8) -> u8 {
2200    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2201    1u8 << bit
2202}
2203
2204fn mask_for_position(position: usize) -> u8 {
2205    1u8 << (position % 8)
2206}
2207
2208fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2209    if patterns.is_empty() {
2210        return Ok(None);
2211    }
2212
2213    let mut builder = GlobSetBuilder::new();
2214    for pattern in patterns {
2215        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2216        builder.add(glob);
2217    }
2218    builder.build().map(Some).map_err(|error| error.to_string())
2219}
2220
2221fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2222    let mut buffer = [0u8; 4];
2223    reader.read_exact(&mut buffer)?;
2224    Ok(u32::from_le_bytes(buffer))
2225}
2226
2227fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2228    let mut buffer = [0u8; 8];
2229    reader.read_exact(&mut buffer)?;
2230    Ok(u64::from_le_bytes(buffer))
2231}
2232
2233fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2234    writer.write_all(&value.to_le_bytes())
2235}
2236
2237fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2238    writer.write_all(&value.to_le_bytes())
2239}
2240
2241fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2242    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2243        return Err(std::io::Error::other("search index checksum missing"));
2244    };
2245    let expected = u32::from_le_bytes(*stored);
2246    let actual = crc32fast::hash(body);
2247    if actual != expected {
2248        return Err(std::io::Error::other("search index checksum mismatch"));
2249    }
2250    Ok(())
2251}
2252
2253fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2254    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2255    total_len.checked_sub(pos)
2256}
2257
2258fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2259    let output = Command::new("git")
2260        .arg("-C")
2261        .arg(root)
2262        .args(args)
2263        .output()
2264        .ok()?;
2265    if !output.status.success() {
2266        return None;
2267    }
2268    let value = String::from_utf8(output.stdout).ok()?;
2269    let value = value.trim().to_string();
2270    if value.is_empty() {
2271        None
2272    } else {
2273        Some(value)
2274    }
2275}
2276
2277fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2278    let diff_range = format!("{}..{}", from, to);
2279    let output = match Command::new("git")
2280        .arg("-C")
2281        .arg(root)
2282        .args(["diff", "--name-status", "-M", &diff_range])
2283        .output()
2284    {
2285        Ok(output) => output,
2286        Err(_) => return false,
2287    };
2288
2289    if !output.status.success() {
2290        return false;
2291    }
2292
2293    let Ok(diff) = String::from_utf8(output.stdout) else {
2294        return false;
2295    };
2296
2297    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2298        let mut fields = line.split('\t');
2299        let Some(status) = fields.next() else {
2300            continue;
2301        };
2302
2303        if status.starts_with('R') {
2304            let Some(old_path) = fields
2305                .next()
2306                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2307            else {
2308                continue;
2309            };
2310            let Some(new_path) = fields
2311                .next()
2312                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2313            else {
2314                continue;
2315            };
2316            index.remove_file(&old_path);
2317            index.update_file(&new_path);
2318            continue;
2319        }
2320
2321        let Some(path) = fields
2322            .next()
2323            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2324        else {
2325            continue;
2326        };
2327        if status.starts_with('D') || !path.exists() {
2328            index.remove_file(&path);
2329        } else {
2330            index.update_file(&path);
2331        }
2332    }
2333
2334    true
2335}
2336
2337fn is_binary_path(path: &Path, size: u64) -> bool {
2338    if size == 0 {
2339        return false;
2340    }
2341
2342    let mut file = match File::open(path) {
2343        Ok(file) => file,
2344        Err(_) => return true,
2345    };
2346
2347    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2348    match file.read(&mut preview) {
2349        Ok(read) => is_binary_bytes(&preview[..read]),
2350        Err(_) => true,
2351    }
2352}
2353
2354fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2355    let mut starts = vec![0usize];
2356    for (index, byte) in content.iter().copied().enumerate() {
2357        if byte == b'\n' {
2358            starts.push(index + 1);
2359        }
2360    }
2361    starts
2362}
2363
2364fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2365    let line_index = match line_starts.binary_search(&offset) {
2366        Ok(index) => index,
2367        Err(index) => index.saturating_sub(1),
2368    };
2369    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2370    let line_end = content[line_start..]
2371        .iter()
2372        .position(|byte| *byte == b'\n')
2373        .map(|length| line_start + length)
2374        .unwrap_or(content.len());
2375    let mut line_slice = &content[line_start..line_end];
2376    if line_slice.ends_with(b"\r") {
2377        line_slice = &line_slice[..line_slice.len() - 1];
2378    }
2379    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2380    let column = String::from_utf8_lossy(&content[line_start..offset])
2381        .chars()
2382        .count() as u32
2383        + 1;
2384    (line_index as u32 + 1, column, line_text)
2385}
2386
2387fn to_glob_path(path: &Path) -> String {
2388    path.to_string_lossy().replace('\\', "/")
2389}
2390
2391#[cfg(test)]
2392mod tests {
2393    use std::process::Command;
2394
2395    use super::*;
2396
2397    #[test]
2398    fn cached_path_under_root_allows_missing_lexical_child() {
2399        let dir = tempfile::tempdir().expect("create temp dir");
2400        let project = dir.path().join("project");
2401        fs::create_dir_all(&project).expect("create project dir");
2402        let root = fs::canonicalize(&project).expect("canonicalize project");
2403
2404        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2405            .expect("missing child should fall back to lexical validation");
2406
2407        assert_eq!(path, root.join("future/file.rs"));
2408    }
2409
2410    #[cfg(unix)]
2411    #[test]
2412    fn cached_path_under_root_rejects_symlink_escape() {
2413        let dir = tempfile::tempdir().expect("create temp dir");
2414        let project = dir.path().join("project");
2415        let outside = dir.path().join("outside");
2416        fs::create_dir_all(&project).expect("create project dir");
2417        fs::create_dir_all(&outside).expect("create outside dir");
2418        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2419        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2420        let root = fs::canonicalize(&project).expect("canonicalize project");
2421
2422        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2423    }
2424
2425    #[test]
2426    fn extract_trigrams_tracks_next_char_and_position() {
2427        let trigrams = extract_trigrams(b"Rust");
2428        assert_eq!(trigrams.len(), 2);
2429        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2430        assert_eq!(
2431            trigrams[1],
2432            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2433        );
2434    }
2435
2436    #[test]
2437    fn decompose_regex_extracts_literals_and_alternations() {
2438        let query = decompose_regex("abc(def|ghi)xyz");
2439        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2440        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2441        assert_eq!(query.or_groups.len(), 1);
2442        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2443        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2444    }
2445
2446    #[test]
2447    fn candidates_intersect_posting_lists() {
2448        let mut index = SearchIndex::new();
2449        let dir = tempfile::tempdir().expect("create temp dir");
2450        let alpha = dir.path().join("alpha.txt");
2451        let beta = dir.path().join("beta.txt");
2452        fs::write(&alpha, "abcdef").expect("write alpha");
2453        fs::write(&beta, "abcxyz").expect("write beta");
2454        index.project_root = dir.path().to_path_buf();
2455        index.index_file(&alpha, b"abcdef");
2456        index.index_file(&beta, b"abcxyz");
2457
2458        let query = RegexQuery {
2459            and_trigrams: vec![
2460                pack_trigram(b'a', b'b', b'c'),
2461                pack_trigram(b'd', b'e', b'f'),
2462            ],
2463            ..RegexQuery::default()
2464        };
2465
2466        let candidates = index.candidates(&query);
2467        assert_eq!(candidates.len(), 1);
2468        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2469    }
2470
2471    #[test]
2472    fn candidates_apply_bloom_filters() {
2473        let mut index = SearchIndex::new();
2474        let dir = tempfile::tempdir().expect("create temp dir");
2475        let file = dir.path().join("sample.txt");
2476        fs::write(&file, "abcd efgh").expect("write sample");
2477        index.project_root = dir.path().to_path_buf();
2478        index.index_file(&file, b"abcd efgh");
2479
2480        let trigram = pack_trigram(b'a', b'b', b'c');
2481        let matching_filter = PostingFilter {
2482            next_mask: mask_for_next_char(b'd'),
2483            loc_mask: mask_for_position(0),
2484        };
2485        let non_matching_filter = PostingFilter {
2486            next_mask: mask_for_next_char(b'z'),
2487            loc_mask: mask_for_position(0),
2488        };
2489
2490        assert_eq!(
2491            index
2492                .postings_for_trigram(trigram, Some(matching_filter))
2493                .len(),
2494            1
2495        );
2496        assert!(index
2497            .postings_for_trigram(trigram, Some(non_matching_filter))
2498            .is_empty());
2499    }
2500
2501    #[test]
2502    fn disk_round_trip_preserves_postings_and_files() {
2503        let dir = tempfile::tempdir().expect("create temp dir");
2504        let project = dir.path().join("project");
2505        fs::create_dir_all(&project).expect("create project dir");
2506        let file = project.join("src.txt");
2507        fs::write(&file, "abcdef").expect("write source");
2508
2509        let mut index = SearchIndex::build(&project);
2510        index.git_head = Some("deadbeef".to_string());
2511        let cache_dir = dir.path().join("cache");
2512        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2513
2514        let loaded =
2515            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2516        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2517        assert_eq!(loaded.files.len(), 1);
2518        assert_eq!(
2519            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2520            PathBuf::from("src.txt")
2521        );
2522        assert_eq!(loaded.postings.len(), index.postings.len());
2523        assert!(loaded
2524            .postings
2525            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2526    }
2527
2528    #[test]
2529    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2530        let root = PathBuf::from("/tmp/aft-project");
2531
2532        assert_eq!(
2533            cache_relative_path(&root, &root.join("src/lib.rs")),
2534            Some(PathBuf::from("src/lib.rs"))
2535        );
2536        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2537        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2538        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2539        assert_eq!(
2540            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2541            Some(root.join("src/lib.rs"))
2542        );
2543    }
2544
2545    #[test]
2546    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2547        let dir = tempfile::tempdir().expect("create temp dir");
2548        let project = dir.path().join("project");
2549        fs::create_dir_all(&project).expect("create project dir");
2550        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2551        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2552        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2553
2554        Command::new("git")
2555            .arg("init")
2556            .arg(&project)
2557            .status()
2558            .expect("git init");
2559        for args in [
2560            ["config", "user.email", "aft@example.invalid"],
2561            ["config", "user.name", "AFT Test"],
2562        ] {
2563            Command::new("git")
2564                .arg("-C")
2565                .arg(&project)
2566                .args(args)
2567                .status()
2568                .expect("git config");
2569        }
2570        Command::new("git")
2571            .arg("-C")
2572            .arg(&project)
2573            .args(["add", "."])
2574            .status()
2575            .expect("git add initial");
2576        Command::new("git")
2577            .arg("-C")
2578            .arg(&project)
2579            .args(["commit", "-m", "initial"])
2580            .status()
2581            .expect("git commit initial");
2582        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2583        let mut baseline = SearchIndex::build(&project);
2584        baseline.git_head = Some(previous.clone());
2585
2586        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2587        Command::new("git")
2588            .arg("-C")
2589            .arg(&project)
2590            .args(["add", "-A"])
2591            .status()
2592            .expect("git add rename");
2593        Command::new("git")
2594            .arg("-C")
2595            .arg(&project)
2596            .args(["commit", "-m", "rename"])
2597            .status()
2598            .expect("git commit rename");
2599        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2600
2601        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2602        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2603
2604        let refreshed = SearchIndex::rebuild_or_refresh(
2605            &project,
2606            DEFAULT_MAX_FILE_SIZE,
2607            Some(current),
2608            Some(baseline),
2609        );
2610
2611        assert!(!refreshed
2612            .path_to_id
2613            .contains_key(&canonical_project.join("old.txt")));
2614        assert!(refreshed
2615            .path_to_id
2616            .contains_key(&canonical_project.join("new.txt")));
2617        assert!(refreshed
2618            .path_to_id
2619            .contains_key(&canonical_project.join("untracked.txt")));
2620        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2621        assert_eq!(matches.matches.len(), 1);
2622    }
2623
2624    #[test]
2625    fn read_from_disk_rejects_corrupt_postings_checksum() {
2626        let dir = tempfile::tempdir().expect("create temp dir");
2627        let project = dir.path().join("project");
2628        fs::create_dir_all(&project).expect("create project dir");
2629        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2630
2631        let index = SearchIndex::build(&project);
2632        let cache_dir = dir.path().join("cache");
2633        index.write_to_disk(&cache_dir, None);
2634
2635        let cache_path = cache_dir.join("cache.bin");
2636        let mut bytes = fs::read(&cache_path).expect("read cache");
2637        let middle = bytes.len() / 2;
2638        bytes[middle] ^= 0xff;
2639        fs::write(&cache_path, bytes).expect("write corrupted cache");
2640
2641        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2642    }
2643
2644    #[test]
2645    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2646        let dir = tempfile::tempdir().expect("create temp dir");
2647        let project = dir.path().join("project");
2648        fs::create_dir_all(&project).expect("create project dir");
2649        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2650
2651        let index = SearchIndex::build(&project);
2652        let cache_dir = dir.path().join("cache");
2653        index.write_to_disk(&cache_dir, None);
2654
2655        assert!(cache_dir.join("cache.bin").is_file());
2656        assert!(fs::read_dir(&cache_dir)
2657            .expect("read cache dir")
2658            .all(|entry| !entry
2659                .expect("cache entry")
2660                .file_name()
2661                .to_string_lossy()
2662                .contains(".tmp.")));
2663    }
2664
2665    #[test]
2666    fn concurrent_search_index_writes_do_not_corrupt() {
2667        let dir = tempfile::tempdir().expect("create temp dir");
2668        let project = dir.path().join("project");
2669        fs::create_dir_all(&project).expect("create project dir");
2670        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2671        let cache_dir = dir.path().join("cache");
2672
2673        let a_project = project.clone();
2674        let a_cache = cache_dir.clone();
2675        let a = std::thread::spawn(move || {
2676            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2677            let index = SearchIndex::build(&a_project);
2678            index.write_to_disk(&a_cache, None);
2679        });
2680        let b_project = project.clone();
2681        let b_cache = cache_dir.clone();
2682        let b = std::thread::spawn(move || {
2683            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2684            let index = SearchIndex::build(&b_project);
2685            index.write_to_disk(&b_cache, None);
2686        });
2687        a.join().expect("writer a");
2688        b.join().expect("writer b");
2689
2690        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2691    }
2692
2693    #[test]
2694    fn search_index_atomic_rename_survives_partial_write() {
2695        let dir = tempfile::tempdir().expect("create temp dir");
2696        let cache_dir = dir.path().join("cache");
2697        fs::create_dir_all(&cache_dir).expect("create cache dir");
2698        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2699
2700        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2701    }
2702
2703    #[test]
2704    fn project_cache_key_includes_checkout_path() {
2705        let dir = tempfile::tempdir().expect("create temp dir");
2706        let source = dir.path().join("source");
2707        fs::create_dir_all(&source).expect("create source repo dir");
2708        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2709
2710        assert!(Command::new("git")
2711            .current_dir(&source)
2712            .args(["init"])
2713            .status()
2714            .expect("init git repo")
2715            .success());
2716        assert!(Command::new("git")
2717            .current_dir(&source)
2718            .args(["add", "."])
2719            .status()
2720            .expect("git add")
2721            .success());
2722        assert!(Command::new("git")
2723            .current_dir(&source)
2724            .args([
2725                "-c",
2726                "user.name=AFT Tests",
2727                "-c",
2728                "user.email=aft-tests@example.com",
2729                "commit",
2730                "-m",
2731                "initial",
2732            ])
2733            .status()
2734            .expect("git commit")
2735            .success());
2736
2737        let clone = dir.path().join("clone");
2738        assert!(Command::new("git")
2739            .args(["clone", "--quiet"])
2740            .arg(&source)
2741            .arg(&clone)
2742            .status()
2743            .expect("git clone")
2744            .success());
2745
2746        let source_key = project_cache_key(&source);
2747        let clone_key = project_cache_key(&clone);
2748
2749        assert_eq!(source_key.len(), 16);
2750        assert_eq!(clone_key.len(), 16);
2751        // Same repo (same root commit) → same cache key regardless of clone path
2752        assert_eq!(source_key, clone_key);
2753    }
2754
2755    #[test]
2756    fn git_head_unchanged_picks_up_local_edits() {
2757        let dir = tempfile::tempdir().expect("create temp dir");
2758        let project = dir.path().join("repo");
2759        fs::create_dir_all(&project).expect("create repo dir");
2760        let file = project.join("tracked.txt");
2761        fs::write(&file, "oldtoken\n").expect("write file");
2762        assert!(Command::new("git")
2763            .current_dir(&project)
2764            .arg("init")
2765            .status()
2766            .unwrap()
2767            .success());
2768        assert!(Command::new("git")
2769            .current_dir(&project)
2770            .args(["add", "."])
2771            .status()
2772            .unwrap()
2773            .success());
2774        assert!(Command::new("git")
2775            .current_dir(&project)
2776            .args([
2777                "-c",
2778                "user.name=AFT Tests",
2779                "-c",
2780                "user.email=aft-tests@example.com",
2781                "commit",
2782                "-m",
2783                "initial"
2784            ])
2785            .status()
2786            .unwrap()
2787            .success());
2788        let head = current_git_head(&project);
2789        let mut baseline = SearchIndex::build(&project);
2790        baseline.git_head = head.clone();
2791        fs::write(&file, "newtoken\n").expect("edit tracked file");
2792
2793        let refreshed =
2794            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2795        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2796
2797        assert_eq!(result.total_matches, 1);
2798    }
2799
2800    #[test]
2801    fn non_git_project_reuses_cache_when_files_unchanged() {
2802        let dir = tempfile::tempdir().expect("create temp dir");
2803        let project = dir.path().join("project");
2804        fs::create_dir_all(&project).expect("create project dir");
2805        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2806        let baseline = SearchIndex::build(&project);
2807        let baseline_file_count = baseline.file_count();
2808
2809        let refreshed =
2810            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2811
2812        assert_eq!(refreshed.file_count(), baseline_file_count);
2813        assert_eq!(
2814            refreshed
2815                .grep("unchangedtoken", true, &[], &[], &project, 10)
2816                .total_matches,
2817            1
2818        );
2819    }
2820
2821    #[test]
2822    fn resolve_search_scope_disables_index_for_external_path() {
2823        let dir = tempfile::tempdir().expect("create temp dir");
2824        let project = dir.path().join("project");
2825        let outside = dir.path().join("outside");
2826        fs::create_dir_all(&project).expect("create project dir");
2827        fs::create_dir_all(&outside).expect("create outside dir");
2828
2829        let scope = resolve_search_scope(&project, outside.to_str());
2830
2831        assert_eq!(
2832            scope.root,
2833            fs::canonicalize(&outside).expect("canonicalize outside")
2834        );
2835        assert!(!scope.use_index);
2836    }
2837
2838    #[test]
2839    fn grep_filters_matches_to_search_root() {
2840        let dir = tempfile::tempdir().expect("create temp dir");
2841        let project = dir.path().join("project");
2842        let src = project.join("src");
2843        let docs = project.join("docs");
2844        fs::create_dir_all(&src).expect("create src dir");
2845        fs::create_dir_all(&docs).expect("create docs dir");
2846        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2847        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2848
2849        let index = SearchIndex::build(&project);
2850        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2851
2852        assert_eq!(result.files_searched, 1);
2853        assert_eq!(result.files_with_matches, 1);
2854        assert_eq!(result.matches.len(), 1);
2855        // Index stores canonicalized paths; on macOS /var → /private/var
2856        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2857        assert_eq!(result.matches[0].file, expected);
2858    }
2859
2860    #[test]
2861    fn grep_deduplicates_multiple_matches_on_same_line() {
2862        let dir = tempfile::tempdir().expect("create temp dir");
2863        let project = dir.path().join("project");
2864        let src = project.join("src");
2865        fs::create_dir_all(&src).expect("create src dir");
2866        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2867
2868        let index = SearchIndex::build(&project);
2869        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2870
2871        assert_eq!(result.total_matches, 1);
2872        assert_eq!(result.matches.len(), 1);
2873    }
2874
2875    #[test]
2876    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2877        let dir = tempfile::tempdir().expect("create temp dir");
2878        let project = dir.path().join("project");
2879        fs::create_dir_all(&project).expect("create project dir");
2880        let file = project.join("unicode.txt");
2881        fs::write(&file, "äbc\n").expect("write unicode file");
2882
2883        let index = SearchIndex::build(&project);
2884        let result = index.grep("Äbc", false, &[], &[], &project, 10);
2885
2886        assert_eq!(result.total_matches, 1);
2887        assert_eq!(result.matches.len(), 1);
2888        assert_eq!(
2889            result.matches[0].file,
2890            fs::canonicalize(file).expect("canonicalize unicode file")
2891        );
2892    }
2893
2894    #[test]
2895    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2896        let dir = tempfile::tempdir().expect("create temp dir");
2897        let project = dir.path().join("project");
2898        fs::create_dir_all(&project).expect("create project dir");
2899        let file = project.join("tokens.txt");
2900        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2901        fs::write(&file, "alpha").expect("write original file");
2902        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2903
2904        let baseline = SearchIndex::build(&project);
2905        fs::write(&file, "bravo").expect("write same-size edit");
2906        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2907
2908        let refreshed =
2909            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2910        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2911        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2912        let refreshed_id = *refreshed
2913            .path_to_id
2914            .get(&canonical_file)
2915            .expect("file remains indexed");
2916
2917        assert_eq!(result.total_matches, 1);
2918        assert!(refreshed
2919            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2920            .contains(&refreshed_id));
2921        assert!(!refreshed
2922            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2923            .contains(&refreshed_id));
2924    }
2925
2926    #[test]
2927    fn grep_reports_total_matches_before_truncation() {
2928        let dir = tempfile::tempdir().expect("create temp dir");
2929        let project = dir.path().join("project");
2930        let src = project.join("src");
2931        fs::create_dir_all(&src).expect("create src dir");
2932        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2933
2934        let index = SearchIndex::build(&project);
2935        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2936
2937        assert_eq!(result.total_matches, 2);
2938        assert_eq!(result.matches.len(), 1);
2939        assert!(result.truncated);
2940    }
2941
2942    #[test]
2943    fn glob_filters_results_to_search_root() {
2944        let dir = tempfile::tempdir().expect("create temp dir");
2945        let project = dir.path().join("project");
2946        let src = project.join("src");
2947        let scripts = project.join("scripts");
2948        fs::create_dir_all(&src).expect("create src dir");
2949        fs::create_dir_all(&scripts).expect("create scripts dir");
2950        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2951        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2952
2953        let index = SearchIndex::build(&project);
2954        let files = index.glob("**/*.rs", &src);
2955
2956        assert_eq!(
2957            files,
2958            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2959        );
2960    }
2961
2962    #[test]
2963    fn glob_includes_hidden_and_binary_files() {
2964        let dir = tempfile::tempdir().expect("create temp dir");
2965        let project = dir.path().join("project");
2966        let hidden_dir = project.join(".hidden");
2967        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2968        let hidden_file = hidden_dir.join("data.bin");
2969        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2970
2971        let index = SearchIndex::build(&project);
2972        let files = index.glob("**/*.bin", &project);
2973
2974        assert_eq!(
2975            files,
2976            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2977        );
2978    }
2979
2980    #[test]
2981    fn read_from_disk_rejects_invalid_nanos() {
2982        let dir = tempfile::tempdir().expect("create temp dir");
2983        let cache_dir = dir.path().join("cache");
2984        fs::create_dir_all(&cache_dir).expect("create cache dir");
2985
2986        let mut postings = Vec::new();
2987        postings.extend_from_slice(INDEX_MAGIC);
2988        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2989        postings.extend_from_slice(&0u32.to_le_bytes());
2990        postings.extend_from_slice(&1u32.to_le_bytes());
2991        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2992        postings.extend_from_slice(&1u32.to_le_bytes());
2993        postings.extend_from_slice(b"/");
2994        postings.push(0u8);
2995        postings.extend_from_slice(&1u32.to_le_bytes());
2996        postings.extend_from_slice(&0u64.to_le_bytes());
2997        postings.extend_from_slice(&0u64.to_le_bytes());
2998        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2999        postings.extend_from_slice(b"a");
3000        postings.extend_from_slice(&0u64.to_le_bytes());
3001
3002        let mut lookup = Vec::new();
3003        lookup.extend_from_slice(LOOKUP_MAGIC);
3004        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3005        lookup.extend_from_slice(&0u32.to_le_bytes());
3006
3007        let postings_checksum = crc32fast::hash(&postings);
3008        postings.extend_from_slice(&postings_checksum.to_le_bytes());
3009        let lookup_checksum = crc32fast::hash(&lookup);
3010        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3011        let mut cache = Vec::new();
3012        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3013        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3014        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3015        cache.extend_from_slice(&postings);
3016        cache.extend_from_slice(&lookup);
3017        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3018
3019        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3020    }
3021
3022    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
3023    /// changed between cmp() calls.
3024    ///
3025    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
3026    /// which does a `stat()` syscall. If the file was deleted, modified, or
3027    /// touched mid-sort, the comparator returned different values for the
3028    /// same input pair on different invocations. Rust's slice::sort detects
3029    /// this and panics with "user-provided comparison function does not
3030    /// correctly implement a total order".
3031    ///
3032    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
3033    /// bridge invalidated files in parallel with grep's sort path. This
3034    /// test simulates the worst case: most paths don't exist (Err from
3035    /// fs::metadata) and sort still completes successfully.
3036    #[test]
3037    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3038        // Mix of existing and non-existing paths in deliberately
3039        // non-monotonic order — pre-fix, the sort would call stat() at
3040        // least N log N times and any flakiness would trigger the panic.
3041        let dir = tempfile::tempdir().expect("create tempdir");
3042        let mut paths: Vec<PathBuf> = Vec::new();
3043        for i in 0..30 {
3044            // Half exist, half don't.
3045            let path = if i % 2 == 0 {
3046                let p = dir.path().join(format!("real-{i}.rs"));
3047                fs::write(&p, format!("// {i}\n")).expect("write");
3048                p
3049            } else {
3050                dir.path().join(format!("missing-{i}.rs"))
3051            };
3052            paths.push(path);
3053        }
3054
3055        // Run the sort many times to maximise the chance of catching any
3056        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
3057        for _ in 0..50 {
3058            let mut copy = paths.clone();
3059            sort_paths_by_mtime_desc(&mut copy);
3060            assert_eq!(copy.len(), paths.len());
3061        }
3062    }
3063
3064    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
3065    /// the same conditions as sort_paths_by_mtime_desc. See the
3066    /// sort_paths_... test above for the full rationale.
3067    #[test]
3068    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3069        let dir = tempfile::tempdir().expect("create tempdir");
3070        let mut matches: Vec<GrepMatch> = Vec::new();
3071        for i in 0..30 {
3072            let file = if i % 2 == 0 {
3073                let p = dir.path().join(format!("real-{i}.rs"));
3074                fs::write(&p, format!("// {i}\n")).expect("write");
3075                p
3076            } else {
3077                dir.path().join(format!("missing-{i}.rs"))
3078            };
3079            matches.push(GrepMatch {
3080                file,
3081                line: u32::try_from(i).unwrap_or(0),
3082                column: 0,
3083                line_text: format!("match {i}"),
3084                match_text: format!("match {i}"),
3085            });
3086        }
3087
3088        for _ in 0..50 {
3089            let mut copy = matches.clone();
3090            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3091            assert_eq!(copy.len(), matches.len());
3092        }
3093    }
3094}
aft/search_index.rs

aft/
search_index.rs