Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20
21const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
22const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
23const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
24const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
25const INDEX_VERSION: u32 = 3;
26const PREVIEW_BYTES: usize = 8 * 1024;
27const EOF_SENTINEL: u8 = 0;
28const MAX_ENTRIES: usize = 10_000_000;
29const MIN_FILE_ENTRY_BYTES: usize = 57;
30const LOOKUP_ENTRY_BYTES: usize = 16;
31const POSTING_BYTES: usize = 6;
32static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
33
34pub struct CacheLock {
35    _guard: fs_lock::LockGuard,
36}
37
38impl CacheLock {
39    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
40        fs::create_dir_all(cache_dir)?;
41        let path = cache_dir.join("cache.lock");
42        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
43            .lock()
44            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
45        fs_lock::try_acquire(&path, Duration::from_secs(2))
46            .map(|guard| Self { _guard: guard })
47            .map_err(|error| match error {
48                fs_lock::AcquireError::Timeout => {
49                    std::io::Error::other("timed out acquiring search cache lock")
50                }
51                fs_lock::AcquireError::Io(error) => error,
52            })
53    }
54}
55
56#[derive(Clone, Debug)]
57pub struct SearchIndex {
58    pub postings: HashMap<u32, Vec<Posting>>,
59    pub files: Vec<FileEntry>,
60    pub path_to_id: HashMap<PathBuf, u32>,
61    pub ready: bool,
62    project_root: PathBuf,
63    git_head: Option<String>,
64    max_file_size: u64,
65    pub file_trigrams: HashMap<u32, Vec<u32>>,
66    unindexed_files: HashSet<u32>,
67}
68
69impl SearchIndex {
70    /// Number of indexed files.
71    pub fn file_count(&self) -> usize {
72        self.files.len()
73    }
74
75    /// Number of unique trigrams in the index.
76    pub fn trigram_count(&self) -> usize {
77        self.postings.len()
78    }
79
80    /// Compute distinct query trigrams from literal tokens.
81    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
82        query_trigrams_from_tokens(tokens)
83    }
84
85    /// Score-rank file candidates by lexical relevance to query trigrams.
86    pub fn lexical_rank(
87        &self,
88        query_trigrams: &[u32],
89        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
90        max_files: usize,
91    ) -> Vec<(PathBuf, f32)> {
92        if query_trigrams.is_empty() || max_files == 0 {
93            return Vec::new();
94        }
95
96        let mut non_zero: Vec<(u32, usize)> = query_trigrams
97            .iter()
98            .filter_map(|trigram| {
99                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
100                (posting_count > 0).then_some((*trigram, posting_count))
101            })
102            .collect();
103        if non_zero.is_empty() {
104            return Vec::new();
105        }
106
107        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
108        let selected_count = non_zero.len().min(3);
109        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
110
111        let mut candidate_ids = BTreeSet::new();
112        for (trigram, _) in non_zero.iter().take(selected_count) {
113            if let Some(postings) = self.postings.get(trigram) {
114                for posting in postings {
115                    if self.is_active_file(posting.file_id) {
116                        candidate_ids.insert(posting.file_id);
117                    }
118                }
119            }
120        }
121
122        let mut ranked = Vec::new();
123        for file_id in candidate_ids.into_iter().take(candidate_cap) {
124            let Some(entry) = self.files.get(file_id as usize) else {
125                continue;
126            };
127            if let Some(filter) = candidate_filter {
128                if !filter(&entry.path) {
129                    continue;
130                }
131            }
132            let score = lexical_score(self, query_trigrams, file_id);
133            if score > 0.0 {
134                ranked.push((entry.path.clone(), score));
135            }
136        }
137
138        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
139        ranked.truncate(max_files);
140        ranked
141    }
142}
143
144#[derive(Clone, Debug, PartialEq, Eq)]
145pub struct Posting {
146    pub file_id: u32,
147    pub next_mask: u8,
148    pub loc_mask: u8,
149}
150
151#[derive(Clone, Debug)]
152pub struct FileEntry {
153    pub path: PathBuf,
154    pub size: u64,
155    pub modified: SystemTime,
156    pub content_hash: blake3::Hash,
157}
158
159#[derive(Clone, Debug, PartialEq, Eq)]
160pub struct GrepMatch {
161    pub file: PathBuf,
162    pub line: u32,
163    pub column: u32,
164    pub line_text: String,
165    pub match_text: String,
166}
167
168#[derive(Clone, Debug)]
169pub struct GrepResult {
170    pub matches: Vec<GrepMatch>,
171    pub total_matches: usize,
172    pub files_searched: usize,
173    pub files_with_matches: usize,
174    pub index_status: IndexStatus,
175    pub truncated: bool,
176}
177
178#[derive(Clone, Copy, Debug, PartialEq, Eq)]
179pub enum IndexStatus {
180    Ready,
181    Building,
182    Fallback,
183}
184
185impl IndexStatus {
186    pub fn as_str(&self) -> &'static str {
187        match self {
188            IndexStatus::Ready => "Ready",
189            IndexStatus::Building => "Building",
190            IndexStatus::Fallback => "Fallback",
191        }
192    }
193}
194
195#[derive(Clone, Debug, Default)]
196pub struct RegexQuery {
197    pub and_trigrams: Vec<u32>,
198    pub or_groups: Vec<Vec<u32>>,
199    pub(crate) and_filters: HashMap<u32, PostingFilter>,
200    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
201}
202
203#[derive(Clone, Copy, Debug, Default)]
204pub(crate) struct PostingFilter {
205    next_mask: u8,
206    loc_mask: u8,
207}
208
209#[derive(Clone, Debug, Default)]
210struct QueryBuild {
211    and_runs: Vec<Vec<u8>>,
212    or_groups: Vec<Vec<Vec<u8>>>,
213}
214
215#[derive(Clone, Debug, Default)]
216pub(crate) struct PathFilters {
217    includes: Option<GlobSet>,
218    excludes: Option<GlobSet>,
219}
220
221#[derive(Clone, Debug)]
222pub(crate) struct SearchScope {
223    pub root: PathBuf,
224    pub use_index: bool,
225}
226
227#[derive(Clone, Debug)]
228struct SharedGrepMatch {
229    file: Arc<PathBuf>,
230    line: u32,
231    column: u32,
232    line_text: String,
233    match_text: String,
234}
235
236#[derive(Clone, Debug)]
237enum SearchMatcher {
238    Literal(LiteralSearch),
239    Regex(Regex),
240}
241
242#[derive(Clone, Debug)]
243enum LiteralSearch {
244    CaseSensitive(Vec<u8>),
245    AsciiCaseInsensitive(Vec<u8>),
246}
247
248impl SearchIndex {
249    pub fn new() -> Self {
250        SearchIndex {
251            postings: HashMap::new(),
252            files: Vec::new(),
253            path_to_id: HashMap::new(),
254            ready: false,
255            project_root: PathBuf::new(),
256            git_head: None,
257            max_file_size: DEFAULT_MAX_FILE_SIZE,
258            file_trigrams: HashMap::new(),
259            unindexed_files: HashSet::new(),
260        }
261    }
262
263    pub fn build(root: &Path) -> Self {
264        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
265    }
266
267    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
268        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
269        let mut index = SearchIndex {
270            project_root: project_root.clone(),
271            max_file_size,
272            ..SearchIndex::new()
273        };
274
275        let filters = PathFilters::default();
276        for path in walk_project_files(&project_root, &filters) {
277            index.update_file(&path);
278        }
279
280        index.git_head = current_git_head(&project_root);
281        index.ready = true;
282        index
283    }
284
285    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
286        self.remove_file(path);
287
288        let file_id = match self.allocate_file_id(path, content.len() as u64) {
289            Some(file_id) => file_id,
290            None => return,
291        };
292        if let Some(file) = self.files.get_mut(file_id as usize) {
293            file.content_hash = cache_freshness::hash_bytes(content);
294        }
295
296        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
297        for (trigram, next_char, position) in extract_trigrams(content) {
298            let entry = trigram_map.entry(trigram).or_default();
299            entry.next_mask |= mask_for_next_char(next_char);
300            entry.loc_mask |= mask_for_position(position);
301        }
302
303        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
304        for (trigram, filter) in trigram_map {
305            let postings = self.postings.entry(trigram).or_default();
306            postings.push(Posting {
307                file_id,
308                next_mask: filter.next_mask,
309                loc_mask: filter.loc_mask,
310            });
311            // Posting lists are kept sorted by file_id for binary search during
312            // intersection. Since file_ids are allocated incrementally, the new
313            // entry is usually already in order. Only sort when needed.
314            if postings.len() > 1
315                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
316            {
317                postings.sort_unstable_by_key(|p| p.file_id);
318            }
319            file_trigrams.push(trigram);
320        }
321
322        self.file_trigrams.insert(file_id, file_trigrams);
323        self.unindexed_files.remove(&file_id);
324    }
325
326    pub fn remove_file(&mut self, path: &Path) {
327        let Some(file_id) = self.path_to_id.remove(path) else {
328            return;
329        };
330
331        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
332            for trigram in trigrams {
333                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
334                    postings.retain(|posting| posting.file_id != file_id);
335                    postings.is_empty()
336                } else {
337                    false
338                };
339
340                if should_remove {
341                    self.postings.remove(&trigram);
342                }
343            }
344        }
345
346        self.unindexed_files.remove(&file_id);
347        if let Some(file) = self.files.get_mut(file_id as usize) {
348            file.path = PathBuf::new();
349            file.size = 0;
350            file.modified = UNIX_EPOCH;
351            file.content_hash = cache_freshness::zero_hash();
352        }
353    }
354
355    pub fn update_file(&mut self, path: &Path) {
356        self.remove_file(path);
357
358        let metadata = match fs::metadata(path) {
359            Ok(metadata) if metadata.is_file() => metadata,
360            _ => return,
361        };
362
363        if is_binary_path(path, metadata.len()) {
364            self.track_unindexed_file(path, &metadata);
365            return;
366        }
367
368        if metadata.len() > self.max_file_size {
369            self.track_unindexed_file(path, &metadata);
370            return;
371        }
372
373        let content = match fs::read(path) {
374            Ok(content) => content,
375            Err(_) => return,
376        };
377
378        if is_binary_bytes(&content) {
379            self.track_unindexed_file(path, &metadata);
380            return;
381        }
382
383        self.index_file(path, &content);
384    }
385
386    pub fn grep(
387        &self,
388        pattern: &str,
389        case_sensitive: bool,
390        include: &[String],
391        exclude: &[String],
392        search_root: &Path,
393        max_results: usize,
394    ) -> GrepResult {
395        self.search_grep(
396            pattern,
397            case_sensitive,
398            include,
399            exclude,
400            search_root,
401            max_results,
402        )
403    }
404
405    pub fn search_grep(
406        &self,
407        pattern: &str,
408        case_sensitive: bool,
409        include: &[String],
410        exclude: &[String],
411        search_root: &Path,
412        max_results: usize,
413    ) -> GrepResult {
414        // Detect if pattern is a plain literal (no regex metacharacters).
415        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
416        let is_literal = !pattern.chars().any(|c| {
417            matches!(
418                c,
419                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
420            )
421        });
422
423        let literal_search = if is_literal {
424            if case_sensitive {
425                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
426            } else if pattern.is_ascii() {
427                Some(LiteralSearch::AsciiCaseInsensitive(
428                    pattern
429                        .as_bytes()
430                        .iter()
431                        .map(|byte| byte.to_ascii_lowercase())
432                        .collect(),
433                ))
434            } else {
435                None
436            }
437        } else {
438            None
439        };
440
441        // Build the regex for non-literal patterns (or literal Unicode fallback).
442        let regex = if literal_search.is_some() {
443            None
444        } else {
445            let regex_pattern = if is_literal {
446                regex::escape(pattern)
447            } else {
448                pattern.to_string()
449            };
450            let mut builder = RegexBuilder::new(&regex_pattern);
451            builder.case_insensitive(!case_sensitive);
452            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
453            builder.multi_line(true);
454            match builder.build() {
455                Ok(r) => Some(r),
456                Err(_) => {
457                    return GrepResult {
458                        matches: Vec::new(),
459                        total_matches: 0,
460                        files_searched: 0,
461                        files_with_matches: 0,
462                        index_status: if self.ready {
463                            IndexStatus::Ready
464                        } else {
465                            IndexStatus::Building
466                        },
467                        truncated: false,
468                    };
469                }
470            }
471        };
472
473        let matcher = if let Some(literal_search) = literal_search {
474            SearchMatcher::Literal(literal_search)
475        } else {
476            SearchMatcher::Regex(
477                regex.expect("regex should exist when literal matcher is unavailable"),
478            )
479        };
480
481        let filters = match build_path_filters(include, exclude) {
482            Ok(filters) => filters,
483            Err(_) => PathFilters::default(),
484        };
485        let search_root = canonicalize_or_normalize(search_root);
486
487        let query = if !case_sensitive && !pattern.is_ascii() {
488            RegexQuery::default()
489        } else {
490            decompose_regex(pattern)
491        };
492        let candidate_ids = self.candidates(&query);
493
494        let candidate_files: Vec<&FileEntry> = candidate_ids
495            .into_iter()
496            .filter_map(|file_id| self.files.get(file_id as usize))
497            .filter(|file| !file.path.as_os_str().is_empty())
498            .filter(|file| is_within_search_root(&search_root, &file.path))
499            .filter(|file| filters.matches(&self.project_root, &file.path))
500            .collect();
501
502        let total_matches = AtomicUsize::new(0);
503        let files_searched = AtomicUsize::new(0);
504        let files_with_matches = AtomicUsize::new(0);
505        let truncated = AtomicBool::new(false);
506        let stop_after = max_results.saturating_mul(2);
507
508        let mut matches = if candidate_files.len() > 10 {
509            candidate_files
510                .par_iter()
511                .map(|file| {
512                    search_candidate_file(
513                        file,
514                        &matcher,
515                        max_results,
516                        stop_after,
517                        &total_matches,
518                        &files_searched,
519                        &files_with_matches,
520                        &truncated,
521                    )
522                })
523                .reduce(Vec::new, |mut left, mut right| {
524                    left.append(&mut right);
525                    left
526                })
527        } else {
528            let mut matches = Vec::new();
529            for file in candidate_files {
530                matches.extend(search_candidate_file(
531                    file,
532                    &matcher,
533                    max_results,
534                    stop_after,
535                    &total_matches,
536                    &files_searched,
537                    &files_with_matches,
538                    &truncated,
539                ));
540
541                if should_stop_search(&truncated, &total_matches, stop_after) {
542                    break;
543                }
544            }
545            matches
546        };
547
548        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
549            self.path_to_id
550                .get(path)
551                .and_then(|file_id| self.files.get(*file_id as usize))
552                .map(|file| file.modified)
553        });
554
555        let matches = matches
556            .into_iter()
557            .map(|matched| GrepMatch {
558                file: matched.file.as_ref().clone(),
559                line: matched.line,
560                column: matched.column,
561                line_text: matched.line_text,
562                match_text: matched.match_text,
563            })
564            .collect();
565
566        GrepResult {
567            total_matches: total_matches.load(Ordering::Relaxed),
568            matches,
569            files_searched: files_searched.load(Ordering::Relaxed),
570            files_with_matches: files_with_matches.load(Ordering::Relaxed),
571            index_status: if self.ready {
572                IndexStatus::Ready
573            } else {
574                IndexStatus::Building
575            },
576            truncated: truncated.load(Ordering::Relaxed),
577        }
578    }
579
580    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
581        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
582            Ok(filters) => filters,
583            Err(_) => return Vec::new(),
584        };
585        let search_root = canonicalize_or_normalize(search_root);
586        let mut entries = self
587            .files
588            .iter()
589            .filter(|file| !file.path.as_os_str().is_empty())
590            .filter(|file| is_within_search_root(&search_root, &file.path))
591            .filter(|file| filters.matches(&self.project_root, &file.path))
592            .map(|file| (file.path.clone(), file.modified))
593            .collect::<Vec<_>>();
594
595        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
596            right_mtime
597                .cmp(left_mtime)
598                .then_with(|| left_path.cmp(right_path))
599        });
600
601        entries.into_iter().map(|(path, _)| path).collect()
602    }
603
604    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
605        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
606            return self.active_file_ids();
607        }
608
609        let mut and_trigrams = query.and_trigrams.clone();
610        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
611
612        let mut current: Option<Vec<u32>> = None;
613
614        for trigram in and_trigrams {
615            let filter = query.and_filters.get(&trigram).copied();
616            let matches = self.postings_for_trigram(trigram, filter);
617            current = Some(match current.take() {
618                Some(existing) => intersect_sorted_ids(&existing, &matches),
619                None => matches,
620            });
621
622            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
623                break;
624            }
625        }
626
627        let mut current = current.unwrap_or_else(|| self.active_file_ids());
628
629        for (index, group) in query.or_groups.iter().enumerate() {
630            let mut group_matches = Vec::new();
631            let filters = query.or_filters.get(index);
632
633            for trigram in group {
634                let filter = filters.and_then(|filters| filters.get(trigram).copied());
635                let matches = self.postings_for_trigram(*trigram, filter);
636                if group_matches.is_empty() {
637                    group_matches = matches;
638                } else {
639                    group_matches = union_sorted_ids(&group_matches, &matches);
640                }
641            }
642
643            current = intersect_sorted_ids(&current, &group_matches);
644            if current.is_empty() {
645                break;
646            }
647        }
648
649        let mut unindexed = self
650            .unindexed_files
651            .iter()
652            .copied()
653            .filter(|file_id| self.is_active_file(*file_id))
654            .collect::<Vec<_>>();
655        if !unindexed.is_empty() {
656            unindexed.sort_unstable();
657            current = union_sorted_ids(&current, &unindexed);
658        }
659
660        current
661    }
662
663    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
664        if fs::create_dir_all(cache_dir).is_err() {
665            return;
666        }
667
668        let cache_path = cache_dir.join("cache.bin");
669        let tmp_cache = cache_dir.join(format!(
670            "cache.bin.tmp.{}.{}",
671            std::process::id(),
672            SystemTime::now()
673                .duration_since(UNIX_EPOCH)
674                .unwrap_or(Duration::ZERO)
675                .as_nanos()
676        ));
677
678        let active_ids = self.active_file_ids();
679        let mut id_map = HashMap::new();
680        for (new_id, old_id) in active_ids.iter().enumerate() {
681            let Ok(new_id_u32) = u32::try_from(new_id) else {
682                return;
683            };
684            id_map.insert(*old_id, new_id_u32);
685        }
686
687        let write_result = (|| -> std::io::Result<()> {
688            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
689
690            postings_writer.write_all(INDEX_MAGIC)?;
691            write_u32(&mut postings_writer, INDEX_VERSION)?;
692
693            let head = git_head.unwrap_or_default();
694            let root = self.project_root.to_string_lossy();
695            let head_len = u32::try_from(head.len())
696                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
697            let root_len = u32::try_from(root.len())
698                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
699            let file_count = u32::try_from(active_ids.len())
700                .map_err(|_| std::io::Error::other("too many files to cache"))?;
701
702            write_u32(&mut postings_writer, head_len)?;
703            write_u32(&mut postings_writer, root_len)?;
704            write_u64(&mut postings_writer, self.max_file_size)?;
705            write_u32(&mut postings_writer, file_count)?;
706            postings_writer.write_all(head.as_bytes())?;
707            postings_writer.write_all(root.as_bytes())?;
708
709            for old_id in &active_ids {
710                let Some(file) = self.files.get(*old_id as usize) else {
711                    return Err(std::io::Error::other("missing file entry for cache write"));
712                };
713                let path =
714                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
715                        std::io::Error::other(format!(
716                            "refusing to cache path outside project root: {}",
717                            file.path.display()
718                        ))
719                    })?;
720                let path = path.to_string_lossy();
721                let path_len = u32::try_from(path.len())
722                    .map_err(|_| std::io::Error::other("cached path too large"))?;
723                let modified = file
724                    .modified
725                    .duration_since(UNIX_EPOCH)
726                    .unwrap_or(Duration::ZERO);
727                let unindexed = if self.unindexed_files.contains(old_id) {
728                    1u8
729                } else {
730                    0u8
731                };
732
733                postings_writer.write_all(&[unindexed])?;
734                write_u32(&mut postings_writer, path_len)?;
735                write_u64(&mut postings_writer, file.size)?;
736                write_u64(&mut postings_writer, modified.as_secs())?;
737                write_u32(&mut postings_writer, modified.subsec_nanos())?;
738                postings_writer.write_all(file.content_hash.as_bytes())?;
739                postings_writer.write_all(path.as_bytes())?;
740            }
741
742            let mut lookup_entries = Vec::new();
743            let mut postings_blob = Vec::new();
744            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
745            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
746
747            for (trigram, postings) in sorted_postings {
748                let offset = u64::try_from(postings_blob.len())
749                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
750                let mut count = 0u32;
751
752                for posting in postings {
753                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
754                        continue;
755                    };
756
757                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
758                    postings_blob.push(posting.next_mask);
759                    postings_blob.push(posting.loc_mask);
760                    count = count.saturating_add(1);
761                }
762
763                if count > 0 {
764                    lookup_entries.push((*trigram, offset, count));
765                }
766            }
767
768            write_u64(
769                &mut postings_writer,
770                u64::try_from(postings_blob.len())
771                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
772            )?;
773            postings_writer.write_all(&postings_blob)?;
774            postings_writer.flush()?;
775            let mut postings_blob_file = postings_writer
776                .into_inner()
777                .map_err(|error| std::io::Error::other(error.to_string()))?
778                .into_inner();
779            let checksum = crc32fast::hash(&postings_blob_file);
780            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
781
782            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
783            let entry_count = u32::try_from(lookup_entries.len())
784                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
785
786            lookup_writer.write_all(LOOKUP_MAGIC)?;
787            write_u32(&mut lookup_writer, INDEX_VERSION)?;
788            write_u32(&mut lookup_writer, entry_count)?;
789
790            for (trigram, offset, count) in lookup_entries {
791                write_u32(&mut lookup_writer, trigram)?;
792                write_u64(&mut lookup_writer, offset)?;
793                write_u32(&mut lookup_writer, count)?;
794            }
795
796            lookup_writer.flush()?;
797            let mut lookup_blob_file = lookup_writer
798                .into_inner()
799                .map_err(|error| std::io::Error::other(error.to_string()))?
800                .into_inner();
801            let checksum = crc32fast::hash(&lookup_blob_file);
802            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
803
804            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
805            write_u32(&mut cache_writer, CACHE_MAGIC)?;
806            write_u32(&mut cache_writer, INDEX_VERSION)?;
807            write_u64(
808                &mut cache_writer,
809                u64::try_from(postings_blob_file.len())
810                    .map_err(|_| std::io::Error::other("postings section too large"))?,
811            )?;
812            cache_writer.write_all(&postings_blob_file)?;
813            cache_writer.write_all(&lookup_blob_file)?;
814            cache_writer.flush()?;
815            cache_writer.get_ref().sync_all()?;
816            drop(cache_writer);
817            fs::rename(&tmp_cache, &cache_path)?;
818
819            Ok(())
820        })();
821
822        if write_result.is_err() {
823            let _ = fs::remove_file(&tmp_cache);
824        }
825    }
826
827    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
828        debug_assert!(current_canonical_root.is_absolute());
829        let cache_path = cache_dir.join("cache.bin");
830        let cache_bytes = fs::read(&cache_path).ok()?;
831        if cache_bytes.len() < 16 {
832            return None;
833        }
834        let mut header = Cursor::new(&cache_bytes);
835        if read_u32(&mut header).ok()? != CACHE_MAGIC {
836            return None;
837        }
838        if read_u32(&mut header).ok()? != INDEX_VERSION {
839            return None;
840        }
841        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
842        let start = usize::try_from(header.position()).ok()?;
843        let postings_end = start.checked_add(postings_len_total)?;
844        if postings_end > cache_bytes.len() {
845            return None;
846        }
847        let postings_bytes = &cache_bytes[start..postings_end];
848        let lookup_bytes = &cache_bytes[postings_end..];
849        let lookup_len_total = lookup_bytes.len();
850        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
851        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
852        if postings_len_total < 4 || lookup_len_total < 4 {
853            return None;
854        }
855        verify_crc32_bytes_slice(postings_bytes).ok()?;
856        verify_crc32_bytes_slice(lookup_bytes).ok()?;
857
858        let mut magic = [0u8; 8];
859        postings_reader.read_exact(&mut magic).ok()?;
860        if &magic != INDEX_MAGIC {
861            return None;
862        }
863        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
864            return None;
865        }
866
867        let head_len = read_u32(&mut postings_reader).ok()? as usize;
868        let root_len = read_u32(&mut postings_reader).ok()? as usize;
869        let max_file_size = read_u64(&mut postings_reader).ok()?;
870        let file_count = read_u32(&mut postings_reader).ok()? as usize;
871        if file_count > MAX_ENTRIES {
872            return None;
873        }
874        let postings_body_len = postings_len_total.checked_sub(4)?;
875        let lookup_body_len = lookup_len_total.checked_sub(4)?;
876
877        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
878        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
879        if minimum_file_bytes > remaining_postings {
880            return None;
881        }
882
883        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
884            return None;
885        }
886        let mut head_bytes = vec![0u8; head_len];
887        postings_reader.read_exact(&mut head_bytes).ok()?;
888        let git_head = String::from_utf8(head_bytes)
889            .ok()
890            .filter(|head| !head.is_empty());
891
892        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
893            return None;
894        }
895        let mut root_bytes = vec![0u8; root_len];
896        postings_reader.read_exact(&mut root_bytes).ok()?;
897        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
898        let project_root = current_canonical_root.to_path_buf();
899
900        let mut files = Vec::with_capacity(file_count);
901        let mut path_to_id = HashMap::new();
902        let mut unindexed_files = HashSet::new();
903
904        for file_id in 0..file_count {
905            let mut unindexed = [0u8; 1];
906            postings_reader.read_exact(&mut unindexed).ok()?;
907            let path_len = read_u32(&mut postings_reader).ok()? as usize;
908            let size = read_u64(&mut postings_reader).ok()?;
909            let secs = read_u64(&mut postings_reader).ok()?;
910            let nanos = read_u32(&mut postings_reader).ok()?;
911            let mut hash_bytes = [0u8; 32];
912            postings_reader.read_exact(&mut hash_bytes).ok()?;
913            let content_hash = blake3::Hash::from_bytes(hash_bytes);
914            if nanos >= 1_000_000_000 {
915                return None;
916            }
917            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
918                return None;
919            }
920            let mut path_bytes = vec![0u8; path_len];
921            postings_reader.read_exact(&mut path_bytes).ok()?;
922            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
923            let full_path = cached_path_under_root(&project_root, &relative_path)?;
924            let file_id_u32 = u32::try_from(file_id).ok()?;
925
926            files.push(FileEntry {
927                path: full_path.clone(),
928                size,
929                modified: UNIX_EPOCH + Duration::new(secs, nanos),
930                content_hash,
931            });
932            path_to_id.insert(full_path, file_id_u32);
933            if unindexed[0] == 1 {
934                unindexed_files.insert(file_id_u32);
935            }
936        }
937
938        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
939        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
940        if postings_len > max_postings_bytes {
941            return None;
942        }
943        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
944            return None;
945        }
946        let mut postings_blob = vec![0u8; postings_len];
947        postings_reader.read_exact(&mut postings_blob).ok()?;
948
949        let mut lookup_magic = [0u8; 8];
950        lookup_reader.read_exact(&mut lookup_magic).ok()?;
951        if &lookup_magic != LOOKUP_MAGIC {
952            return None;
953        }
954        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
955            return None;
956        }
957        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
958        if entry_count > MAX_ENTRIES {
959            return None;
960        }
961        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
962        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
963        if minimum_lookup_bytes > remaining_lookup {
964            return None;
965        }
966
967        let mut postings = HashMap::new();
968        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
969
970        for _ in 0..entry_count {
971            let trigram = read_u32(&mut lookup_reader).ok()?;
972            let offset = read_u64(&mut lookup_reader).ok()? as usize;
973            let count = read_u32(&mut lookup_reader).ok()? as usize;
974            if count > MAX_ENTRIES {
975                return None;
976            }
977            let bytes_len = count.checked_mul(POSTING_BYTES)?;
978            let end = offset.checked_add(bytes_len)?;
979            if end > postings_blob.len() {
980                return None;
981            }
982
983            let mut trigram_postings = Vec::with_capacity(count);
984            for chunk in postings_blob[offset..end].chunks_exact(6) {
985                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
986                let posting = Posting {
987                    file_id,
988                    next_mask: chunk[4],
989                    loc_mask: chunk[5],
990                };
991                trigram_postings.push(posting.clone());
992                file_trigrams.entry(file_id).or_default().push(trigram);
993            }
994            postings.insert(trigram, trigram_postings);
995        }
996
997        Some(SearchIndex {
998            postings,
999            files,
1000            path_to_id,
1001            ready: false,
1002            project_root,
1003            git_head,
1004            max_file_size,
1005            file_trigrams,
1006            unindexed_files,
1007        })
1008    }
1009
1010    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1011        self.git_head.as_deref()
1012    }
1013
1014    pub(crate) fn set_ready(&mut self, ready: bool) {
1015        self.ready = ready;
1016    }
1017
1018    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1019        self.git_head = current_head;
1020        verify_file_mtimes(self);
1021        self.ready = true;
1022    }
1023
1024    pub(crate) fn rebuild_or_refresh(
1025        root: &Path,
1026        max_file_size: u64,
1027        current_head: Option<String>,
1028        baseline: Option<SearchIndex>,
1029    ) -> Self {
1030        if let Some(mut baseline) = baseline {
1031            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1032            baseline.max_file_size = max_file_size;
1033
1034            if baseline.git_head == current_head || current_head.is_none() {
1035                // HEAD matches, but files may have changed on disk since the index was
1036                // last written (e.g., uncommitted edits, stash pop, manual file changes
1037                // while OpenCode was closed). Verify mtimes and re-index stale files.
1038                // Non-git projects also use this per-file (path, mtime, size)
1039                // fingerprint so unchanged trees reuse the disk cache instead of
1040                // rebuilding every configure.
1041                baseline.git_head = current_head;
1042                verify_file_mtimes(&mut baseline);
1043                baseline.ready = true;
1044                return baseline;
1045            }
1046
1047            if let (Some(previous), Some(current)) =
1048                (baseline.git_head.clone(), current_head.clone())
1049            {
1050                let project_root = baseline.project_root.clone();
1051                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1052                    baseline.git_head = Some(current);
1053                    verify_file_mtimes(&mut baseline);
1054                    baseline.ready = true;
1055                    return baseline;
1056                }
1057            }
1058        }
1059
1060        SearchIndex::build_with_limit(root, max_file_size)
1061    }
1062
1063    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1064        let file_id = u32::try_from(self.files.len()).ok()?;
1065        let metadata = fs::metadata(path).ok();
1066        let size = metadata
1067            .as_ref()
1068            .map_or(size_hint, |metadata| metadata.len());
1069        let modified = metadata
1070            .and_then(|metadata| metadata.modified().ok())
1071            .unwrap_or(UNIX_EPOCH);
1072
1073        self.files.push(FileEntry {
1074            path: path.to_path_buf(),
1075            size,
1076            modified,
1077            content_hash: cache_freshness::zero_hash(),
1078        });
1079        self.path_to_id.insert(path.to_path_buf(), file_id);
1080        Some(file_id)
1081    }
1082
1083    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1084        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1085            return;
1086        };
1087        self.unindexed_files.insert(file_id);
1088        self.file_trigrams.insert(file_id, Vec::new());
1089    }
1090
1091    fn active_file_ids(&self) -> Vec<u32> {
1092        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1093        ids.sort_unstable();
1094        ids
1095    }
1096
1097    fn is_active_file(&self, file_id: u32) -> bool {
1098        self.files
1099            .get(file_id as usize)
1100            .map(|file| !file.path.as_os_str().is_empty())
1101            .unwrap_or(false)
1102    }
1103
1104    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1105        let Some(postings) = self.postings.get(&trigram) else {
1106            return Vec::new();
1107        };
1108
1109        let mut matches = Vec::with_capacity(postings.len());
1110
1111        for posting in postings {
1112            if let Some(filter) = filter {
1113                // next_mask: bloom filter check — the character following this trigram in the
1114                // query must also appear after this trigram somewhere in the file.
1115                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1116                    continue;
1117                }
1118                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1119                // between consecutive trigram pairs, but is NOT used as a single-trigram
1120                // filter because the position in the query string has no relationship to
1121                // the position in the file. Using it here causes false negatives.
1122            }
1123            if self.is_active_file(posting.file_id) {
1124                matches.push(posting.file_id);
1125            }
1126        }
1127
1128        matches
1129    }
1130}
1131
1132fn search_candidate_file(
1133    file: &FileEntry,
1134    matcher: &SearchMatcher,
1135    max_results: usize,
1136    stop_after: usize,
1137    total_matches: &AtomicUsize,
1138    files_searched: &AtomicUsize,
1139    files_with_matches: &AtomicUsize,
1140    truncated: &AtomicBool,
1141) -> Vec<SharedGrepMatch> {
1142    if should_stop_search(truncated, total_matches, stop_after) {
1143        return Vec::new();
1144    }
1145
1146    let content = match read_indexed_file_bytes(&file.path) {
1147        Some(content) => content,
1148        None => return Vec::new(),
1149    };
1150    // Defense in depth: even though indexing tries to filter binaries via
1151    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1152    // query time. content_inspector is fast (~bytes-per-cycle on a small
1153    // preview) and this guarantees we never surface matches inside binary
1154    // files even if the indexer somehow let one through (e.g. file changed
1155    // between indexing and query).
1156    if is_binary_bytes(&content) {
1157        return Vec::new();
1158    }
1159    files_searched.fetch_add(1, Ordering::Relaxed);
1160
1161    let shared_path = Arc::new(file.path.clone());
1162    let mut matches = Vec::new();
1163    let mut line_starts = None;
1164    let mut seen_lines = HashSet::new();
1165    let mut matched_this_file = false;
1166
1167    match matcher {
1168        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1169            let finder = memchr::memmem::Finder::new(needle);
1170            let mut start = 0;
1171
1172            while let Some(position) = finder.find(&content[start..]) {
1173                if should_stop_search(truncated, total_matches, stop_after) {
1174                    break;
1175                }
1176
1177                let offset = start + position;
1178                start = offset + 1;
1179
1180                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1181                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1182                if !seen_lines.insert(line) {
1183                    continue;
1184                }
1185
1186                matched_this_file = true;
1187                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1188                if match_number > max_results {
1189                    truncated.store(true, Ordering::Relaxed);
1190                    break;
1191                }
1192
1193                let end = offset + needle.len();
1194                matches.push(SharedGrepMatch {
1195                    file: shared_path.clone(),
1196                    line,
1197                    column,
1198                    line_text,
1199                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1200                });
1201            }
1202        }
1203        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1204            let search_content = content.to_ascii_lowercase();
1205            let finder = memchr::memmem::Finder::new(needle);
1206            let mut start = 0;
1207
1208            while let Some(position) = finder.find(&search_content[start..]) {
1209                if should_stop_search(truncated, total_matches, stop_after) {
1210                    break;
1211                }
1212
1213                let offset = start + position;
1214                start = offset + 1;
1215
1216                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1217                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1218                if !seen_lines.insert(line) {
1219                    continue;
1220                }
1221
1222                matched_this_file = true;
1223                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1224                if match_number > max_results {
1225                    truncated.store(true, Ordering::Relaxed);
1226                    break;
1227                }
1228
1229                let end = offset + needle.len();
1230                matches.push(SharedGrepMatch {
1231                    file: shared_path.clone(),
1232                    line,
1233                    column,
1234                    line_text,
1235                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1236                });
1237            }
1238        }
1239        SearchMatcher::Regex(regex) => {
1240            for matched in regex.find_iter(&content) {
1241                if should_stop_search(truncated, total_matches, stop_after) {
1242                    break;
1243                }
1244
1245                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1246                let (line, column, line_text) =
1247                    line_details_bytes(&content, line_starts, matched.start());
1248                if !seen_lines.insert(line) {
1249                    continue;
1250                }
1251
1252                matched_this_file = true;
1253                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1254                if match_number > max_results {
1255                    truncated.store(true, Ordering::Relaxed);
1256                    break;
1257                }
1258
1259                matches.push(SharedGrepMatch {
1260                    file: shared_path.clone(),
1261                    line,
1262                    column,
1263                    line_text,
1264                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1265                });
1266            }
1267        }
1268    }
1269
1270    if matched_this_file {
1271        files_with_matches.fetch_add(1, Ordering::Relaxed);
1272    }
1273
1274    matches
1275}
1276
1277fn should_stop_search(
1278    truncated: &AtomicBool,
1279    total_matches: &AtomicUsize,
1280    stop_after: usize,
1281) -> bool {
1282    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1283}
1284
1285fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1286    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1287    let mut left_index = 0;
1288    let mut right_index = 0;
1289
1290    while left_index < left.len() && right_index < right.len() {
1291        match left[left_index].cmp(&right[right_index]) {
1292            std::cmp::Ordering::Less => left_index += 1,
1293            std::cmp::Ordering::Greater => right_index += 1,
1294            std::cmp::Ordering::Equal => {
1295                merged.push(left[left_index]);
1296                left_index += 1;
1297                right_index += 1;
1298            }
1299        }
1300    }
1301
1302    merged
1303}
1304
1305fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1306    let mut merged = Vec::with_capacity(left.len() + right.len());
1307    let mut left_index = 0;
1308    let mut right_index = 0;
1309
1310    while left_index < left.len() && right_index < right.len() {
1311        match left[left_index].cmp(&right[right_index]) {
1312            std::cmp::Ordering::Less => {
1313                merged.push(left[left_index]);
1314                left_index += 1;
1315            }
1316            std::cmp::Ordering::Greater => {
1317                merged.push(right[right_index]);
1318                right_index += 1;
1319            }
1320            std::cmp::Ordering::Equal => {
1321                merged.push(left[left_index]);
1322                left_index += 1;
1323                right_index += 1;
1324            }
1325        }
1326    }
1327
1328    merged.extend_from_slice(&left[left_index..]);
1329    merged.extend_from_slice(&right[right_index..]);
1330    merged
1331}
1332
1333pub fn decompose_regex(pattern: &str) -> RegexQuery {
1334    let hir = match regex_syntax::parse(pattern) {
1335        Ok(hir) => hir,
1336        Err(_) => return RegexQuery::default(),
1337    };
1338
1339    let build = build_query(&hir);
1340    build.into_query()
1341}
1342
1343pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1344    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1345}
1346
1347pub fn normalize_char(c: u8) -> u8 {
1348    c.to_ascii_lowercase()
1349}
1350
1351pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1352    if content.len() < 3 {
1353        return Vec::new();
1354    }
1355
1356    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1357    for start in 0..=content.len() - 3 {
1358        let trigram = pack_trigram(
1359            normalize_char(content[start]),
1360            normalize_char(content[start + 1]),
1361            normalize_char(content[start + 2]),
1362        );
1363        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1364        trigrams.push((trigram, next_char, start));
1365    }
1366    trigrams
1367}
1368
1369pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1370    let mut seen = HashSet::new();
1371    let mut out = Vec::new();
1372    for token in tokens {
1373        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1374            if seen.insert(trigram) {
1375                out.push(trigram);
1376            }
1377        }
1378    }
1379    out
1380}
1381
1382pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1383    if query_trigrams.is_empty() {
1384        return 0.0;
1385    }
1386
1387    let mut hits = 0u32;
1388    for &trigram in query_trigrams {
1389        if let Some(postings) = index.postings.get(&trigram) {
1390            if postings
1391                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1392                .is_ok()
1393            {
1394                hits += 1;
1395            }
1396        }
1397    }
1398
1399    if hits == 0 {
1400        return 0.0;
1401    }
1402
1403    let file_trigram_count = index
1404        .file_trigrams
1405        .get(&file_id)
1406        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1407    (hits as f32) / (1.0 + file_trigram_count.ln())
1408}
1409
1410pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1411    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1412    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1413        return PathBuf::from(override_dir)
1414            .join("index")
1415            .join(project_cache_key(project_root));
1416    }
1417    // Use configured storage dir (from plugin, XDG-compliant)
1418    if let Some(dir) = storage_dir {
1419        return dir.join("index").join(project_cache_key(project_root));
1420    }
1421    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1422    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1423    // If neither is set, fall back to a temp directory rather than `"."`
1424    // because the search-index code reads/writes absolute paths.
1425    let home = std::env::var_os("HOME")
1426        .or_else(|| std::env::var_os("USERPROFILE"))
1427        .map(PathBuf::from)
1428        .unwrap_or_else(std::env::temp_dir);
1429    home.join(".cache")
1430        .join("aft")
1431        .join("index")
1432        .join(project_cache_key(project_root))
1433}
1434
1435pub(crate) fn build_path_filters(
1436    include: &[String],
1437    exclude: &[String],
1438) -> Result<PathFilters, String> {
1439    Ok(PathFilters {
1440        includes: build_globset(include)?,
1441        excludes: build_globset(exclude)?,
1442    })
1443}
1444
1445pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1446    walk_project_files_from(root, root, filters)
1447}
1448
1449pub(crate) fn walk_project_files_from(
1450    filter_root: &Path,
1451    search_root: &Path,
1452    filters: &PathFilters,
1453) -> Vec<PathBuf> {
1454    let mut builder = WalkBuilder::new(search_root);
1455    builder
1456        .hidden(false)
1457        .git_ignore(true)
1458        .git_global(true)
1459        .git_exclude(true)
1460        .filter_entry(|entry| {
1461            let name = entry.file_name().to_string_lossy();
1462            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1463                return !matches!(
1464                    name.as_ref(),
1465                    "node_modules"
1466                        | "target"
1467                        | "venv"
1468                        | ".venv"
1469                        | ".git"
1470                        | "__pycache__"
1471                        | ".tox"
1472                        | "dist"
1473                        | "build"
1474                );
1475            }
1476            true
1477        });
1478
1479    let mut files = Vec::new();
1480    for entry in builder.build().filter_map(|entry| entry.ok()) {
1481        if !entry
1482            .file_type()
1483            .map_or(false, |file_type| file_type.is_file())
1484        {
1485            continue;
1486        }
1487        let path = entry.into_path();
1488        if filters.matches(filter_root, &path) {
1489            files.push(path);
1490        }
1491    }
1492
1493    sort_paths_by_mtime_desc(&mut files);
1494    files
1495}
1496
1497pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1498    let bytes = fs::read(path).ok()?;
1499    if is_binary_bytes(&bytes) {
1500        return None;
1501    }
1502    String::from_utf8(bytes).ok()
1503}
1504
1505fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1506    fs::read(path).ok()
1507}
1508
1509pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1510    path.strip_prefix(root)
1511        .map(PathBuf::from)
1512        .unwrap_or_else(|_| path.to_path_buf())
1513}
1514
1515pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1516    let normalized_root = normalize_path(root);
1517    let normalized_path = normalize_path(path);
1518    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1519    validate_cached_relative_path(relative)
1520}
1521
1522pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1523    let relative = validate_cached_relative_path(relative_path)?;
1524    let normalized_root = normalize_path(root);
1525    let full_path = normalize_path(&normalized_root.join(relative));
1526
1527    match fs::canonicalize(&full_path) {
1528        Ok(canonical_path) => {
1529            if canonical_path.starts_with(&normalized_root) {
1530                return Some(full_path);
1531            }
1532
1533            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1534            canonical_path
1535                .starts_with(&canonical_root)
1536                .then_some(full_path)
1537        }
1538        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1539    }
1540}
1541
1542pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1543    if path.is_absolute() {
1544        return None;
1545    }
1546
1547    let mut normalized = PathBuf::new();
1548    for component in path.components() {
1549        match component {
1550            Component::Normal(part) => normalized.push(part),
1551            Component::CurDir => {}
1552            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1553        }
1554    }
1555    (!normalized.as_os_str().is_empty()).then_some(normalized)
1556}
1557
1558/// Sort paths newest-first by mtime, falling back to lexicographic order.
1559///
1560/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1561/// `sort_by()` closure. That made the comparator non-deterministic — a
1562/// `stat()` syscall for the same path can return different values across
1563/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1564/// concurrent file-watcher activity), and Rust's slice::sort panics at
1565/// runtime when it detects a non-total-order comparator. CI hit this on
1566/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1567///
1568/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1569/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1570pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1571    use std::collections::HashMap;
1572    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1573    for path in paths.iter() {
1574        mtimes
1575            .entry(path.clone())
1576            .or_insert_with(|| path_modified_time(path));
1577    }
1578    paths.sort_by(|left, right| {
1579        let left_mtime = mtimes.get(left).and_then(|v| *v);
1580        let right_mtime = mtimes.get(right).and_then(|v| *v);
1581        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1582    });
1583}
1584
1585/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1586/// the sort. Same fix, applied to grep matches that share files.
1587pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1588    use std::collections::HashMap;
1589    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1590    for m in matches.iter() {
1591        mtimes.entry(m.file.clone()).or_insert_with(|| {
1592            let resolved = resolve_match_path(project_root, &m.file);
1593            path_modified_time(&resolved)
1594        });
1595    }
1596    matches.sort_by(|left, right| {
1597        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1598        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1599        right_mtime
1600            .cmp(&left_mtime)
1601            .then_with(|| left.file.cmp(&right.file))
1602            .then_with(|| left.line.cmp(&right.line))
1603            .then_with(|| left.column.cmp(&right.column))
1604    });
1605}
1606
1607/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1608/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1609/// table from the search index), but it can still return different values if
1610/// the file is modified mid-sort. Snapshot once.
1611fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1612    matches: &mut [SharedGrepMatch],
1613    modified_for_path: F,
1614) where
1615    F: Fn(&Path) -> Option<SystemTime>,
1616{
1617    use std::collections::HashMap;
1618    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1619    for m in matches.iter() {
1620        let path = m.file.as_path().to_path_buf();
1621        mtimes
1622            .entry(path.clone())
1623            .or_insert_with(|| modified_for_path(&path));
1624    }
1625    matches.sort_by(|left, right| {
1626        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1627        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1628        right_mtime
1629            .cmp(&left_mtime)
1630            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1631            .then_with(|| left.line.cmp(&right.line))
1632            .then_with(|| left.column.cmp(&right.column))
1633    });
1634}
1635
1636pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1637    let resolved_project_root = canonicalize_or_normalize(project_root);
1638    let root = match path {
1639        Some(path) => {
1640            let path = PathBuf::from(path);
1641            if path.is_absolute() {
1642                canonicalize_or_normalize(&path)
1643            } else {
1644                normalize_path(&resolved_project_root.join(path))
1645            }
1646        }
1647        None => resolved_project_root.clone(),
1648    };
1649
1650    let use_index = is_within_search_root(&resolved_project_root, &root);
1651    SearchScope { root, use_index }
1652}
1653
1654pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1655    content_inspector::inspect(content).is_binary()
1656}
1657
1658pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1659    run_git(root, &["rev-parse", "HEAD"])
1660}
1661
1662pub fn project_cache_key(project_root: &Path) -> String {
1663    use sha2::{Digest, Sha256};
1664
1665    let mut hasher = Sha256::new();
1666
1667    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1668        // Git repo: root commit is the unique identity.
1669        // Same repo cloned anywhere produces the same key.
1670        hasher.update(root_commit.as_bytes());
1671    } else {
1672        // Non-git project: use the canonical filesystem path as identity.
1673        let canonical_root = canonicalize_or_normalize(project_root);
1674        hasher.update(canonical_root.to_string_lossy().as_bytes());
1675    }
1676
1677    let digest = format!("{:x}", hasher.finalize());
1678    digest[..16].to_string()
1679}
1680
1681impl PathFilters {
1682    fn matches(&self, root: &Path, path: &Path) -> bool {
1683        let relative = to_glob_path(&relative_to_root(root, path));
1684        if self
1685            .includes
1686            .as_ref()
1687            .is_some_and(|includes| !includes.is_match(&relative))
1688        {
1689            return false;
1690        }
1691        if self
1692            .excludes
1693            .as_ref()
1694            .is_some_and(|excludes| excludes.is_match(&relative))
1695        {
1696            return false;
1697        }
1698        true
1699    }
1700}
1701
1702fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1703    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1704}
1705
1706fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1707    if path.is_absolute() {
1708        path.to_path_buf()
1709    } else {
1710        project_root.join(path)
1711    }
1712}
1713
1714fn path_modified_time(path: &Path) -> Option<SystemTime> {
1715    fs::metadata(path)
1716        .and_then(|metadata| metadata.modified())
1717        .ok()
1718}
1719
1720fn normalize_path(path: &Path) -> PathBuf {
1721    let mut result = PathBuf::new();
1722    for component in path.components() {
1723        match component {
1724            Component::ParentDir => {
1725                if !result.pop() {
1726                    result.push(component);
1727                }
1728            }
1729            Component::CurDir => {}
1730            _ => result.push(component),
1731        }
1732    }
1733    result
1734}
1735
1736/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1737/// since the index was last written. Also detect new files and deleted files.
1738fn verify_file_mtimes(index: &mut SearchIndex) {
1739    // Collect stale files (mtime mismatch or deleted)
1740    let mut stale_paths = Vec::new();
1741    for entry in &index.files {
1742        if entry.path.as_os_str().is_empty() {
1743            continue; // tombstoned entry
1744        }
1745        let cached = FileFreshness {
1746            mtime: entry.modified,
1747            size: entry.size,
1748            content_hash: entry.content_hash,
1749        };
1750        match cache_freshness::verify_file_strict(&entry.path, &cached) {
1751            FreshnessVerdict::HotFresh | FreshnessVerdict::ContentFresh { .. } => {}
1752            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1753                stale_paths.push(entry.path.clone())
1754            }
1755        }
1756    }
1757
1758    for entry in &mut index.files {
1759        if entry.path.as_os_str().is_empty() {
1760            continue;
1761        }
1762        let cached = FileFreshness {
1763            mtime: entry.modified,
1764            size: entry.size,
1765            content_hash: entry.content_hash,
1766        };
1767        if let FreshnessVerdict::ContentFresh {
1768            new_mtime,
1769            new_size,
1770        } = cache_freshness::verify_file_strict(&entry.path, &cached)
1771        {
1772            entry.modified = new_mtime;
1773            entry.size = new_size;
1774        }
1775    }
1776
1777    // Re-index stale files
1778    for path in &stale_paths {
1779        index.update_file(path);
1780    }
1781
1782    // Detect new files not in the index
1783    let filters = PathFilters::default();
1784    for path in walk_project_files(&index.project_root, &filters) {
1785        if !index.path_to_id.contains_key(&path) {
1786            index.update_file(&path);
1787        }
1788    }
1789
1790    if !stale_paths.is_empty() {
1791        crate::slog_info!(
1792            "search index: refreshed {} stale file(s) from disk cache",
1793            stale_paths.len()
1794        );
1795    }
1796}
1797
1798fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1799    normalize_path(path).starts_with(normalize_path(search_root))
1800}
1801
1802impl QueryBuild {
1803    fn into_query(self) -> RegexQuery {
1804        let mut query = RegexQuery::default();
1805
1806        for run in self.and_runs {
1807            add_run_to_and_query(&mut query, &run);
1808        }
1809
1810        for group in self.or_groups {
1811            let mut trigrams = BTreeSet::new();
1812            let mut filters = HashMap::new();
1813            for run in group {
1814                for (trigram, filter) in trigram_filters(&run) {
1815                    trigrams.insert(trigram);
1816                    merge_filter(filters.entry(trigram).or_default(), filter);
1817                }
1818            }
1819            if !trigrams.is_empty() {
1820                query.or_groups.push(trigrams.into_iter().collect());
1821                query.or_filters.push(filters);
1822            }
1823        }
1824
1825        query
1826    }
1827}
1828
1829fn build_query(hir: &Hir) -> QueryBuild {
1830    match hir.kind() {
1831        HirKind::Literal(literal) => {
1832            if literal.0.len() >= 3 {
1833                QueryBuild {
1834                    and_runs: vec![literal.0.to_vec()],
1835                    or_groups: Vec::new(),
1836                }
1837            } else {
1838                QueryBuild::default()
1839            }
1840        }
1841        HirKind::Capture(capture) => build_query(&capture.sub),
1842        HirKind::Concat(parts) => {
1843            let mut build = QueryBuild::default();
1844            for part in parts {
1845                let part_build = build_query(part);
1846                build.and_runs.extend(part_build.and_runs);
1847                build.or_groups.extend(part_build.or_groups);
1848            }
1849            build
1850        }
1851        HirKind::Alternation(parts) => {
1852            let mut group = Vec::new();
1853            for part in parts {
1854                let Some(mut choices) = guaranteed_run_choices(part) else {
1855                    return QueryBuild::default();
1856                };
1857                group.append(&mut choices);
1858            }
1859            if group.is_empty() {
1860                QueryBuild::default()
1861            } else {
1862                QueryBuild {
1863                    and_runs: Vec::new(),
1864                    or_groups: vec![group],
1865                }
1866            }
1867        }
1868        HirKind::Repetition(repetition) => {
1869            if repetition.min == 0 {
1870                QueryBuild::default()
1871            } else {
1872                build_query(&repetition.sub)
1873            }
1874        }
1875        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1876    }
1877}
1878
1879fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1880    match hir.kind() {
1881        HirKind::Literal(literal) => {
1882            if literal.0.len() >= 3 {
1883                Some(vec![literal.0.to_vec()])
1884            } else {
1885                None
1886            }
1887        }
1888        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1889        HirKind::Concat(parts) => {
1890            let mut runs = Vec::new();
1891            for part in parts {
1892                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1893                    runs.append(&mut part_runs);
1894                }
1895            }
1896            if runs.is_empty() {
1897                None
1898            } else {
1899                Some(runs)
1900            }
1901        }
1902        HirKind::Alternation(parts) => {
1903            let mut runs = Vec::new();
1904            for part in parts {
1905                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1906                    return None;
1907                };
1908                runs.append(&mut part_runs);
1909            }
1910            if runs.is_empty() {
1911                None
1912            } else {
1913                Some(runs)
1914            }
1915        }
1916        HirKind::Repetition(repetition) => {
1917            if repetition.min == 0 {
1918                None
1919            } else {
1920                guaranteed_run_choices(&repetition.sub)
1921            }
1922        }
1923        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1924    }
1925}
1926
1927fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1928    for (trigram, filter) in trigram_filters(run) {
1929        if !query.and_trigrams.contains(&trigram) {
1930            query.and_trigrams.push(trigram);
1931        }
1932        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1933    }
1934}
1935
1936fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1937    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1938    for (trigram, next_char, position) in extract_trigrams(run) {
1939        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1940        if next_char != EOF_SENTINEL {
1941            entry.next_mask |= mask_for_next_char(next_char);
1942        }
1943        entry.loc_mask |= mask_for_position(position);
1944    }
1945    filters.into_iter().collect()
1946}
1947
1948fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1949    target.next_mask |= filter.next_mask;
1950    target.loc_mask |= filter.loc_mask;
1951}
1952
1953fn mask_for_next_char(next_char: u8) -> u8 {
1954    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1955    1u8 << bit
1956}
1957
1958fn mask_for_position(position: usize) -> u8 {
1959    1u8 << (position % 8)
1960}
1961
1962fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1963    if patterns.is_empty() {
1964        return Ok(None);
1965    }
1966
1967    let mut builder = GlobSetBuilder::new();
1968    for pattern in patterns {
1969        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1970        builder.add(glob);
1971    }
1972    builder.build().map(Some).map_err(|error| error.to_string())
1973}
1974
1975fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1976    let mut buffer = [0u8; 4];
1977    reader.read_exact(&mut buffer)?;
1978    Ok(u32::from_le_bytes(buffer))
1979}
1980
1981fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1982    let mut buffer = [0u8; 8];
1983    reader.read_exact(&mut buffer)?;
1984    Ok(u64::from_le_bytes(buffer))
1985}
1986
1987fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1988    writer.write_all(&value.to_le_bytes())
1989}
1990
1991fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1992    writer.write_all(&value.to_le_bytes())
1993}
1994
1995fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1996    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1997        return Err(std::io::Error::other("search index checksum missing"));
1998    };
1999    let expected = u32::from_le_bytes(*stored);
2000    let actual = crc32fast::hash(body);
2001    if actual != expected {
2002        return Err(std::io::Error::other("search index checksum mismatch"));
2003    }
2004    Ok(())
2005}
2006
2007fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2008    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2009    total_len.checked_sub(pos)
2010}
2011
2012fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2013    let output = Command::new("git")
2014        .arg("-C")
2015        .arg(root)
2016        .args(args)
2017        .output()
2018        .ok()?;
2019    if !output.status.success() {
2020        return None;
2021    }
2022    let value = String::from_utf8(output.stdout).ok()?;
2023    let value = value.trim().to_string();
2024    if value.is_empty() {
2025        None
2026    } else {
2027        Some(value)
2028    }
2029}
2030
2031fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2032    let diff_range = format!("{}..{}", from, to);
2033    let output = match Command::new("git")
2034        .arg("-C")
2035        .arg(root)
2036        .args(["diff", "--name-status", "-M", &diff_range])
2037        .output()
2038    {
2039        Ok(output) => output,
2040        Err(_) => return false,
2041    };
2042
2043    if !output.status.success() {
2044        return false;
2045    }
2046
2047    let Ok(diff) = String::from_utf8(output.stdout) else {
2048        return false;
2049    };
2050
2051    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2052        let mut fields = line.split('\t');
2053        let Some(status) = fields.next() else {
2054            continue;
2055        };
2056
2057        if status.starts_with('R') {
2058            let Some(old_path) = fields
2059                .next()
2060                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2061            else {
2062                continue;
2063            };
2064            let Some(new_path) = fields
2065                .next()
2066                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2067            else {
2068                continue;
2069            };
2070            index.remove_file(&old_path);
2071            index.update_file(&new_path);
2072            continue;
2073        }
2074
2075        let Some(path) = fields
2076            .next()
2077            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2078        else {
2079            continue;
2080        };
2081        if status.starts_with('D') || !path.exists() {
2082            index.remove_file(&path);
2083        } else {
2084            index.update_file(&path);
2085        }
2086    }
2087
2088    true
2089}
2090
2091fn is_binary_path(path: &Path, size: u64) -> bool {
2092    if size == 0 {
2093        return false;
2094    }
2095
2096    let mut file = match File::open(path) {
2097        Ok(file) => file,
2098        Err(_) => return true,
2099    };
2100
2101    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2102    match file.read(&mut preview) {
2103        Ok(read) => is_binary_bytes(&preview[..read]),
2104        Err(_) => true,
2105    }
2106}
2107
2108fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2109    let mut starts = vec![0usize];
2110    for (index, byte) in content.iter().copied().enumerate() {
2111        if byte == b'\n' {
2112            starts.push(index + 1);
2113        }
2114    }
2115    starts
2116}
2117
2118fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2119    let line_index = match line_starts.binary_search(&offset) {
2120        Ok(index) => index,
2121        Err(index) => index.saturating_sub(1),
2122    };
2123    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2124    let line_end = content[line_start..]
2125        .iter()
2126        .position(|byte| *byte == b'\n')
2127        .map(|length| line_start + length)
2128        .unwrap_or(content.len());
2129    let mut line_slice = &content[line_start..line_end];
2130    if line_slice.ends_with(b"\r") {
2131        line_slice = &line_slice[..line_slice.len() - 1];
2132    }
2133    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2134    let column = String::from_utf8_lossy(&content[line_start..offset])
2135        .chars()
2136        .count() as u32
2137        + 1;
2138    (line_index as u32 + 1, column, line_text)
2139}
2140
2141fn to_glob_path(path: &Path) -> String {
2142    path.to_string_lossy().replace('\\', "/")
2143}
2144
2145#[cfg(test)]
2146mod tests {
2147    use std::process::Command;
2148
2149    use super::*;
2150
2151    #[test]
2152    fn cached_path_under_root_allows_missing_lexical_child() {
2153        let dir = tempfile::tempdir().expect("create temp dir");
2154        let project = dir.path().join("project");
2155        fs::create_dir_all(&project).expect("create project dir");
2156        let root = fs::canonicalize(&project).expect("canonicalize project");
2157
2158        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2159            .expect("missing child should fall back to lexical validation");
2160
2161        assert_eq!(path, root.join("future/file.rs"));
2162    }
2163
2164    #[cfg(unix)]
2165    #[test]
2166    fn cached_path_under_root_rejects_symlink_escape() {
2167        let dir = tempfile::tempdir().expect("create temp dir");
2168        let project = dir.path().join("project");
2169        let outside = dir.path().join("outside");
2170        fs::create_dir_all(&project).expect("create project dir");
2171        fs::create_dir_all(&outside).expect("create outside dir");
2172        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2173        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2174        let root = fs::canonicalize(&project).expect("canonicalize project");
2175
2176        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2177    }
2178
2179    #[test]
2180    fn extract_trigrams_tracks_next_char_and_position() {
2181        let trigrams = extract_trigrams(b"Rust");
2182        assert_eq!(trigrams.len(), 2);
2183        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2184        assert_eq!(
2185            trigrams[1],
2186            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2187        );
2188    }
2189
2190    #[test]
2191    fn decompose_regex_extracts_literals_and_alternations() {
2192        let query = decompose_regex("abc(def|ghi)xyz");
2193        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2194        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2195        assert_eq!(query.or_groups.len(), 1);
2196        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2197        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2198    }
2199
2200    #[test]
2201    fn candidates_intersect_posting_lists() {
2202        let mut index = SearchIndex::new();
2203        let dir = tempfile::tempdir().expect("create temp dir");
2204        let alpha = dir.path().join("alpha.txt");
2205        let beta = dir.path().join("beta.txt");
2206        fs::write(&alpha, "abcdef").expect("write alpha");
2207        fs::write(&beta, "abcxyz").expect("write beta");
2208        index.project_root = dir.path().to_path_buf();
2209        index.index_file(&alpha, b"abcdef");
2210        index.index_file(&beta, b"abcxyz");
2211
2212        let query = RegexQuery {
2213            and_trigrams: vec![
2214                pack_trigram(b'a', b'b', b'c'),
2215                pack_trigram(b'd', b'e', b'f'),
2216            ],
2217            ..RegexQuery::default()
2218        };
2219
2220        let candidates = index.candidates(&query);
2221        assert_eq!(candidates.len(), 1);
2222        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2223    }
2224
2225    #[test]
2226    fn candidates_apply_bloom_filters() {
2227        let mut index = SearchIndex::new();
2228        let dir = tempfile::tempdir().expect("create temp dir");
2229        let file = dir.path().join("sample.txt");
2230        fs::write(&file, "abcd efgh").expect("write sample");
2231        index.project_root = dir.path().to_path_buf();
2232        index.index_file(&file, b"abcd efgh");
2233
2234        let trigram = pack_trigram(b'a', b'b', b'c');
2235        let matching_filter = PostingFilter {
2236            next_mask: mask_for_next_char(b'd'),
2237            loc_mask: mask_for_position(0),
2238        };
2239        let non_matching_filter = PostingFilter {
2240            next_mask: mask_for_next_char(b'z'),
2241            loc_mask: mask_for_position(0),
2242        };
2243
2244        assert_eq!(
2245            index
2246                .postings_for_trigram(trigram, Some(matching_filter))
2247                .len(),
2248            1
2249        );
2250        assert!(index
2251            .postings_for_trigram(trigram, Some(non_matching_filter))
2252            .is_empty());
2253    }
2254
2255    #[test]
2256    fn disk_round_trip_preserves_postings_and_files() {
2257        let dir = tempfile::tempdir().expect("create temp dir");
2258        let project = dir.path().join("project");
2259        fs::create_dir_all(&project).expect("create project dir");
2260        let file = project.join("src.txt");
2261        fs::write(&file, "abcdef").expect("write source");
2262
2263        let mut index = SearchIndex::build(&project);
2264        index.git_head = Some("deadbeef".to_string());
2265        let cache_dir = dir.path().join("cache");
2266        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2267
2268        let loaded =
2269            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2270        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2271        assert_eq!(loaded.files.len(), 1);
2272        assert_eq!(
2273            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2274            PathBuf::from("src.txt")
2275        );
2276        assert_eq!(loaded.postings.len(), index.postings.len());
2277        assert!(loaded
2278            .postings
2279            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2280    }
2281
2282    #[test]
2283    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2284        let root = PathBuf::from("/tmp/aft-project");
2285
2286        assert_eq!(
2287            cache_relative_path(&root, &root.join("src/lib.rs")),
2288            Some(PathBuf::from("src/lib.rs"))
2289        );
2290        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2291        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2292        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2293        assert_eq!(
2294            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2295            Some(root.join("src/lib.rs"))
2296        );
2297    }
2298
2299    #[test]
2300    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2301        let dir = tempfile::tempdir().expect("create temp dir");
2302        let project = dir.path().join("project");
2303        fs::create_dir_all(&project).expect("create project dir");
2304        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2305        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2306        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2307
2308        Command::new("git")
2309            .arg("init")
2310            .arg(&project)
2311            .status()
2312            .expect("git init");
2313        for args in [
2314            ["config", "user.email", "aft@example.invalid"],
2315            ["config", "user.name", "AFT Test"],
2316        ] {
2317            Command::new("git")
2318                .arg("-C")
2319                .arg(&project)
2320                .args(args)
2321                .status()
2322                .expect("git config");
2323        }
2324        Command::new("git")
2325            .arg("-C")
2326            .arg(&project)
2327            .args(["add", "."])
2328            .status()
2329            .expect("git add initial");
2330        Command::new("git")
2331            .arg("-C")
2332            .arg(&project)
2333            .args(["commit", "-m", "initial"])
2334            .status()
2335            .expect("git commit initial");
2336        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2337        let mut baseline = SearchIndex::build(&project);
2338        baseline.git_head = Some(previous.clone());
2339
2340        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2341        Command::new("git")
2342            .arg("-C")
2343            .arg(&project)
2344            .args(["add", "-A"])
2345            .status()
2346            .expect("git add rename");
2347        Command::new("git")
2348            .arg("-C")
2349            .arg(&project)
2350            .args(["commit", "-m", "rename"])
2351            .status()
2352            .expect("git commit rename");
2353        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2354
2355        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2356        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2357
2358        let refreshed = SearchIndex::rebuild_or_refresh(
2359            &project,
2360            DEFAULT_MAX_FILE_SIZE,
2361            Some(current),
2362            Some(baseline),
2363        );
2364
2365        assert!(!refreshed
2366            .path_to_id
2367            .contains_key(&canonical_project.join("old.txt")));
2368        assert!(refreshed
2369            .path_to_id
2370            .contains_key(&canonical_project.join("new.txt")));
2371        assert!(refreshed
2372            .path_to_id
2373            .contains_key(&canonical_project.join("untracked.txt")));
2374        let matches =
2375            refreshed.search_grep("after local edit", true, &[], &[], &canonical_project, 10);
2376        assert_eq!(matches.matches.len(), 1);
2377    }
2378
2379    #[test]
2380    fn read_from_disk_rejects_corrupt_postings_checksum() {
2381        let dir = tempfile::tempdir().expect("create temp dir");
2382        let project = dir.path().join("project");
2383        fs::create_dir_all(&project).expect("create project dir");
2384        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2385
2386        let index = SearchIndex::build(&project);
2387        let cache_dir = dir.path().join("cache");
2388        index.write_to_disk(&cache_dir, None);
2389
2390        let cache_path = cache_dir.join("cache.bin");
2391        let mut bytes = fs::read(&cache_path).expect("read cache");
2392        let middle = bytes.len() / 2;
2393        bytes[middle] ^= 0xff;
2394        fs::write(&cache_path, bytes).expect("write corrupted cache");
2395
2396        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2397    }
2398
2399    #[test]
2400    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2401        let dir = tempfile::tempdir().expect("create temp dir");
2402        let project = dir.path().join("project");
2403        fs::create_dir_all(&project).expect("create project dir");
2404        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2405
2406        let index = SearchIndex::build(&project);
2407        let cache_dir = dir.path().join("cache");
2408        index.write_to_disk(&cache_dir, None);
2409
2410        assert!(cache_dir.join("cache.bin").is_file());
2411        assert!(fs::read_dir(&cache_dir)
2412            .expect("read cache dir")
2413            .all(|entry| !entry
2414                .expect("cache entry")
2415                .file_name()
2416                .to_string_lossy()
2417                .contains(".tmp.")));
2418    }
2419
2420    #[test]
2421    fn concurrent_search_index_writes_do_not_corrupt() {
2422        let dir = tempfile::tempdir().expect("create temp dir");
2423        let project = dir.path().join("project");
2424        fs::create_dir_all(&project).expect("create project dir");
2425        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2426        let cache_dir = dir.path().join("cache");
2427
2428        let a_project = project.clone();
2429        let a_cache = cache_dir.clone();
2430        let a = std::thread::spawn(move || {
2431            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2432            let index = SearchIndex::build(&a_project);
2433            index.write_to_disk(&a_cache, None);
2434        });
2435        let b_project = project.clone();
2436        let b_cache = cache_dir.clone();
2437        let b = std::thread::spawn(move || {
2438            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2439            let index = SearchIndex::build(&b_project);
2440            index.write_to_disk(&b_cache, None);
2441        });
2442        a.join().expect("writer a");
2443        b.join().expect("writer b");
2444
2445        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2446    }
2447
2448    #[test]
2449    fn search_index_atomic_rename_survives_partial_write() {
2450        let dir = tempfile::tempdir().expect("create temp dir");
2451        let cache_dir = dir.path().join("cache");
2452        fs::create_dir_all(&cache_dir).expect("create cache dir");
2453        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2454
2455        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2456    }
2457
2458    #[test]
2459    fn project_cache_key_includes_checkout_path() {
2460        let dir = tempfile::tempdir().expect("create temp dir");
2461        let source = dir.path().join("source");
2462        fs::create_dir_all(&source).expect("create source repo dir");
2463        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2464
2465        assert!(Command::new("git")
2466            .current_dir(&source)
2467            .args(["init"])
2468            .status()
2469            .expect("init git repo")
2470            .success());
2471        assert!(Command::new("git")
2472            .current_dir(&source)
2473            .args(["add", "."])
2474            .status()
2475            .expect("git add")
2476            .success());
2477        assert!(Command::new("git")
2478            .current_dir(&source)
2479            .args([
2480                "-c",
2481                "user.name=AFT Tests",
2482                "-c",
2483                "user.email=aft-tests@example.com",
2484                "commit",
2485                "-m",
2486                "initial",
2487            ])
2488            .status()
2489            .expect("git commit")
2490            .success());
2491
2492        let clone = dir.path().join("clone");
2493        assert!(Command::new("git")
2494            .args(["clone", "--quiet"])
2495            .arg(&source)
2496            .arg(&clone)
2497            .status()
2498            .expect("git clone")
2499            .success());
2500
2501        let source_key = project_cache_key(&source);
2502        let clone_key = project_cache_key(&clone);
2503
2504        assert_eq!(source_key.len(), 16);
2505        assert_eq!(clone_key.len(), 16);
2506        // Same repo (same root commit) → same cache key regardless of clone path
2507        assert_eq!(source_key, clone_key);
2508    }
2509
2510    #[test]
2511    fn git_head_unchanged_picks_up_local_edits() {
2512        let dir = tempfile::tempdir().expect("create temp dir");
2513        let project = dir.path().join("repo");
2514        fs::create_dir_all(&project).expect("create repo dir");
2515        let file = project.join("tracked.txt");
2516        fs::write(&file, "oldtoken\n").expect("write file");
2517        assert!(Command::new("git")
2518            .current_dir(&project)
2519            .arg("init")
2520            .status()
2521            .unwrap()
2522            .success());
2523        assert!(Command::new("git")
2524            .current_dir(&project)
2525            .args(["add", "."])
2526            .status()
2527            .unwrap()
2528            .success());
2529        assert!(Command::new("git")
2530            .current_dir(&project)
2531            .args([
2532                "-c",
2533                "user.name=AFT Tests",
2534                "-c",
2535                "user.email=aft-tests@example.com",
2536                "commit",
2537                "-m",
2538                "initial"
2539            ])
2540            .status()
2541            .unwrap()
2542            .success());
2543        let head = current_git_head(&project);
2544        let mut baseline = SearchIndex::build(&project);
2545        baseline.git_head = head.clone();
2546        fs::write(&file, "newtoken\n").expect("edit tracked file");
2547
2548        let refreshed =
2549            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2550        let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2551
2552        assert_eq!(result.total_matches, 1);
2553    }
2554
2555    #[test]
2556    fn non_git_project_reuses_cache_when_files_unchanged() {
2557        let dir = tempfile::tempdir().expect("create temp dir");
2558        let project = dir.path().join("project");
2559        fs::create_dir_all(&project).expect("create project dir");
2560        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2561        let baseline = SearchIndex::build(&project);
2562        let baseline_file_count = baseline.file_count();
2563
2564        let refreshed =
2565            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2566
2567        assert_eq!(refreshed.file_count(), baseline_file_count);
2568        assert_eq!(
2569            refreshed
2570                .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2571                .total_matches,
2572            1
2573        );
2574    }
2575
2576    #[test]
2577    fn resolve_search_scope_disables_index_for_external_path() {
2578        let dir = tempfile::tempdir().expect("create temp dir");
2579        let project = dir.path().join("project");
2580        let outside = dir.path().join("outside");
2581        fs::create_dir_all(&project).expect("create project dir");
2582        fs::create_dir_all(&outside).expect("create outside dir");
2583
2584        let scope = resolve_search_scope(&project, outside.to_str());
2585
2586        assert_eq!(
2587            scope.root,
2588            fs::canonicalize(&outside).expect("canonicalize outside")
2589        );
2590        assert!(!scope.use_index);
2591    }
2592
2593    #[test]
2594    fn grep_filters_matches_to_search_root() {
2595        let dir = tempfile::tempdir().expect("create temp dir");
2596        let project = dir.path().join("project");
2597        let src = project.join("src");
2598        let docs = project.join("docs");
2599        fs::create_dir_all(&src).expect("create src dir");
2600        fs::create_dir_all(&docs).expect("create docs dir");
2601        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2602        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2603
2604        let index = SearchIndex::build(&project);
2605        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2606
2607        assert_eq!(result.files_searched, 1);
2608        assert_eq!(result.files_with_matches, 1);
2609        assert_eq!(result.matches.len(), 1);
2610        // Index stores canonicalized paths; on macOS /var → /private/var
2611        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2612        assert_eq!(result.matches[0].file, expected);
2613    }
2614
2615    #[test]
2616    fn grep_deduplicates_multiple_matches_on_same_line() {
2617        let dir = tempfile::tempdir().expect("create temp dir");
2618        let project = dir.path().join("project");
2619        let src = project.join("src");
2620        fs::create_dir_all(&src).expect("create src dir");
2621        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2622
2623        let index = SearchIndex::build(&project);
2624        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2625
2626        assert_eq!(result.total_matches, 1);
2627        assert_eq!(result.matches.len(), 1);
2628    }
2629
2630    #[test]
2631    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2632        let dir = tempfile::tempdir().expect("create temp dir");
2633        let project = dir.path().join("project");
2634        fs::create_dir_all(&project).expect("create project dir");
2635        let file = project.join("unicode.txt");
2636        fs::write(&file, "äbc\n").expect("write unicode file");
2637
2638        let index = SearchIndex::build(&project);
2639        let result = index.search_grep("Äbc", false, &[], &[], &project, 10);
2640
2641        assert_eq!(result.total_matches, 1);
2642        assert_eq!(result.matches.len(), 1);
2643        assert_eq!(
2644            result.matches[0].file,
2645            fs::canonicalize(file).expect("canonicalize unicode file")
2646        );
2647    }
2648
2649    #[test]
2650    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2651        let dir = tempfile::tempdir().expect("create temp dir");
2652        let project = dir.path().join("project");
2653        fs::create_dir_all(&project).expect("create project dir");
2654        let file = project.join("tokens.txt");
2655        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2656        fs::write(&file, "alpha").expect("write original file");
2657        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2658
2659        let baseline = SearchIndex::build(&project);
2660        fs::write(&file, "bravo").expect("write same-size edit");
2661        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2662
2663        let refreshed =
2664            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2665        let result = refreshed.search_grep("bravo", true, &[], &[], &project, 10);
2666        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2667        let refreshed_id = *refreshed
2668            .path_to_id
2669            .get(&canonical_file)
2670            .expect("file remains indexed");
2671
2672        assert_eq!(result.total_matches, 1);
2673        assert!(refreshed
2674            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2675            .contains(&refreshed_id));
2676        assert!(!refreshed
2677            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2678            .contains(&refreshed_id));
2679    }
2680
2681    #[test]
2682    fn grep_reports_total_matches_before_truncation() {
2683        let dir = tempfile::tempdir().expect("create temp dir");
2684        let project = dir.path().join("project");
2685        let src = project.join("src");
2686        fs::create_dir_all(&src).expect("create src dir");
2687        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2688
2689        let index = SearchIndex::build(&project);
2690        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2691
2692        assert_eq!(result.total_matches, 2);
2693        assert_eq!(result.matches.len(), 1);
2694        assert!(result.truncated);
2695    }
2696
2697    #[test]
2698    fn glob_filters_results_to_search_root() {
2699        let dir = tempfile::tempdir().expect("create temp dir");
2700        let project = dir.path().join("project");
2701        let src = project.join("src");
2702        let scripts = project.join("scripts");
2703        fs::create_dir_all(&src).expect("create src dir");
2704        fs::create_dir_all(&scripts).expect("create scripts dir");
2705        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2706        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2707
2708        let index = SearchIndex::build(&project);
2709        let files = index.glob("**/*.rs", &src);
2710
2711        assert_eq!(
2712            files,
2713            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2714        );
2715    }
2716
2717    #[test]
2718    fn glob_includes_hidden_and_binary_files() {
2719        let dir = tempfile::tempdir().expect("create temp dir");
2720        let project = dir.path().join("project");
2721        let hidden_dir = project.join(".hidden");
2722        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2723        let hidden_file = hidden_dir.join("data.bin");
2724        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2725
2726        let index = SearchIndex::build(&project);
2727        let files = index.glob("**/*.bin", &project);
2728
2729        assert_eq!(
2730            files,
2731            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2732        );
2733    }
2734
2735    #[test]
2736    fn read_from_disk_rejects_invalid_nanos() {
2737        let dir = tempfile::tempdir().expect("create temp dir");
2738        let cache_dir = dir.path().join("cache");
2739        fs::create_dir_all(&cache_dir).expect("create cache dir");
2740
2741        let mut postings = Vec::new();
2742        postings.extend_from_slice(INDEX_MAGIC);
2743        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2744        postings.extend_from_slice(&0u32.to_le_bytes());
2745        postings.extend_from_slice(&1u32.to_le_bytes());
2746        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2747        postings.extend_from_slice(&1u32.to_le_bytes());
2748        postings.extend_from_slice(b"/");
2749        postings.push(0u8);
2750        postings.extend_from_slice(&1u32.to_le_bytes());
2751        postings.extend_from_slice(&0u64.to_le_bytes());
2752        postings.extend_from_slice(&0u64.to_le_bytes());
2753        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2754        postings.extend_from_slice(b"a");
2755        postings.extend_from_slice(&0u64.to_le_bytes());
2756
2757        let mut lookup = Vec::new();
2758        lookup.extend_from_slice(LOOKUP_MAGIC);
2759        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2760        lookup.extend_from_slice(&0u32.to_le_bytes());
2761
2762        let postings_checksum = crc32fast::hash(&postings);
2763        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2764        let lookup_checksum = crc32fast::hash(&lookup);
2765        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2766        let mut cache = Vec::new();
2767        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2768        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2769        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2770        cache.extend_from_slice(&postings);
2771        cache.extend_from_slice(&lookup);
2772        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2773
2774        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2775    }
2776
2777    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2778    /// changed between cmp() calls.
2779    ///
2780    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2781    /// which does a `stat()` syscall. If the file was deleted, modified, or
2782    /// touched mid-sort, the comparator returned different values for the
2783    /// same input pair on different invocations. Rust's slice::sort detects
2784    /// this and panics with "user-provided comparison function does not
2785    /// correctly implement a total order".
2786    ///
2787    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2788    /// bridge invalidated files in parallel with grep's sort path. This
2789    /// test simulates the worst case: most paths don't exist (Err from
2790    /// fs::metadata) and sort still completes successfully.
2791    #[test]
2792    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2793        // Mix of existing and non-existing paths in deliberately
2794        // non-monotonic order — pre-fix, the sort would call stat() at
2795        // least N log N times and any flakiness would trigger the panic.
2796        let dir = tempfile::tempdir().expect("create tempdir");
2797        let mut paths: Vec<PathBuf> = Vec::new();
2798        for i in 0..30 {
2799            // Half exist, half don't.
2800            let path = if i % 2 == 0 {
2801                let p = dir.path().join(format!("real-{i}.rs"));
2802                fs::write(&p, format!("// {i}\n")).expect("write");
2803                p
2804            } else {
2805                dir.path().join(format!("missing-{i}.rs"))
2806            };
2807            paths.push(path);
2808        }
2809
2810        // Run the sort many times to maximise the chance of catching any
2811        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2812        for _ in 0..50 {
2813            let mut copy = paths.clone();
2814            sort_paths_by_mtime_desc(&mut copy);
2815            assert_eq!(copy.len(), paths.len());
2816        }
2817    }
2818
2819    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2820    /// the same conditions as sort_paths_by_mtime_desc. See the
2821    /// sort_paths_... test above for the full rationale.
2822    #[test]
2823    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2824        let dir = tempfile::tempdir().expect("create tempdir");
2825        let mut matches: Vec<GrepMatch> = Vec::new();
2826        for i in 0..30 {
2827            let file = if i % 2 == 0 {
2828                let p = dir.path().join(format!("real-{i}.rs"));
2829                fs::write(&p, format!("// {i}\n")).expect("write");
2830                p
2831            } else {
2832                dir.path().join(format!("missing-{i}.rs"))
2833            };
2834            matches.push(GrepMatch {
2835                file,
2836                line: u32::try_from(i).unwrap_or(0),
2837                column: 0,
2838                line_text: format!("match {i}"),
2839                match_text: format!("match {i}"),
2840            });
2841        }
2842
2843        for _ in 0..50 {
2844            let mut copy = matches.clone();
2845            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2846            assert_eq!(copy.len(), matches.len());
2847        }
2848    }
2849}