Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 2;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24const MAX_ENTRIES: usize = 10_000_000;
25const MIN_FILE_ENTRY_BYTES: usize = 25;
26const LOOKUP_ENTRY_BYTES: usize = 16;
27const POSTING_BYTES: usize = 6;
28
29#[derive(Clone, Debug)]
30pub struct SearchIndex {
31    pub postings: HashMap<u32, Vec<Posting>>,
32    pub files: Vec<FileEntry>,
33    pub path_to_id: HashMap<PathBuf, u32>,
34    pub ready: bool,
35    project_root: PathBuf,
36    git_head: Option<String>,
37    max_file_size: u64,
38    file_trigrams: HashMap<u32, Vec<u32>>,
39    unindexed_files: HashSet<u32>,
40}
41
42impl SearchIndex {
43    /// Number of indexed files.
44    pub fn file_count(&self) -> usize {
45        self.files.len()
46    }
47
48    /// Number of unique trigrams in the index.
49    pub fn trigram_count(&self) -> usize {
50        self.postings.len()
51    }
52}
53
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct Posting {
56    pub file_id: u32,
57    pub next_mask: u8,
58    pub loc_mask: u8,
59}
60
61#[derive(Clone, Debug)]
62pub struct FileEntry {
63    pub path: PathBuf,
64    pub size: u64,
65    pub modified: SystemTime,
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct GrepMatch {
70    pub file: PathBuf,
71    pub line: u32,
72    pub column: u32,
73    pub line_text: String,
74    pub match_text: String,
75}
76
77#[derive(Clone, Debug)]
78pub struct GrepResult {
79    pub matches: Vec<GrepMatch>,
80    pub total_matches: usize,
81    pub files_searched: usize,
82    pub files_with_matches: usize,
83    pub index_status: IndexStatus,
84    pub truncated: bool,
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq)]
88pub enum IndexStatus {
89    Ready,
90    Building,
91    Fallback,
92}
93
94impl IndexStatus {
95    pub fn as_str(&self) -> &'static str {
96        match self {
97            IndexStatus::Ready => "Ready",
98            IndexStatus::Building => "Building",
99            IndexStatus::Fallback => "Fallback",
100        }
101    }
102}
103
104#[derive(Clone, Debug, Default)]
105pub struct RegexQuery {
106    pub and_trigrams: Vec<u32>,
107    pub or_groups: Vec<Vec<u32>>,
108    pub(crate) and_filters: HashMap<u32, PostingFilter>,
109    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
110}
111
112#[derive(Clone, Copy, Debug, Default)]
113pub(crate) struct PostingFilter {
114    next_mask: u8,
115    loc_mask: u8,
116}
117
118#[derive(Clone, Debug, Default)]
119struct QueryBuild {
120    and_runs: Vec<Vec<u8>>,
121    or_groups: Vec<Vec<Vec<u8>>>,
122}
123
124#[derive(Clone, Debug, Default)]
125pub(crate) struct PathFilters {
126    includes: Option<GlobSet>,
127    excludes: Option<GlobSet>,
128}
129
130#[derive(Clone, Debug)]
131pub(crate) struct SearchScope {
132    pub root: PathBuf,
133    pub use_index: bool,
134}
135
136#[derive(Clone, Debug)]
137struct SharedGrepMatch {
138    file: Arc<PathBuf>,
139    line: u32,
140    column: u32,
141    line_text: String,
142    match_text: String,
143}
144
145#[derive(Clone, Debug)]
146enum SearchMatcher {
147    Literal(LiteralSearch),
148    Regex(Regex),
149}
150
151#[derive(Clone, Debug)]
152enum LiteralSearch {
153    CaseSensitive(Vec<u8>),
154    AsciiCaseInsensitive(Vec<u8>),
155}
156
157impl SearchIndex {
158    pub fn new() -> Self {
159        SearchIndex {
160            postings: HashMap::new(),
161            files: Vec::new(),
162            path_to_id: HashMap::new(),
163            ready: false,
164            project_root: PathBuf::new(),
165            git_head: None,
166            max_file_size: DEFAULT_MAX_FILE_SIZE,
167            file_trigrams: HashMap::new(),
168            unindexed_files: HashSet::new(),
169        }
170    }
171
172    pub fn build(root: &Path) -> Self {
173        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
174    }
175
176    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
177        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
178        let mut index = SearchIndex {
179            project_root: project_root.clone(),
180            max_file_size,
181            ..SearchIndex::new()
182        };
183
184        let filters = PathFilters::default();
185        for path in walk_project_files(&project_root, &filters) {
186            index.update_file(&path);
187        }
188
189        index.git_head = current_git_head(&project_root);
190        index.ready = true;
191        index
192    }
193
194    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
195        self.remove_file(path);
196
197        let file_id = match self.allocate_file_id(path, content.len() as u64) {
198            Some(file_id) => file_id,
199            None => return,
200        };
201
202        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
203        for (trigram, next_char, position) in extract_trigrams(content) {
204            let entry = trigram_map.entry(trigram).or_default();
205            entry.next_mask |= mask_for_next_char(next_char);
206            entry.loc_mask |= mask_for_position(position);
207        }
208
209        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
210        for (trigram, filter) in trigram_map {
211            let postings = self.postings.entry(trigram).or_default();
212            postings.push(Posting {
213                file_id,
214                next_mask: filter.next_mask,
215                loc_mask: filter.loc_mask,
216            });
217            // Posting lists are kept sorted by file_id for binary search during
218            // intersection. Since file_ids are allocated incrementally, the new
219            // entry is usually already in order. Only sort when needed.
220            if postings.len() > 1
221                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
222            {
223                postings.sort_unstable_by_key(|p| p.file_id);
224            }
225            file_trigrams.push(trigram);
226        }
227
228        self.file_trigrams.insert(file_id, file_trigrams);
229        self.unindexed_files.remove(&file_id);
230    }
231
232    pub fn remove_file(&mut self, path: &Path) {
233        let Some(file_id) = self.path_to_id.remove(path) else {
234            return;
235        };
236
237        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
238            for trigram in trigrams {
239                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
240                    postings.retain(|posting| posting.file_id != file_id);
241                    postings.is_empty()
242                } else {
243                    false
244                };
245
246                if should_remove {
247                    self.postings.remove(&trigram);
248                }
249            }
250        }
251
252        self.unindexed_files.remove(&file_id);
253        if let Some(file) = self.files.get_mut(file_id as usize) {
254            file.path = PathBuf::new();
255            file.size = 0;
256            file.modified = UNIX_EPOCH;
257        }
258    }
259
260    pub fn update_file(&mut self, path: &Path) {
261        self.remove_file(path);
262
263        let metadata = match fs::metadata(path) {
264            Ok(metadata) if metadata.is_file() => metadata,
265            _ => return,
266        };
267
268        if is_binary_path(path, metadata.len()) {
269            self.track_unindexed_file(path, &metadata);
270            return;
271        }
272
273        if metadata.len() > self.max_file_size {
274            self.track_unindexed_file(path, &metadata);
275            return;
276        }
277
278        let content = match fs::read(path) {
279            Ok(content) => content,
280            Err(_) => return,
281        };
282
283        if is_binary_bytes(&content) {
284            self.track_unindexed_file(path, &metadata);
285            return;
286        }
287
288        self.index_file(path, &content);
289    }
290
291    pub fn grep(
292        &self,
293        pattern: &str,
294        case_sensitive: bool,
295        include: &[String],
296        exclude: &[String],
297        search_root: &Path,
298        max_results: usize,
299    ) -> GrepResult {
300        self.search_grep(
301            pattern,
302            case_sensitive,
303            include,
304            exclude,
305            search_root,
306            max_results,
307        )
308    }
309
310    pub fn search_grep(
311        &self,
312        pattern: &str,
313        case_sensitive: bool,
314        include: &[String],
315        exclude: &[String],
316        search_root: &Path,
317        max_results: usize,
318    ) -> GrepResult {
319        // Detect if pattern is a plain literal (no regex metacharacters).
320        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
321        let is_literal = !pattern.chars().any(|c| {
322            matches!(
323                c,
324                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
325            )
326        });
327
328        let literal_search = if is_literal {
329            if case_sensitive {
330                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
331            } else if pattern.is_ascii() {
332                Some(LiteralSearch::AsciiCaseInsensitive(
333                    pattern
334                        .as_bytes()
335                        .iter()
336                        .map(|byte| byte.to_ascii_lowercase())
337                        .collect(),
338                ))
339            } else {
340                None
341            }
342        } else {
343            None
344        };
345
346        // Build the regex for non-literal patterns (or literal Unicode fallback).
347        let regex = if literal_search.is_some() {
348            None
349        } else {
350            let regex_pattern = if is_literal {
351                regex::escape(pattern)
352            } else {
353                pattern.to_string()
354            };
355            let mut builder = RegexBuilder::new(&regex_pattern);
356            builder.case_insensitive(!case_sensitive);
357            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
358            builder.multi_line(true);
359            match builder.build() {
360                Ok(r) => Some(r),
361                Err(_) => {
362                    return GrepResult {
363                        matches: Vec::new(),
364                        total_matches: 0,
365                        files_searched: 0,
366                        files_with_matches: 0,
367                        index_status: if self.ready {
368                            IndexStatus::Ready
369                        } else {
370                            IndexStatus::Building
371                        },
372                        truncated: false,
373                    };
374                }
375            }
376        };
377
378        let matcher = if let Some(literal_search) = literal_search {
379            SearchMatcher::Literal(literal_search)
380        } else {
381            SearchMatcher::Regex(
382                regex.expect("regex should exist when literal matcher is unavailable"),
383            )
384        };
385
386        let filters = match build_path_filters(include, exclude) {
387            Ok(filters) => filters,
388            Err(_) => PathFilters::default(),
389        };
390        let search_root = canonicalize_or_normalize(search_root);
391
392        let query = decompose_regex(pattern);
393        let candidate_ids = self.candidates(&query);
394
395        let candidate_files: Vec<&FileEntry> = candidate_ids
396            .into_iter()
397            .filter_map(|file_id| self.files.get(file_id as usize))
398            .filter(|file| !file.path.as_os_str().is_empty())
399            .filter(|file| is_within_search_root(&search_root, &file.path))
400            .filter(|file| filters.matches(&self.project_root, &file.path))
401            .collect();
402
403        let total_matches = AtomicUsize::new(0);
404        let files_searched = AtomicUsize::new(0);
405        let files_with_matches = AtomicUsize::new(0);
406        let truncated = AtomicBool::new(false);
407        let stop_after = max_results.saturating_mul(2);
408
409        let mut matches = if candidate_files.len() > 10 {
410            candidate_files
411                .par_iter()
412                .map(|file| {
413                    search_candidate_file(
414                        file,
415                        &matcher,
416                        max_results,
417                        stop_after,
418                        &total_matches,
419                        &files_searched,
420                        &files_with_matches,
421                        &truncated,
422                    )
423                })
424                .reduce(Vec::new, |mut left, mut right| {
425                    left.append(&mut right);
426                    left
427                })
428        } else {
429            let mut matches = Vec::new();
430            for file in candidate_files {
431                matches.extend(search_candidate_file(
432                    file,
433                    &matcher,
434                    max_results,
435                    stop_after,
436                    &total_matches,
437                    &files_searched,
438                    &files_with_matches,
439                    &truncated,
440                ));
441
442                if should_stop_search(&truncated, &total_matches, stop_after) {
443                    break;
444                }
445            }
446            matches
447        };
448
449        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
450            self.path_to_id
451                .get(path)
452                .and_then(|file_id| self.files.get(*file_id as usize))
453                .map(|file| file.modified)
454        });
455
456        let matches = matches
457            .into_iter()
458            .map(|matched| GrepMatch {
459                file: matched.file.as_ref().clone(),
460                line: matched.line,
461                column: matched.column,
462                line_text: matched.line_text,
463                match_text: matched.match_text,
464            })
465            .collect();
466
467        GrepResult {
468            total_matches: total_matches.load(Ordering::Relaxed),
469            matches,
470            files_searched: files_searched.load(Ordering::Relaxed),
471            files_with_matches: files_with_matches.load(Ordering::Relaxed),
472            index_status: if self.ready {
473                IndexStatus::Ready
474            } else {
475                IndexStatus::Building
476            },
477            truncated: truncated.load(Ordering::Relaxed),
478        }
479    }
480
481    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
482        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
483            Ok(filters) => filters,
484            Err(_) => return Vec::new(),
485        };
486        let search_root = canonicalize_or_normalize(search_root);
487        let mut entries = self
488            .files
489            .iter()
490            .filter(|file| !file.path.as_os_str().is_empty())
491            .filter(|file| is_within_search_root(&search_root, &file.path))
492            .filter(|file| filters.matches(&self.project_root, &file.path))
493            .map(|file| (file.path.clone(), file.modified))
494            .collect::<Vec<_>>();
495
496        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
497            right_mtime
498                .cmp(left_mtime)
499                .then_with(|| left_path.cmp(right_path))
500        });
501
502        entries.into_iter().map(|(path, _)| path).collect()
503    }
504
505    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
506        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
507            return self.active_file_ids();
508        }
509
510        let mut and_trigrams = query.and_trigrams.clone();
511        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
512
513        let mut current: Option<Vec<u32>> = None;
514
515        for trigram in and_trigrams {
516            let filter = query.and_filters.get(&trigram).copied();
517            let matches = self.postings_for_trigram(trigram, filter);
518            current = Some(match current.take() {
519                Some(existing) => intersect_sorted_ids(&existing, &matches),
520                None => matches,
521            });
522
523            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
524                break;
525            }
526        }
527
528        let mut current = current.unwrap_or_else(|| self.active_file_ids());
529
530        for (index, group) in query.or_groups.iter().enumerate() {
531            let mut group_matches = Vec::new();
532            let filters = query.or_filters.get(index);
533
534            for trigram in group {
535                let filter = filters.and_then(|filters| filters.get(trigram).copied());
536                let matches = self.postings_for_trigram(*trigram, filter);
537                if group_matches.is_empty() {
538                    group_matches = matches;
539                } else {
540                    group_matches = union_sorted_ids(&group_matches, &matches);
541                }
542            }
543
544            current = intersect_sorted_ids(&current, &group_matches);
545            if current.is_empty() {
546                break;
547            }
548        }
549
550        let mut unindexed = self
551            .unindexed_files
552            .iter()
553            .copied()
554            .filter(|file_id| self.is_active_file(*file_id))
555            .collect::<Vec<_>>();
556        if !unindexed.is_empty() {
557            unindexed.sort_unstable();
558            current = union_sorted_ids(&current, &unindexed);
559        }
560
561        current
562    }
563
564    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
565        if fs::create_dir_all(cache_dir).is_err() {
566            return;
567        }
568
569        let postings_path = cache_dir.join("postings.bin");
570        let lookup_path = cache_dir.join("lookup.bin");
571        let tmp_postings = cache_dir.join("postings.bin.tmp");
572        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
573
574        let active_ids = self.active_file_ids();
575        let mut id_map = HashMap::new();
576        for (new_id, old_id) in active_ids.iter().enumerate() {
577            let Ok(new_id_u32) = u32::try_from(new_id) else {
578                return;
579            };
580            id_map.insert(*old_id, new_id_u32);
581        }
582
583        let write_result = (|| -> std::io::Result<()> {
584            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
585
586            postings_writer.write_all(INDEX_MAGIC)?;
587            write_u32(&mut postings_writer, INDEX_VERSION)?;
588
589            let head = git_head.unwrap_or_default();
590            let root = self.project_root.to_string_lossy();
591            let head_len = u32::try_from(head.len())
592                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
593            let root_len = u32::try_from(root.len())
594                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
595            let file_count = u32::try_from(active_ids.len())
596                .map_err(|_| std::io::Error::other("too many files to cache"))?;
597
598            write_u32(&mut postings_writer, head_len)?;
599            write_u32(&mut postings_writer, root_len)?;
600            write_u64(&mut postings_writer, self.max_file_size)?;
601            write_u32(&mut postings_writer, file_count)?;
602            postings_writer.write_all(head.as_bytes())?;
603            postings_writer.write_all(root.as_bytes())?;
604
605            for old_id in &active_ids {
606                let Some(file) = self.files.get(*old_id as usize) else {
607                    return Err(std::io::Error::other("missing file entry for cache write"));
608                };
609                let path = relative_to_root(&self.project_root, &file.path);
610                let path = path.to_string_lossy();
611                let path_len = u32::try_from(path.len())
612                    .map_err(|_| std::io::Error::other("cached path too large"))?;
613                let modified = file
614                    .modified
615                    .duration_since(UNIX_EPOCH)
616                    .unwrap_or(Duration::ZERO);
617                let unindexed = if self.unindexed_files.contains(old_id) {
618                    1u8
619                } else {
620                    0u8
621                };
622
623                postings_writer.write_all(&[unindexed])?;
624                write_u32(&mut postings_writer, path_len)?;
625                write_u64(&mut postings_writer, file.size)?;
626                write_u64(&mut postings_writer, modified.as_secs())?;
627                write_u32(&mut postings_writer, modified.subsec_nanos())?;
628                postings_writer.write_all(path.as_bytes())?;
629            }
630
631            let mut lookup_entries = Vec::new();
632            let mut postings_blob = Vec::new();
633            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
634            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
635
636            for (trigram, postings) in sorted_postings {
637                let offset = u64::try_from(postings_blob.len())
638                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
639                let mut count = 0u32;
640
641                for posting in postings {
642                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
643                        continue;
644                    };
645
646                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
647                    postings_blob.push(posting.next_mask);
648                    postings_blob.push(posting.loc_mask);
649                    count = count.saturating_add(1);
650                }
651
652                if count > 0 {
653                    lookup_entries.push((*trigram, offset, count));
654                }
655            }
656
657            write_u64(
658                &mut postings_writer,
659                u64::try_from(postings_blob.len())
660                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
661            )?;
662            postings_writer.write_all(&postings_blob)?;
663            write_crc32(&mut postings_writer, &tmp_postings)?;
664            postings_writer.flush()?;
665            drop(postings_writer);
666
667            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
668            let entry_count = u32::try_from(lookup_entries.len())
669                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
670
671            lookup_writer.write_all(LOOKUP_MAGIC)?;
672            write_u32(&mut lookup_writer, INDEX_VERSION)?;
673            write_u32(&mut lookup_writer, entry_count)?;
674
675            for (trigram, offset, count) in lookup_entries {
676                write_u32(&mut lookup_writer, trigram)?;
677                write_u64(&mut lookup_writer, offset)?;
678                write_u32(&mut lookup_writer, count)?;
679            }
680
681            write_crc32(&mut lookup_writer, &tmp_lookup)?;
682            lookup_writer.flush()?;
683            drop(lookup_writer);
684
685            fs::rename(&tmp_postings, &postings_path)?;
686            fs::rename(&tmp_lookup, &lookup_path)?;
687
688            Ok(())
689        })();
690
691        if write_result.is_err() {
692            let _ = fs::remove_file(&tmp_postings);
693            let _ = fs::remove_file(&tmp_lookup);
694        }
695    }
696
697    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
698        let postings_path = cache_dir.join("postings.bin");
699        let lookup_path = cache_dir.join("lookup.bin");
700
701        let mut postings_reader = BufReader::new(File::open(&postings_path).ok()?);
702        let mut lookup_reader = BufReader::new(File::open(&lookup_path).ok()?);
703        let postings_len_total =
704            usize::try_from(postings_reader.get_ref().metadata().ok()?.len()).ok()?;
705        let lookup_len_total =
706            usize::try_from(lookup_reader.get_ref().metadata().ok()?.len()).ok()?;
707        if postings_len_total < 4 || lookup_len_total < 4 {
708            return None;
709        }
710        verify_crc32(&postings_path).ok()?;
711        verify_crc32(&lookup_path).ok()?;
712
713        let mut magic = [0u8; 8];
714        postings_reader.read_exact(&mut magic).ok()?;
715        if &magic != INDEX_MAGIC {
716            return None;
717        }
718        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
719            return None;
720        }
721
722        let head_len = read_u32(&mut postings_reader).ok()? as usize;
723        let root_len = read_u32(&mut postings_reader).ok()? as usize;
724        let max_file_size = read_u64(&mut postings_reader).ok()?;
725        let file_count = read_u32(&mut postings_reader).ok()? as usize;
726        if file_count > MAX_ENTRIES {
727            return None;
728        }
729        let postings_body_len = postings_len_total.checked_sub(4)?;
730        let lookup_body_len = lookup_len_total.checked_sub(4)?;
731
732        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
733        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
734        if minimum_file_bytes > remaining_postings {
735            return None;
736        }
737
738        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
739            return None;
740        }
741        let mut head_bytes = vec![0u8; head_len];
742        postings_reader.read_exact(&mut head_bytes).ok()?;
743        let git_head = String::from_utf8(head_bytes)
744            .ok()
745            .filter(|head| !head.is_empty());
746
747        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
748            return None;
749        }
750        let mut root_bytes = vec![0u8; root_len];
751        postings_reader.read_exact(&mut root_bytes).ok()?;
752        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
753
754        let mut files = Vec::with_capacity(file_count);
755        let mut path_to_id = HashMap::new();
756        let mut unindexed_files = HashSet::new();
757
758        for file_id in 0..file_count {
759            let mut unindexed = [0u8; 1];
760            postings_reader.read_exact(&mut unindexed).ok()?;
761            let path_len = read_u32(&mut postings_reader).ok()? as usize;
762            let size = read_u64(&mut postings_reader).ok()?;
763            let secs = read_u64(&mut postings_reader).ok()?;
764            let nanos = read_u32(&mut postings_reader).ok()?;
765            if nanos >= 1_000_000_000 {
766                return None;
767            }
768            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
769                return None;
770            }
771            let mut path_bytes = vec![0u8; path_len];
772            postings_reader.read_exact(&mut path_bytes).ok()?;
773            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
774            let full_path = project_root.join(relative_path);
775            let file_id_u32 = u32::try_from(file_id).ok()?;
776
777            files.push(FileEntry {
778                path: full_path.clone(),
779                size,
780                modified: UNIX_EPOCH + Duration::new(secs, nanos),
781            });
782            path_to_id.insert(full_path, file_id_u32);
783            if unindexed[0] == 1 {
784                unindexed_files.insert(file_id_u32);
785            }
786        }
787
788        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
789        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
790        if postings_len > max_postings_bytes {
791            return None;
792        }
793        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
794            return None;
795        }
796        let mut postings_blob = vec![0u8; postings_len];
797        postings_reader.read_exact(&mut postings_blob).ok()?;
798
799        let mut lookup_magic = [0u8; 8];
800        lookup_reader.read_exact(&mut lookup_magic).ok()?;
801        if &lookup_magic != LOOKUP_MAGIC {
802            return None;
803        }
804        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
805            return None;
806        }
807        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
808        if entry_count > MAX_ENTRIES {
809            return None;
810        }
811        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
812        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
813        if minimum_lookup_bytes > remaining_lookup {
814            return None;
815        }
816
817        let mut postings = HashMap::new();
818        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
819
820        for _ in 0..entry_count {
821            let trigram = read_u32(&mut lookup_reader).ok()?;
822            let offset = read_u64(&mut lookup_reader).ok()? as usize;
823            let count = read_u32(&mut lookup_reader).ok()? as usize;
824            if count > MAX_ENTRIES {
825                return None;
826            }
827            let bytes_len = count.checked_mul(POSTING_BYTES)?;
828            let end = offset.checked_add(bytes_len)?;
829            if end > postings_blob.len() {
830                return None;
831            }
832
833            let mut trigram_postings = Vec::with_capacity(count);
834            for chunk in postings_blob[offset..end].chunks_exact(6) {
835                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
836                let posting = Posting {
837                    file_id,
838                    next_mask: chunk[4],
839                    loc_mask: chunk[5],
840                };
841                trigram_postings.push(posting.clone());
842                file_trigrams.entry(file_id).or_default().push(trigram);
843            }
844            postings.insert(trigram, trigram_postings);
845        }
846
847        Some(SearchIndex {
848            postings,
849            files,
850            path_to_id,
851            ready: true,
852            project_root,
853            git_head,
854            max_file_size,
855            file_trigrams,
856            unindexed_files,
857        })
858    }
859
860    pub(crate) fn stored_git_head(&self) -> Option<&str> {
861        self.git_head.as_deref()
862    }
863
864    pub(crate) fn set_ready(&mut self, ready: bool) {
865        self.ready = ready;
866    }
867
868    pub(crate) fn rebuild_or_refresh(
869        root: &Path,
870        max_file_size: u64,
871        current_head: Option<String>,
872        baseline: Option<SearchIndex>,
873    ) -> Self {
874        if current_head.is_none() {
875            return SearchIndex::build_with_limit(root, max_file_size);
876        }
877
878        if let Some(mut baseline) = baseline {
879            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
880            baseline.max_file_size = max_file_size;
881
882            if baseline.git_head == current_head {
883                // HEAD matches, but files may have changed on disk since the index was
884                // last written (e.g., uncommitted edits, stash pop, manual file changes
885                // while OpenCode was closed). Verify mtimes and re-index stale files.
886                verify_file_mtimes(&mut baseline);
887                baseline.ready = true;
888                return baseline;
889            }
890
891            if let (Some(previous), Some(current)) =
892                (baseline.git_head.clone(), current_head.clone())
893            {
894                let project_root = baseline.project_root.clone();
895                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
896                    baseline.git_head = Some(current);
897                    baseline.ready = true;
898                    return baseline;
899                }
900            }
901        }
902
903        SearchIndex::build_with_limit(root, max_file_size)
904    }
905
906    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
907        let file_id = u32::try_from(self.files.len()).ok()?;
908        let metadata = fs::metadata(path).ok();
909        let size = metadata
910            .as_ref()
911            .map_or(size_hint, |metadata| metadata.len());
912        let modified = metadata
913            .and_then(|metadata| metadata.modified().ok())
914            .unwrap_or(UNIX_EPOCH);
915
916        self.files.push(FileEntry {
917            path: path.to_path_buf(),
918            size,
919            modified,
920        });
921        self.path_to_id.insert(path.to_path_buf(), file_id);
922        Some(file_id)
923    }
924
925    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
926        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
927            return;
928        };
929        self.unindexed_files.insert(file_id);
930        self.file_trigrams.insert(file_id, Vec::new());
931    }
932
933    fn active_file_ids(&self) -> Vec<u32> {
934        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
935        ids.sort_unstable();
936        ids
937    }
938
939    fn is_active_file(&self, file_id: u32) -> bool {
940        self.files
941            .get(file_id as usize)
942            .map(|file| !file.path.as_os_str().is_empty())
943            .unwrap_or(false)
944    }
945
946    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
947        let Some(postings) = self.postings.get(&trigram) else {
948            return Vec::new();
949        };
950
951        let mut matches = Vec::with_capacity(postings.len());
952
953        for posting in postings {
954            if let Some(filter) = filter {
955                // next_mask: bloom filter check — the character following this trigram in the
956                // query must also appear after this trigram somewhere in the file.
957                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
958                    continue;
959                }
960                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
961                // between consecutive trigram pairs, but is NOT used as a single-trigram
962                // filter because the position in the query string has no relationship to
963                // the position in the file. Using it here causes false negatives.
964            }
965            if self.is_active_file(posting.file_id) {
966                matches.push(posting.file_id);
967            }
968        }
969
970        matches
971    }
972}
973
974fn search_candidate_file(
975    file: &FileEntry,
976    matcher: &SearchMatcher,
977    max_results: usize,
978    stop_after: usize,
979    total_matches: &AtomicUsize,
980    files_searched: &AtomicUsize,
981    files_with_matches: &AtomicUsize,
982    truncated: &AtomicBool,
983) -> Vec<SharedGrepMatch> {
984    if should_stop_search(truncated, total_matches, stop_after) {
985        return Vec::new();
986    }
987
988    let content = match read_indexed_file_bytes(&file.path) {
989        Some(content) => content,
990        None => return Vec::new(),
991    };
992    // Defense in depth: even though indexing tries to filter binaries via
993    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
994    // query time. content_inspector is fast (~bytes-per-cycle on a small
995    // preview) and this guarantees we never surface matches inside binary
996    // files even if the indexer somehow let one through (e.g. file changed
997    // between indexing and query).
998    if is_binary_bytes(&content) {
999        return Vec::new();
1000    }
1001    files_searched.fetch_add(1, Ordering::Relaxed);
1002
1003    let shared_path = Arc::new(file.path.clone());
1004    let mut matches = Vec::new();
1005    let mut line_starts = None;
1006    let mut seen_lines = HashSet::new();
1007    let mut matched_this_file = false;
1008
1009    match matcher {
1010        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1011            let finder = memchr::memmem::Finder::new(needle);
1012            let mut start = 0;
1013
1014            while let Some(position) = finder.find(&content[start..]) {
1015                if should_stop_search(truncated, total_matches, stop_after) {
1016                    break;
1017                }
1018
1019                let offset = start + position;
1020                start = offset + 1;
1021
1022                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1023                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1024                if !seen_lines.insert(line) {
1025                    continue;
1026                }
1027
1028                matched_this_file = true;
1029                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1030                if match_number > max_results {
1031                    truncated.store(true, Ordering::Relaxed);
1032                    break;
1033                }
1034
1035                let end = offset + needle.len();
1036                matches.push(SharedGrepMatch {
1037                    file: shared_path.clone(),
1038                    line,
1039                    column,
1040                    line_text,
1041                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1042                });
1043            }
1044        }
1045        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1046            let search_content = content.to_ascii_lowercase();
1047            let finder = memchr::memmem::Finder::new(needle);
1048            let mut start = 0;
1049
1050            while let Some(position) = finder.find(&search_content[start..]) {
1051                if should_stop_search(truncated, total_matches, stop_after) {
1052                    break;
1053                }
1054
1055                let offset = start + position;
1056                start = offset + 1;
1057
1058                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1059                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1060                if !seen_lines.insert(line) {
1061                    continue;
1062                }
1063
1064                matched_this_file = true;
1065                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1066                if match_number > max_results {
1067                    truncated.store(true, Ordering::Relaxed);
1068                    break;
1069                }
1070
1071                let end = offset + needle.len();
1072                matches.push(SharedGrepMatch {
1073                    file: shared_path.clone(),
1074                    line,
1075                    column,
1076                    line_text,
1077                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1078                });
1079            }
1080        }
1081        SearchMatcher::Regex(regex) => {
1082            for matched in regex.find_iter(&content) {
1083                if should_stop_search(truncated, total_matches, stop_after) {
1084                    break;
1085                }
1086
1087                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1088                let (line, column, line_text) =
1089                    line_details_bytes(&content, line_starts, matched.start());
1090                if !seen_lines.insert(line) {
1091                    continue;
1092                }
1093
1094                matched_this_file = true;
1095                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1096                if match_number > max_results {
1097                    truncated.store(true, Ordering::Relaxed);
1098                    break;
1099                }
1100
1101                matches.push(SharedGrepMatch {
1102                    file: shared_path.clone(),
1103                    line,
1104                    column,
1105                    line_text,
1106                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1107                });
1108            }
1109        }
1110    }
1111
1112    if matched_this_file {
1113        files_with_matches.fetch_add(1, Ordering::Relaxed);
1114    }
1115
1116    matches
1117}
1118
1119fn should_stop_search(
1120    truncated: &AtomicBool,
1121    total_matches: &AtomicUsize,
1122    stop_after: usize,
1123) -> bool {
1124    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1125}
1126
1127fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1128    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1129    let mut left_index = 0;
1130    let mut right_index = 0;
1131
1132    while left_index < left.len() && right_index < right.len() {
1133        match left[left_index].cmp(&right[right_index]) {
1134            std::cmp::Ordering::Less => left_index += 1,
1135            std::cmp::Ordering::Greater => right_index += 1,
1136            std::cmp::Ordering::Equal => {
1137                merged.push(left[left_index]);
1138                left_index += 1;
1139                right_index += 1;
1140            }
1141        }
1142    }
1143
1144    merged
1145}
1146
1147fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1148    let mut merged = Vec::with_capacity(left.len() + right.len());
1149    let mut left_index = 0;
1150    let mut right_index = 0;
1151
1152    while left_index < left.len() && right_index < right.len() {
1153        match left[left_index].cmp(&right[right_index]) {
1154            std::cmp::Ordering::Less => {
1155                merged.push(left[left_index]);
1156                left_index += 1;
1157            }
1158            std::cmp::Ordering::Greater => {
1159                merged.push(right[right_index]);
1160                right_index += 1;
1161            }
1162            std::cmp::Ordering::Equal => {
1163                merged.push(left[left_index]);
1164                left_index += 1;
1165                right_index += 1;
1166            }
1167        }
1168    }
1169
1170    merged.extend_from_slice(&left[left_index..]);
1171    merged.extend_from_slice(&right[right_index..]);
1172    merged
1173}
1174
1175pub fn decompose_regex(pattern: &str) -> RegexQuery {
1176    let hir = match regex_syntax::parse(pattern) {
1177        Ok(hir) => hir,
1178        Err(_) => return RegexQuery::default(),
1179    };
1180
1181    let build = build_query(&hir);
1182    build.into_query()
1183}
1184
1185pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1186    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1187}
1188
1189pub fn normalize_char(c: u8) -> u8 {
1190    c.to_ascii_lowercase()
1191}
1192
1193pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1194    if content.len() < 3 {
1195        return Vec::new();
1196    }
1197
1198    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1199    for start in 0..=content.len() - 3 {
1200        let trigram = pack_trigram(
1201            normalize_char(content[start]),
1202            normalize_char(content[start + 1]),
1203            normalize_char(content[start + 2]),
1204        );
1205        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1206        trigrams.push((trigram, next_char, start));
1207    }
1208    trigrams
1209}
1210
1211pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1212    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1213    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1214        return PathBuf::from(override_dir)
1215            .join("index")
1216            .join(project_cache_key(project_root));
1217    }
1218    // Use configured storage dir (from plugin, XDG-compliant)
1219    if let Some(dir) = storage_dir {
1220        return dir.join("index").join(project_cache_key(project_root));
1221    }
1222    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1223    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1224    // If neither is set, fall back to a temp directory rather than `"."`
1225    // because the search-index code reads/writes absolute paths.
1226    let home = std::env::var_os("HOME")
1227        .or_else(|| std::env::var_os("USERPROFILE"))
1228        .map(PathBuf::from)
1229        .unwrap_or_else(std::env::temp_dir);
1230    home.join(".cache")
1231        .join("aft")
1232        .join("index")
1233        .join(project_cache_key(project_root))
1234}
1235
1236pub(crate) fn build_path_filters(
1237    include: &[String],
1238    exclude: &[String],
1239) -> Result<PathFilters, String> {
1240    Ok(PathFilters {
1241        includes: build_globset(include)?,
1242        excludes: build_globset(exclude)?,
1243    })
1244}
1245
1246pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1247    walk_project_files_from(root, root, filters)
1248}
1249
1250pub(crate) fn walk_project_files_from(
1251    filter_root: &Path,
1252    search_root: &Path,
1253    filters: &PathFilters,
1254) -> Vec<PathBuf> {
1255    let mut builder = WalkBuilder::new(search_root);
1256    builder
1257        .hidden(false)
1258        .git_ignore(true)
1259        .git_global(true)
1260        .git_exclude(true)
1261        .filter_entry(|entry| {
1262            let name = entry.file_name().to_string_lossy();
1263            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1264                return !matches!(
1265                    name.as_ref(),
1266                    "node_modules"
1267                        | "target"
1268                        | "venv"
1269                        | ".venv"
1270                        | ".git"
1271                        | "__pycache__"
1272                        | ".tox"
1273                        | "dist"
1274                        | "build"
1275                );
1276            }
1277            true
1278        });
1279
1280    let mut files = Vec::new();
1281    for entry in builder.build().filter_map(|entry| entry.ok()) {
1282        if !entry
1283            .file_type()
1284            .map_or(false, |file_type| file_type.is_file())
1285        {
1286            continue;
1287        }
1288        let path = entry.into_path();
1289        if filters.matches(filter_root, &path) {
1290            files.push(path);
1291        }
1292    }
1293
1294    sort_paths_by_mtime_desc(&mut files);
1295    files
1296}
1297
1298pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1299    let bytes = fs::read(path).ok()?;
1300    if is_binary_bytes(&bytes) {
1301        return None;
1302    }
1303    String::from_utf8(bytes).ok()
1304}
1305
1306fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1307    fs::read(path).ok()
1308}
1309
1310pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1311    path.strip_prefix(root)
1312        .map(PathBuf::from)
1313        .unwrap_or_else(|_| path.to_path_buf())
1314}
1315
1316/// Sort paths newest-first by mtime, falling back to lexicographic order.
1317///
1318/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1319/// `sort_by()` closure. That made the comparator non-deterministic — a
1320/// `stat()` syscall for the same path can return different values across
1321/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1322/// concurrent file-watcher activity), and Rust's slice::sort panics at
1323/// runtime when it detects a non-total-order comparator. CI hit this on
1324/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1325///
1326/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1327/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1328pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1329    use std::collections::HashMap;
1330    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1331    for path in paths.iter() {
1332        mtimes
1333            .entry(path.clone())
1334            .or_insert_with(|| path_modified_time(path));
1335    }
1336    paths.sort_by(|left, right| {
1337        let left_mtime = mtimes.get(left).and_then(|v| *v);
1338        let right_mtime = mtimes.get(right).and_then(|v| *v);
1339        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1340    });
1341}
1342
1343/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1344/// the sort. Same fix, applied to grep matches that share files.
1345pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1346    use std::collections::HashMap;
1347    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1348    for m in matches.iter() {
1349        mtimes.entry(m.file.clone()).or_insert_with(|| {
1350            let resolved = resolve_match_path(project_root, &m.file);
1351            path_modified_time(&resolved)
1352        });
1353    }
1354    matches.sort_by(|left, right| {
1355        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1356        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1357        right_mtime
1358            .cmp(&left_mtime)
1359            .then_with(|| left.file.cmp(&right.file))
1360            .then_with(|| left.line.cmp(&right.line))
1361            .then_with(|| left.column.cmp(&right.column))
1362    });
1363}
1364
1365/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1366/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1367/// table from the search index), but it can still return different values if
1368/// the file is modified mid-sort. Snapshot once.
1369fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1370    matches: &mut [SharedGrepMatch],
1371    modified_for_path: F,
1372) where
1373    F: Fn(&Path) -> Option<SystemTime>,
1374{
1375    use std::collections::HashMap;
1376    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1377    for m in matches.iter() {
1378        let path = m.file.as_path().to_path_buf();
1379        mtimes
1380            .entry(path.clone())
1381            .or_insert_with(|| modified_for_path(&path));
1382    }
1383    matches.sort_by(|left, right| {
1384        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1385        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1386        right_mtime
1387            .cmp(&left_mtime)
1388            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1389            .then_with(|| left.line.cmp(&right.line))
1390            .then_with(|| left.column.cmp(&right.column))
1391    });
1392}
1393
1394pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1395    let resolved_project_root = canonicalize_or_normalize(project_root);
1396    let root = match path {
1397        Some(path) => {
1398            let path = PathBuf::from(path);
1399            if path.is_absolute() {
1400                canonicalize_or_normalize(&path)
1401            } else {
1402                normalize_path(&resolved_project_root.join(path))
1403            }
1404        }
1405        None => resolved_project_root.clone(),
1406    };
1407
1408    let use_index = is_within_search_root(&resolved_project_root, &root);
1409    SearchScope { root, use_index }
1410}
1411
1412pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1413    content_inspector::inspect(content).is_binary()
1414}
1415
1416pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1417    run_git(root, &["rev-parse", "HEAD"])
1418}
1419
1420pub fn project_cache_key(project_root: &Path) -> String {
1421    use sha2::{Digest, Sha256};
1422
1423    let mut hasher = Sha256::new();
1424
1425    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1426        // Git repo: root commit is the unique identity.
1427        // Same repo cloned anywhere produces the same key.
1428        hasher.update(root_commit.as_bytes());
1429    } else {
1430        // Non-git project: use the canonical filesystem path as identity.
1431        let canonical_root = canonicalize_or_normalize(project_root);
1432        hasher.update(canonical_root.to_string_lossy().as_bytes());
1433    }
1434
1435    let digest = format!("{:x}", hasher.finalize());
1436    digest[..16].to_string()
1437}
1438
1439impl PathFilters {
1440    fn matches(&self, root: &Path, path: &Path) -> bool {
1441        let relative = to_glob_path(&relative_to_root(root, path));
1442        if self
1443            .includes
1444            .as_ref()
1445            .is_some_and(|includes| !includes.is_match(&relative))
1446        {
1447            return false;
1448        }
1449        if self
1450            .excludes
1451            .as_ref()
1452            .is_some_and(|excludes| excludes.is_match(&relative))
1453        {
1454            return false;
1455        }
1456        true
1457    }
1458}
1459
1460fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1461    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1462}
1463
1464fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1465    if path.is_absolute() {
1466        path.to_path_buf()
1467    } else {
1468        project_root.join(path)
1469    }
1470}
1471
1472fn path_modified_time(path: &Path) -> Option<SystemTime> {
1473    fs::metadata(path)
1474        .and_then(|metadata| metadata.modified())
1475        .ok()
1476}
1477
1478fn normalize_path(path: &Path) -> PathBuf {
1479    let mut result = PathBuf::new();
1480    for component in path.components() {
1481        match component {
1482            Component::ParentDir => {
1483                if !result.pop() {
1484                    result.push(component);
1485                }
1486            }
1487            Component::CurDir => {}
1488            _ => result.push(component),
1489        }
1490    }
1491    result
1492}
1493
1494/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1495/// since the index was last written. Also detect new files and deleted files.
1496fn verify_file_mtimes(index: &mut SearchIndex) {
1497    // Collect stale files (mtime mismatch or deleted)
1498    let mut stale_paths = Vec::new();
1499    for entry in &index.files {
1500        if entry.path.as_os_str().is_empty() {
1501            continue; // tombstoned entry
1502        }
1503        match fs::metadata(&entry.path) {
1504            Ok(meta) => {
1505                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1506                if current_mtime != entry.modified || meta.len() != entry.size {
1507                    stale_paths.push(entry.path.clone());
1508                }
1509            }
1510            Err(_) => {
1511                // File deleted
1512                stale_paths.push(entry.path.clone());
1513            }
1514        }
1515    }
1516
1517    // Re-index stale files
1518    for path in &stale_paths {
1519        index.update_file(path);
1520    }
1521
1522    // Detect new files not in the index
1523    let filters = PathFilters::default();
1524    for path in walk_project_files(&index.project_root, &filters) {
1525        if !index.path_to_id.contains_key(&path) {
1526            index.update_file(&path);
1527        }
1528    }
1529
1530    if !stale_paths.is_empty() {
1531        log::info!(
1532            "search index: refreshed {} stale file(s) from disk cache",
1533            stale_paths.len()
1534        );
1535    }
1536}
1537
1538fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1539    path.starts_with(search_root)
1540}
1541
1542impl QueryBuild {
1543    fn into_query(self) -> RegexQuery {
1544        let mut query = RegexQuery::default();
1545
1546        for run in self.and_runs {
1547            add_run_to_and_query(&mut query, &run);
1548        }
1549
1550        for group in self.or_groups {
1551            let mut trigrams = BTreeSet::new();
1552            let mut filters = HashMap::new();
1553            for run in group {
1554                for (trigram, filter) in trigram_filters(&run) {
1555                    trigrams.insert(trigram);
1556                    merge_filter(filters.entry(trigram).or_default(), filter);
1557                }
1558            }
1559            if !trigrams.is_empty() {
1560                query.or_groups.push(trigrams.into_iter().collect());
1561                query.or_filters.push(filters);
1562            }
1563        }
1564
1565        query
1566    }
1567}
1568
1569fn build_query(hir: &Hir) -> QueryBuild {
1570    match hir.kind() {
1571        HirKind::Literal(literal) => {
1572            if literal.0.len() >= 3 {
1573                QueryBuild {
1574                    and_runs: vec![literal.0.to_vec()],
1575                    or_groups: Vec::new(),
1576                }
1577            } else {
1578                QueryBuild::default()
1579            }
1580        }
1581        HirKind::Capture(capture) => build_query(&capture.sub),
1582        HirKind::Concat(parts) => {
1583            let mut build = QueryBuild::default();
1584            for part in parts {
1585                let part_build = build_query(part);
1586                build.and_runs.extend(part_build.and_runs);
1587                build.or_groups.extend(part_build.or_groups);
1588            }
1589            build
1590        }
1591        HirKind::Alternation(parts) => {
1592            let mut group = Vec::new();
1593            for part in parts {
1594                let Some(mut choices) = guaranteed_run_choices(part) else {
1595                    return QueryBuild::default();
1596                };
1597                group.append(&mut choices);
1598            }
1599            if group.is_empty() {
1600                QueryBuild::default()
1601            } else {
1602                QueryBuild {
1603                    and_runs: Vec::new(),
1604                    or_groups: vec![group],
1605                }
1606            }
1607        }
1608        HirKind::Repetition(repetition) => {
1609            if repetition.min == 0 {
1610                QueryBuild::default()
1611            } else {
1612                build_query(&repetition.sub)
1613            }
1614        }
1615        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1616    }
1617}
1618
1619fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1620    match hir.kind() {
1621        HirKind::Literal(literal) => {
1622            if literal.0.len() >= 3 {
1623                Some(vec![literal.0.to_vec()])
1624            } else {
1625                None
1626            }
1627        }
1628        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1629        HirKind::Concat(parts) => {
1630            let mut runs = Vec::new();
1631            for part in parts {
1632                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1633                    runs.append(&mut part_runs);
1634                }
1635            }
1636            if runs.is_empty() {
1637                None
1638            } else {
1639                Some(runs)
1640            }
1641        }
1642        HirKind::Alternation(parts) => {
1643            let mut runs = Vec::new();
1644            for part in parts {
1645                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1646                    return None;
1647                };
1648                runs.append(&mut part_runs);
1649            }
1650            if runs.is_empty() {
1651                None
1652            } else {
1653                Some(runs)
1654            }
1655        }
1656        HirKind::Repetition(repetition) => {
1657            if repetition.min == 0 {
1658                None
1659            } else {
1660                guaranteed_run_choices(&repetition.sub)
1661            }
1662        }
1663        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1664    }
1665}
1666
1667fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1668    for (trigram, filter) in trigram_filters(run) {
1669        if !query.and_trigrams.contains(&trigram) {
1670            query.and_trigrams.push(trigram);
1671        }
1672        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1673    }
1674}
1675
1676fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1677    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1678    for (trigram, next_char, position) in extract_trigrams(run) {
1679        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1680        if next_char != EOF_SENTINEL {
1681            entry.next_mask |= mask_for_next_char(next_char);
1682        }
1683        entry.loc_mask |= mask_for_position(position);
1684    }
1685    filters.into_iter().collect()
1686}
1687
1688fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1689    target.next_mask |= filter.next_mask;
1690    target.loc_mask |= filter.loc_mask;
1691}
1692
1693fn mask_for_next_char(next_char: u8) -> u8 {
1694    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1695    1u8 << bit
1696}
1697
1698fn mask_for_position(position: usize) -> u8 {
1699    1u8 << (position % 8)
1700}
1701
1702fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1703    if patterns.is_empty() {
1704        return Ok(None);
1705    }
1706
1707    let mut builder = GlobSetBuilder::new();
1708    for pattern in patterns {
1709        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1710        builder.add(glob);
1711    }
1712    builder.build().map(Some).map_err(|error| error.to_string())
1713}
1714
1715fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1716    let mut buffer = [0u8; 4];
1717    reader.read_exact(&mut buffer)?;
1718    Ok(u32::from_le_bytes(buffer))
1719}
1720
1721fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1722    let mut buffer = [0u8; 8];
1723    reader.read_exact(&mut buffer)?;
1724    Ok(u64::from_le_bytes(buffer))
1725}
1726
1727fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1728    writer.write_all(&value.to_le_bytes())
1729}
1730
1731fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1732    writer.write_all(&value.to_le_bytes())
1733}
1734
1735fn write_crc32(writer: &mut BufWriter<File>, path: &Path) -> std::io::Result<()> {
1736    writer.flush()?;
1737    let body = std::fs::read(path)?;
1738    let checksum = crc32fast::hash(&body);
1739    writer.write_all(&checksum.to_le_bytes())
1740}
1741
1742fn verify_crc32(path: &Path) -> std::io::Result<()> {
1743    let bytes = std::fs::read(path)?;
1744    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1745        return Err(std::io::Error::other("search index checksum missing"));
1746    };
1747    let expected = u32::from_le_bytes(*stored);
1748    let actual = crc32fast::hash(body);
1749    if actual != expected {
1750        return Err(std::io::Error::other("search index checksum mismatch"));
1751    }
1752    Ok(())
1753}
1754
1755fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1756    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1757    total_len.checked_sub(pos)
1758}
1759
1760fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1761    let output = Command::new("git")
1762        .arg("-C")
1763        .arg(root)
1764        .args(args)
1765        .output()
1766        .ok()?;
1767    if !output.status.success() {
1768        return None;
1769    }
1770    let value = String::from_utf8(output.stdout).ok()?;
1771    let value = value.trim().to_string();
1772    if value.is_empty() {
1773        None
1774    } else {
1775        Some(value)
1776    }
1777}
1778
1779fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1780    let diff_range = format!("{}..{}", from, to);
1781    let output = match Command::new("git")
1782        .arg("-C")
1783        .arg(root)
1784        .args(["diff", "--name-only", &diff_range])
1785        .output()
1786    {
1787        Ok(output) => output,
1788        Err(_) => return false,
1789    };
1790
1791    if !output.status.success() {
1792        return false;
1793    }
1794
1795    let Ok(paths) = String::from_utf8(output.stdout) else {
1796        return false;
1797    };
1798
1799    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1800        let path = root.join(relative_path);
1801        if path.exists() {
1802            index.update_file(&path);
1803        } else {
1804            index.remove_file(&path);
1805        }
1806    }
1807
1808    true
1809}
1810
1811fn is_binary_path(path: &Path, size: u64) -> bool {
1812    if size == 0 {
1813        return false;
1814    }
1815
1816    let mut file = match File::open(path) {
1817        Ok(file) => file,
1818        Err(_) => return true,
1819    };
1820
1821    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1822    match file.read(&mut preview) {
1823        Ok(read) => is_binary_bytes(&preview[..read]),
1824        Err(_) => true,
1825    }
1826}
1827
1828fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1829    let mut starts = vec![0usize];
1830    for (index, byte) in content.iter().copied().enumerate() {
1831        if byte == b'\n' {
1832            starts.push(index + 1);
1833        }
1834    }
1835    starts
1836}
1837
1838fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1839    let line_index = match line_starts.binary_search(&offset) {
1840        Ok(index) => index,
1841        Err(index) => index.saturating_sub(1),
1842    };
1843    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1844    let line_end = content[line_start..]
1845        .iter()
1846        .position(|byte| *byte == b'\n')
1847        .map(|length| line_start + length)
1848        .unwrap_or(content.len());
1849    let mut line_slice = &content[line_start..line_end];
1850    if line_slice.ends_with(b"\r") {
1851        line_slice = &line_slice[..line_slice.len() - 1];
1852    }
1853    let line_text = String::from_utf8_lossy(line_slice).into_owned();
1854    let column = String::from_utf8_lossy(&content[line_start..offset])
1855        .chars()
1856        .count() as u32
1857        + 1;
1858    (line_index as u32 + 1, column, line_text)
1859}
1860
1861fn to_glob_path(path: &Path) -> String {
1862    path.to_string_lossy().replace('\\', "/")
1863}
1864
1865#[cfg(test)]
1866mod tests {
1867    use std::process::Command;
1868
1869    use super::*;
1870
1871    #[test]
1872    fn extract_trigrams_tracks_next_char_and_position() {
1873        let trigrams = extract_trigrams(b"Rust");
1874        assert_eq!(trigrams.len(), 2);
1875        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1876        assert_eq!(
1877            trigrams[1],
1878            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1879        );
1880    }
1881
1882    #[test]
1883    fn decompose_regex_extracts_literals_and_alternations() {
1884        let query = decompose_regex("abc(def|ghi)xyz");
1885        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1886        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1887        assert_eq!(query.or_groups.len(), 1);
1888        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1889        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1890    }
1891
1892    #[test]
1893    fn candidates_intersect_posting_lists() {
1894        let mut index = SearchIndex::new();
1895        let dir = tempfile::tempdir().expect("create temp dir");
1896        let alpha = dir.path().join("alpha.txt");
1897        let beta = dir.path().join("beta.txt");
1898        fs::write(&alpha, "abcdef").expect("write alpha");
1899        fs::write(&beta, "abcxyz").expect("write beta");
1900        index.project_root = dir.path().to_path_buf();
1901        index.index_file(&alpha, b"abcdef");
1902        index.index_file(&beta, b"abcxyz");
1903
1904        let query = RegexQuery {
1905            and_trigrams: vec![
1906                pack_trigram(b'a', b'b', b'c'),
1907                pack_trigram(b'd', b'e', b'f'),
1908            ],
1909            ..RegexQuery::default()
1910        };
1911
1912        let candidates = index.candidates(&query);
1913        assert_eq!(candidates.len(), 1);
1914        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1915    }
1916
1917    #[test]
1918    fn candidates_apply_bloom_filters() {
1919        let mut index = SearchIndex::new();
1920        let dir = tempfile::tempdir().expect("create temp dir");
1921        let file = dir.path().join("sample.txt");
1922        fs::write(&file, "abcd efgh").expect("write sample");
1923        index.project_root = dir.path().to_path_buf();
1924        index.index_file(&file, b"abcd efgh");
1925
1926        let trigram = pack_trigram(b'a', b'b', b'c');
1927        let matching_filter = PostingFilter {
1928            next_mask: mask_for_next_char(b'd'),
1929            loc_mask: mask_for_position(0),
1930        };
1931        let non_matching_filter = PostingFilter {
1932            next_mask: mask_for_next_char(b'z'),
1933            loc_mask: mask_for_position(0),
1934        };
1935
1936        assert_eq!(
1937            index
1938                .postings_for_trigram(trigram, Some(matching_filter))
1939                .len(),
1940            1
1941        );
1942        assert!(index
1943            .postings_for_trigram(trigram, Some(non_matching_filter))
1944            .is_empty());
1945    }
1946
1947    #[test]
1948    fn disk_round_trip_preserves_postings_and_files() {
1949        let dir = tempfile::tempdir().expect("create temp dir");
1950        let project = dir.path().join("project");
1951        fs::create_dir_all(&project).expect("create project dir");
1952        let file = project.join("src.txt");
1953        fs::write(&file, "abcdef").expect("write source");
1954
1955        let mut index = SearchIndex::build(&project);
1956        index.git_head = Some("deadbeef".to_string());
1957        let cache_dir = dir.path().join("cache");
1958        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1959
1960        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1961        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1962        assert_eq!(loaded.files.len(), 1);
1963        assert_eq!(
1964            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1965            PathBuf::from("src.txt")
1966        );
1967        assert_eq!(loaded.postings.len(), index.postings.len());
1968        assert!(loaded
1969            .postings
1970            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1971    }
1972
1973    #[test]
1974    fn read_from_disk_rejects_corrupt_postings_checksum() {
1975        let dir = tempfile::tempdir().expect("create temp dir");
1976        let project = dir.path().join("project");
1977        fs::create_dir_all(&project).expect("create project dir");
1978        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1979
1980        let index = SearchIndex::build(&project);
1981        let cache_dir = dir.path().join("cache");
1982        index.write_to_disk(&cache_dir, None);
1983
1984        let postings_path = cache_dir.join("postings.bin");
1985        let mut bytes = fs::read(&postings_path).expect("read postings");
1986        let middle = bytes.len() / 2;
1987        bytes[middle] ^= 0xff;
1988        fs::write(&postings_path, bytes).expect("write corrupted postings");
1989
1990        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
1991    }
1992
1993    #[test]
1994    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1995        let dir = tempfile::tempdir().expect("create temp dir");
1996        let project = dir.path().join("project");
1997        fs::create_dir_all(&project).expect("create project dir");
1998        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1999
2000        let index = SearchIndex::build(&project);
2001        let cache_dir = dir.path().join("cache");
2002        index.write_to_disk(&cache_dir, None);
2003
2004        assert!(cache_dir.join("postings.bin").is_file());
2005        assert!(cache_dir.join("lookup.bin").is_file());
2006        assert!(!cache_dir.join("postings.bin.tmp").exists());
2007        assert!(!cache_dir.join("lookup.bin.tmp").exists());
2008    }
2009
2010    #[test]
2011    fn project_cache_key_includes_checkout_path() {
2012        let dir = tempfile::tempdir().expect("create temp dir");
2013        let source = dir.path().join("source");
2014        fs::create_dir_all(&source).expect("create source repo dir");
2015        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2016
2017        assert!(Command::new("git")
2018            .current_dir(&source)
2019            .args(["init"])
2020            .status()
2021            .expect("init git repo")
2022            .success());
2023        assert!(Command::new("git")
2024            .current_dir(&source)
2025            .args(["add", "."])
2026            .status()
2027            .expect("git add")
2028            .success());
2029        assert!(Command::new("git")
2030            .current_dir(&source)
2031            .args([
2032                "-c",
2033                "user.name=AFT Tests",
2034                "-c",
2035                "user.email=aft-tests@example.com",
2036                "commit",
2037                "-m",
2038                "initial",
2039            ])
2040            .status()
2041            .expect("git commit")
2042            .success());
2043
2044        let clone = dir.path().join("clone");
2045        assert!(Command::new("git")
2046            .args(["clone", "--quiet"])
2047            .arg(&source)
2048            .arg(&clone)
2049            .status()
2050            .expect("git clone")
2051            .success());
2052
2053        let source_key = project_cache_key(&source);
2054        let clone_key = project_cache_key(&clone);
2055
2056        assert_eq!(source_key.len(), 16);
2057        assert_eq!(clone_key.len(), 16);
2058        // Same repo (same root commit) → same cache key regardless of clone path
2059        assert_eq!(source_key, clone_key);
2060    }
2061
2062    #[test]
2063    fn resolve_search_scope_disables_index_for_external_path() {
2064        let dir = tempfile::tempdir().expect("create temp dir");
2065        let project = dir.path().join("project");
2066        let outside = dir.path().join("outside");
2067        fs::create_dir_all(&project).expect("create project dir");
2068        fs::create_dir_all(&outside).expect("create outside dir");
2069
2070        let scope = resolve_search_scope(&project, outside.to_str());
2071
2072        assert_eq!(
2073            scope.root,
2074            fs::canonicalize(&outside).expect("canonicalize outside")
2075        );
2076        assert!(!scope.use_index);
2077    }
2078
2079    #[test]
2080    fn grep_filters_matches_to_search_root() {
2081        let dir = tempfile::tempdir().expect("create temp dir");
2082        let project = dir.path().join("project");
2083        let src = project.join("src");
2084        let docs = project.join("docs");
2085        fs::create_dir_all(&src).expect("create src dir");
2086        fs::create_dir_all(&docs).expect("create docs dir");
2087        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2088        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2089
2090        let index = SearchIndex::build(&project);
2091        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2092
2093        assert_eq!(result.files_searched, 1);
2094        assert_eq!(result.files_with_matches, 1);
2095        assert_eq!(result.matches.len(), 1);
2096        // Index stores canonicalized paths; on macOS /var → /private/var
2097        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2098        assert_eq!(result.matches[0].file, expected);
2099    }
2100
2101    #[test]
2102    fn grep_deduplicates_multiple_matches_on_same_line() {
2103        let dir = tempfile::tempdir().expect("create temp dir");
2104        let project = dir.path().join("project");
2105        let src = project.join("src");
2106        fs::create_dir_all(&src).expect("create src dir");
2107        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2108
2109        let index = SearchIndex::build(&project);
2110        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2111
2112        assert_eq!(result.total_matches, 1);
2113        assert_eq!(result.matches.len(), 1);
2114    }
2115
2116    #[test]
2117    fn grep_reports_total_matches_before_truncation() {
2118        let dir = tempfile::tempdir().expect("create temp dir");
2119        let project = dir.path().join("project");
2120        let src = project.join("src");
2121        fs::create_dir_all(&src).expect("create src dir");
2122        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2123
2124        let index = SearchIndex::build(&project);
2125        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2126
2127        assert_eq!(result.total_matches, 2);
2128        assert_eq!(result.matches.len(), 1);
2129        assert!(result.truncated);
2130    }
2131
2132    #[test]
2133    fn glob_filters_results_to_search_root() {
2134        let dir = tempfile::tempdir().expect("create temp dir");
2135        let project = dir.path().join("project");
2136        let src = project.join("src");
2137        let scripts = project.join("scripts");
2138        fs::create_dir_all(&src).expect("create src dir");
2139        fs::create_dir_all(&scripts).expect("create scripts dir");
2140        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2141        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2142
2143        let index = SearchIndex::build(&project);
2144        let files = index.glob("**/*.rs", &src);
2145
2146        assert_eq!(
2147            files,
2148            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2149        );
2150    }
2151
2152    #[test]
2153    fn glob_includes_hidden_and_binary_files() {
2154        let dir = tempfile::tempdir().expect("create temp dir");
2155        let project = dir.path().join("project");
2156        let hidden_dir = project.join(".hidden");
2157        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2158        let hidden_file = hidden_dir.join("data.bin");
2159        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2160
2161        let index = SearchIndex::build(&project);
2162        let files = index.glob("**/*.bin", &project);
2163
2164        assert_eq!(
2165            files,
2166            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2167        );
2168    }
2169
2170    #[test]
2171    fn read_from_disk_rejects_invalid_nanos() {
2172        let dir = tempfile::tempdir().expect("create temp dir");
2173        let cache_dir = dir.path().join("cache");
2174        fs::create_dir_all(&cache_dir).expect("create cache dir");
2175
2176        let mut postings = Vec::new();
2177        postings.extend_from_slice(INDEX_MAGIC);
2178        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2179        postings.extend_from_slice(&0u32.to_le_bytes());
2180        postings.extend_from_slice(&1u32.to_le_bytes());
2181        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2182        postings.extend_from_slice(&1u32.to_le_bytes());
2183        postings.extend_from_slice(b"/");
2184        postings.push(0u8);
2185        postings.extend_from_slice(&1u32.to_le_bytes());
2186        postings.extend_from_slice(&0u64.to_le_bytes());
2187        postings.extend_from_slice(&0u64.to_le_bytes());
2188        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2189        postings.extend_from_slice(b"a");
2190        postings.extend_from_slice(&0u64.to_le_bytes());
2191
2192        let mut lookup = Vec::new();
2193        lookup.extend_from_slice(LOOKUP_MAGIC);
2194        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2195        lookup.extend_from_slice(&0u32.to_le_bytes());
2196
2197        fs::write(cache_dir.join("postings.bin"), postings).expect("write postings");
2198        fs::write(cache_dir.join("lookup.bin"), lookup).expect("write lookup");
2199
2200        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2201    }
2202
2203    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2204    /// changed between cmp() calls.
2205    ///
2206    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2207    /// which does a `stat()` syscall. If the file was deleted, modified, or
2208    /// touched mid-sort, the comparator returned different values for the
2209    /// same input pair on different invocations. Rust's slice::sort detects
2210    /// this and panics with "user-provided comparison function does not
2211    /// correctly implement a total order".
2212    ///
2213    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2214    /// bridge invalidated files in parallel with grep's sort path. This
2215    /// test simulates the worst case: most paths don't exist (Err from
2216    /// fs::metadata) and sort still completes successfully.
2217    #[test]
2218    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2219        // Mix of existing and non-existing paths in deliberately
2220        // non-monotonic order — pre-fix, the sort would call stat() at
2221        // least N log N times and any flakiness would trigger the panic.
2222        let dir = tempfile::tempdir().expect("create tempdir");
2223        let mut paths: Vec<PathBuf> = Vec::new();
2224        for i in 0..30 {
2225            // Half exist, half don't.
2226            let path = if i % 2 == 0 {
2227                let p = dir.path().join(format!("real-{i}.rs"));
2228                fs::write(&p, format!("// {i}\n")).expect("write");
2229                p
2230            } else {
2231                dir.path().join(format!("missing-{i}.rs"))
2232            };
2233            paths.push(path);
2234        }
2235
2236        // Run the sort many times to maximise the chance of catching any
2237        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2238        for _ in 0..50 {
2239            let mut copy = paths.clone();
2240            sort_paths_by_mtime_desc(&mut copy);
2241            assert_eq!(copy.len(), paths.len());
2242        }
2243    }
2244
2245    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2246    /// the same conditions as sort_paths_by_mtime_desc. See the
2247    /// sort_paths_... test above for the full rationale.
2248    #[test]
2249    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2250        let dir = tempfile::tempdir().expect("create tempdir");
2251        let mut matches: Vec<GrepMatch> = Vec::new();
2252        for i in 0..30 {
2253            let file = if i % 2 == 0 {
2254                let p = dir.path().join(format!("real-{i}.rs"));
2255                fs::write(&p, format!("// {i}\n")).expect("write");
2256                p
2257            } else {
2258                dir.path().join(format!("missing-{i}.rs"))
2259            };
2260            matches.push(GrepMatch {
2261                file,
2262                line: u32::try_from(i).unwrap_or(0),
2263                column: 0,
2264                line_text: format!("match {i}"),
2265                match_text: format!("match {i}"),
2266            });
2267        }
2268
2269        for _ in 0..50 {
2270            let mut copy = matches.clone();
2271            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2272            assert_eq!(copy.len(), matches.len());
2273        }
2274    }
2275}