aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 3;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36    _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41        fs::create_dir_all(cache_dir)?;
42        let path = cache_dir.join("cache.lock");
43        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44            .lock()
45            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46        fs_lock::try_acquire(&path, Duration::from_secs(2))
47            .map(|guard| Self { _guard: guard })
48            .map_err(|error| match error {
49                fs_lock::AcquireError::Timeout => {
50                    std::io::Error::other("timed out acquiring search cache lock")
51                }
52                fs_lock::AcquireError::Io(error) => error,
53            })
54    }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59    pub postings: HashMap<u32, Vec<Posting>>,
60    pub files: Vec<FileEntry>,
61    pub path_to_id: HashMap<PathBuf, u32>,
62    pub ready: bool,
63    project_root: PathBuf,
64    git_head: Option<String>,
65    max_file_size: u64,
66    pub file_trigrams: HashMap<u32, Vec<u32>>,
67    unindexed_files: HashSet<u32>,
68}
69
70#[derive(Clone, Debug, Default)]
71pub struct LexicalRankResult {
72    pub files: Vec<(PathBuf, f32)>,
73    pub engine_capped: bool,
74}
75
76impl SearchIndex {
77    /// Number of indexed files.
78    pub fn file_count(&self) -> usize {
79        self.files.len()
80    }
81
82    /// Number of unique trigrams in the index.
83    pub fn trigram_count(&self) -> usize {
84        self.postings.len()
85    }
86
87    /// Compute distinct query trigrams from literal tokens.
88    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
89        query_trigrams_from_tokens(tokens)
90    }
91
92    /// Score-rank file candidates by lexical relevance to query trigrams.
93    pub fn lexical_rank(
94        &self,
95        query_trigrams: &[u32],
96        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
97        max_files: usize,
98    ) -> Vec<(PathBuf, f32)> {
99        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
100            .files
101    }
102
103    /// Score-rank file candidates and report whether pre-filter candidate
104    /// enumeration hit the internal 200/500 cap before ranking.
105    pub fn lexical_rank_with_stats(
106        &self,
107        query_trigrams: &[u32],
108        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
109        max_files: usize,
110    ) -> LexicalRankResult {
111        if query_trigrams.is_empty() || max_files == 0 {
112            return LexicalRankResult::default();
113        }
114
115        let mut non_zero: Vec<(u32, usize)> = query_trigrams
116            .iter()
117            .filter_map(|trigram| {
118                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
119                (posting_count > 0).then_some((*trigram, posting_count))
120            })
121            .collect();
122        if non_zero.is_empty() {
123            return LexicalRankResult::default();
124        }
125
126        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
127        let selected_count = non_zero.len().min(3);
128        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
129
130        let mut candidate_ids = BTreeSet::new();
131        for (trigram, _) in non_zero.iter().take(selected_count) {
132            if let Some(postings) = self.postings.get(trigram) {
133                for posting in postings {
134                    if self.is_active_file(posting.file_id) {
135                        candidate_ids.insert(posting.file_id);
136                    }
137                }
138            }
139        }
140        let pre_filter_candidate_count = candidate_ids.len();
141        let engine_capped = pre_filter_candidate_count > candidate_cap;
142        let filtered_candidates = candidate_ids
143            .into_iter()
144            .filter_map(|file_id| {
145                self.files
146                    .get(file_id as usize)
147                    .map(|entry| (file_id, entry))
148            })
149            .filter(|(_, entry)| {
150                if let Some(filter) = candidate_filter {
151                    filter(&entry.path)
152                } else {
153                    true
154                }
155            })
156            .collect::<Vec<_>>();
157
158        let mut ranked = Vec::new();
159        for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
160            let score = lexical_score(self, query_trigrams, file_id);
161            if score > 0.0 {
162                ranked.push((entry.path.clone(), score));
163            }
164        }
165
166        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
167        ranked.truncate(max_files);
168        LexicalRankResult {
169            files: ranked,
170            engine_capped,
171        }
172    }
173}
174
175#[derive(Clone, Debug, PartialEq, Eq)]
176pub struct Posting {
177    pub file_id: u32,
178    pub next_mask: u8,
179    pub loc_mask: u8,
180}
181
182#[derive(Clone, Debug)]
183pub struct FileEntry {
184    pub path: PathBuf,
185    pub size: u64,
186    pub modified: SystemTime,
187    pub content_hash: blake3::Hash,
188}
189
190#[derive(Clone, Debug, PartialEq, Eq)]
191pub struct GrepMatch {
192    pub file: PathBuf,
193    pub line: u32,
194    pub column: u32,
195    pub line_text: String,
196    pub match_text: String,
197}
198
199#[derive(Clone, Debug)]
200pub struct GrepResult {
201    pub matches: Vec<GrepMatch>,
202    pub total_matches: usize,
203    pub files_searched: usize,
204    pub files_with_matches: usize,
205    pub index_status: IndexStatus,
206    pub truncated: bool,
207    pub fully_degraded: bool,
208    pub engine_capped: bool,
209}
210
211#[derive(Clone, Copy, Debug, PartialEq, Eq)]
212pub enum IndexStatus {
213    Ready,
214    Building,
215    Fallback,
216    Disabled,
217}
218
219impl IndexStatus {
220    pub fn as_str(&self) -> &'static str {
221        match self {
222            IndexStatus::Ready => "Ready",
223            IndexStatus::Building => "Building",
224            IndexStatus::Fallback => "Fallback",
225            IndexStatus::Disabled => "Disabled",
226        }
227    }
228}
229
230#[derive(Clone, Debug, Default)]
231pub struct RegexQuery {
232    pub and_trigrams: Vec<u32>,
233    pub or_groups: Vec<Vec<u32>>,
234    pub(crate) and_filters: HashMap<u32, PostingFilter>,
235    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
236}
237
238#[derive(Clone, Copy, Debug, Default)]
239pub(crate) struct PostingFilter {
240    next_mask: u8,
241    loc_mask: u8,
242}
243
244#[derive(Clone, Debug, Default)]
245struct QueryBuild {
246    and_runs: Vec<Vec<u8>>,
247    or_groups: Vec<Vec<Vec<u8>>>,
248}
249
250#[derive(Clone, Debug, Default)]
251pub(crate) struct PathFilters {
252    includes: Option<GlobSet>,
253    excludes: Option<GlobSet>,
254}
255
256#[derive(Clone, Debug)]
257pub(crate) struct SearchScope {
258    pub root: PathBuf,
259    pub use_index: bool,
260}
261
262#[derive(Clone, Debug)]
263struct SharedGrepMatch {
264    file: Arc<PathBuf>,
265    line: u32,
266    column: u32,
267    line_text: String,
268    match_text: String,
269}
270
271#[derive(Clone, Debug)]
272enum SearchMatcher {
273    Literal(LiteralSearch),
274    Regex(Regex),
275}
276
277impl SearchIndex {
278    pub fn new() -> Self {
279        SearchIndex {
280            postings: HashMap::new(),
281            files: Vec::new(),
282            path_to_id: HashMap::new(),
283            ready: false,
284            project_root: PathBuf::new(),
285            git_head: None,
286            max_file_size: DEFAULT_MAX_FILE_SIZE,
287            file_trigrams: HashMap::new(),
288            unindexed_files: HashSet::new(),
289        }
290    }
291
292    pub fn build(root: &Path) -> Self {
293        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
294    }
295
296    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
297        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
298        let mut index = SearchIndex {
299            project_root: project_root.clone(),
300            max_file_size,
301            ..SearchIndex::new()
302        };
303
304        let filters = PathFilters::default();
305        for path in walk_project_files(&project_root, &filters) {
306            index.update_file(&path);
307        }
308
309        index.git_head = current_git_head(&project_root);
310        index.ready = true;
311        index
312    }
313
314    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
315        self.remove_file(path);
316
317        let file_id = match self.allocate_file_id(path, content.len() as u64) {
318            Some(file_id) => file_id,
319            None => return,
320        };
321        if let Some(file) = self.files.get_mut(file_id as usize) {
322            file.content_hash = cache_freshness::hash_bytes(content);
323        }
324
325        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
326        for (trigram, next_char, position) in extract_trigrams(content) {
327            let entry = trigram_map.entry(trigram).or_default();
328            entry.next_mask |= mask_for_next_char(next_char);
329            entry.loc_mask |= mask_for_position(position);
330        }
331
332        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
333        for (trigram, filter) in trigram_map {
334            let postings = self.postings.entry(trigram).or_default();
335            postings.push(Posting {
336                file_id,
337                next_mask: filter.next_mask,
338                loc_mask: filter.loc_mask,
339            });
340            // Posting lists are kept sorted by file_id for binary search during
341            // intersection. Since file_ids are allocated incrementally, the new
342            // entry is usually already in order. Only sort when needed.
343            if postings.len() > 1
344                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
345            {
346                postings.sort_unstable_by_key(|p| p.file_id);
347            }
348            file_trigrams.push(trigram);
349        }
350
351        self.file_trigrams.insert(file_id, file_trigrams);
352        self.unindexed_files.remove(&file_id);
353    }
354
355    pub fn remove_file(&mut self, path: &Path) {
356        let canonical_path = canonicalize_existing_or_deleted_path(path);
357        let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
358            file_id
359        } else if canonical_path.as_path() != path {
360            let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
361                return;
362            };
363            file_id
364        } else {
365            return;
366        };
367
368        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
369            for trigram in trigrams {
370                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
371                    postings.retain(|posting| posting.file_id != file_id);
372                    postings.is_empty()
373                } else {
374                    false
375                };
376
377                if should_remove {
378                    self.postings.remove(&trigram);
379                }
380            }
381        }
382
383        self.unindexed_files.remove(&file_id);
384        if let Some(file) = self.files.get_mut(file_id as usize) {
385            file.path = PathBuf::new();
386            file.size = 0;
387            file.modified = UNIX_EPOCH;
388            file.content_hash = cache_freshness::zero_hash();
389        }
390    }
391
392    pub fn update_file(&mut self, path: &Path) {
393        self.remove_file(path);
394
395        let metadata = match fs::metadata(path) {
396            Ok(metadata) if metadata.is_file() => metadata,
397            _ => return,
398        };
399
400        if is_binary_path(path, metadata.len()) {
401            self.track_unindexed_file(path, &metadata);
402            return;
403        }
404
405        if metadata.len() > self.max_file_size {
406            self.track_unindexed_file(path, &metadata);
407            return;
408        }
409
410        let content = match fs::read(path) {
411            Ok(content) => content,
412            Err(_) => return,
413        };
414
415        if is_binary_bytes(&content) {
416            self.track_unindexed_file(path, &metadata);
417            return;
418        }
419
420        self.index_file(path, &content);
421    }
422
423    pub fn grep(
424        &self,
425        pattern: &str,
426        case_sensitive: bool,
427        include: &[String],
428        exclude: &[String],
429        search_root: &Path,
430        max_results: usize,
431    ) -> GrepResult {
432        match pattern_compile::compile(
433            pattern,
434            CompileOpts {
435                case_insensitive: !case_sensitive,
436                ..CompileOpts::default()
437            },
438        ) {
439            CompileResult::Ok(compiled) => {
440                self.search_grep(&compiled, include, exclude, search_root, max_results)
441            }
442            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
443                self.empty_grep_result()
444            }
445        }
446    }
447
448    pub fn search_grep(
449        &self,
450        pattern: &CompiledPattern,
451        include: &[String],
452        exclude: &[String],
453        search_root: &Path,
454        max_results: usize,
455    ) -> GrepResult {
456        let matcher = match pattern {
457            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
458            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
459        };
460
461        let filters = match build_path_filters(include, exclude) {
462            Ok(filters) => filters,
463            Err(_) => PathFilters::default(),
464        };
465        let search_root = canonicalize_or_normalize(search_root);
466
467        let raw_pattern = pattern.raw_pattern_for_trigrams();
468        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
469            RegexQuery::default()
470        } else {
471            decompose_regex(&raw_pattern)
472        };
473        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
474        let candidate_ids = self.candidates(&query);
475
476        let candidate_files: Vec<&FileEntry> = candidate_ids
477            .into_iter()
478            .filter_map(|file_id| self.files.get(file_id as usize))
479            .filter(|file| !file.path.as_os_str().is_empty())
480            .filter(|file| is_within_search_root(&search_root, &file.path))
481            .filter(|file| filters.matches(&self.project_root, &file.path))
482            .collect();
483
484        let total_matches = AtomicUsize::new(0);
485        let files_searched = AtomicUsize::new(0);
486        let files_with_matches = AtomicUsize::new(0);
487        let truncated = AtomicBool::new(false);
488        let engine_capped = AtomicBool::new(false);
489        let stop_after = max_results.saturating_mul(2);
490
491        let mut matches = if candidate_files.len() > 10 {
492            candidate_files
493                .par_iter()
494                .map(|file| {
495                    search_candidate_file(
496                        file,
497                        &matcher,
498                        max_results,
499                        stop_after,
500                        &total_matches,
501                        &files_searched,
502                        &files_with_matches,
503                        &truncated,
504                        &engine_capped,
505                    )
506                })
507                .reduce(Vec::new, |mut left, mut right| {
508                    left.append(&mut right);
509                    left
510                })
511        } else {
512            let mut matches = Vec::new();
513            for file in candidate_files {
514                matches.extend(search_candidate_file(
515                    file,
516                    &matcher,
517                    max_results,
518                    stop_after,
519                    &total_matches,
520                    &files_searched,
521                    &files_with_matches,
522                    &truncated,
523                    &engine_capped,
524                ));
525
526                if should_stop_search(&truncated, &total_matches, stop_after) {
527                    engine_capped.store(true, Ordering::Relaxed);
528                    break;
529                }
530            }
531            matches
532        };
533
534        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
535            self.path_to_id
536                .get(path)
537                .and_then(|file_id| self.files.get(*file_id as usize))
538                .map(|file| file.modified)
539        });
540
541        let matches = matches
542            .into_iter()
543            .map(|matched| GrepMatch {
544                file: matched.file.as_ref().clone(),
545                line: matched.line,
546                column: matched.column,
547                line_text: matched.line_text,
548                match_text: matched.match_text,
549            })
550            .collect();
551
552        GrepResult {
553            total_matches: total_matches.load(Ordering::Relaxed),
554            matches,
555            files_searched: files_searched.load(Ordering::Relaxed),
556            files_with_matches: files_with_matches.load(Ordering::Relaxed),
557            index_status: if self.ready {
558                IndexStatus::Ready
559            } else {
560                IndexStatus::Building
561            },
562            truncated: truncated.load(Ordering::Relaxed),
563            fully_degraded,
564            engine_capped: engine_capped.load(Ordering::Relaxed),
565        }
566    }
567
568    fn empty_grep_result(&self) -> GrepResult {
569        GrepResult {
570            matches: Vec::new(),
571            total_matches: 0,
572            files_searched: 0,
573            files_with_matches: 0,
574            index_status: if self.ready {
575                IndexStatus::Ready
576            } else {
577                IndexStatus::Building
578            },
579            truncated: false,
580            fully_degraded: false,
581            engine_capped: false,
582        }
583    }
584
585    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
586        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
587            Ok(filters) => filters,
588            Err(_) => return Vec::new(),
589        };
590        let search_root = canonicalize_or_normalize(search_root);
591        let mut entries = self
592            .files
593            .iter()
594            .filter(|file| !file.path.as_os_str().is_empty())
595            .filter(|file| is_within_search_root(&search_root, &file.path))
596            .filter(|file| filters.matches(&self.project_root, &file.path))
597            .map(|file| (file.path.clone(), file.modified))
598            .collect::<Vec<_>>();
599
600        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
601            right_mtime
602                .cmp(left_mtime)
603                .then_with(|| left_path.cmp(right_path))
604        });
605
606        entries.into_iter().map(|(path, _)| path).collect()
607    }
608
609    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
610        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
611            return self.active_file_ids();
612        }
613
614        let mut and_trigrams = query.and_trigrams.clone();
615        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
616
617        let mut current: Option<Vec<u32>> = None;
618
619        for trigram in and_trigrams {
620            let filter = query.and_filters.get(&trigram).copied();
621            let matches = self.postings_for_trigram(trigram, filter);
622            current = Some(match current.take() {
623                Some(existing) => intersect_sorted_ids(&existing, &matches),
624                None => matches,
625            });
626
627            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
628                break;
629            }
630        }
631
632        let mut current = current.unwrap_or_else(|| self.active_file_ids());
633
634        for (index, group) in query.or_groups.iter().enumerate() {
635            let mut group_matches = Vec::new();
636            let filters = query.or_filters.get(index);
637
638            for trigram in group {
639                let filter = filters.and_then(|filters| filters.get(trigram).copied());
640                let matches = self.postings_for_trigram(*trigram, filter);
641                if group_matches.is_empty() {
642                    group_matches = matches;
643                } else {
644                    group_matches = union_sorted_ids(&group_matches, &matches);
645                }
646            }
647
648            current = intersect_sorted_ids(&current, &group_matches);
649            if current.is_empty() {
650                break;
651            }
652        }
653
654        let mut unindexed = self
655            .unindexed_files
656            .iter()
657            .copied()
658            .filter(|file_id| self.is_active_file(*file_id))
659            .collect::<Vec<_>>();
660        if !unindexed.is_empty() {
661            unindexed.sort_unstable();
662            current = union_sorted_ids(&current, &unindexed);
663        }
664
665        current
666    }
667
668    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
669        if fs::create_dir_all(cache_dir).is_err() {
670            return;
671        }
672
673        let cache_path = cache_dir.join("cache.bin");
674        let tmp_cache = cache_dir.join(format!(
675            "cache.bin.tmp.{}.{}",
676            std::process::id(),
677            SystemTime::now()
678                .duration_since(UNIX_EPOCH)
679                .unwrap_or(Duration::ZERO)
680                .as_nanos()
681        ));
682
683        let active_ids = self.active_file_ids();
684        let mut id_map = HashMap::new();
685        for (new_id, old_id) in active_ids.iter().enumerate() {
686            let Ok(new_id_u32) = u32::try_from(new_id) else {
687                return;
688            };
689            id_map.insert(*old_id, new_id_u32);
690        }
691
692        let write_result = (|| -> std::io::Result<()> {
693            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
694
695            postings_writer.write_all(INDEX_MAGIC)?;
696            write_u32(&mut postings_writer, INDEX_VERSION)?;
697
698            let head = git_head.unwrap_or_default();
699            let root = self.project_root.to_string_lossy();
700            let head_len = u32::try_from(head.len())
701                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
702            let root_len = u32::try_from(root.len())
703                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
704            let file_count = u32::try_from(active_ids.len())
705                .map_err(|_| std::io::Error::other("too many files to cache"))?;
706
707            write_u32(&mut postings_writer, head_len)?;
708            write_u32(&mut postings_writer, root_len)?;
709            write_u64(&mut postings_writer, self.max_file_size)?;
710            write_u32(&mut postings_writer, file_count)?;
711            postings_writer.write_all(head.as_bytes())?;
712            postings_writer.write_all(root.as_bytes())?;
713
714            for old_id in &active_ids {
715                let Some(file) = self.files.get(*old_id as usize) else {
716                    return Err(std::io::Error::other("missing file entry for cache write"));
717                };
718                let path =
719                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
720                        std::io::Error::other(format!(
721                            "refusing to cache path outside project root: {}",
722                            file.path.display()
723                        ))
724                    })?;
725                let path = path.to_string_lossy();
726                let path_len = u32::try_from(path.len())
727                    .map_err(|_| std::io::Error::other("cached path too large"))?;
728                let modified = file
729                    .modified
730                    .duration_since(UNIX_EPOCH)
731                    .unwrap_or(Duration::ZERO);
732                let unindexed = if self.unindexed_files.contains(old_id) {
733                    1u8
734                } else {
735                    0u8
736                };
737
738                postings_writer.write_all(&[unindexed])?;
739                write_u32(&mut postings_writer, path_len)?;
740                write_u64(&mut postings_writer, file.size)?;
741                write_u64(&mut postings_writer, modified.as_secs())?;
742                write_u32(&mut postings_writer, modified.subsec_nanos())?;
743                postings_writer.write_all(file.content_hash.as_bytes())?;
744                postings_writer.write_all(path.as_bytes())?;
745            }
746
747            let mut lookup_entries = Vec::new();
748            let mut postings_blob = Vec::new();
749            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
750            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
751
752            for (trigram, postings) in sorted_postings {
753                let offset = u64::try_from(postings_blob.len())
754                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
755                let mut count = 0u32;
756
757                for posting in postings {
758                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
759                        continue;
760                    };
761
762                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
763                    postings_blob.push(posting.next_mask);
764                    postings_blob.push(posting.loc_mask);
765                    count = count.saturating_add(1);
766                }
767
768                if count > 0 {
769                    lookup_entries.push((*trigram, offset, count));
770                }
771            }
772
773            write_u64(
774                &mut postings_writer,
775                u64::try_from(postings_blob.len())
776                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
777            )?;
778            postings_writer.write_all(&postings_blob)?;
779            postings_writer.flush()?;
780            let mut postings_blob_file = postings_writer
781                .into_inner()
782                .map_err(|error| std::io::Error::other(error.to_string()))?
783                .into_inner();
784            let checksum = crc32fast::hash(&postings_blob_file);
785            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
786
787            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
788            let entry_count = u32::try_from(lookup_entries.len())
789                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
790
791            lookup_writer.write_all(LOOKUP_MAGIC)?;
792            write_u32(&mut lookup_writer, INDEX_VERSION)?;
793            write_u32(&mut lookup_writer, entry_count)?;
794
795            for (trigram, offset, count) in lookup_entries {
796                write_u32(&mut lookup_writer, trigram)?;
797                write_u64(&mut lookup_writer, offset)?;
798                write_u32(&mut lookup_writer, count)?;
799            }
800
801            lookup_writer.flush()?;
802            let mut lookup_blob_file = lookup_writer
803                .into_inner()
804                .map_err(|error| std::io::Error::other(error.to_string()))?
805                .into_inner();
806            let checksum = crc32fast::hash(&lookup_blob_file);
807            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
808
809            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
810            write_u32(&mut cache_writer, CACHE_MAGIC)?;
811            write_u32(&mut cache_writer, INDEX_VERSION)?;
812            write_u64(
813                &mut cache_writer,
814                u64::try_from(postings_blob_file.len())
815                    .map_err(|_| std::io::Error::other("postings section too large"))?,
816            )?;
817            cache_writer.write_all(&postings_blob_file)?;
818            cache_writer.write_all(&lookup_blob_file)?;
819            cache_writer.flush()?;
820            cache_writer.get_ref().sync_all()?;
821            drop(cache_writer);
822            fs::rename(&tmp_cache, &cache_path)?;
823
824            Ok(())
825        })();
826
827        if write_result.is_err() {
828            let _ = fs::remove_file(&tmp_cache);
829        }
830    }
831
832    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
833        debug_assert!(current_canonical_root.is_absolute());
834        let cache_path = cache_dir.join("cache.bin");
835        let cache_bytes = fs::read(&cache_path).ok()?;
836        if cache_bytes.len() < 16 {
837            return None;
838        }
839        let mut header = Cursor::new(&cache_bytes);
840        if read_u32(&mut header).ok()? != CACHE_MAGIC {
841            return None;
842        }
843        if read_u32(&mut header).ok()? != INDEX_VERSION {
844            return None;
845        }
846        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
847        let start = usize::try_from(header.position()).ok()?;
848        let postings_end = start.checked_add(postings_len_total)?;
849        if postings_end > cache_bytes.len() {
850            return None;
851        }
852        let postings_bytes = &cache_bytes[start..postings_end];
853        let lookup_bytes = &cache_bytes[postings_end..];
854        let lookup_len_total = lookup_bytes.len();
855        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
856        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
857        if postings_len_total < 4 || lookup_len_total < 4 {
858            return None;
859        }
860        verify_crc32_bytes_slice(postings_bytes).ok()?;
861        verify_crc32_bytes_slice(lookup_bytes).ok()?;
862
863        let mut magic = [0u8; 8];
864        postings_reader.read_exact(&mut magic).ok()?;
865        if &magic != INDEX_MAGIC {
866            return None;
867        }
868        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
869            return None;
870        }
871
872        let head_len = read_u32(&mut postings_reader).ok()? as usize;
873        let root_len = read_u32(&mut postings_reader).ok()? as usize;
874        let max_file_size = read_u64(&mut postings_reader).ok()?;
875        let file_count = read_u32(&mut postings_reader).ok()? as usize;
876        if file_count > MAX_ENTRIES {
877            return None;
878        }
879        let postings_body_len = postings_len_total.checked_sub(4)?;
880        let lookup_body_len = lookup_len_total.checked_sub(4)?;
881
882        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
883        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
884        if minimum_file_bytes > remaining_postings {
885            return None;
886        }
887
888        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
889            return None;
890        }
891        let mut head_bytes = vec![0u8; head_len];
892        postings_reader.read_exact(&mut head_bytes).ok()?;
893        let git_head = String::from_utf8(head_bytes)
894            .ok()
895            .filter(|head| !head.is_empty());
896
897        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
898            return None;
899        }
900        let mut root_bytes = vec![0u8; root_len];
901        postings_reader.read_exact(&mut root_bytes).ok()?;
902        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
903        let project_root = current_canonical_root.to_path_buf();
904
905        let mut files = Vec::with_capacity(file_count);
906        let mut path_to_id = HashMap::new();
907        let mut unindexed_files = HashSet::new();
908
909        for file_id in 0..file_count {
910            let mut unindexed = [0u8; 1];
911            postings_reader.read_exact(&mut unindexed).ok()?;
912            let path_len = read_u32(&mut postings_reader).ok()? as usize;
913            let size = read_u64(&mut postings_reader).ok()?;
914            let secs = read_u64(&mut postings_reader).ok()?;
915            let nanos = read_u32(&mut postings_reader).ok()?;
916            let mut hash_bytes = [0u8; 32];
917            postings_reader.read_exact(&mut hash_bytes).ok()?;
918            let content_hash = blake3::Hash::from_bytes(hash_bytes);
919            if nanos >= 1_000_000_000 {
920                return None;
921            }
922            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
923                return None;
924            }
925            let mut path_bytes = vec![0u8; path_len];
926            postings_reader.read_exact(&mut path_bytes).ok()?;
927            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
928            let full_path = cached_path_under_root(&project_root, &relative_path)?;
929            let file_id_u32 = u32::try_from(file_id).ok()?;
930
931            files.push(FileEntry {
932                path: full_path.clone(),
933                size,
934                modified: UNIX_EPOCH + Duration::new(secs, nanos),
935                content_hash,
936            });
937            path_to_id.insert(full_path, file_id_u32);
938            if unindexed[0] == 1 {
939                unindexed_files.insert(file_id_u32);
940            }
941        }
942
943        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
944        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
945        if postings_len > max_postings_bytes {
946            return None;
947        }
948        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
949            return None;
950        }
951        let mut postings_blob = vec![0u8; postings_len];
952        postings_reader.read_exact(&mut postings_blob).ok()?;
953
954        let mut lookup_magic = [0u8; 8];
955        lookup_reader.read_exact(&mut lookup_magic).ok()?;
956        if &lookup_magic != LOOKUP_MAGIC {
957            return None;
958        }
959        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
960            return None;
961        }
962        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
963        if entry_count > MAX_ENTRIES {
964            return None;
965        }
966        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
967        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
968        if minimum_lookup_bytes > remaining_lookup {
969            return None;
970        }
971
972        let mut postings = HashMap::new();
973        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
974
975        for _ in 0..entry_count {
976            let trigram = read_u32(&mut lookup_reader).ok()?;
977            let offset = read_u64(&mut lookup_reader).ok()? as usize;
978            let count = read_u32(&mut lookup_reader).ok()? as usize;
979            if count > MAX_ENTRIES {
980                return None;
981            }
982            let bytes_len = count.checked_mul(POSTING_BYTES)?;
983            let end = offset.checked_add(bytes_len)?;
984            if end > postings_blob.len() {
985                return None;
986            }
987
988            let mut trigram_postings = Vec::with_capacity(count);
989            for chunk in postings_blob[offset..end].chunks_exact(6) {
990                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
991                let posting = Posting {
992                    file_id,
993                    next_mask: chunk[4],
994                    loc_mask: chunk[5],
995                };
996                trigram_postings.push(posting.clone());
997                file_trigrams.entry(file_id).or_default().push(trigram);
998            }
999            postings.insert(trigram, trigram_postings);
1000        }
1001
1002        Some(SearchIndex {
1003            postings,
1004            files,
1005            path_to_id,
1006            ready: false,
1007            project_root,
1008            git_head,
1009            max_file_size,
1010            file_trigrams,
1011            unindexed_files,
1012        })
1013    }
1014
1015    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1016        self.git_head.as_deref()
1017    }
1018
1019    pub(crate) fn set_ready(&mut self, ready: bool) {
1020        self.ready = ready;
1021    }
1022
1023    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1024        self.git_head = current_head;
1025        verify_file_mtimes(self);
1026        self.ready = true;
1027    }
1028
1029    #[cfg(debug_assertions)]
1030    #[doc(hidden)]
1031    pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1032        self.verify_against_disk(current_head);
1033    }
1034
1035    pub(crate) fn rebuild_or_refresh(
1036        root: &Path,
1037        max_file_size: u64,
1038        current_head: Option<String>,
1039        baseline: Option<SearchIndex>,
1040    ) -> Self {
1041        if let Some(mut baseline) = baseline {
1042            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1043            baseline.max_file_size = max_file_size;
1044
1045            if baseline.git_head == current_head || current_head.is_none() {
1046                // HEAD matches, but files may have changed on disk since the index was
1047                // last written (e.g., uncommitted edits, stash pop, manual file changes
1048                // while OpenCode was closed). Verify mtimes and re-index stale files.
1049                // Non-git projects also use this per-file (path, mtime, size)
1050                // fingerprint so unchanged trees reuse the disk cache instead of
1051                // rebuilding every configure.
1052                baseline.git_head = current_head;
1053                verify_file_mtimes(&mut baseline);
1054                baseline.ready = true;
1055                return baseline;
1056            }
1057
1058            if let (Some(previous), Some(current)) =
1059                (baseline.git_head.clone(), current_head.clone())
1060            {
1061                let project_root = baseline.project_root.clone();
1062                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1063                    baseline.git_head = Some(current);
1064                    verify_file_mtimes(&mut baseline);
1065                    baseline.ready = true;
1066                    return baseline;
1067                }
1068            }
1069        }
1070
1071        SearchIndex::build_with_limit(root, max_file_size)
1072    }
1073
1074    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1075        let file_id = u32::try_from(self.files.len()).ok()?;
1076        let metadata = fs::metadata(path).ok();
1077        let size = metadata
1078            .as_ref()
1079            .map_or(size_hint, |metadata| metadata.len());
1080        let modified = metadata
1081            .and_then(|metadata| metadata.modified().ok())
1082            .unwrap_or(UNIX_EPOCH);
1083
1084        self.files.push(FileEntry {
1085            path: path.to_path_buf(),
1086            size,
1087            modified,
1088            content_hash: cache_freshness::zero_hash(),
1089        });
1090        self.path_to_id.insert(path.to_path_buf(), file_id);
1091        Some(file_id)
1092    }
1093
1094    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1095        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1096            return;
1097        };
1098        self.unindexed_files.insert(file_id);
1099        self.file_trigrams.insert(file_id, Vec::new());
1100    }
1101
1102    fn active_file_ids(&self) -> Vec<u32> {
1103        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1104        ids.sort_unstable();
1105        ids
1106    }
1107
1108    fn is_active_file(&self, file_id: u32) -> bool {
1109        self.files
1110            .get(file_id as usize)
1111            .map(|file| !file.path.as_os_str().is_empty())
1112            .unwrap_or(false)
1113    }
1114
1115    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1116        let Some(postings) = self.postings.get(&trigram) else {
1117            return Vec::new();
1118        };
1119
1120        let mut matches = Vec::with_capacity(postings.len());
1121
1122        for posting in postings {
1123            if let Some(filter) = filter {
1124                // next_mask: bloom filter check — the character following this trigram in the
1125                // query must also appear after this trigram somewhere in the file.
1126                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1127                    continue;
1128                }
1129                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1130                // between consecutive trigram pairs, but is NOT used as a single-trigram
1131                // filter because the position in the query string has no relationship to
1132                // the position in the file. Using it here causes false negatives.
1133            }
1134            if self.is_active_file(posting.file_id) {
1135                matches.push(posting.file_id);
1136            }
1137        }
1138
1139        matches
1140    }
1141}
1142
1143fn search_candidate_file(
1144    file: &FileEntry,
1145    matcher: &SearchMatcher,
1146    max_results: usize,
1147    stop_after: usize,
1148    total_matches: &AtomicUsize,
1149    files_searched: &AtomicUsize,
1150    files_with_matches: &AtomicUsize,
1151    truncated: &AtomicBool,
1152    engine_capped: &AtomicBool,
1153) -> Vec<SharedGrepMatch> {
1154    if should_stop_search(truncated, total_matches, stop_after) {
1155        engine_capped.store(true, Ordering::Relaxed);
1156        return Vec::new();
1157    }
1158
1159    let content = match read_indexed_file_bytes(&file.path) {
1160        Some(content) => content,
1161        None => return Vec::new(),
1162    };
1163    // Defense in depth: even though indexing tries to filter binaries via
1164    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1165    // query time. content_inspector is fast (~bytes-per-cycle on a small
1166    // preview) and this guarantees we never surface matches inside binary
1167    // files even if the indexer somehow let one through (e.g. file changed
1168    // between indexing and query).
1169    if is_binary_bytes(&content) {
1170        return Vec::new();
1171    }
1172    files_searched.fetch_add(1, Ordering::Relaxed);
1173
1174    let shared_path = Arc::new(file.path.clone());
1175    let mut matches = Vec::new();
1176    let mut line_starts = None;
1177    let mut seen_lines = HashSet::new();
1178    let mut matched_this_file = false;
1179
1180    match matcher {
1181        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1182            let needle = &literal.needle;
1183            let finder = memchr::memmem::Finder::new(needle);
1184            let mut start = 0;
1185
1186            while let Some(position) = finder.find(&content[start..]) {
1187                if should_stop_search(truncated, total_matches, stop_after) {
1188                    engine_capped.store(true, Ordering::Relaxed);
1189                    break;
1190                }
1191
1192                let offset = start + position;
1193                start = offset + 1;
1194
1195                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1196                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1197                if !seen_lines.insert(line) {
1198                    continue;
1199                }
1200
1201                matched_this_file = true;
1202                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1203                if match_number > max_results {
1204                    truncated.store(true, Ordering::Relaxed);
1205                    break;
1206                }
1207
1208                let end = offset + needle.len();
1209                matches.push(SharedGrepMatch {
1210                    file: shared_path.clone(),
1211                    line,
1212                    column,
1213                    line_text,
1214                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1215                });
1216            }
1217        }
1218        SearchMatcher::Literal(literal) => {
1219            let needle = &literal.needle;
1220            let search_content = content.to_ascii_lowercase();
1221            let finder = memchr::memmem::Finder::new(needle);
1222            let mut start = 0;
1223
1224            while let Some(position) = finder.find(&search_content[start..]) {
1225                if should_stop_search(truncated, total_matches, stop_after) {
1226                    engine_capped.store(true, Ordering::Relaxed);
1227                    break;
1228                }
1229
1230                let offset = start + position;
1231                start = offset + 1;
1232
1233                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1234                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1235                if !seen_lines.insert(line) {
1236                    continue;
1237                }
1238
1239                matched_this_file = true;
1240                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1241                if match_number > max_results {
1242                    truncated.store(true, Ordering::Relaxed);
1243                    break;
1244                }
1245
1246                let end = offset + needle.len();
1247                matches.push(SharedGrepMatch {
1248                    file: shared_path.clone(),
1249                    line,
1250                    column,
1251                    line_text,
1252                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1253                });
1254            }
1255        }
1256        SearchMatcher::Regex(regex) => {
1257            for matched in regex.find_iter(&content) {
1258                if should_stop_search(truncated, total_matches, stop_after) {
1259                    engine_capped.store(true, Ordering::Relaxed);
1260                    break;
1261                }
1262
1263                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1264                let (line, column, line_text) =
1265                    line_details_bytes(&content, line_starts, matched.start());
1266                if !seen_lines.insert(line) {
1267                    continue;
1268                }
1269
1270                matched_this_file = true;
1271                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1272                if match_number > max_results {
1273                    truncated.store(true, Ordering::Relaxed);
1274                    break;
1275                }
1276
1277                matches.push(SharedGrepMatch {
1278                    file: shared_path.clone(),
1279                    line,
1280                    column,
1281                    line_text,
1282                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1283                });
1284            }
1285        }
1286    }
1287
1288    if matched_this_file {
1289        files_with_matches.fetch_add(1, Ordering::Relaxed);
1290    }
1291
1292    matches
1293}
1294
1295fn should_stop_search(
1296    truncated: &AtomicBool,
1297    total_matches: &AtomicUsize,
1298    stop_after: usize,
1299) -> bool {
1300    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1301}
1302
1303fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1304    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1305    let mut left_index = 0;
1306    let mut right_index = 0;
1307
1308    while left_index < left.len() && right_index < right.len() {
1309        match left[left_index].cmp(&right[right_index]) {
1310            std::cmp::Ordering::Less => left_index += 1,
1311            std::cmp::Ordering::Greater => right_index += 1,
1312            std::cmp::Ordering::Equal => {
1313                merged.push(left[left_index]);
1314                left_index += 1;
1315                right_index += 1;
1316            }
1317        }
1318    }
1319
1320    merged
1321}
1322
1323fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1324    let mut merged = Vec::with_capacity(left.len() + right.len());
1325    let mut left_index = 0;
1326    let mut right_index = 0;
1327
1328    while left_index < left.len() && right_index < right.len() {
1329        match left[left_index].cmp(&right[right_index]) {
1330            std::cmp::Ordering::Less => {
1331                merged.push(left[left_index]);
1332                left_index += 1;
1333            }
1334            std::cmp::Ordering::Greater => {
1335                merged.push(right[right_index]);
1336                right_index += 1;
1337            }
1338            std::cmp::Ordering::Equal => {
1339                merged.push(left[left_index]);
1340                left_index += 1;
1341                right_index += 1;
1342            }
1343        }
1344    }
1345
1346    merged.extend_from_slice(&left[left_index..]);
1347    merged.extend_from_slice(&right[right_index..]);
1348    merged
1349}
1350
1351pub fn decompose_regex(pattern: &str) -> RegexQuery {
1352    let hir = match regex_syntax::parse(pattern) {
1353        Ok(hir) => hir,
1354        Err(_) => return RegexQuery::default(),
1355    };
1356
1357    let build = build_query(&hir);
1358    build.into_query()
1359}
1360
1361pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1362    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1363}
1364
1365pub fn normalize_char(c: u8) -> u8 {
1366    c.to_ascii_lowercase()
1367}
1368
1369pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1370    if content.len() < 3 {
1371        return Vec::new();
1372    }
1373
1374    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1375    for start in 0..=content.len() - 3 {
1376        let trigram = pack_trigram(
1377            normalize_char(content[start]),
1378            normalize_char(content[start + 1]),
1379            normalize_char(content[start + 2]),
1380        );
1381        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1382        trigrams.push((trigram, next_char, start));
1383    }
1384    trigrams
1385}
1386
1387pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1388    let mut seen = HashSet::new();
1389    let mut out = Vec::new();
1390    for token in tokens {
1391        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1392            if seen.insert(trigram) {
1393                out.push(trigram);
1394            }
1395        }
1396    }
1397    out
1398}
1399
1400pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1401    if query_trigrams.is_empty() {
1402        return 0.0;
1403    }
1404
1405    let mut hits = 0u32;
1406    for &trigram in query_trigrams {
1407        if let Some(postings) = index.postings.get(&trigram) {
1408            if postings
1409                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1410                .is_ok()
1411            {
1412                hits += 1;
1413            }
1414        }
1415    }
1416
1417    if hits == 0 {
1418        return 0.0;
1419    }
1420
1421    let file_trigram_count = index
1422        .file_trigrams
1423        .get(&file_id)
1424        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1425    (hits as f32) / (1.0 + file_trigram_count.ln())
1426}
1427
1428pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1429    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1430    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1431        return PathBuf::from(override_dir)
1432            .join("index")
1433            .join(project_cache_key(project_root));
1434    }
1435    // Use configured storage dir (from plugin, XDG-compliant)
1436    if let Some(dir) = storage_dir {
1437        return dir.join("index").join(project_cache_key(project_root));
1438    }
1439    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1440    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1441    // If neither is set, fall back to a temp directory rather than `"."`
1442    // because the search-index code reads/writes absolute paths.
1443    let home = std::env::var_os("HOME")
1444        .or_else(|| std::env::var_os("USERPROFILE"))
1445        .map(PathBuf::from)
1446        .unwrap_or_else(std::env::temp_dir);
1447    home.join(".cache")
1448        .join("aft")
1449        .join("index")
1450        .join(project_cache_key(project_root))
1451}
1452
1453pub(crate) fn build_path_filters(
1454    include: &[String],
1455    exclude: &[String],
1456) -> Result<PathFilters, String> {
1457    Ok(PathFilters {
1458        includes: build_globset(include)?,
1459        excludes: build_globset(exclude)?,
1460    })
1461}
1462
1463pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1464    walk_project_files_from(root, root, filters)
1465}
1466
1467pub(crate) fn walk_project_files_from(
1468    filter_root: &Path,
1469    search_root: &Path,
1470    filters: &PathFilters,
1471) -> Vec<PathBuf> {
1472    let mut builder = WalkBuilder::new(search_root);
1473    builder
1474        .hidden(false)
1475        .git_ignore(true)
1476        .git_global(true)
1477        .git_exclude(true)
1478        .filter_entry(|entry| {
1479            let name = entry.file_name().to_string_lossy();
1480            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1481                return !matches!(
1482                    name.as_ref(),
1483                    "node_modules"
1484                        | "target"
1485                        | "venv"
1486                        | ".venv"
1487                        | ".git"
1488                        | "__pycache__"
1489                        | ".tox"
1490                        | "dist"
1491                        | "build"
1492                );
1493            }
1494            true
1495        });
1496
1497    let mut files = Vec::new();
1498    for entry in builder.build().filter_map(|entry| entry.ok()) {
1499        if !entry
1500            .file_type()
1501            .map_or(false, |file_type| file_type.is_file())
1502        {
1503            continue;
1504        }
1505        let path = entry.into_path();
1506        if filters.matches(filter_root, &path) {
1507            files.push(path);
1508        }
1509    }
1510
1511    sort_paths_by_mtime_desc(&mut files);
1512    files
1513}
1514
1515pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1516    let bytes = fs::read(path).ok()?;
1517    if is_binary_bytes(&bytes) {
1518        return None;
1519    }
1520    String::from_utf8(bytes).ok()
1521}
1522
1523fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1524    fs::read(path).ok()
1525}
1526
1527pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1528    path.strip_prefix(root)
1529        .map(PathBuf::from)
1530        .unwrap_or_else(|_| path.to_path_buf())
1531}
1532
1533pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1534    let normalized_root = normalize_path(root);
1535    let normalized_path = normalize_path(path);
1536    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1537    validate_cached_relative_path(relative)
1538}
1539
1540pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1541    let relative = validate_cached_relative_path(relative_path)?;
1542    let normalized_root = normalize_path(root);
1543    let full_path = normalize_path(&normalized_root.join(relative));
1544
1545    match fs::canonicalize(&full_path) {
1546        Ok(canonical_path) => {
1547            if canonical_path.starts_with(&normalized_root) {
1548                return Some(full_path);
1549            }
1550
1551            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1552            canonical_path
1553                .starts_with(&canonical_root)
1554                .then_some(full_path)
1555        }
1556        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1557    }
1558}
1559
1560pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1561    if path.is_absolute() {
1562        return None;
1563    }
1564
1565    let mut normalized = PathBuf::new();
1566    for component in path.components() {
1567        match component {
1568            Component::Normal(part) => normalized.push(part),
1569            Component::CurDir => {}
1570            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1571        }
1572    }
1573    (!normalized.as_os_str().is_empty()).then_some(normalized)
1574}
1575
1576/// Sort paths newest-first by mtime, falling back to lexicographic order.
1577///
1578/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1579/// `sort_by()` closure. That made the comparator non-deterministic — a
1580/// `stat()` syscall for the same path can return different values across
1581/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1582/// concurrent file-watcher activity), and Rust's slice::sort panics at
1583/// runtime when it detects a non-total-order comparator. CI hit this on
1584/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1585///
1586/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1587/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1588pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1589    use std::collections::HashMap;
1590    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1591    for path in paths.iter() {
1592        mtimes
1593            .entry(path.clone())
1594            .or_insert_with(|| path_modified_time(path));
1595    }
1596    paths.sort_by(|left, right| {
1597        let left_mtime = mtimes.get(left).and_then(|v| *v);
1598        let right_mtime = mtimes.get(right).and_then(|v| *v);
1599        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1600    });
1601}
1602
1603/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1604/// the sort. Same fix, applied to grep matches that share files.
1605pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1606    use std::collections::HashMap;
1607    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1608    for m in matches.iter() {
1609        mtimes.entry(m.file.clone()).or_insert_with(|| {
1610            let resolved = resolve_match_path(project_root, &m.file);
1611            path_modified_time(&resolved)
1612        });
1613    }
1614    matches.sort_by(|left, right| {
1615        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1616        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1617        right_mtime
1618            .cmp(&left_mtime)
1619            .then_with(|| left.file.cmp(&right.file))
1620            .then_with(|| left.line.cmp(&right.line))
1621            .then_with(|| left.column.cmp(&right.column))
1622    });
1623}
1624
1625/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1626/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1627/// table from the search index), but it can still return different values if
1628/// the file is modified mid-sort. Snapshot once.
1629fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1630    matches: &mut [SharedGrepMatch],
1631    modified_for_path: F,
1632) where
1633    F: Fn(&Path) -> Option<SystemTime>,
1634{
1635    use std::collections::HashMap;
1636    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1637    for m in matches.iter() {
1638        let path = m.file.as_path().to_path_buf();
1639        mtimes
1640            .entry(path.clone())
1641            .or_insert_with(|| modified_for_path(&path));
1642    }
1643    matches.sort_by(|left, right| {
1644        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1645        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1646        right_mtime
1647            .cmp(&left_mtime)
1648            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1649            .then_with(|| left.line.cmp(&right.line))
1650            .then_with(|| left.column.cmp(&right.column))
1651    });
1652}
1653
1654pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1655    let resolved_project_root = canonicalize_or_normalize(project_root);
1656    let root = match path {
1657        Some(path) => {
1658            let path = PathBuf::from(path);
1659            if path.is_absolute() {
1660                canonicalize_or_normalize(&path)
1661            } else {
1662                normalize_path(&resolved_project_root.join(path))
1663            }
1664        }
1665        None => resolved_project_root.clone(),
1666    };
1667
1668    let use_index = is_within_search_root(&resolved_project_root, &root);
1669    SearchScope { root, use_index }
1670}
1671
1672pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1673    content_inspector::inspect(content).is_binary()
1674}
1675
1676pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1677    run_git(root, &["rev-parse", "HEAD"])
1678}
1679
1680pub fn project_cache_key(project_root: &Path) -> String {
1681    use sha2::{Digest, Sha256};
1682
1683    let mut hasher = Sha256::new();
1684
1685    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1686        // Git repo: root commit is the unique identity.
1687        // Same repo cloned anywhere produces the same key.
1688        hasher.update(root_commit.as_bytes());
1689    } else {
1690        // Non-git project: use the canonical filesystem path as identity.
1691        let canonical_root = canonicalize_or_normalize(project_root);
1692        hasher.update(canonical_root.to_string_lossy().as_bytes());
1693    }
1694
1695    let digest = format!("{:x}", hasher.finalize());
1696    digest[..16].to_string()
1697}
1698
1699impl PathFilters {
1700    fn matches(&self, root: &Path, path: &Path) -> bool {
1701        let relative = to_glob_path(&relative_to_root(root, path));
1702        if self
1703            .includes
1704            .as_ref()
1705            .is_some_and(|includes| !includes.is_match(&relative))
1706        {
1707            return false;
1708        }
1709        if self
1710            .excludes
1711            .as_ref()
1712            .is_some_and(|excludes| excludes.is_match(&relative))
1713        {
1714            return false;
1715        }
1716        true
1717    }
1718}
1719
1720fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1721    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1722}
1723
1724fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1725    if path.is_absolute() {
1726        path.to_path_buf()
1727    } else {
1728        project_root.join(path)
1729    }
1730}
1731
1732fn path_modified_time(path: &Path) -> Option<SystemTime> {
1733    fs::metadata(path)
1734        .and_then(|metadata| metadata.modified())
1735        .ok()
1736}
1737
1738fn normalize_path(path: &Path) -> PathBuf {
1739    let mut result = PathBuf::new();
1740    for component in path.components() {
1741        match component {
1742            Component::ParentDir => {
1743                if !result.pop() {
1744                    result.push(component);
1745                }
1746            }
1747            Component::CurDir => {}
1748            _ => result.push(component),
1749        }
1750    }
1751    result
1752}
1753
1754fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
1755    if let Ok(canonical) = fs::canonicalize(path) {
1756        return canonical;
1757    }
1758
1759    let Some(parent) = path.parent() else {
1760        return path.to_path_buf();
1761    };
1762    let Some(file_name) = path.file_name() else {
1763        return path.to_path_buf();
1764    };
1765
1766    fs::canonicalize(parent)
1767        .map(|canonical_parent| canonical_parent.join(file_name))
1768        .unwrap_or_else(|_| path.to_path_buf())
1769}
1770
1771/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1772/// since the index was last written. Also detect new files and deleted files.
1773fn verify_file_mtimes(index: &mut SearchIndex) {
1774    let mut stale_paths = Vec::new();
1775
1776    for entry in &mut index.files {
1777        if entry.path.as_os_str().is_empty() {
1778            continue; // tombstoned entry
1779        }
1780        let cached = FileFreshness {
1781            mtime: entry.modified,
1782            size: entry.size,
1783            content_hash: entry.content_hash,
1784        };
1785        match cache_freshness::verify_file_strict(&entry.path, &cached) {
1786            FreshnessVerdict::HotFresh => {}
1787            FreshnessVerdict::ContentFresh {
1788                new_mtime,
1789                new_size,
1790            } => {
1791                entry.modified = new_mtime;
1792                entry.size = new_size;
1793            }
1794            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1795                stale_paths.push(entry.path.clone())
1796            }
1797        }
1798    }
1799
1800    // Re-index stale files
1801    for path in &stale_paths {
1802        index.update_file(path);
1803    }
1804
1805    // Detect new files not in the index
1806    let filters = PathFilters::default();
1807    for path in walk_project_files(&index.project_root, &filters) {
1808        if !index.path_to_id.contains_key(&path) {
1809            index.update_file(&path);
1810        }
1811    }
1812
1813    if !stale_paths.is_empty() {
1814        crate::slog_info!(
1815            "search index: refreshed {} stale file(s) from disk cache",
1816            stale_paths.len()
1817        );
1818    }
1819}
1820
1821fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1822    normalize_path(path).starts_with(normalize_path(search_root))
1823}
1824
1825impl QueryBuild {
1826    fn into_query(self) -> RegexQuery {
1827        let mut query = RegexQuery::default();
1828
1829        for run in self.and_runs {
1830            add_run_to_and_query(&mut query, &run);
1831        }
1832
1833        for group in self.or_groups {
1834            let mut trigrams = BTreeSet::new();
1835            let mut filters = HashMap::new();
1836            for run in group {
1837                for (trigram, filter) in trigram_filters(&run) {
1838                    trigrams.insert(trigram);
1839                    merge_filter(filters.entry(trigram).or_default(), filter);
1840                }
1841            }
1842            if !trigrams.is_empty() {
1843                query.or_groups.push(trigrams.into_iter().collect());
1844                query.or_filters.push(filters);
1845            }
1846        }
1847
1848        query
1849    }
1850}
1851
1852fn build_query(hir: &Hir) -> QueryBuild {
1853    match hir.kind() {
1854        HirKind::Literal(literal) => {
1855            if literal.0.len() >= 3 {
1856                QueryBuild {
1857                    and_runs: vec![literal.0.to_vec()],
1858                    or_groups: Vec::new(),
1859                }
1860            } else {
1861                QueryBuild::default()
1862            }
1863        }
1864        HirKind::Capture(capture) => build_query(&capture.sub),
1865        HirKind::Concat(parts) => {
1866            let mut build = QueryBuild::default();
1867            for part in parts {
1868                let part_build = build_query(part);
1869                build.and_runs.extend(part_build.and_runs);
1870                build.or_groups.extend(part_build.or_groups);
1871            }
1872            build
1873        }
1874        HirKind::Alternation(parts) => {
1875            let mut group = Vec::new();
1876            for part in parts {
1877                let Some(mut choices) = guaranteed_run_choices(part) else {
1878                    return QueryBuild::default();
1879                };
1880                group.append(&mut choices);
1881            }
1882            if group.is_empty() {
1883                QueryBuild::default()
1884            } else {
1885                QueryBuild {
1886                    and_runs: Vec::new(),
1887                    or_groups: vec![group],
1888                }
1889            }
1890        }
1891        HirKind::Repetition(repetition) => {
1892            if repetition.min == 0 {
1893                QueryBuild::default()
1894            } else {
1895                build_query(&repetition.sub)
1896            }
1897        }
1898        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1899    }
1900}
1901
1902fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1903    match hir.kind() {
1904        HirKind::Literal(literal) => {
1905            if literal.0.len() >= 3 {
1906                Some(vec![literal.0.to_vec()])
1907            } else {
1908                None
1909            }
1910        }
1911        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1912        HirKind::Concat(parts) => {
1913            let mut runs = Vec::new();
1914            for part in parts {
1915                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1916                    runs.append(&mut part_runs);
1917                }
1918            }
1919            if runs.is_empty() {
1920                None
1921            } else {
1922                Some(runs)
1923            }
1924        }
1925        HirKind::Alternation(parts) => {
1926            let mut runs = Vec::new();
1927            for part in parts {
1928                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1929                    return None;
1930                };
1931                runs.append(&mut part_runs);
1932            }
1933            if runs.is_empty() {
1934                None
1935            } else {
1936                Some(runs)
1937            }
1938        }
1939        HirKind::Repetition(repetition) => {
1940            if repetition.min == 0 {
1941                None
1942            } else {
1943                guaranteed_run_choices(&repetition.sub)
1944            }
1945        }
1946        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1947    }
1948}
1949
1950fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1951    for (trigram, filter) in trigram_filters(run) {
1952        if !query.and_trigrams.contains(&trigram) {
1953            query.and_trigrams.push(trigram);
1954        }
1955        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1956    }
1957}
1958
1959fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1960    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1961    for (trigram, next_char, position) in extract_trigrams(run) {
1962        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1963        if next_char != EOF_SENTINEL {
1964            entry.next_mask |= mask_for_next_char(next_char);
1965        }
1966        entry.loc_mask |= mask_for_position(position);
1967    }
1968    filters.into_iter().collect()
1969}
1970
1971fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1972    target.next_mask |= filter.next_mask;
1973    target.loc_mask |= filter.loc_mask;
1974}
1975
1976fn mask_for_next_char(next_char: u8) -> u8 {
1977    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1978    1u8 << bit
1979}
1980
1981fn mask_for_position(position: usize) -> u8 {
1982    1u8 << (position % 8)
1983}
1984
1985fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1986    if patterns.is_empty() {
1987        return Ok(None);
1988    }
1989
1990    let mut builder = GlobSetBuilder::new();
1991    for pattern in patterns {
1992        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1993        builder.add(glob);
1994    }
1995    builder.build().map(Some).map_err(|error| error.to_string())
1996}
1997
1998fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1999    let mut buffer = [0u8; 4];
2000    reader.read_exact(&mut buffer)?;
2001    Ok(u32::from_le_bytes(buffer))
2002}
2003
2004fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2005    let mut buffer = [0u8; 8];
2006    reader.read_exact(&mut buffer)?;
2007    Ok(u64::from_le_bytes(buffer))
2008}
2009
2010fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2011    writer.write_all(&value.to_le_bytes())
2012}
2013
2014fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2015    writer.write_all(&value.to_le_bytes())
2016}
2017
2018fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2019    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2020        return Err(std::io::Error::other("search index checksum missing"));
2021    };
2022    let expected = u32::from_le_bytes(*stored);
2023    let actual = crc32fast::hash(body);
2024    if actual != expected {
2025        return Err(std::io::Error::other("search index checksum mismatch"));
2026    }
2027    Ok(())
2028}
2029
2030fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2031    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2032    total_len.checked_sub(pos)
2033}
2034
2035fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2036    let output = Command::new("git")
2037        .arg("-C")
2038        .arg(root)
2039        .args(args)
2040        .output()
2041        .ok()?;
2042    if !output.status.success() {
2043        return None;
2044    }
2045    let value = String::from_utf8(output.stdout).ok()?;
2046    let value = value.trim().to_string();
2047    if value.is_empty() {
2048        None
2049    } else {
2050        Some(value)
2051    }
2052}
2053
2054fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2055    let diff_range = format!("{}..{}", from, to);
2056    let output = match Command::new("git")
2057        .arg("-C")
2058        .arg(root)
2059        .args(["diff", "--name-status", "-M", &diff_range])
2060        .output()
2061    {
2062        Ok(output) => output,
2063        Err(_) => return false,
2064    };
2065
2066    if !output.status.success() {
2067        return false;
2068    }
2069
2070    let Ok(diff) = String::from_utf8(output.stdout) else {
2071        return false;
2072    };
2073
2074    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2075        let mut fields = line.split('\t');
2076        let Some(status) = fields.next() else {
2077            continue;
2078        };
2079
2080        if status.starts_with('R') {
2081            let Some(old_path) = fields
2082                .next()
2083                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2084            else {
2085                continue;
2086            };
2087            let Some(new_path) = fields
2088                .next()
2089                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2090            else {
2091                continue;
2092            };
2093            index.remove_file(&old_path);
2094            index.update_file(&new_path);
2095            continue;
2096        }
2097
2098        let Some(path) = fields
2099            .next()
2100            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2101        else {
2102            continue;
2103        };
2104        if status.starts_with('D') || !path.exists() {
2105            index.remove_file(&path);
2106        } else {
2107            index.update_file(&path);
2108        }
2109    }
2110
2111    true
2112}
2113
2114fn is_binary_path(path: &Path, size: u64) -> bool {
2115    if size == 0 {
2116        return false;
2117    }
2118
2119    let mut file = match File::open(path) {
2120        Ok(file) => file,
2121        Err(_) => return true,
2122    };
2123
2124    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2125    match file.read(&mut preview) {
2126        Ok(read) => is_binary_bytes(&preview[..read]),
2127        Err(_) => true,
2128    }
2129}
2130
2131fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2132    let mut starts = vec![0usize];
2133    for (index, byte) in content.iter().copied().enumerate() {
2134        if byte == b'\n' {
2135            starts.push(index + 1);
2136        }
2137    }
2138    starts
2139}
2140
2141fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2142    let line_index = match line_starts.binary_search(&offset) {
2143        Ok(index) => index,
2144        Err(index) => index.saturating_sub(1),
2145    };
2146    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2147    let line_end = content[line_start..]
2148        .iter()
2149        .position(|byte| *byte == b'\n')
2150        .map(|length| line_start + length)
2151        .unwrap_or(content.len());
2152    let mut line_slice = &content[line_start..line_end];
2153    if line_slice.ends_with(b"\r") {
2154        line_slice = &line_slice[..line_slice.len() - 1];
2155    }
2156    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2157    let column = String::from_utf8_lossy(&content[line_start..offset])
2158        .chars()
2159        .count() as u32
2160        + 1;
2161    (line_index as u32 + 1, column, line_text)
2162}
2163
2164fn to_glob_path(path: &Path) -> String {
2165    path.to_string_lossy().replace('\\', "/")
2166}
2167
2168#[cfg(test)]
2169mod tests {
2170    use std::process::Command;
2171
2172    use super::*;
2173
2174    #[test]
2175    fn cached_path_under_root_allows_missing_lexical_child() {
2176        let dir = tempfile::tempdir().expect("create temp dir");
2177        let project = dir.path().join("project");
2178        fs::create_dir_all(&project).expect("create project dir");
2179        let root = fs::canonicalize(&project).expect("canonicalize project");
2180
2181        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2182            .expect("missing child should fall back to lexical validation");
2183
2184        assert_eq!(path, root.join("future/file.rs"));
2185    }
2186
2187    #[cfg(unix)]
2188    #[test]
2189    fn cached_path_under_root_rejects_symlink_escape() {
2190        let dir = tempfile::tempdir().expect("create temp dir");
2191        let project = dir.path().join("project");
2192        let outside = dir.path().join("outside");
2193        fs::create_dir_all(&project).expect("create project dir");
2194        fs::create_dir_all(&outside).expect("create outside dir");
2195        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2196        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2197        let root = fs::canonicalize(&project).expect("canonicalize project");
2198
2199        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2200    }
2201
2202    #[test]
2203    fn extract_trigrams_tracks_next_char_and_position() {
2204        let trigrams = extract_trigrams(b"Rust");
2205        assert_eq!(trigrams.len(), 2);
2206        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2207        assert_eq!(
2208            trigrams[1],
2209            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2210        );
2211    }
2212
2213    #[test]
2214    fn decompose_regex_extracts_literals_and_alternations() {
2215        let query = decompose_regex("abc(def|ghi)xyz");
2216        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2217        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2218        assert_eq!(query.or_groups.len(), 1);
2219        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2220        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2221    }
2222
2223    #[test]
2224    fn candidates_intersect_posting_lists() {
2225        let mut index = SearchIndex::new();
2226        let dir = tempfile::tempdir().expect("create temp dir");
2227        let alpha = dir.path().join("alpha.txt");
2228        let beta = dir.path().join("beta.txt");
2229        fs::write(&alpha, "abcdef").expect("write alpha");
2230        fs::write(&beta, "abcxyz").expect("write beta");
2231        index.project_root = dir.path().to_path_buf();
2232        index.index_file(&alpha, b"abcdef");
2233        index.index_file(&beta, b"abcxyz");
2234
2235        let query = RegexQuery {
2236            and_trigrams: vec![
2237                pack_trigram(b'a', b'b', b'c'),
2238                pack_trigram(b'd', b'e', b'f'),
2239            ],
2240            ..RegexQuery::default()
2241        };
2242
2243        let candidates = index.candidates(&query);
2244        assert_eq!(candidates.len(), 1);
2245        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2246    }
2247
2248    #[test]
2249    fn candidates_apply_bloom_filters() {
2250        let mut index = SearchIndex::new();
2251        let dir = tempfile::tempdir().expect("create temp dir");
2252        let file = dir.path().join("sample.txt");
2253        fs::write(&file, "abcd efgh").expect("write sample");
2254        index.project_root = dir.path().to_path_buf();
2255        index.index_file(&file, b"abcd efgh");
2256
2257        let trigram = pack_trigram(b'a', b'b', b'c');
2258        let matching_filter = PostingFilter {
2259            next_mask: mask_for_next_char(b'd'),
2260            loc_mask: mask_for_position(0),
2261        };
2262        let non_matching_filter = PostingFilter {
2263            next_mask: mask_for_next_char(b'z'),
2264            loc_mask: mask_for_position(0),
2265        };
2266
2267        assert_eq!(
2268            index
2269                .postings_for_trigram(trigram, Some(matching_filter))
2270                .len(),
2271            1
2272        );
2273        assert!(index
2274            .postings_for_trigram(trigram, Some(non_matching_filter))
2275            .is_empty());
2276    }
2277
2278    #[test]
2279    fn disk_round_trip_preserves_postings_and_files() {
2280        let dir = tempfile::tempdir().expect("create temp dir");
2281        let project = dir.path().join("project");
2282        fs::create_dir_all(&project).expect("create project dir");
2283        let file = project.join("src.txt");
2284        fs::write(&file, "abcdef").expect("write source");
2285
2286        let mut index = SearchIndex::build(&project);
2287        index.git_head = Some("deadbeef".to_string());
2288        let cache_dir = dir.path().join("cache");
2289        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2290
2291        let loaded =
2292            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2293        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2294        assert_eq!(loaded.files.len(), 1);
2295        assert_eq!(
2296            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2297            PathBuf::from("src.txt")
2298        );
2299        assert_eq!(loaded.postings.len(), index.postings.len());
2300        assert!(loaded
2301            .postings
2302            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2303    }
2304
2305    #[test]
2306    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2307        let root = PathBuf::from("/tmp/aft-project");
2308
2309        assert_eq!(
2310            cache_relative_path(&root, &root.join("src/lib.rs")),
2311            Some(PathBuf::from("src/lib.rs"))
2312        );
2313        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2314        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2315        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2316        assert_eq!(
2317            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2318            Some(root.join("src/lib.rs"))
2319        );
2320    }
2321
2322    #[test]
2323    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2324        let dir = tempfile::tempdir().expect("create temp dir");
2325        let project = dir.path().join("project");
2326        fs::create_dir_all(&project).expect("create project dir");
2327        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2328        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2329        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2330
2331        Command::new("git")
2332            .arg("init")
2333            .arg(&project)
2334            .status()
2335            .expect("git init");
2336        for args in [
2337            ["config", "user.email", "aft@example.invalid"],
2338            ["config", "user.name", "AFT Test"],
2339        ] {
2340            Command::new("git")
2341                .arg("-C")
2342                .arg(&project)
2343                .args(args)
2344                .status()
2345                .expect("git config");
2346        }
2347        Command::new("git")
2348            .arg("-C")
2349            .arg(&project)
2350            .args(["add", "."])
2351            .status()
2352            .expect("git add initial");
2353        Command::new("git")
2354            .arg("-C")
2355            .arg(&project)
2356            .args(["commit", "-m", "initial"])
2357            .status()
2358            .expect("git commit initial");
2359        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2360        let mut baseline = SearchIndex::build(&project);
2361        baseline.git_head = Some(previous.clone());
2362
2363        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2364        Command::new("git")
2365            .arg("-C")
2366            .arg(&project)
2367            .args(["add", "-A"])
2368            .status()
2369            .expect("git add rename");
2370        Command::new("git")
2371            .arg("-C")
2372            .arg(&project)
2373            .args(["commit", "-m", "rename"])
2374            .status()
2375            .expect("git commit rename");
2376        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2377
2378        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2379        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2380
2381        let refreshed = SearchIndex::rebuild_or_refresh(
2382            &project,
2383            DEFAULT_MAX_FILE_SIZE,
2384            Some(current),
2385            Some(baseline),
2386        );
2387
2388        assert!(!refreshed
2389            .path_to_id
2390            .contains_key(&canonical_project.join("old.txt")));
2391        assert!(refreshed
2392            .path_to_id
2393            .contains_key(&canonical_project.join("new.txt")));
2394        assert!(refreshed
2395            .path_to_id
2396            .contains_key(&canonical_project.join("untracked.txt")));
2397        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2398        assert_eq!(matches.matches.len(), 1);
2399    }
2400
2401    #[test]
2402    fn read_from_disk_rejects_corrupt_postings_checksum() {
2403        let dir = tempfile::tempdir().expect("create temp dir");
2404        let project = dir.path().join("project");
2405        fs::create_dir_all(&project).expect("create project dir");
2406        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2407
2408        let index = SearchIndex::build(&project);
2409        let cache_dir = dir.path().join("cache");
2410        index.write_to_disk(&cache_dir, None);
2411
2412        let cache_path = cache_dir.join("cache.bin");
2413        let mut bytes = fs::read(&cache_path).expect("read cache");
2414        let middle = bytes.len() / 2;
2415        bytes[middle] ^= 0xff;
2416        fs::write(&cache_path, bytes).expect("write corrupted cache");
2417
2418        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2419    }
2420
2421    #[test]
2422    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2423        let dir = tempfile::tempdir().expect("create temp dir");
2424        let project = dir.path().join("project");
2425        fs::create_dir_all(&project).expect("create project dir");
2426        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2427
2428        let index = SearchIndex::build(&project);
2429        let cache_dir = dir.path().join("cache");
2430        index.write_to_disk(&cache_dir, None);
2431
2432        assert!(cache_dir.join("cache.bin").is_file());
2433        assert!(fs::read_dir(&cache_dir)
2434            .expect("read cache dir")
2435            .all(|entry| !entry
2436                .expect("cache entry")
2437                .file_name()
2438                .to_string_lossy()
2439                .contains(".tmp.")));
2440    }
2441
2442    #[test]
2443    fn concurrent_search_index_writes_do_not_corrupt() {
2444        let dir = tempfile::tempdir().expect("create temp dir");
2445        let project = dir.path().join("project");
2446        fs::create_dir_all(&project).expect("create project dir");
2447        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2448        let cache_dir = dir.path().join("cache");
2449
2450        let a_project = project.clone();
2451        let a_cache = cache_dir.clone();
2452        let a = std::thread::spawn(move || {
2453            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2454            let index = SearchIndex::build(&a_project);
2455            index.write_to_disk(&a_cache, None);
2456        });
2457        let b_project = project.clone();
2458        let b_cache = cache_dir.clone();
2459        let b = std::thread::spawn(move || {
2460            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2461            let index = SearchIndex::build(&b_project);
2462            index.write_to_disk(&b_cache, None);
2463        });
2464        a.join().expect("writer a");
2465        b.join().expect("writer b");
2466
2467        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2468    }
2469
2470    #[test]
2471    fn search_index_atomic_rename_survives_partial_write() {
2472        let dir = tempfile::tempdir().expect("create temp dir");
2473        let cache_dir = dir.path().join("cache");
2474        fs::create_dir_all(&cache_dir).expect("create cache dir");
2475        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2476
2477        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2478    }
2479
2480    #[test]
2481    fn project_cache_key_includes_checkout_path() {
2482        let dir = tempfile::tempdir().expect("create temp dir");
2483        let source = dir.path().join("source");
2484        fs::create_dir_all(&source).expect("create source repo dir");
2485        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2486
2487        assert!(Command::new("git")
2488            .current_dir(&source)
2489            .args(["init"])
2490            .status()
2491            .expect("init git repo")
2492            .success());
2493        assert!(Command::new("git")
2494            .current_dir(&source)
2495            .args(["add", "."])
2496            .status()
2497            .expect("git add")
2498            .success());
2499        assert!(Command::new("git")
2500            .current_dir(&source)
2501            .args([
2502                "-c",
2503                "user.name=AFT Tests",
2504                "-c",
2505                "user.email=aft-tests@example.com",
2506                "commit",
2507                "-m",
2508                "initial",
2509            ])
2510            .status()
2511            .expect("git commit")
2512            .success());
2513
2514        let clone = dir.path().join("clone");
2515        assert!(Command::new("git")
2516            .args(["clone", "--quiet"])
2517            .arg(&source)
2518            .arg(&clone)
2519            .status()
2520            .expect("git clone")
2521            .success());
2522
2523        let source_key = project_cache_key(&source);
2524        let clone_key = project_cache_key(&clone);
2525
2526        assert_eq!(source_key.len(), 16);
2527        assert_eq!(clone_key.len(), 16);
2528        // Same repo (same root commit) → same cache key regardless of clone path
2529        assert_eq!(source_key, clone_key);
2530    }
2531
2532    #[test]
2533    fn git_head_unchanged_picks_up_local_edits() {
2534        let dir = tempfile::tempdir().expect("create temp dir");
2535        let project = dir.path().join("repo");
2536        fs::create_dir_all(&project).expect("create repo dir");
2537        let file = project.join("tracked.txt");
2538        fs::write(&file, "oldtoken\n").expect("write file");
2539        assert!(Command::new("git")
2540            .current_dir(&project)
2541            .arg("init")
2542            .status()
2543            .unwrap()
2544            .success());
2545        assert!(Command::new("git")
2546            .current_dir(&project)
2547            .args(["add", "."])
2548            .status()
2549            .unwrap()
2550            .success());
2551        assert!(Command::new("git")
2552            .current_dir(&project)
2553            .args([
2554                "-c",
2555                "user.name=AFT Tests",
2556                "-c",
2557                "user.email=aft-tests@example.com",
2558                "commit",
2559                "-m",
2560                "initial"
2561            ])
2562            .status()
2563            .unwrap()
2564            .success());
2565        let head = current_git_head(&project);
2566        let mut baseline = SearchIndex::build(&project);
2567        baseline.git_head = head.clone();
2568        fs::write(&file, "newtoken\n").expect("edit tracked file");
2569
2570        let refreshed =
2571            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2572        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2573
2574        assert_eq!(result.total_matches, 1);
2575    }
2576
2577    #[test]
2578    fn non_git_project_reuses_cache_when_files_unchanged() {
2579        let dir = tempfile::tempdir().expect("create temp dir");
2580        let project = dir.path().join("project");
2581        fs::create_dir_all(&project).expect("create project dir");
2582        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2583        let baseline = SearchIndex::build(&project);
2584        let baseline_file_count = baseline.file_count();
2585
2586        let refreshed =
2587            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2588
2589        assert_eq!(refreshed.file_count(), baseline_file_count);
2590        assert_eq!(
2591            refreshed
2592                .grep("unchangedtoken", true, &[], &[], &project, 10)
2593                .total_matches,
2594            1
2595        );
2596    }
2597
2598    #[test]
2599    fn resolve_search_scope_disables_index_for_external_path() {
2600        let dir = tempfile::tempdir().expect("create temp dir");
2601        let project = dir.path().join("project");
2602        let outside = dir.path().join("outside");
2603        fs::create_dir_all(&project).expect("create project dir");
2604        fs::create_dir_all(&outside).expect("create outside dir");
2605
2606        let scope = resolve_search_scope(&project, outside.to_str());
2607
2608        assert_eq!(
2609            scope.root,
2610            fs::canonicalize(&outside).expect("canonicalize outside")
2611        );
2612        assert!(!scope.use_index);
2613    }
2614
2615    #[test]
2616    fn grep_filters_matches_to_search_root() {
2617        let dir = tempfile::tempdir().expect("create temp dir");
2618        let project = dir.path().join("project");
2619        let src = project.join("src");
2620        let docs = project.join("docs");
2621        fs::create_dir_all(&src).expect("create src dir");
2622        fs::create_dir_all(&docs).expect("create docs dir");
2623        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2624        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2625
2626        let index = SearchIndex::build(&project);
2627        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2628
2629        assert_eq!(result.files_searched, 1);
2630        assert_eq!(result.files_with_matches, 1);
2631        assert_eq!(result.matches.len(), 1);
2632        // Index stores canonicalized paths; on macOS /var → /private/var
2633        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2634        assert_eq!(result.matches[0].file, expected);
2635    }
2636
2637    #[test]
2638    fn grep_deduplicates_multiple_matches_on_same_line() {
2639        let dir = tempfile::tempdir().expect("create temp dir");
2640        let project = dir.path().join("project");
2641        let src = project.join("src");
2642        fs::create_dir_all(&src).expect("create src dir");
2643        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2644
2645        let index = SearchIndex::build(&project);
2646        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2647
2648        assert_eq!(result.total_matches, 1);
2649        assert_eq!(result.matches.len(), 1);
2650    }
2651
2652    #[test]
2653    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2654        let dir = tempfile::tempdir().expect("create temp dir");
2655        let project = dir.path().join("project");
2656        fs::create_dir_all(&project).expect("create project dir");
2657        let file = project.join("unicode.txt");
2658        fs::write(&file, "äbc\n").expect("write unicode file");
2659
2660        let index = SearchIndex::build(&project);
2661        let result = index.grep("Äbc", false, &[], &[], &project, 10);
2662
2663        assert_eq!(result.total_matches, 1);
2664        assert_eq!(result.matches.len(), 1);
2665        assert_eq!(
2666            result.matches[0].file,
2667            fs::canonicalize(file).expect("canonicalize unicode file")
2668        );
2669    }
2670
2671    #[test]
2672    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2673        let dir = tempfile::tempdir().expect("create temp dir");
2674        let project = dir.path().join("project");
2675        fs::create_dir_all(&project).expect("create project dir");
2676        let file = project.join("tokens.txt");
2677        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2678        fs::write(&file, "alpha").expect("write original file");
2679        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2680
2681        let baseline = SearchIndex::build(&project);
2682        fs::write(&file, "bravo").expect("write same-size edit");
2683        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2684
2685        let refreshed =
2686            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2687        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2688        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2689        let refreshed_id = *refreshed
2690            .path_to_id
2691            .get(&canonical_file)
2692            .expect("file remains indexed");
2693
2694        assert_eq!(result.total_matches, 1);
2695        assert!(refreshed
2696            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2697            .contains(&refreshed_id));
2698        assert!(!refreshed
2699            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2700            .contains(&refreshed_id));
2701    }
2702
2703    #[test]
2704    fn grep_reports_total_matches_before_truncation() {
2705        let dir = tempfile::tempdir().expect("create temp dir");
2706        let project = dir.path().join("project");
2707        let src = project.join("src");
2708        fs::create_dir_all(&src).expect("create src dir");
2709        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2710
2711        let index = SearchIndex::build(&project);
2712        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2713
2714        assert_eq!(result.total_matches, 2);
2715        assert_eq!(result.matches.len(), 1);
2716        assert!(result.truncated);
2717    }
2718
2719    #[test]
2720    fn glob_filters_results_to_search_root() {
2721        let dir = tempfile::tempdir().expect("create temp dir");
2722        let project = dir.path().join("project");
2723        let src = project.join("src");
2724        let scripts = project.join("scripts");
2725        fs::create_dir_all(&src).expect("create src dir");
2726        fs::create_dir_all(&scripts).expect("create scripts dir");
2727        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2728        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2729
2730        let index = SearchIndex::build(&project);
2731        let files = index.glob("**/*.rs", &src);
2732
2733        assert_eq!(
2734            files,
2735            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2736        );
2737    }
2738
2739    #[test]
2740    fn glob_includes_hidden_and_binary_files() {
2741        let dir = tempfile::tempdir().expect("create temp dir");
2742        let project = dir.path().join("project");
2743        let hidden_dir = project.join(".hidden");
2744        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2745        let hidden_file = hidden_dir.join("data.bin");
2746        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2747
2748        let index = SearchIndex::build(&project);
2749        let files = index.glob("**/*.bin", &project);
2750
2751        assert_eq!(
2752            files,
2753            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2754        );
2755    }
2756
2757    #[test]
2758    fn read_from_disk_rejects_invalid_nanos() {
2759        let dir = tempfile::tempdir().expect("create temp dir");
2760        let cache_dir = dir.path().join("cache");
2761        fs::create_dir_all(&cache_dir).expect("create cache dir");
2762
2763        let mut postings = Vec::new();
2764        postings.extend_from_slice(INDEX_MAGIC);
2765        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2766        postings.extend_from_slice(&0u32.to_le_bytes());
2767        postings.extend_from_slice(&1u32.to_le_bytes());
2768        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2769        postings.extend_from_slice(&1u32.to_le_bytes());
2770        postings.extend_from_slice(b"/");
2771        postings.push(0u8);
2772        postings.extend_from_slice(&1u32.to_le_bytes());
2773        postings.extend_from_slice(&0u64.to_le_bytes());
2774        postings.extend_from_slice(&0u64.to_le_bytes());
2775        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2776        postings.extend_from_slice(b"a");
2777        postings.extend_from_slice(&0u64.to_le_bytes());
2778
2779        let mut lookup = Vec::new();
2780        lookup.extend_from_slice(LOOKUP_MAGIC);
2781        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2782        lookup.extend_from_slice(&0u32.to_le_bytes());
2783
2784        let postings_checksum = crc32fast::hash(&postings);
2785        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2786        let lookup_checksum = crc32fast::hash(&lookup);
2787        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2788        let mut cache = Vec::new();
2789        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2790        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2791        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2792        cache.extend_from_slice(&postings);
2793        cache.extend_from_slice(&lookup);
2794        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2795
2796        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2797    }
2798
2799    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2800    /// changed between cmp() calls.
2801    ///
2802    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2803    /// which does a `stat()` syscall. If the file was deleted, modified, or
2804    /// touched mid-sort, the comparator returned different values for the
2805    /// same input pair on different invocations. Rust's slice::sort detects
2806    /// this and panics with "user-provided comparison function does not
2807    /// correctly implement a total order".
2808    ///
2809    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2810    /// bridge invalidated files in parallel with grep's sort path. This
2811    /// test simulates the worst case: most paths don't exist (Err from
2812    /// fs::metadata) and sort still completes successfully.
2813    #[test]
2814    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2815        // Mix of existing and non-existing paths in deliberately
2816        // non-monotonic order — pre-fix, the sort would call stat() at
2817        // least N log N times and any flakiness would trigger the panic.
2818        let dir = tempfile::tempdir().expect("create tempdir");
2819        let mut paths: Vec<PathBuf> = Vec::new();
2820        for i in 0..30 {
2821            // Half exist, half don't.
2822            let path = if i % 2 == 0 {
2823                let p = dir.path().join(format!("real-{i}.rs"));
2824                fs::write(&p, format!("// {i}\n")).expect("write");
2825                p
2826            } else {
2827                dir.path().join(format!("missing-{i}.rs"))
2828            };
2829            paths.push(path);
2830        }
2831
2832        // Run the sort many times to maximise the chance of catching any
2833        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2834        for _ in 0..50 {
2835            let mut copy = paths.clone();
2836            sort_paths_by_mtime_desc(&mut copy);
2837            assert_eq!(copy.len(), paths.len());
2838        }
2839    }
2840
2841    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2842    /// the same conditions as sort_paths_by_mtime_desc. See the
2843    /// sort_paths_... test above for the full rationale.
2844    #[test]
2845    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2846        let dir = tempfile::tempdir().expect("create tempdir");
2847        let mut matches: Vec<GrepMatch> = Vec::new();
2848        for i in 0..30 {
2849            let file = if i % 2 == 0 {
2850                let p = dir.path().join(format!("real-{i}.rs"));
2851                fs::write(&p, format!("// {i}\n")).expect("write");
2852                p
2853            } else {
2854                dir.path().join(format!("missing-{i}.rs"))
2855            };
2856            matches.push(GrepMatch {
2857                file,
2858                line: u32::try_from(i).unwrap_or(0),
2859                column: 0,
2860                line_text: format!("match {i}"),
2861                match_text: format!("match {i}"),
2862            });
2863        }
2864
2865        for _ in 0..50 {
2866            let mut copy = matches.clone();
2867            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2868            assert_eq!(copy.len(), matches.len());
2869        }
2870    }
2871}
aft/search_index.rs

aft/
search_index.rs