aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
24const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 3;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36    _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41        fs::create_dir_all(cache_dir)?;
42        let path = cache_dir.join("cache.lock");
43        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44            .lock()
45            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46        fs_lock::try_acquire(&path, Duration::from_secs(2))
47            .map(|guard| Self { _guard: guard })
48            .map_err(|error| match error {
49                fs_lock::AcquireError::Timeout => {
50                    std::io::Error::other("timed out acquiring search cache lock")
51                }
52                fs_lock::AcquireError::Io(error) => error,
53            })
54    }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59    pub postings: HashMap<u32, Vec<Posting>>,
60    pub files: Vec<FileEntry>,
61    pub path_to_id: HashMap<PathBuf, u32>,
62    pub ready: bool,
63    project_root: PathBuf,
64    git_head: Option<String>,
65    max_file_size: u64,
66    pub file_trigrams: HashMap<u32, Vec<u32>>,
67    unindexed_files: HashSet<u32>,
68}
69
70#[derive(Clone, Debug, Default)]
71pub struct LexicalRankResult {
72    pub files: Vec<(PathBuf, f32)>,
73    pub engine_capped: bool,
74}
75
76impl SearchIndex {
77    /// Number of indexed files.
78    pub fn file_count(&self) -> usize {
79        self.files.len()
80    }
81
82    /// Number of unique trigrams in the index.
83    pub fn trigram_count(&self) -> usize {
84        self.postings.len()
85    }
86
87    /// Compute distinct query trigrams from literal tokens.
88    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
89        query_trigrams_from_tokens(tokens)
90    }
91
92    /// Score-rank file candidates by lexical relevance to query trigrams.
93    pub fn lexical_rank(
94        &self,
95        query_trigrams: &[u32],
96        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
97        max_files: usize,
98    ) -> Vec<(PathBuf, f32)> {
99        self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
100            .files
101    }
102
103    /// Score-rank file candidates and report whether candidate enumeration hit
104    /// the internal 200/500 cap before ranking.
105    pub fn lexical_rank_with_stats(
106        &self,
107        query_trigrams: &[u32],
108        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
109        max_files: usize,
110    ) -> LexicalRankResult {
111        if query_trigrams.is_empty() || max_files == 0 {
112            return LexicalRankResult::default();
113        }
114
115        let mut non_zero: Vec<(u32, usize)> = query_trigrams
116            .iter()
117            .filter_map(|trigram| {
118                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
119                (posting_count > 0).then_some((*trigram, posting_count))
120            })
121            .collect();
122        if non_zero.is_empty() {
123            return LexicalRankResult::default();
124        }
125
126        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
127        let selected_count = non_zero.len().min(3);
128        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
129
130        let mut candidate_ids = BTreeSet::new();
131        for (trigram, _) in non_zero.iter().take(selected_count) {
132            if let Some(postings) = self.postings.get(trigram) {
133                for posting in postings {
134                    if self.is_active_file(posting.file_id) {
135                        candidate_ids.insert(posting.file_id);
136                    }
137                }
138            }
139        }
140        let engine_capped = candidate_ids.len() > candidate_cap;
141
142        let mut ranked = Vec::new();
143        for file_id in candidate_ids.into_iter().take(candidate_cap) {
144            let Some(entry) = self.files.get(file_id as usize) else {
145                continue;
146            };
147            if let Some(filter) = candidate_filter {
148                if !filter(&entry.path) {
149                    continue;
150                }
151            }
152            let score = lexical_score(self, query_trigrams, file_id);
153            if score > 0.0 {
154                ranked.push((entry.path.clone(), score));
155            }
156        }
157
158        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
159        ranked.truncate(max_files);
160        LexicalRankResult {
161            files: ranked,
162            engine_capped,
163        }
164    }
165}
166
167#[derive(Clone, Debug, PartialEq, Eq)]
168pub struct Posting {
169    pub file_id: u32,
170    pub next_mask: u8,
171    pub loc_mask: u8,
172}
173
174#[derive(Clone, Debug)]
175pub struct FileEntry {
176    pub path: PathBuf,
177    pub size: u64,
178    pub modified: SystemTime,
179    pub content_hash: blake3::Hash,
180}
181
182#[derive(Clone, Debug, PartialEq, Eq)]
183pub struct GrepMatch {
184    pub file: PathBuf,
185    pub line: u32,
186    pub column: u32,
187    pub line_text: String,
188    pub match_text: String,
189}
190
191#[derive(Clone, Debug)]
192pub struct GrepResult {
193    pub matches: Vec<GrepMatch>,
194    pub total_matches: usize,
195    pub files_searched: usize,
196    pub files_with_matches: usize,
197    pub index_status: IndexStatus,
198    pub truncated: bool,
199    pub fully_degraded: bool,
200    pub engine_capped: bool,
201}
202
203#[derive(Clone, Copy, Debug, PartialEq, Eq)]
204pub enum IndexStatus {
205    Ready,
206    Building,
207    Fallback,
208    Disabled,
209}
210
211impl IndexStatus {
212    pub fn as_str(&self) -> &'static str {
213        match self {
214            IndexStatus::Ready => "Ready",
215            IndexStatus::Building => "Building",
216            IndexStatus::Fallback => "Fallback",
217            IndexStatus::Disabled => "Disabled",
218        }
219    }
220}
221
222#[derive(Clone, Debug, Default)]
223pub struct RegexQuery {
224    pub and_trigrams: Vec<u32>,
225    pub or_groups: Vec<Vec<u32>>,
226    pub(crate) and_filters: HashMap<u32, PostingFilter>,
227    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
228}
229
230#[derive(Clone, Copy, Debug, Default)]
231pub(crate) struct PostingFilter {
232    next_mask: u8,
233    loc_mask: u8,
234}
235
236#[derive(Clone, Debug, Default)]
237struct QueryBuild {
238    and_runs: Vec<Vec<u8>>,
239    or_groups: Vec<Vec<Vec<u8>>>,
240}
241
242#[derive(Clone, Debug, Default)]
243pub(crate) struct PathFilters {
244    includes: Option<GlobSet>,
245    excludes: Option<GlobSet>,
246}
247
248#[derive(Clone, Debug)]
249pub(crate) struct SearchScope {
250    pub root: PathBuf,
251    pub use_index: bool,
252}
253
254#[derive(Clone, Debug)]
255struct SharedGrepMatch {
256    file: Arc<PathBuf>,
257    line: u32,
258    column: u32,
259    line_text: String,
260    match_text: String,
261}
262
263#[derive(Clone, Debug)]
264enum SearchMatcher {
265    Literal(LiteralSearch),
266    Regex(Regex),
267}
268
269impl SearchIndex {
270    pub fn new() -> Self {
271        SearchIndex {
272            postings: HashMap::new(),
273            files: Vec::new(),
274            path_to_id: HashMap::new(),
275            ready: false,
276            project_root: PathBuf::new(),
277            git_head: None,
278            max_file_size: DEFAULT_MAX_FILE_SIZE,
279            file_trigrams: HashMap::new(),
280            unindexed_files: HashSet::new(),
281        }
282    }
283
284    pub fn build(root: &Path) -> Self {
285        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
286    }
287
288    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
289        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
290        let mut index = SearchIndex {
291            project_root: project_root.clone(),
292            max_file_size,
293            ..SearchIndex::new()
294        };
295
296        let filters = PathFilters::default();
297        for path in walk_project_files(&project_root, &filters) {
298            index.update_file(&path);
299        }
300
301        index.git_head = current_git_head(&project_root);
302        index.ready = true;
303        index
304    }
305
306    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
307        self.remove_file(path);
308
309        let file_id = match self.allocate_file_id(path, content.len() as u64) {
310            Some(file_id) => file_id,
311            None => return,
312        };
313        if let Some(file) = self.files.get_mut(file_id as usize) {
314            file.content_hash = cache_freshness::hash_bytes(content);
315        }
316
317        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
318        for (trigram, next_char, position) in extract_trigrams(content) {
319            let entry = trigram_map.entry(trigram).or_default();
320            entry.next_mask |= mask_for_next_char(next_char);
321            entry.loc_mask |= mask_for_position(position);
322        }
323
324        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
325        for (trigram, filter) in trigram_map {
326            let postings = self.postings.entry(trigram).or_default();
327            postings.push(Posting {
328                file_id,
329                next_mask: filter.next_mask,
330                loc_mask: filter.loc_mask,
331            });
332            // Posting lists are kept sorted by file_id for binary search during
333            // intersection. Since file_ids are allocated incrementally, the new
334            // entry is usually already in order. Only sort when needed.
335            if postings.len() > 1
336                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
337            {
338                postings.sort_unstable_by_key(|p| p.file_id);
339            }
340            file_trigrams.push(trigram);
341        }
342
343        self.file_trigrams.insert(file_id, file_trigrams);
344        self.unindexed_files.remove(&file_id);
345    }
346
347    pub fn remove_file(&mut self, path: &Path) {
348        let Some(file_id) = self.path_to_id.remove(path) else {
349            return;
350        };
351
352        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
353            for trigram in trigrams {
354                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
355                    postings.retain(|posting| posting.file_id != file_id);
356                    postings.is_empty()
357                } else {
358                    false
359                };
360
361                if should_remove {
362                    self.postings.remove(&trigram);
363                }
364            }
365        }
366
367        self.unindexed_files.remove(&file_id);
368        if let Some(file) = self.files.get_mut(file_id as usize) {
369            file.path = PathBuf::new();
370            file.size = 0;
371            file.modified = UNIX_EPOCH;
372            file.content_hash = cache_freshness::zero_hash();
373        }
374    }
375
376    pub fn update_file(&mut self, path: &Path) {
377        self.remove_file(path);
378
379        let metadata = match fs::metadata(path) {
380            Ok(metadata) if metadata.is_file() => metadata,
381            _ => return,
382        };
383
384        if is_binary_path(path, metadata.len()) {
385            self.track_unindexed_file(path, &metadata);
386            return;
387        }
388
389        if metadata.len() > self.max_file_size {
390            self.track_unindexed_file(path, &metadata);
391            return;
392        }
393
394        let content = match fs::read(path) {
395            Ok(content) => content,
396            Err(_) => return,
397        };
398
399        if is_binary_bytes(&content) {
400            self.track_unindexed_file(path, &metadata);
401            return;
402        }
403
404        self.index_file(path, &content);
405    }
406
407    pub fn grep(
408        &self,
409        pattern: &str,
410        case_sensitive: bool,
411        include: &[String],
412        exclude: &[String],
413        search_root: &Path,
414        max_results: usize,
415    ) -> GrepResult {
416        match pattern_compile::compile(
417            pattern,
418            CompileOpts {
419                case_insensitive: !case_sensitive,
420                ..CompileOpts::default()
421            },
422        ) {
423            CompileResult::Ok(compiled) => {
424                self.search_grep(&compiled, include, exclude, search_root, max_results)
425            }
426            CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
427                self.empty_grep_result()
428            }
429        }
430    }
431
432    pub fn search_grep(
433        &self,
434        pattern: &CompiledPattern,
435        include: &[String],
436        exclude: &[String],
437        search_root: &Path,
438        max_results: usize,
439    ) -> GrepResult {
440        let matcher = match pattern {
441            CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
442            CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
443        };
444
445        let filters = match build_path_filters(include, exclude) {
446            Ok(filters) => filters,
447            Err(_) => PathFilters::default(),
448        };
449        let search_root = canonicalize_or_normalize(search_root);
450
451        let raw_pattern = pattern.raw_pattern_for_trigrams();
452        let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
453            RegexQuery::default()
454        } else {
455            decompose_regex(&raw_pattern)
456        };
457        let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
458        let candidate_ids = self.candidates(&query);
459
460        let candidate_files: Vec<&FileEntry> = candidate_ids
461            .into_iter()
462            .filter_map(|file_id| self.files.get(file_id as usize))
463            .filter(|file| !file.path.as_os_str().is_empty())
464            .filter(|file| is_within_search_root(&search_root, &file.path))
465            .filter(|file| filters.matches(&self.project_root, &file.path))
466            .collect();
467
468        let total_matches = AtomicUsize::new(0);
469        let files_searched = AtomicUsize::new(0);
470        let files_with_matches = AtomicUsize::new(0);
471        let truncated = AtomicBool::new(false);
472        let engine_capped = AtomicBool::new(false);
473        let stop_after = max_results.saturating_mul(2);
474
475        let mut matches = if candidate_files.len() > 10 {
476            candidate_files
477                .par_iter()
478                .map(|file| {
479                    search_candidate_file(
480                        file,
481                        &matcher,
482                        max_results,
483                        stop_after,
484                        &total_matches,
485                        &files_searched,
486                        &files_with_matches,
487                        &truncated,
488                        &engine_capped,
489                    )
490                })
491                .reduce(Vec::new, |mut left, mut right| {
492                    left.append(&mut right);
493                    left
494                })
495        } else {
496            let mut matches = Vec::new();
497            for file in candidate_files {
498                matches.extend(search_candidate_file(
499                    file,
500                    &matcher,
501                    max_results,
502                    stop_after,
503                    &total_matches,
504                    &files_searched,
505                    &files_with_matches,
506                    &truncated,
507                    &engine_capped,
508                ));
509
510                if should_stop_search(&truncated, &total_matches, stop_after) {
511                    engine_capped.store(true, Ordering::Relaxed);
512                    break;
513                }
514            }
515            matches
516        };
517
518        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
519            self.path_to_id
520                .get(path)
521                .and_then(|file_id| self.files.get(*file_id as usize))
522                .map(|file| file.modified)
523        });
524
525        let matches = matches
526            .into_iter()
527            .map(|matched| GrepMatch {
528                file: matched.file.as_ref().clone(),
529                line: matched.line,
530                column: matched.column,
531                line_text: matched.line_text,
532                match_text: matched.match_text,
533            })
534            .collect();
535
536        GrepResult {
537            total_matches: total_matches.load(Ordering::Relaxed),
538            matches,
539            files_searched: files_searched.load(Ordering::Relaxed),
540            files_with_matches: files_with_matches.load(Ordering::Relaxed),
541            index_status: if self.ready {
542                IndexStatus::Ready
543            } else {
544                IndexStatus::Building
545            },
546            truncated: truncated.load(Ordering::Relaxed),
547            fully_degraded,
548            engine_capped: engine_capped.load(Ordering::Relaxed),
549        }
550    }
551
552    fn empty_grep_result(&self) -> GrepResult {
553        GrepResult {
554            matches: Vec::new(),
555            total_matches: 0,
556            files_searched: 0,
557            files_with_matches: 0,
558            index_status: if self.ready {
559                IndexStatus::Ready
560            } else {
561                IndexStatus::Building
562            },
563            truncated: false,
564            fully_degraded: false,
565            engine_capped: false,
566        }
567    }
568
569    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
570        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
571            Ok(filters) => filters,
572            Err(_) => return Vec::new(),
573        };
574        let search_root = canonicalize_or_normalize(search_root);
575        let mut entries = self
576            .files
577            .iter()
578            .filter(|file| !file.path.as_os_str().is_empty())
579            .filter(|file| is_within_search_root(&search_root, &file.path))
580            .filter(|file| filters.matches(&self.project_root, &file.path))
581            .map(|file| (file.path.clone(), file.modified))
582            .collect::<Vec<_>>();
583
584        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
585            right_mtime
586                .cmp(left_mtime)
587                .then_with(|| left_path.cmp(right_path))
588        });
589
590        entries.into_iter().map(|(path, _)| path).collect()
591    }
592
593    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
594        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
595            return self.active_file_ids();
596        }
597
598        let mut and_trigrams = query.and_trigrams.clone();
599        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
600
601        let mut current: Option<Vec<u32>> = None;
602
603        for trigram in and_trigrams {
604            let filter = query.and_filters.get(&trigram).copied();
605            let matches = self.postings_for_trigram(trigram, filter);
606            current = Some(match current.take() {
607                Some(existing) => intersect_sorted_ids(&existing, &matches),
608                None => matches,
609            });
610
611            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
612                break;
613            }
614        }
615
616        let mut current = current.unwrap_or_else(|| self.active_file_ids());
617
618        for (index, group) in query.or_groups.iter().enumerate() {
619            let mut group_matches = Vec::new();
620            let filters = query.or_filters.get(index);
621
622            for trigram in group {
623                let filter = filters.and_then(|filters| filters.get(trigram).copied());
624                let matches = self.postings_for_trigram(*trigram, filter);
625                if group_matches.is_empty() {
626                    group_matches = matches;
627                } else {
628                    group_matches = union_sorted_ids(&group_matches, &matches);
629                }
630            }
631
632            current = intersect_sorted_ids(&current, &group_matches);
633            if current.is_empty() {
634                break;
635            }
636        }
637
638        let mut unindexed = self
639            .unindexed_files
640            .iter()
641            .copied()
642            .filter(|file_id| self.is_active_file(*file_id))
643            .collect::<Vec<_>>();
644        if !unindexed.is_empty() {
645            unindexed.sort_unstable();
646            current = union_sorted_ids(&current, &unindexed);
647        }
648
649        current
650    }
651
652    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
653        if fs::create_dir_all(cache_dir).is_err() {
654            return;
655        }
656
657        let cache_path = cache_dir.join("cache.bin");
658        let tmp_cache = cache_dir.join(format!(
659            "cache.bin.tmp.{}.{}",
660            std::process::id(),
661            SystemTime::now()
662                .duration_since(UNIX_EPOCH)
663                .unwrap_or(Duration::ZERO)
664                .as_nanos()
665        ));
666
667        let active_ids = self.active_file_ids();
668        let mut id_map = HashMap::new();
669        for (new_id, old_id) in active_ids.iter().enumerate() {
670            let Ok(new_id_u32) = u32::try_from(new_id) else {
671                return;
672            };
673            id_map.insert(*old_id, new_id_u32);
674        }
675
676        let write_result = (|| -> std::io::Result<()> {
677            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
678
679            postings_writer.write_all(INDEX_MAGIC)?;
680            write_u32(&mut postings_writer, INDEX_VERSION)?;
681
682            let head = git_head.unwrap_or_default();
683            let root = self.project_root.to_string_lossy();
684            let head_len = u32::try_from(head.len())
685                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
686            let root_len = u32::try_from(root.len())
687                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
688            let file_count = u32::try_from(active_ids.len())
689                .map_err(|_| std::io::Error::other("too many files to cache"))?;
690
691            write_u32(&mut postings_writer, head_len)?;
692            write_u32(&mut postings_writer, root_len)?;
693            write_u64(&mut postings_writer, self.max_file_size)?;
694            write_u32(&mut postings_writer, file_count)?;
695            postings_writer.write_all(head.as_bytes())?;
696            postings_writer.write_all(root.as_bytes())?;
697
698            for old_id in &active_ids {
699                let Some(file) = self.files.get(*old_id as usize) else {
700                    return Err(std::io::Error::other("missing file entry for cache write"));
701                };
702                let path =
703                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
704                        std::io::Error::other(format!(
705                            "refusing to cache path outside project root: {}",
706                            file.path.display()
707                        ))
708                    })?;
709                let path = path.to_string_lossy();
710                let path_len = u32::try_from(path.len())
711                    .map_err(|_| std::io::Error::other("cached path too large"))?;
712                let modified = file
713                    .modified
714                    .duration_since(UNIX_EPOCH)
715                    .unwrap_or(Duration::ZERO);
716                let unindexed = if self.unindexed_files.contains(old_id) {
717                    1u8
718                } else {
719                    0u8
720                };
721
722                postings_writer.write_all(&[unindexed])?;
723                write_u32(&mut postings_writer, path_len)?;
724                write_u64(&mut postings_writer, file.size)?;
725                write_u64(&mut postings_writer, modified.as_secs())?;
726                write_u32(&mut postings_writer, modified.subsec_nanos())?;
727                postings_writer.write_all(file.content_hash.as_bytes())?;
728                postings_writer.write_all(path.as_bytes())?;
729            }
730
731            let mut lookup_entries = Vec::new();
732            let mut postings_blob = Vec::new();
733            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
734            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
735
736            for (trigram, postings) in sorted_postings {
737                let offset = u64::try_from(postings_blob.len())
738                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
739                let mut count = 0u32;
740
741                for posting in postings {
742                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
743                        continue;
744                    };
745
746                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
747                    postings_blob.push(posting.next_mask);
748                    postings_blob.push(posting.loc_mask);
749                    count = count.saturating_add(1);
750                }
751
752                if count > 0 {
753                    lookup_entries.push((*trigram, offset, count));
754                }
755            }
756
757            write_u64(
758                &mut postings_writer,
759                u64::try_from(postings_blob.len())
760                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
761            )?;
762            postings_writer.write_all(&postings_blob)?;
763            postings_writer.flush()?;
764            let mut postings_blob_file = postings_writer
765                .into_inner()
766                .map_err(|error| std::io::Error::other(error.to_string()))?
767                .into_inner();
768            let checksum = crc32fast::hash(&postings_blob_file);
769            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
770
771            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
772            let entry_count = u32::try_from(lookup_entries.len())
773                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
774
775            lookup_writer.write_all(LOOKUP_MAGIC)?;
776            write_u32(&mut lookup_writer, INDEX_VERSION)?;
777            write_u32(&mut lookup_writer, entry_count)?;
778
779            for (trigram, offset, count) in lookup_entries {
780                write_u32(&mut lookup_writer, trigram)?;
781                write_u64(&mut lookup_writer, offset)?;
782                write_u32(&mut lookup_writer, count)?;
783            }
784
785            lookup_writer.flush()?;
786            let mut lookup_blob_file = lookup_writer
787                .into_inner()
788                .map_err(|error| std::io::Error::other(error.to_string()))?
789                .into_inner();
790            let checksum = crc32fast::hash(&lookup_blob_file);
791            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
792
793            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
794            write_u32(&mut cache_writer, CACHE_MAGIC)?;
795            write_u32(&mut cache_writer, INDEX_VERSION)?;
796            write_u64(
797                &mut cache_writer,
798                u64::try_from(postings_blob_file.len())
799                    .map_err(|_| std::io::Error::other("postings section too large"))?,
800            )?;
801            cache_writer.write_all(&postings_blob_file)?;
802            cache_writer.write_all(&lookup_blob_file)?;
803            cache_writer.flush()?;
804            cache_writer.get_ref().sync_all()?;
805            drop(cache_writer);
806            fs::rename(&tmp_cache, &cache_path)?;
807
808            Ok(())
809        })();
810
811        if write_result.is_err() {
812            let _ = fs::remove_file(&tmp_cache);
813        }
814    }
815
816    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
817        debug_assert!(current_canonical_root.is_absolute());
818        let cache_path = cache_dir.join("cache.bin");
819        let cache_bytes = fs::read(&cache_path).ok()?;
820        if cache_bytes.len() < 16 {
821            return None;
822        }
823        let mut header = Cursor::new(&cache_bytes);
824        if read_u32(&mut header).ok()? != CACHE_MAGIC {
825            return None;
826        }
827        if read_u32(&mut header).ok()? != INDEX_VERSION {
828            return None;
829        }
830        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
831        let start = usize::try_from(header.position()).ok()?;
832        let postings_end = start.checked_add(postings_len_total)?;
833        if postings_end > cache_bytes.len() {
834            return None;
835        }
836        let postings_bytes = &cache_bytes[start..postings_end];
837        let lookup_bytes = &cache_bytes[postings_end..];
838        let lookup_len_total = lookup_bytes.len();
839        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
840        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
841        if postings_len_total < 4 || lookup_len_total < 4 {
842            return None;
843        }
844        verify_crc32_bytes_slice(postings_bytes).ok()?;
845        verify_crc32_bytes_slice(lookup_bytes).ok()?;
846
847        let mut magic = [0u8; 8];
848        postings_reader.read_exact(&mut magic).ok()?;
849        if &magic != INDEX_MAGIC {
850            return None;
851        }
852        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
853            return None;
854        }
855
856        let head_len = read_u32(&mut postings_reader).ok()? as usize;
857        let root_len = read_u32(&mut postings_reader).ok()? as usize;
858        let max_file_size = read_u64(&mut postings_reader).ok()?;
859        let file_count = read_u32(&mut postings_reader).ok()? as usize;
860        if file_count > MAX_ENTRIES {
861            return None;
862        }
863        let postings_body_len = postings_len_total.checked_sub(4)?;
864        let lookup_body_len = lookup_len_total.checked_sub(4)?;
865
866        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
867        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
868        if minimum_file_bytes > remaining_postings {
869            return None;
870        }
871
872        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
873            return None;
874        }
875        let mut head_bytes = vec![0u8; head_len];
876        postings_reader.read_exact(&mut head_bytes).ok()?;
877        let git_head = String::from_utf8(head_bytes)
878            .ok()
879            .filter(|head| !head.is_empty());
880
881        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
882            return None;
883        }
884        let mut root_bytes = vec![0u8; root_len];
885        postings_reader.read_exact(&mut root_bytes).ok()?;
886        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
887        let project_root = current_canonical_root.to_path_buf();
888
889        let mut files = Vec::with_capacity(file_count);
890        let mut path_to_id = HashMap::new();
891        let mut unindexed_files = HashSet::new();
892
893        for file_id in 0..file_count {
894            let mut unindexed = [0u8; 1];
895            postings_reader.read_exact(&mut unindexed).ok()?;
896            let path_len = read_u32(&mut postings_reader).ok()? as usize;
897            let size = read_u64(&mut postings_reader).ok()?;
898            let secs = read_u64(&mut postings_reader).ok()?;
899            let nanos = read_u32(&mut postings_reader).ok()?;
900            let mut hash_bytes = [0u8; 32];
901            postings_reader.read_exact(&mut hash_bytes).ok()?;
902            let content_hash = blake3::Hash::from_bytes(hash_bytes);
903            if nanos >= 1_000_000_000 {
904                return None;
905            }
906            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
907                return None;
908            }
909            let mut path_bytes = vec![0u8; path_len];
910            postings_reader.read_exact(&mut path_bytes).ok()?;
911            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
912            let full_path = cached_path_under_root(&project_root, &relative_path)?;
913            let file_id_u32 = u32::try_from(file_id).ok()?;
914
915            files.push(FileEntry {
916                path: full_path.clone(),
917                size,
918                modified: UNIX_EPOCH + Duration::new(secs, nanos),
919                content_hash,
920            });
921            path_to_id.insert(full_path, file_id_u32);
922            if unindexed[0] == 1 {
923                unindexed_files.insert(file_id_u32);
924            }
925        }
926
927        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
928        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
929        if postings_len > max_postings_bytes {
930            return None;
931        }
932        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
933            return None;
934        }
935        let mut postings_blob = vec![0u8; postings_len];
936        postings_reader.read_exact(&mut postings_blob).ok()?;
937
938        let mut lookup_magic = [0u8; 8];
939        lookup_reader.read_exact(&mut lookup_magic).ok()?;
940        if &lookup_magic != LOOKUP_MAGIC {
941            return None;
942        }
943        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
944            return None;
945        }
946        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
947        if entry_count > MAX_ENTRIES {
948            return None;
949        }
950        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
951        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
952        if minimum_lookup_bytes > remaining_lookup {
953            return None;
954        }
955
956        let mut postings = HashMap::new();
957        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
958
959        for _ in 0..entry_count {
960            let trigram = read_u32(&mut lookup_reader).ok()?;
961            let offset = read_u64(&mut lookup_reader).ok()? as usize;
962            let count = read_u32(&mut lookup_reader).ok()? as usize;
963            if count > MAX_ENTRIES {
964                return None;
965            }
966            let bytes_len = count.checked_mul(POSTING_BYTES)?;
967            let end = offset.checked_add(bytes_len)?;
968            if end > postings_blob.len() {
969                return None;
970            }
971
972            let mut trigram_postings = Vec::with_capacity(count);
973            for chunk in postings_blob[offset..end].chunks_exact(6) {
974                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
975                let posting = Posting {
976                    file_id,
977                    next_mask: chunk[4],
978                    loc_mask: chunk[5],
979                };
980                trigram_postings.push(posting.clone());
981                file_trigrams.entry(file_id).or_default().push(trigram);
982            }
983            postings.insert(trigram, trigram_postings);
984        }
985
986        Some(SearchIndex {
987            postings,
988            files,
989            path_to_id,
990            ready: false,
991            project_root,
992            git_head,
993            max_file_size,
994            file_trigrams,
995            unindexed_files,
996        })
997    }
998
999    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1000        self.git_head.as_deref()
1001    }
1002
1003    pub(crate) fn set_ready(&mut self, ready: bool) {
1004        self.ready = ready;
1005    }
1006
1007    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1008        self.git_head = current_head;
1009        verify_file_mtimes(self);
1010        self.ready = true;
1011    }
1012
1013    pub(crate) fn rebuild_or_refresh(
1014        root: &Path,
1015        max_file_size: u64,
1016        current_head: Option<String>,
1017        baseline: Option<SearchIndex>,
1018    ) -> Self {
1019        if let Some(mut baseline) = baseline {
1020            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1021            baseline.max_file_size = max_file_size;
1022
1023            if baseline.git_head == current_head || current_head.is_none() {
1024                // HEAD matches, but files may have changed on disk since the index was
1025                // last written (e.g., uncommitted edits, stash pop, manual file changes
1026                // while OpenCode was closed). Verify mtimes and re-index stale files.
1027                // Non-git projects also use this per-file (path, mtime, size)
1028                // fingerprint so unchanged trees reuse the disk cache instead of
1029                // rebuilding every configure.
1030                baseline.git_head = current_head;
1031                verify_file_mtimes(&mut baseline);
1032                baseline.ready = true;
1033                return baseline;
1034            }
1035
1036            if let (Some(previous), Some(current)) =
1037                (baseline.git_head.clone(), current_head.clone())
1038            {
1039                let project_root = baseline.project_root.clone();
1040                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1041                    baseline.git_head = Some(current);
1042                    verify_file_mtimes(&mut baseline);
1043                    baseline.ready = true;
1044                    return baseline;
1045                }
1046            }
1047        }
1048
1049        SearchIndex::build_with_limit(root, max_file_size)
1050    }
1051
1052    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1053        let file_id = u32::try_from(self.files.len()).ok()?;
1054        let metadata = fs::metadata(path).ok();
1055        let size = metadata
1056            .as_ref()
1057            .map_or(size_hint, |metadata| metadata.len());
1058        let modified = metadata
1059            .and_then(|metadata| metadata.modified().ok())
1060            .unwrap_or(UNIX_EPOCH);
1061
1062        self.files.push(FileEntry {
1063            path: path.to_path_buf(),
1064            size,
1065            modified,
1066            content_hash: cache_freshness::zero_hash(),
1067        });
1068        self.path_to_id.insert(path.to_path_buf(), file_id);
1069        Some(file_id)
1070    }
1071
1072    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1073        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1074            return;
1075        };
1076        self.unindexed_files.insert(file_id);
1077        self.file_trigrams.insert(file_id, Vec::new());
1078    }
1079
1080    fn active_file_ids(&self) -> Vec<u32> {
1081        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1082        ids.sort_unstable();
1083        ids
1084    }
1085
1086    fn is_active_file(&self, file_id: u32) -> bool {
1087        self.files
1088            .get(file_id as usize)
1089            .map(|file| !file.path.as_os_str().is_empty())
1090            .unwrap_or(false)
1091    }
1092
1093    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1094        let Some(postings) = self.postings.get(&trigram) else {
1095            return Vec::new();
1096        };
1097
1098        let mut matches = Vec::with_capacity(postings.len());
1099
1100        for posting in postings {
1101            if let Some(filter) = filter {
1102                // next_mask: bloom filter check — the character following this trigram in the
1103                // query must also appear after this trigram somewhere in the file.
1104                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1105                    continue;
1106                }
1107                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1108                // between consecutive trigram pairs, but is NOT used as a single-trigram
1109                // filter because the position in the query string has no relationship to
1110                // the position in the file. Using it here causes false negatives.
1111            }
1112            if self.is_active_file(posting.file_id) {
1113                matches.push(posting.file_id);
1114            }
1115        }
1116
1117        matches
1118    }
1119}
1120
1121fn search_candidate_file(
1122    file: &FileEntry,
1123    matcher: &SearchMatcher,
1124    max_results: usize,
1125    stop_after: usize,
1126    total_matches: &AtomicUsize,
1127    files_searched: &AtomicUsize,
1128    files_with_matches: &AtomicUsize,
1129    truncated: &AtomicBool,
1130    engine_capped: &AtomicBool,
1131) -> Vec<SharedGrepMatch> {
1132    if should_stop_search(truncated, total_matches, stop_after) {
1133        engine_capped.store(true, Ordering::Relaxed);
1134        return Vec::new();
1135    }
1136
1137    let content = match read_indexed_file_bytes(&file.path) {
1138        Some(content) => content,
1139        None => return Vec::new(),
1140    };
1141    // Defense in depth: even though indexing tries to filter binaries via
1142    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1143    // query time. content_inspector is fast (~bytes-per-cycle on a small
1144    // preview) and this guarantees we never surface matches inside binary
1145    // files even if the indexer somehow let one through (e.g. file changed
1146    // between indexing and query).
1147    if is_binary_bytes(&content) {
1148        return Vec::new();
1149    }
1150    files_searched.fetch_add(1, Ordering::Relaxed);
1151
1152    let shared_path = Arc::new(file.path.clone());
1153    let mut matches = Vec::new();
1154    let mut line_starts = None;
1155    let mut seen_lines = HashSet::new();
1156    let mut matched_this_file = false;
1157
1158    match matcher {
1159        SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1160            let needle = &literal.needle;
1161            let finder = memchr::memmem::Finder::new(needle);
1162            let mut start = 0;
1163
1164            while let Some(position) = finder.find(&content[start..]) {
1165                if should_stop_search(truncated, total_matches, stop_after) {
1166                    engine_capped.store(true, Ordering::Relaxed);
1167                    break;
1168                }
1169
1170                let offset = start + position;
1171                start = offset + 1;
1172
1173                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1174                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1175                if !seen_lines.insert(line) {
1176                    continue;
1177                }
1178
1179                matched_this_file = true;
1180                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1181                if match_number > max_results {
1182                    truncated.store(true, Ordering::Relaxed);
1183                    break;
1184                }
1185
1186                let end = offset + needle.len();
1187                matches.push(SharedGrepMatch {
1188                    file: shared_path.clone(),
1189                    line,
1190                    column,
1191                    line_text,
1192                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1193                });
1194            }
1195        }
1196        SearchMatcher::Literal(literal) => {
1197            let needle = &literal.needle;
1198            let search_content = content.to_ascii_lowercase();
1199            let finder = memchr::memmem::Finder::new(needle);
1200            let mut start = 0;
1201
1202            while let Some(position) = finder.find(&search_content[start..]) {
1203                if should_stop_search(truncated, total_matches, stop_after) {
1204                    engine_capped.store(true, Ordering::Relaxed);
1205                    break;
1206                }
1207
1208                let offset = start + position;
1209                start = offset + 1;
1210
1211                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1212                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1213                if !seen_lines.insert(line) {
1214                    continue;
1215                }
1216
1217                matched_this_file = true;
1218                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1219                if match_number > max_results {
1220                    truncated.store(true, Ordering::Relaxed);
1221                    break;
1222                }
1223
1224                let end = offset + needle.len();
1225                matches.push(SharedGrepMatch {
1226                    file: shared_path.clone(),
1227                    line,
1228                    column,
1229                    line_text,
1230                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1231                });
1232            }
1233        }
1234        SearchMatcher::Regex(regex) => {
1235            for matched in regex.find_iter(&content) {
1236                if should_stop_search(truncated, total_matches, stop_after) {
1237                    engine_capped.store(true, Ordering::Relaxed);
1238                    break;
1239                }
1240
1241                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1242                let (line, column, line_text) =
1243                    line_details_bytes(&content, line_starts, matched.start());
1244                if !seen_lines.insert(line) {
1245                    continue;
1246                }
1247
1248                matched_this_file = true;
1249                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1250                if match_number > max_results {
1251                    truncated.store(true, Ordering::Relaxed);
1252                    break;
1253                }
1254
1255                matches.push(SharedGrepMatch {
1256                    file: shared_path.clone(),
1257                    line,
1258                    column,
1259                    line_text,
1260                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1261                });
1262            }
1263        }
1264    }
1265
1266    if matched_this_file {
1267        files_with_matches.fetch_add(1, Ordering::Relaxed);
1268    }
1269
1270    matches
1271}
1272
1273fn should_stop_search(
1274    truncated: &AtomicBool,
1275    total_matches: &AtomicUsize,
1276    stop_after: usize,
1277) -> bool {
1278    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1279}
1280
1281fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1282    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1283    let mut left_index = 0;
1284    let mut right_index = 0;
1285
1286    while left_index < left.len() && right_index < right.len() {
1287        match left[left_index].cmp(&right[right_index]) {
1288            std::cmp::Ordering::Less => left_index += 1,
1289            std::cmp::Ordering::Greater => right_index += 1,
1290            std::cmp::Ordering::Equal => {
1291                merged.push(left[left_index]);
1292                left_index += 1;
1293                right_index += 1;
1294            }
1295        }
1296    }
1297
1298    merged
1299}
1300
1301fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1302    let mut merged = Vec::with_capacity(left.len() + right.len());
1303    let mut left_index = 0;
1304    let mut right_index = 0;
1305
1306    while left_index < left.len() && right_index < right.len() {
1307        match left[left_index].cmp(&right[right_index]) {
1308            std::cmp::Ordering::Less => {
1309                merged.push(left[left_index]);
1310                left_index += 1;
1311            }
1312            std::cmp::Ordering::Greater => {
1313                merged.push(right[right_index]);
1314                right_index += 1;
1315            }
1316            std::cmp::Ordering::Equal => {
1317                merged.push(left[left_index]);
1318                left_index += 1;
1319                right_index += 1;
1320            }
1321        }
1322    }
1323
1324    merged.extend_from_slice(&left[left_index..]);
1325    merged.extend_from_slice(&right[right_index..]);
1326    merged
1327}
1328
1329pub fn decompose_regex(pattern: &str) -> RegexQuery {
1330    let hir = match regex_syntax::parse(pattern) {
1331        Ok(hir) => hir,
1332        Err(_) => return RegexQuery::default(),
1333    };
1334
1335    let build = build_query(&hir);
1336    build.into_query()
1337}
1338
1339pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1340    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1341}
1342
1343pub fn normalize_char(c: u8) -> u8 {
1344    c.to_ascii_lowercase()
1345}
1346
1347pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1348    if content.len() < 3 {
1349        return Vec::new();
1350    }
1351
1352    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1353    for start in 0..=content.len() - 3 {
1354        let trigram = pack_trigram(
1355            normalize_char(content[start]),
1356            normalize_char(content[start + 1]),
1357            normalize_char(content[start + 2]),
1358        );
1359        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1360        trigrams.push((trigram, next_char, start));
1361    }
1362    trigrams
1363}
1364
1365pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1366    let mut seen = HashSet::new();
1367    let mut out = Vec::new();
1368    for token in tokens {
1369        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1370            if seen.insert(trigram) {
1371                out.push(trigram);
1372            }
1373        }
1374    }
1375    out
1376}
1377
1378pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1379    if query_trigrams.is_empty() {
1380        return 0.0;
1381    }
1382
1383    let mut hits = 0u32;
1384    for &trigram in query_trigrams {
1385        if let Some(postings) = index.postings.get(&trigram) {
1386            if postings
1387                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1388                .is_ok()
1389            {
1390                hits += 1;
1391            }
1392        }
1393    }
1394
1395    if hits == 0 {
1396        return 0.0;
1397    }
1398
1399    let file_trigram_count = index
1400        .file_trigrams
1401        .get(&file_id)
1402        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1403    (hits as f32) / (1.0 + file_trigram_count.ln())
1404}
1405
1406pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1407    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1408    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1409        return PathBuf::from(override_dir)
1410            .join("index")
1411            .join(project_cache_key(project_root));
1412    }
1413    // Use configured storage dir (from plugin, XDG-compliant)
1414    if let Some(dir) = storage_dir {
1415        return dir.join("index").join(project_cache_key(project_root));
1416    }
1417    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1418    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1419    // If neither is set, fall back to a temp directory rather than `"."`
1420    // because the search-index code reads/writes absolute paths.
1421    let home = std::env::var_os("HOME")
1422        .or_else(|| std::env::var_os("USERPROFILE"))
1423        .map(PathBuf::from)
1424        .unwrap_or_else(std::env::temp_dir);
1425    home.join(".cache")
1426        .join("aft")
1427        .join("index")
1428        .join(project_cache_key(project_root))
1429}
1430
1431pub(crate) fn build_path_filters(
1432    include: &[String],
1433    exclude: &[String],
1434) -> Result<PathFilters, String> {
1435    Ok(PathFilters {
1436        includes: build_globset(include)?,
1437        excludes: build_globset(exclude)?,
1438    })
1439}
1440
1441pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1442    walk_project_files_from(root, root, filters)
1443}
1444
1445pub(crate) fn walk_project_files_from(
1446    filter_root: &Path,
1447    search_root: &Path,
1448    filters: &PathFilters,
1449) -> Vec<PathBuf> {
1450    let mut builder = WalkBuilder::new(search_root);
1451    builder
1452        .hidden(false)
1453        .git_ignore(true)
1454        .git_global(true)
1455        .git_exclude(true)
1456        .filter_entry(|entry| {
1457            let name = entry.file_name().to_string_lossy();
1458            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1459                return !matches!(
1460                    name.as_ref(),
1461                    "node_modules"
1462                        | "target"
1463                        | "venv"
1464                        | ".venv"
1465                        | ".git"
1466                        | "__pycache__"
1467                        | ".tox"
1468                        | "dist"
1469                        | "build"
1470                );
1471            }
1472            true
1473        });
1474
1475    let mut files = Vec::new();
1476    for entry in builder.build().filter_map(|entry| entry.ok()) {
1477        if !entry
1478            .file_type()
1479            .map_or(false, |file_type| file_type.is_file())
1480        {
1481            continue;
1482        }
1483        let path = entry.into_path();
1484        if filters.matches(filter_root, &path) {
1485            files.push(path);
1486        }
1487    }
1488
1489    sort_paths_by_mtime_desc(&mut files);
1490    files
1491}
1492
1493pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1494    let bytes = fs::read(path).ok()?;
1495    if is_binary_bytes(&bytes) {
1496        return None;
1497    }
1498    String::from_utf8(bytes).ok()
1499}
1500
1501fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1502    fs::read(path).ok()
1503}
1504
1505pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1506    path.strip_prefix(root)
1507        .map(PathBuf::from)
1508        .unwrap_or_else(|_| path.to_path_buf())
1509}
1510
1511pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1512    let normalized_root = normalize_path(root);
1513    let normalized_path = normalize_path(path);
1514    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1515    validate_cached_relative_path(relative)
1516}
1517
1518pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1519    let relative = validate_cached_relative_path(relative_path)?;
1520    let normalized_root = normalize_path(root);
1521    let full_path = normalize_path(&normalized_root.join(relative));
1522
1523    match fs::canonicalize(&full_path) {
1524        Ok(canonical_path) => {
1525            if canonical_path.starts_with(&normalized_root) {
1526                return Some(full_path);
1527            }
1528
1529            let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1530            canonical_path
1531                .starts_with(&canonical_root)
1532                .then_some(full_path)
1533        }
1534        Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1535    }
1536}
1537
1538pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1539    if path.is_absolute() {
1540        return None;
1541    }
1542
1543    let mut normalized = PathBuf::new();
1544    for component in path.components() {
1545        match component {
1546            Component::Normal(part) => normalized.push(part),
1547            Component::CurDir => {}
1548            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1549        }
1550    }
1551    (!normalized.as_os_str().is_empty()).then_some(normalized)
1552}
1553
1554/// Sort paths newest-first by mtime, falling back to lexicographic order.
1555///
1556/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1557/// `sort_by()` closure. That made the comparator non-deterministic — a
1558/// `stat()` syscall for the same path can return different values across
1559/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1560/// concurrent file-watcher activity), and Rust's slice::sort panics at
1561/// runtime when it detects a non-total-order comparator. CI hit this on
1562/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1563///
1564/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1565/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1566pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1567    use std::collections::HashMap;
1568    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1569    for path in paths.iter() {
1570        mtimes
1571            .entry(path.clone())
1572            .or_insert_with(|| path_modified_time(path));
1573    }
1574    paths.sort_by(|left, right| {
1575        let left_mtime = mtimes.get(left).and_then(|v| *v);
1576        let right_mtime = mtimes.get(right).and_then(|v| *v);
1577        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1578    });
1579}
1580
1581/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1582/// the sort. Same fix, applied to grep matches that share files.
1583pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1584    use std::collections::HashMap;
1585    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1586    for m in matches.iter() {
1587        mtimes.entry(m.file.clone()).or_insert_with(|| {
1588            let resolved = resolve_match_path(project_root, &m.file);
1589            path_modified_time(&resolved)
1590        });
1591    }
1592    matches.sort_by(|left, right| {
1593        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1594        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1595        right_mtime
1596            .cmp(&left_mtime)
1597            .then_with(|| left.file.cmp(&right.file))
1598            .then_with(|| left.line.cmp(&right.line))
1599            .then_with(|| left.column.cmp(&right.column))
1600    });
1601}
1602
1603/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1604/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1605/// table from the search index), but it can still return different values if
1606/// the file is modified mid-sort. Snapshot once.
1607fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1608    matches: &mut [SharedGrepMatch],
1609    modified_for_path: F,
1610) where
1611    F: Fn(&Path) -> Option<SystemTime>,
1612{
1613    use std::collections::HashMap;
1614    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1615    for m in matches.iter() {
1616        let path = m.file.as_path().to_path_buf();
1617        mtimes
1618            .entry(path.clone())
1619            .or_insert_with(|| modified_for_path(&path));
1620    }
1621    matches.sort_by(|left, right| {
1622        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1623        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1624        right_mtime
1625            .cmp(&left_mtime)
1626            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1627            .then_with(|| left.line.cmp(&right.line))
1628            .then_with(|| left.column.cmp(&right.column))
1629    });
1630}
1631
1632pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1633    let resolved_project_root = canonicalize_or_normalize(project_root);
1634    let root = match path {
1635        Some(path) => {
1636            let path = PathBuf::from(path);
1637            if path.is_absolute() {
1638                canonicalize_or_normalize(&path)
1639            } else {
1640                normalize_path(&resolved_project_root.join(path))
1641            }
1642        }
1643        None => resolved_project_root.clone(),
1644    };
1645
1646    let use_index = is_within_search_root(&resolved_project_root, &root);
1647    SearchScope { root, use_index }
1648}
1649
1650pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1651    content_inspector::inspect(content).is_binary()
1652}
1653
1654pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1655    run_git(root, &["rev-parse", "HEAD"])
1656}
1657
1658pub fn project_cache_key(project_root: &Path) -> String {
1659    use sha2::{Digest, Sha256};
1660
1661    let mut hasher = Sha256::new();
1662
1663    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1664        // Git repo: root commit is the unique identity.
1665        // Same repo cloned anywhere produces the same key.
1666        hasher.update(root_commit.as_bytes());
1667    } else {
1668        // Non-git project: use the canonical filesystem path as identity.
1669        let canonical_root = canonicalize_or_normalize(project_root);
1670        hasher.update(canonical_root.to_string_lossy().as_bytes());
1671    }
1672
1673    let digest = format!("{:x}", hasher.finalize());
1674    digest[..16].to_string()
1675}
1676
1677impl PathFilters {
1678    fn matches(&self, root: &Path, path: &Path) -> bool {
1679        let relative = to_glob_path(&relative_to_root(root, path));
1680        if self
1681            .includes
1682            .as_ref()
1683            .is_some_and(|includes| !includes.is_match(&relative))
1684        {
1685            return false;
1686        }
1687        if self
1688            .excludes
1689            .as_ref()
1690            .is_some_and(|excludes| excludes.is_match(&relative))
1691        {
1692            return false;
1693        }
1694        true
1695    }
1696}
1697
1698fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1699    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1700}
1701
1702fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1703    if path.is_absolute() {
1704        path.to_path_buf()
1705    } else {
1706        project_root.join(path)
1707    }
1708}
1709
1710fn path_modified_time(path: &Path) -> Option<SystemTime> {
1711    fs::metadata(path)
1712        .and_then(|metadata| metadata.modified())
1713        .ok()
1714}
1715
1716fn normalize_path(path: &Path) -> PathBuf {
1717    let mut result = PathBuf::new();
1718    for component in path.components() {
1719        match component {
1720            Component::ParentDir => {
1721                if !result.pop() {
1722                    result.push(component);
1723                }
1724            }
1725            Component::CurDir => {}
1726            _ => result.push(component),
1727        }
1728    }
1729    result
1730}
1731
1732/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1733/// since the index was last written. Also detect new files and deleted files.
1734fn verify_file_mtimes(index: &mut SearchIndex) {
1735    // Collect stale files (mtime mismatch or deleted)
1736    let mut stale_paths = Vec::new();
1737    for entry in &index.files {
1738        if entry.path.as_os_str().is_empty() {
1739            continue; // tombstoned entry
1740        }
1741        let cached = FileFreshness {
1742            mtime: entry.modified,
1743            size: entry.size,
1744            content_hash: entry.content_hash,
1745        };
1746        match cache_freshness::verify_file_strict(&entry.path, &cached) {
1747            FreshnessVerdict::HotFresh | FreshnessVerdict::ContentFresh { .. } => {}
1748            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1749                stale_paths.push(entry.path.clone())
1750            }
1751        }
1752    }
1753
1754    for entry in &mut index.files {
1755        if entry.path.as_os_str().is_empty() {
1756            continue;
1757        }
1758        let cached = FileFreshness {
1759            mtime: entry.modified,
1760            size: entry.size,
1761            content_hash: entry.content_hash,
1762        };
1763        if let FreshnessVerdict::ContentFresh {
1764            new_mtime,
1765            new_size,
1766        } = cache_freshness::verify_file_strict(&entry.path, &cached)
1767        {
1768            entry.modified = new_mtime;
1769            entry.size = new_size;
1770        }
1771    }
1772
1773    // Re-index stale files
1774    for path in &stale_paths {
1775        index.update_file(path);
1776    }
1777
1778    // Detect new files not in the index
1779    let filters = PathFilters::default();
1780    for path in walk_project_files(&index.project_root, &filters) {
1781        if !index.path_to_id.contains_key(&path) {
1782            index.update_file(&path);
1783        }
1784    }
1785
1786    if !stale_paths.is_empty() {
1787        crate::slog_info!(
1788            "search index: refreshed {} stale file(s) from disk cache",
1789            stale_paths.len()
1790        );
1791    }
1792}
1793
1794fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1795    normalize_path(path).starts_with(normalize_path(search_root))
1796}
1797
1798impl QueryBuild {
1799    fn into_query(self) -> RegexQuery {
1800        let mut query = RegexQuery::default();
1801
1802        for run in self.and_runs {
1803            add_run_to_and_query(&mut query, &run);
1804        }
1805
1806        for group in self.or_groups {
1807            let mut trigrams = BTreeSet::new();
1808            let mut filters = HashMap::new();
1809            for run in group {
1810                for (trigram, filter) in trigram_filters(&run) {
1811                    trigrams.insert(trigram);
1812                    merge_filter(filters.entry(trigram).or_default(), filter);
1813                }
1814            }
1815            if !trigrams.is_empty() {
1816                query.or_groups.push(trigrams.into_iter().collect());
1817                query.or_filters.push(filters);
1818            }
1819        }
1820
1821        query
1822    }
1823}
1824
1825fn build_query(hir: &Hir) -> QueryBuild {
1826    match hir.kind() {
1827        HirKind::Literal(literal) => {
1828            if literal.0.len() >= 3 {
1829                QueryBuild {
1830                    and_runs: vec![literal.0.to_vec()],
1831                    or_groups: Vec::new(),
1832                }
1833            } else {
1834                QueryBuild::default()
1835            }
1836        }
1837        HirKind::Capture(capture) => build_query(&capture.sub),
1838        HirKind::Concat(parts) => {
1839            let mut build = QueryBuild::default();
1840            for part in parts {
1841                let part_build = build_query(part);
1842                build.and_runs.extend(part_build.and_runs);
1843                build.or_groups.extend(part_build.or_groups);
1844            }
1845            build
1846        }
1847        HirKind::Alternation(parts) => {
1848            let mut group = Vec::new();
1849            for part in parts {
1850                let Some(mut choices) = guaranteed_run_choices(part) else {
1851                    return QueryBuild::default();
1852                };
1853                group.append(&mut choices);
1854            }
1855            if group.is_empty() {
1856                QueryBuild::default()
1857            } else {
1858                QueryBuild {
1859                    and_runs: Vec::new(),
1860                    or_groups: vec![group],
1861                }
1862            }
1863        }
1864        HirKind::Repetition(repetition) => {
1865            if repetition.min == 0 {
1866                QueryBuild::default()
1867            } else {
1868                build_query(&repetition.sub)
1869            }
1870        }
1871        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1872    }
1873}
1874
1875fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1876    match hir.kind() {
1877        HirKind::Literal(literal) => {
1878            if literal.0.len() >= 3 {
1879                Some(vec![literal.0.to_vec()])
1880            } else {
1881                None
1882            }
1883        }
1884        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1885        HirKind::Concat(parts) => {
1886            let mut runs = Vec::new();
1887            for part in parts {
1888                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1889                    runs.append(&mut part_runs);
1890                }
1891            }
1892            if runs.is_empty() {
1893                None
1894            } else {
1895                Some(runs)
1896            }
1897        }
1898        HirKind::Alternation(parts) => {
1899            let mut runs = Vec::new();
1900            for part in parts {
1901                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1902                    return None;
1903                };
1904                runs.append(&mut part_runs);
1905            }
1906            if runs.is_empty() {
1907                None
1908            } else {
1909                Some(runs)
1910            }
1911        }
1912        HirKind::Repetition(repetition) => {
1913            if repetition.min == 0 {
1914                None
1915            } else {
1916                guaranteed_run_choices(&repetition.sub)
1917            }
1918        }
1919        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1920    }
1921}
1922
1923fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1924    for (trigram, filter) in trigram_filters(run) {
1925        if !query.and_trigrams.contains(&trigram) {
1926            query.and_trigrams.push(trigram);
1927        }
1928        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1929    }
1930}
1931
1932fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1933    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1934    for (trigram, next_char, position) in extract_trigrams(run) {
1935        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1936        if next_char != EOF_SENTINEL {
1937            entry.next_mask |= mask_for_next_char(next_char);
1938        }
1939        entry.loc_mask |= mask_for_position(position);
1940    }
1941    filters.into_iter().collect()
1942}
1943
1944fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1945    target.next_mask |= filter.next_mask;
1946    target.loc_mask |= filter.loc_mask;
1947}
1948
1949fn mask_for_next_char(next_char: u8) -> u8 {
1950    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1951    1u8 << bit
1952}
1953
1954fn mask_for_position(position: usize) -> u8 {
1955    1u8 << (position % 8)
1956}
1957
1958fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1959    if patterns.is_empty() {
1960        return Ok(None);
1961    }
1962
1963    let mut builder = GlobSetBuilder::new();
1964    for pattern in patterns {
1965        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1966        builder.add(glob);
1967    }
1968    builder.build().map(Some).map_err(|error| error.to_string())
1969}
1970
1971fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1972    let mut buffer = [0u8; 4];
1973    reader.read_exact(&mut buffer)?;
1974    Ok(u32::from_le_bytes(buffer))
1975}
1976
1977fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1978    let mut buffer = [0u8; 8];
1979    reader.read_exact(&mut buffer)?;
1980    Ok(u64::from_le_bytes(buffer))
1981}
1982
1983fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1984    writer.write_all(&value.to_le_bytes())
1985}
1986
1987fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1988    writer.write_all(&value.to_le_bytes())
1989}
1990
1991fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1992    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1993        return Err(std::io::Error::other("search index checksum missing"));
1994    };
1995    let expected = u32::from_le_bytes(*stored);
1996    let actual = crc32fast::hash(body);
1997    if actual != expected {
1998        return Err(std::io::Error::other("search index checksum mismatch"));
1999    }
2000    Ok(())
2001}
2002
2003fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2004    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2005    total_len.checked_sub(pos)
2006}
2007
2008fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2009    let output = Command::new("git")
2010        .arg("-C")
2011        .arg(root)
2012        .args(args)
2013        .output()
2014        .ok()?;
2015    if !output.status.success() {
2016        return None;
2017    }
2018    let value = String::from_utf8(output.stdout).ok()?;
2019    let value = value.trim().to_string();
2020    if value.is_empty() {
2021        None
2022    } else {
2023        Some(value)
2024    }
2025}
2026
2027fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2028    let diff_range = format!("{}..{}", from, to);
2029    let output = match Command::new("git")
2030        .arg("-C")
2031        .arg(root)
2032        .args(["diff", "--name-status", "-M", &diff_range])
2033        .output()
2034    {
2035        Ok(output) => output,
2036        Err(_) => return false,
2037    };
2038
2039    if !output.status.success() {
2040        return false;
2041    }
2042
2043    let Ok(diff) = String::from_utf8(output.stdout) else {
2044        return false;
2045    };
2046
2047    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2048        let mut fields = line.split('\t');
2049        let Some(status) = fields.next() else {
2050            continue;
2051        };
2052
2053        if status.starts_with('R') {
2054            let Some(old_path) = fields
2055                .next()
2056                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2057            else {
2058                continue;
2059            };
2060            let Some(new_path) = fields
2061                .next()
2062                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2063            else {
2064                continue;
2065            };
2066            index.remove_file(&old_path);
2067            index.update_file(&new_path);
2068            continue;
2069        }
2070
2071        let Some(path) = fields
2072            .next()
2073            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2074        else {
2075            continue;
2076        };
2077        if status.starts_with('D') || !path.exists() {
2078            index.remove_file(&path);
2079        } else {
2080            index.update_file(&path);
2081        }
2082    }
2083
2084    true
2085}
2086
2087fn is_binary_path(path: &Path, size: u64) -> bool {
2088    if size == 0 {
2089        return false;
2090    }
2091
2092    let mut file = match File::open(path) {
2093        Ok(file) => file,
2094        Err(_) => return true,
2095    };
2096
2097    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2098    match file.read(&mut preview) {
2099        Ok(read) => is_binary_bytes(&preview[..read]),
2100        Err(_) => true,
2101    }
2102}
2103
2104fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2105    let mut starts = vec![0usize];
2106    for (index, byte) in content.iter().copied().enumerate() {
2107        if byte == b'\n' {
2108            starts.push(index + 1);
2109        }
2110    }
2111    starts
2112}
2113
2114fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2115    let line_index = match line_starts.binary_search(&offset) {
2116        Ok(index) => index,
2117        Err(index) => index.saturating_sub(1),
2118    };
2119    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2120    let line_end = content[line_start..]
2121        .iter()
2122        .position(|byte| *byte == b'\n')
2123        .map(|length| line_start + length)
2124        .unwrap_or(content.len());
2125    let mut line_slice = &content[line_start..line_end];
2126    if line_slice.ends_with(b"\r") {
2127        line_slice = &line_slice[..line_slice.len() - 1];
2128    }
2129    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2130    let column = String::from_utf8_lossy(&content[line_start..offset])
2131        .chars()
2132        .count() as u32
2133        + 1;
2134    (line_index as u32 + 1, column, line_text)
2135}
2136
2137fn to_glob_path(path: &Path) -> String {
2138    path.to_string_lossy().replace('\\', "/")
2139}
2140
2141#[cfg(test)]
2142mod tests {
2143    use std::process::Command;
2144
2145    use super::*;
2146
2147    #[test]
2148    fn cached_path_under_root_allows_missing_lexical_child() {
2149        let dir = tempfile::tempdir().expect("create temp dir");
2150        let project = dir.path().join("project");
2151        fs::create_dir_all(&project).expect("create project dir");
2152        let root = fs::canonicalize(&project).expect("canonicalize project");
2153
2154        let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2155            .expect("missing child should fall back to lexical validation");
2156
2157        assert_eq!(path, root.join("future/file.rs"));
2158    }
2159
2160    #[cfg(unix)]
2161    #[test]
2162    fn cached_path_under_root_rejects_symlink_escape() {
2163        let dir = tempfile::tempdir().expect("create temp dir");
2164        let project = dir.path().join("project");
2165        let outside = dir.path().join("outside");
2166        fs::create_dir_all(&project).expect("create project dir");
2167        fs::create_dir_all(&outside).expect("create outside dir");
2168        fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2169        std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2170        let root = fs::canonicalize(&project).expect("canonicalize project");
2171
2172        assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2173    }
2174
2175    #[test]
2176    fn extract_trigrams_tracks_next_char_and_position() {
2177        let trigrams = extract_trigrams(b"Rust");
2178        assert_eq!(trigrams.len(), 2);
2179        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2180        assert_eq!(
2181            trigrams[1],
2182            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2183        );
2184    }
2185
2186    #[test]
2187    fn decompose_regex_extracts_literals_and_alternations() {
2188        let query = decompose_regex("abc(def|ghi)xyz");
2189        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2190        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2191        assert_eq!(query.or_groups.len(), 1);
2192        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2193        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2194    }
2195
2196    #[test]
2197    fn candidates_intersect_posting_lists() {
2198        let mut index = SearchIndex::new();
2199        let dir = tempfile::tempdir().expect("create temp dir");
2200        let alpha = dir.path().join("alpha.txt");
2201        let beta = dir.path().join("beta.txt");
2202        fs::write(&alpha, "abcdef").expect("write alpha");
2203        fs::write(&beta, "abcxyz").expect("write beta");
2204        index.project_root = dir.path().to_path_buf();
2205        index.index_file(&alpha, b"abcdef");
2206        index.index_file(&beta, b"abcxyz");
2207
2208        let query = RegexQuery {
2209            and_trigrams: vec![
2210                pack_trigram(b'a', b'b', b'c'),
2211                pack_trigram(b'd', b'e', b'f'),
2212            ],
2213            ..RegexQuery::default()
2214        };
2215
2216        let candidates = index.candidates(&query);
2217        assert_eq!(candidates.len(), 1);
2218        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2219    }
2220
2221    #[test]
2222    fn candidates_apply_bloom_filters() {
2223        let mut index = SearchIndex::new();
2224        let dir = tempfile::tempdir().expect("create temp dir");
2225        let file = dir.path().join("sample.txt");
2226        fs::write(&file, "abcd efgh").expect("write sample");
2227        index.project_root = dir.path().to_path_buf();
2228        index.index_file(&file, b"abcd efgh");
2229
2230        let trigram = pack_trigram(b'a', b'b', b'c');
2231        let matching_filter = PostingFilter {
2232            next_mask: mask_for_next_char(b'd'),
2233            loc_mask: mask_for_position(0),
2234        };
2235        let non_matching_filter = PostingFilter {
2236            next_mask: mask_for_next_char(b'z'),
2237            loc_mask: mask_for_position(0),
2238        };
2239
2240        assert_eq!(
2241            index
2242                .postings_for_trigram(trigram, Some(matching_filter))
2243                .len(),
2244            1
2245        );
2246        assert!(index
2247            .postings_for_trigram(trigram, Some(non_matching_filter))
2248            .is_empty());
2249    }
2250
2251    #[test]
2252    fn disk_round_trip_preserves_postings_and_files() {
2253        let dir = tempfile::tempdir().expect("create temp dir");
2254        let project = dir.path().join("project");
2255        fs::create_dir_all(&project).expect("create project dir");
2256        let file = project.join("src.txt");
2257        fs::write(&file, "abcdef").expect("write source");
2258
2259        let mut index = SearchIndex::build(&project);
2260        index.git_head = Some("deadbeef".to_string());
2261        let cache_dir = dir.path().join("cache");
2262        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2263
2264        let loaded =
2265            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2266        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2267        assert_eq!(loaded.files.len(), 1);
2268        assert_eq!(
2269            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2270            PathBuf::from("src.txt")
2271        );
2272        assert_eq!(loaded.postings.len(), index.postings.len());
2273        assert!(loaded
2274            .postings
2275            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2276    }
2277
2278    #[test]
2279    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2280        let root = PathBuf::from("/tmp/aft-project");
2281
2282        assert_eq!(
2283            cache_relative_path(&root, &root.join("src/lib.rs")),
2284            Some(PathBuf::from("src/lib.rs"))
2285        );
2286        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2287        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2288        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2289        assert_eq!(
2290            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2291            Some(root.join("src/lib.rs"))
2292        );
2293    }
2294
2295    #[test]
2296    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2297        let dir = tempfile::tempdir().expect("create temp dir");
2298        let project = dir.path().join("project");
2299        fs::create_dir_all(&project).expect("create project dir");
2300        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2301        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2302        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2303
2304        Command::new("git")
2305            .arg("init")
2306            .arg(&project)
2307            .status()
2308            .expect("git init");
2309        for args in [
2310            ["config", "user.email", "aft@example.invalid"],
2311            ["config", "user.name", "AFT Test"],
2312        ] {
2313            Command::new("git")
2314                .arg("-C")
2315                .arg(&project)
2316                .args(args)
2317                .status()
2318                .expect("git config");
2319        }
2320        Command::new("git")
2321            .arg("-C")
2322            .arg(&project)
2323            .args(["add", "."])
2324            .status()
2325            .expect("git add initial");
2326        Command::new("git")
2327            .arg("-C")
2328            .arg(&project)
2329            .args(["commit", "-m", "initial"])
2330            .status()
2331            .expect("git commit initial");
2332        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2333        let mut baseline = SearchIndex::build(&project);
2334        baseline.git_head = Some(previous.clone());
2335
2336        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2337        Command::new("git")
2338            .arg("-C")
2339            .arg(&project)
2340            .args(["add", "-A"])
2341            .status()
2342            .expect("git add rename");
2343        Command::new("git")
2344            .arg("-C")
2345            .arg(&project)
2346            .args(["commit", "-m", "rename"])
2347            .status()
2348            .expect("git commit rename");
2349        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2350
2351        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2352        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2353
2354        let refreshed = SearchIndex::rebuild_or_refresh(
2355            &project,
2356            DEFAULT_MAX_FILE_SIZE,
2357            Some(current),
2358            Some(baseline),
2359        );
2360
2361        assert!(!refreshed
2362            .path_to_id
2363            .contains_key(&canonical_project.join("old.txt")));
2364        assert!(refreshed
2365            .path_to_id
2366            .contains_key(&canonical_project.join("new.txt")));
2367        assert!(refreshed
2368            .path_to_id
2369            .contains_key(&canonical_project.join("untracked.txt")));
2370        let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2371        assert_eq!(matches.matches.len(), 1);
2372    }
2373
2374    #[test]
2375    fn read_from_disk_rejects_corrupt_postings_checksum() {
2376        let dir = tempfile::tempdir().expect("create temp dir");
2377        let project = dir.path().join("project");
2378        fs::create_dir_all(&project).expect("create project dir");
2379        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2380
2381        let index = SearchIndex::build(&project);
2382        let cache_dir = dir.path().join("cache");
2383        index.write_to_disk(&cache_dir, None);
2384
2385        let cache_path = cache_dir.join("cache.bin");
2386        let mut bytes = fs::read(&cache_path).expect("read cache");
2387        let middle = bytes.len() / 2;
2388        bytes[middle] ^= 0xff;
2389        fs::write(&cache_path, bytes).expect("write corrupted cache");
2390
2391        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2392    }
2393
2394    #[test]
2395    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2396        let dir = tempfile::tempdir().expect("create temp dir");
2397        let project = dir.path().join("project");
2398        fs::create_dir_all(&project).expect("create project dir");
2399        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2400
2401        let index = SearchIndex::build(&project);
2402        let cache_dir = dir.path().join("cache");
2403        index.write_to_disk(&cache_dir, None);
2404
2405        assert!(cache_dir.join("cache.bin").is_file());
2406        assert!(fs::read_dir(&cache_dir)
2407            .expect("read cache dir")
2408            .all(|entry| !entry
2409                .expect("cache entry")
2410                .file_name()
2411                .to_string_lossy()
2412                .contains(".tmp.")));
2413    }
2414
2415    #[test]
2416    fn concurrent_search_index_writes_do_not_corrupt() {
2417        let dir = tempfile::tempdir().expect("create temp dir");
2418        let project = dir.path().join("project");
2419        fs::create_dir_all(&project).expect("create project dir");
2420        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2421        let cache_dir = dir.path().join("cache");
2422
2423        let a_project = project.clone();
2424        let a_cache = cache_dir.clone();
2425        let a = std::thread::spawn(move || {
2426            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2427            let index = SearchIndex::build(&a_project);
2428            index.write_to_disk(&a_cache, None);
2429        });
2430        let b_project = project.clone();
2431        let b_cache = cache_dir.clone();
2432        let b = std::thread::spawn(move || {
2433            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2434            let index = SearchIndex::build(&b_project);
2435            index.write_to_disk(&b_cache, None);
2436        });
2437        a.join().expect("writer a");
2438        b.join().expect("writer b");
2439
2440        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2441    }
2442
2443    #[test]
2444    fn search_index_atomic_rename_survives_partial_write() {
2445        let dir = tempfile::tempdir().expect("create temp dir");
2446        let cache_dir = dir.path().join("cache");
2447        fs::create_dir_all(&cache_dir).expect("create cache dir");
2448        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2449
2450        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2451    }
2452
2453    #[test]
2454    fn project_cache_key_includes_checkout_path() {
2455        let dir = tempfile::tempdir().expect("create temp dir");
2456        let source = dir.path().join("source");
2457        fs::create_dir_all(&source).expect("create source repo dir");
2458        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2459
2460        assert!(Command::new("git")
2461            .current_dir(&source)
2462            .args(["init"])
2463            .status()
2464            .expect("init git repo")
2465            .success());
2466        assert!(Command::new("git")
2467            .current_dir(&source)
2468            .args(["add", "."])
2469            .status()
2470            .expect("git add")
2471            .success());
2472        assert!(Command::new("git")
2473            .current_dir(&source)
2474            .args([
2475                "-c",
2476                "user.name=AFT Tests",
2477                "-c",
2478                "user.email=aft-tests@example.com",
2479                "commit",
2480                "-m",
2481                "initial",
2482            ])
2483            .status()
2484            .expect("git commit")
2485            .success());
2486
2487        let clone = dir.path().join("clone");
2488        assert!(Command::new("git")
2489            .args(["clone", "--quiet"])
2490            .arg(&source)
2491            .arg(&clone)
2492            .status()
2493            .expect("git clone")
2494            .success());
2495
2496        let source_key = project_cache_key(&source);
2497        let clone_key = project_cache_key(&clone);
2498
2499        assert_eq!(source_key.len(), 16);
2500        assert_eq!(clone_key.len(), 16);
2501        // Same repo (same root commit) → same cache key regardless of clone path
2502        assert_eq!(source_key, clone_key);
2503    }
2504
2505    #[test]
2506    fn git_head_unchanged_picks_up_local_edits() {
2507        let dir = tempfile::tempdir().expect("create temp dir");
2508        let project = dir.path().join("repo");
2509        fs::create_dir_all(&project).expect("create repo dir");
2510        let file = project.join("tracked.txt");
2511        fs::write(&file, "oldtoken\n").expect("write file");
2512        assert!(Command::new("git")
2513            .current_dir(&project)
2514            .arg("init")
2515            .status()
2516            .unwrap()
2517            .success());
2518        assert!(Command::new("git")
2519            .current_dir(&project)
2520            .args(["add", "."])
2521            .status()
2522            .unwrap()
2523            .success());
2524        assert!(Command::new("git")
2525            .current_dir(&project)
2526            .args([
2527                "-c",
2528                "user.name=AFT Tests",
2529                "-c",
2530                "user.email=aft-tests@example.com",
2531                "commit",
2532                "-m",
2533                "initial"
2534            ])
2535            .status()
2536            .unwrap()
2537            .success());
2538        let head = current_git_head(&project);
2539        let mut baseline = SearchIndex::build(&project);
2540        baseline.git_head = head.clone();
2541        fs::write(&file, "newtoken\n").expect("edit tracked file");
2542
2543        let refreshed =
2544            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2545        let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2546
2547        assert_eq!(result.total_matches, 1);
2548    }
2549
2550    #[test]
2551    fn non_git_project_reuses_cache_when_files_unchanged() {
2552        let dir = tempfile::tempdir().expect("create temp dir");
2553        let project = dir.path().join("project");
2554        fs::create_dir_all(&project).expect("create project dir");
2555        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2556        let baseline = SearchIndex::build(&project);
2557        let baseline_file_count = baseline.file_count();
2558
2559        let refreshed =
2560            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2561
2562        assert_eq!(refreshed.file_count(), baseline_file_count);
2563        assert_eq!(
2564            refreshed
2565                .grep("unchangedtoken", true, &[], &[], &project, 10)
2566                .total_matches,
2567            1
2568        );
2569    }
2570
2571    #[test]
2572    fn resolve_search_scope_disables_index_for_external_path() {
2573        let dir = tempfile::tempdir().expect("create temp dir");
2574        let project = dir.path().join("project");
2575        let outside = dir.path().join("outside");
2576        fs::create_dir_all(&project).expect("create project dir");
2577        fs::create_dir_all(&outside).expect("create outside dir");
2578
2579        let scope = resolve_search_scope(&project, outside.to_str());
2580
2581        assert_eq!(
2582            scope.root,
2583            fs::canonicalize(&outside).expect("canonicalize outside")
2584        );
2585        assert!(!scope.use_index);
2586    }
2587
2588    #[test]
2589    fn grep_filters_matches_to_search_root() {
2590        let dir = tempfile::tempdir().expect("create temp dir");
2591        let project = dir.path().join("project");
2592        let src = project.join("src");
2593        let docs = project.join("docs");
2594        fs::create_dir_all(&src).expect("create src dir");
2595        fs::create_dir_all(&docs).expect("create docs dir");
2596        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2597        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2598
2599        let index = SearchIndex::build(&project);
2600        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2601
2602        assert_eq!(result.files_searched, 1);
2603        assert_eq!(result.files_with_matches, 1);
2604        assert_eq!(result.matches.len(), 1);
2605        // Index stores canonicalized paths; on macOS /var → /private/var
2606        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2607        assert_eq!(result.matches[0].file, expected);
2608    }
2609
2610    #[test]
2611    fn grep_deduplicates_multiple_matches_on_same_line() {
2612        let dir = tempfile::tempdir().expect("create temp dir");
2613        let project = dir.path().join("project");
2614        let src = project.join("src");
2615        fs::create_dir_all(&src).expect("create src dir");
2616        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2617
2618        let index = SearchIndex::build(&project);
2619        let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2620
2621        assert_eq!(result.total_matches, 1);
2622        assert_eq!(result.matches.len(), 1);
2623    }
2624
2625    #[test]
2626    fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2627        let dir = tempfile::tempdir().expect("create temp dir");
2628        let project = dir.path().join("project");
2629        fs::create_dir_all(&project).expect("create project dir");
2630        let file = project.join("unicode.txt");
2631        fs::write(&file, "äbc\n").expect("write unicode file");
2632
2633        let index = SearchIndex::build(&project);
2634        let result = index.grep("Äbc", false, &[], &[], &project, 10);
2635
2636        assert_eq!(result.total_matches, 1);
2637        assert_eq!(result.matches.len(), 1);
2638        assert_eq!(
2639            result.matches[0].file,
2640            fs::canonicalize(file).expect("canonicalize unicode file")
2641        );
2642    }
2643
2644    #[test]
2645    fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2646        let dir = tempfile::tempdir().expect("create temp dir");
2647        let project = dir.path().join("project");
2648        fs::create_dir_all(&project).expect("create project dir");
2649        let file = project.join("tokens.txt");
2650        let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2651        fs::write(&file, "alpha").expect("write original file");
2652        filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2653
2654        let baseline = SearchIndex::build(&project);
2655        fs::write(&file, "bravo").expect("write same-size edit");
2656        filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2657
2658        let refreshed =
2659            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2660        let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2661        let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2662        let refreshed_id = *refreshed
2663            .path_to_id
2664            .get(&canonical_file)
2665            .expect("file remains indexed");
2666
2667        assert_eq!(result.total_matches, 1);
2668        assert!(refreshed
2669            .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2670            .contains(&refreshed_id));
2671        assert!(!refreshed
2672            .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2673            .contains(&refreshed_id));
2674    }
2675
2676    #[test]
2677    fn grep_reports_total_matches_before_truncation() {
2678        let dir = tempfile::tempdir().expect("create temp dir");
2679        let project = dir.path().join("project");
2680        let src = project.join("src");
2681        fs::create_dir_all(&src).expect("create src dir");
2682        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2683
2684        let index = SearchIndex::build(&project);
2685        let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2686
2687        assert_eq!(result.total_matches, 2);
2688        assert_eq!(result.matches.len(), 1);
2689        assert!(result.truncated);
2690    }
2691
2692    #[test]
2693    fn glob_filters_results_to_search_root() {
2694        let dir = tempfile::tempdir().expect("create temp dir");
2695        let project = dir.path().join("project");
2696        let src = project.join("src");
2697        let scripts = project.join("scripts");
2698        fs::create_dir_all(&src).expect("create src dir");
2699        fs::create_dir_all(&scripts).expect("create scripts dir");
2700        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2701        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2702
2703        let index = SearchIndex::build(&project);
2704        let files = index.glob("**/*.rs", &src);
2705
2706        assert_eq!(
2707            files,
2708            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2709        );
2710    }
2711
2712    #[test]
2713    fn glob_includes_hidden_and_binary_files() {
2714        let dir = tempfile::tempdir().expect("create temp dir");
2715        let project = dir.path().join("project");
2716        let hidden_dir = project.join(".hidden");
2717        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2718        let hidden_file = hidden_dir.join("data.bin");
2719        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2720
2721        let index = SearchIndex::build(&project);
2722        let files = index.glob("**/*.bin", &project);
2723
2724        assert_eq!(
2725            files,
2726            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2727        );
2728    }
2729
2730    #[test]
2731    fn read_from_disk_rejects_invalid_nanos() {
2732        let dir = tempfile::tempdir().expect("create temp dir");
2733        let cache_dir = dir.path().join("cache");
2734        fs::create_dir_all(&cache_dir).expect("create cache dir");
2735
2736        let mut postings = Vec::new();
2737        postings.extend_from_slice(INDEX_MAGIC);
2738        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2739        postings.extend_from_slice(&0u32.to_le_bytes());
2740        postings.extend_from_slice(&1u32.to_le_bytes());
2741        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2742        postings.extend_from_slice(&1u32.to_le_bytes());
2743        postings.extend_from_slice(b"/");
2744        postings.push(0u8);
2745        postings.extend_from_slice(&1u32.to_le_bytes());
2746        postings.extend_from_slice(&0u64.to_le_bytes());
2747        postings.extend_from_slice(&0u64.to_le_bytes());
2748        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2749        postings.extend_from_slice(b"a");
2750        postings.extend_from_slice(&0u64.to_le_bytes());
2751
2752        let mut lookup = Vec::new();
2753        lookup.extend_from_slice(LOOKUP_MAGIC);
2754        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2755        lookup.extend_from_slice(&0u32.to_le_bytes());
2756
2757        let postings_checksum = crc32fast::hash(&postings);
2758        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2759        let lookup_checksum = crc32fast::hash(&lookup);
2760        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2761        let mut cache = Vec::new();
2762        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2763        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2764        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2765        cache.extend_from_slice(&postings);
2766        cache.extend_from_slice(&lookup);
2767        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2768
2769        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2770    }
2771
2772    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2773    /// changed between cmp() calls.
2774    ///
2775    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2776    /// which does a `stat()` syscall. If the file was deleted, modified, or
2777    /// touched mid-sort, the comparator returned different values for the
2778    /// same input pair on different invocations. Rust's slice::sort detects
2779    /// this and panics with "user-provided comparison function does not
2780    /// correctly implement a total order".
2781    ///
2782    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2783    /// bridge invalidated files in parallel with grep's sort path. This
2784    /// test simulates the worst case: most paths don't exist (Err from
2785    /// fs::metadata) and sort still completes successfully.
2786    #[test]
2787    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2788        // Mix of existing and non-existing paths in deliberately
2789        // non-monotonic order — pre-fix, the sort would call stat() at
2790        // least N log N times and any flakiness would trigger the panic.
2791        let dir = tempfile::tempdir().expect("create tempdir");
2792        let mut paths: Vec<PathBuf> = Vec::new();
2793        for i in 0..30 {
2794            // Half exist, half don't.
2795            let path = if i % 2 == 0 {
2796                let p = dir.path().join(format!("real-{i}.rs"));
2797                fs::write(&p, format!("// {i}\n")).expect("write");
2798                p
2799            } else {
2800                dir.path().join(format!("missing-{i}.rs"))
2801            };
2802            paths.push(path);
2803        }
2804
2805        // Run the sort many times to maximise the chance of catching any
2806        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2807        for _ in 0..50 {
2808            let mut copy = paths.clone();
2809            sort_paths_by_mtime_desc(&mut copy);
2810            assert_eq!(copy.len(), paths.len());
2811        }
2812    }
2813
2814    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2815    /// the same conditions as sort_paths_by_mtime_desc. See the
2816    /// sort_paths_... test above for the full rationale.
2817    #[test]
2818    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2819        let dir = tempfile::tempdir().expect("create tempdir");
2820        let mut matches: Vec<GrepMatch> = Vec::new();
2821        for i in 0..30 {
2822            let file = if i % 2 == 0 {
2823                let p = dir.path().join(format!("real-{i}.rs"));
2824                fs::write(&p, format!("// {i}\n")).expect("write");
2825                p
2826            } else {
2827                dir.path().join(format!("missing-{i}.rs"))
2828            };
2829            matches.push(GrepMatch {
2830                file,
2831                line: u32::try_from(i).unwrap_or(0),
2832                column: 0,
2833                line_text: format!("match {i}"),
2834                match_text: format!("match {i}"),
2835            });
2836        }
2837
2838        for _ in 0..50 {
2839            let mut copy = matches.clone();
2840            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2841            assert_eq!(copy.len(), matches.len());
2842        }
2843    }
2844}
aft/search_index.rs

aft/
search_index.rs