aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20
21const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
22const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
23const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
24const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
25const INDEX_VERSION: u32 = 3;
26const PREVIEW_BYTES: usize = 8 * 1024;
27const EOF_SENTINEL: u8 = 0;
28const MAX_ENTRIES: usize = 10_000_000;
29const MIN_FILE_ENTRY_BYTES: usize = 57;
30const LOOKUP_ENTRY_BYTES: usize = 16;
31const POSTING_BYTES: usize = 6;
32static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
33
34pub struct CacheLock {
35    _guard: fs_lock::LockGuard,
36}
37
38impl CacheLock {
39    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
40        fs::create_dir_all(cache_dir)?;
41        let path = cache_dir.join("cache.lock");
42        let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
43            .lock()
44            .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
45        fs_lock::try_acquire(&path, Duration::from_secs(2))
46            .map(|guard| Self { _guard: guard })
47            .map_err(|error| match error {
48                fs_lock::AcquireError::Timeout => {
49                    std::io::Error::other("timed out acquiring search cache lock")
50                }
51                fs_lock::AcquireError::Io(error) => error,
52            })
53    }
54}
55
56#[derive(Clone, Debug)]
57pub struct SearchIndex {
58    pub postings: HashMap<u32, Vec<Posting>>,
59    pub files: Vec<FileEntry>,
60    pub path_to_id: HashMap<PathBuf, u32>,
61    pub ready: bool,
62    project_root: PathBuf,
63    git_head: Option<String>,
64    max_file_size: u64,
65    pub file_trigrams: HashMap<u32, Vec<u32>>,
66    unindexed_files: HashSet<u32>,
67}
68
69impl SearchIndex {
70    /// Number of indexed files.
71    pub fn file_count(&self) -> usize {
72        self.files.len()
73    }
74
75    /// Number of unique trigrams in the index.
76    pub fn trigram_count(&self) -> usize {
77        self.postings.len()
78    }
79
80    /// Compute distinct query trigrams from literal tokens.
81    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
82        query_trigrams_from_tokens(tokens)
83    }
84
85    /// Score-rank file candidates by lexical relevance to query trigrams.
86    pub fn lexical_rank(
87        &self,
88        query_trigrams: &[u32],
89        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
90        max_files: usize,
91    ) -> Vec<(PathBuf, f32)> {
92        if query_trigrams.is_empty() || max_files == 0 {
93            return Vec::new();
94        }
95
96        let mut non_zero: Vec<(u32, usize)> = query_trigrams
97            .iter()
98            .filter_map(|trigram| {
99                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
100                (posting_count > 0).then_some((*trigram, posting_count))
101            })
102            .collect();
103        if non_zero.is_empty() {
104            return Vec::new();
105        }
106
107        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
108        let selected_count = non_zero.len().min(3);
109        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
110
111        let mut candidate_ids = BTreeSet::new();
112        for (trigram, _) in non_zero.iter().take(selected_count) {
113            if let Some(postings) = self.postings.get(trigram) {
114                for posting in postings {
115                    if self.is_active_file(posting.file_id) {
116                        candidate_ids.insert(posting.file_id);
117                    }
118                }
119            }
120        }
121
122        let mut ranked = Vec::new();
123        for file_id in candidate_ids.into_iter().take(candidate_cap) {
124            let Some(entry) = self.files.get(file_id as usize) else {
125                continue;
126            };
127            if let Some(filter) = candidate_filter {
128                if !filter(&entry.path) {
129                    continue;
130                }
131            }
132            let score = lexical_score(self, query_trigrams, file_id);
133            if score > 0.0 {
134                ranked.push((entry.path.clone(), score));
135            }
136        }
137
138        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
139        ranked.truncate(max_files);
140        ranked
141    }
142}
143
144#[derive(Clone, Debug, PartialEq, Eq)]
145pub struct Posting {
146    pub file_id: u32,
147    pub next_mask: u8,
148    pub loc_mask: u8,
149}
150
151#[derive(Clone, Debug)]
152pub struct FileEntry {
153    pub path: PathBuf,
154    pub size: u64,
155    pub modified: SystemTime,
156    pub content_hash: blake3::Hash,
157}
158
159#[derive(Clone, Debug, PartialEq, Eq)]
160pub struct GrepMatch {
161    pub file: PathBuf,
162    pub line: u32,
163    pub column: u32,
164    pub line_text: String,
165    pub match_text: String,
166}
167
168#[derive(Clone, Debug)]
169pub struct GrepResult {
170    pub matches: Vec<GrepMatch>,
171    pub total_matches: usize,
172    pub files_searched: usize,
173    pub files_with_matches: usize,
174    pub index_status: IndexStatus,
175    pub truncated: bool,
176}
177
178#[derive(Clone, Copy, Debug, PartialEq, Eq)]
179pub enum IndexStatus {
180    Ready,
181    Building,
182    Fallback,
183}
184
185impl IndexStatus {
186    pub fn as_str(&self) -> &'static str {
187        match self {
188            IndexStatus::Ready => "Ready",
189            IndexStatus::Building => "Building",
190            IndexStatus::Fallback => "Fallback",
191        }
192    }
193}
194
195#[derive(Clone, Debug, Default)]
196pub struct RegexQuery {
197    pub and_trigrams: Vec<u32>,
198    pub or_groups: Vec<Vec<u32>>,
199    pub(crate) and_filters: HashMap<u32, PostingFilter>,
200    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
201}
202
203#[derive(Clone, Copy, Debug, Default)]
204pub(crate) struct PostingFilter {
205    next_mask: u8,
206    loc_mask: u8,
207}
208
209#[derive(Clone, Debug, Default)]
210struct QueryBuild {
211    and_runs: Vec<Vec<u8>>,
212    or_groups: Vec<Vec<Vec<u8>>>,
213}
214
215#[derive(Clone, Debug, Default)]
216pub(crate) struct PathFilters {
217    includes: Option<GlobSet>,
218    excludes: Option<GlobSet>,
219}
220
221#[derive(Clone, Debug)]
222pub(crate) struct SearchScope {
223    pub root: PathBuf,
224    pub use_index: bool,
225}
226
227#[derive(Clone, Debug)]
228struct SharedGrepMatch {
229    file: Arc<PathBuf>,
230    line: u32,
231    column: u32,
232    line_text: String,
233    match_text: String,
234}
235
236#[derive(Clone, Debug)]
237enum SearchMatcher {
238    Literal(LiteralSearch),
239    Regex(Regex),
240}
241
242#[derive(Clone, Debug)]
243enum LiteralSearch {
244    CaseSensitive(Vec<u8>),
245    AsciiCaseInsensitive(Vec<u8>),
246}
247
248impl SearchIndex {
249    pub fn new() -> Self {
250        SearchIndex {
251            postings: HashMap::new(),
252            files: Vec::new(),
253            path_to_id: HashMap::new(),
254            ready: false,
255            project_root: PathBuf::new(),
256            git_head: None,
257            max_file_size: DEFAULT_MAX_FILE_SIZE,
258            file_trigrams: HashMap::new(),
259            unindexed_files: HashSet::new(),
260        }
261    }
262
263    pub fn build(root: &Path) -> Self {
264        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
265    }
266
267    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
268        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
269        let mut index = SearchIndex {
270            project_root: project_root.clone(),
271            max_file_size,
272            ..SearchIndex::new()
273        };
274
275        let filters = PathFilters::default();
276        for path in walk_project_files(&project_root, &filters) {
277            index.update_file(&path);
278        }
279
280        index.git_head = current_git_head(&project_root);
281        index.ready = true;
282        index
283    }
284
285    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
286        self.remove_file(path);
287
288        let file_id = match self.allocate_file_id(path, content.len() as u64) {
289            Some(file_id) => file_id,
290            None => return,
291        };
292        if let Some(file) = self.files.get_mut(file_id as usize) {
293            file.content_hash = cache_freshness::hash_bytes(content);
294        }
295
296        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
297        for (trigram, next_char, position) in extract_trigrams(content) {
298            let entry = trigram_map.entry(trigram).or_default();
299            entry.next_mask |= mask_for_next_char(next_char);
300            entry.loc_mask |= mask_for_position(position);
301        }
302
303        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
304        for (trigram, filter) in trigram_map {
305            let postings = self.postings.entry(trigram).or_default();
306            postings.push(Posting {
307                file_id,
308                next_mask: filter.next_mask,
309                loc_mask: filter.loc_mask,
310            });
311            // Posting lists are kept sorted by file_id for binary search during
312            // intersection. Since file_ids are allocated incrementally, the new
313            // entry is usually already in order. Only sort when needed.
314            if postings.len() > 1
315                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
316            {
317                postings.sort_unstable_by_key(|p| p.file_id);
318            }
319            file_trigrams.push(trigram);
320        }
321
322        self.file_trigrams.insert(file_id, file_trigrams);
323        self.unindexed_files.remove(&file_id);
324    }
325
326    pub fn remove_file(&mut self, path: &Path) {
327        let Some(file_id) = self.path_to_id.remove(path) else {
328            return;
329        };
330
331        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
332            for trigram in trigrams {
333                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
334                    postings.retain(|posting| posting.file_id != file_id);
335                    postings.is_empty()
336                } else {
337                    false
338                };
339
340                if should_remove {
341                    self.postings.remove(&trigram);
342                }
343            }
344        }
345
346        self.unindexed_files.remove(&file_id);
347        if let Some(file) = self.files.get_mut(file_id as usize) {
348            file.path = PathBuf::new();
349            file.size = 0;
350            file.modified = UNIX_EPOCH;
351            file.content_hash = cache_freshness::zero_hash();
352        }
353    }
354
355    pub fn update_file(&mut self, path: &Path) {
356        self.remove_file(path);
357
358        let metadata = match fs::metadata(path) {
359            Ok(metadata) if metadata.is_file() => metadata,
360            _ => return,
361        };
362
363        if is_binary_path(path, metadata.len()) {
364            self.track_unindexed_file(path, &metadata);
365            return;
366        }
367
368        if metadata.len() > self.max_file_size {
369            self.track_unindexed_file(path, &metadata);
370            return;
371        }
372
373        let content = match fs::read(path) {
374            Ok(content) => content,
375            Err(_) => return,
376        };
377
378        if is_binary_bytes(&content) {
379            self.track_unindexed_file(path, &metadata);
380            return;
381        }
382
383        self.index_file(path, &content);
384    }
385
386    pub fn grep(
387        &self,
388        pattern: &str,
389        case_sensitive: bool,
390        include: &[String],
391        exclude: &[String],
392        search_root: &Path,
393        max_results: usize,
394    ) -> GrepResult {
395        self.search_grep(
396            pattern,
397            case_sensitive,
398            include,
399            exclude,
400            search_root,
401            max_results,
402        )
403    }
404
405    pub fn search_grep(
406        &self,
407        pattern: &str,
408        case_sensitive: bool,
409        include: &[String],
410        exclude: &[String],
411        search_root: &Path,
412        max_results: usize,
413    ) -> GrepResult {
414        // Detect if pattern is a plain literal (no regex metacharacters).
415        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
416        let is_literal = !pattern.chars().any(|c| {
417            matches!(
418                c,
419                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
420            )
421        });
422
423        let literal_search = if is_literal {
424            if case_sensitive {
425                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
426            } else if pattern.is_ascii() {
427                Some(LiteralSearch::AsciiCaseInsensitive(
428                    pattern
429                        .as_bytes()
430                        .iter()
431                        .map(|byte| byte.to_ascii_lowercase())
432                        .collect(),
433                ))
434            } else {
435                None
436            }
437        } else {
438            None
439        };
440
441        // Build the regex for non-literal patterns (or literal Unicode fallback).
442        let regex = if literal_search.is_some() {
443            None
444        } else {
445            let regex_pattern = if is_literal {
446                regex::escape(pattern)
447            } else {
448                pattern.to_string()
449            };
450            let mut builder = RegexBuilder::new(&regex_pattern);
451            builder.case_insensitive(!case_sensitive);
452            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
453            builder.multi_line(true);
454            match builder.build() {
455                Ok(r) => Some(r),
456                Err(_) => {
457                    return GrepResult {
458                        matches: Vec::new(),
459                        total_matches: 0,
460                        files_searched: 0,
461                        files_with_matches: 0,
462                        index_status: if self.ready {
463                            IndexStatus::Ready
464                        } else {
465                            IndexStatus::Building
466                        },
467                        truncated: false,
468                    };
469                }
470            }
471        };
472
473        let matcher = if let Some(literal_search) = literal_search {
474            SearchMatcher::Literal(literal_search)
475        } else {
476            SearchMatcher::Regex(
477                regex.expect("regex should exist when literal matcher is unavailable"),
478            )
479        };
480
481        let filters = match build_path_filters(include, exclude) {
482            Ok(filters) => filters,
483            Err(_) => PathFilters::default(),
484        };
485        let search_root = canonicalize_or_normalize(search_root);
486
487        let query = decompose_regex(pattern);
488        let candidate_ids = self.candidates(&query);
489
490        let candidate_files: Vec<&FileEntry> = candidate_ids
491            .into_iter()
492            .filter_map(|file_id| self.files.get(file_id as usize))
493            .filter(|file| !file.path.as_os_str().is_empty())
494            .filter(|file| is_within_search_root(&search_root, &file.path))
495            .filter(|file| filters.matches(&self.project_root, &file.path))
496            .collect();
497
498        let total_matches = AtomicUsize::new(0);
499        let files_searched = AtomicUsize::new(0);
500        let files_with_matches = AtomicUsize::new(0);
501        let truncated = AtomicBool::new(false);
502        let stop_after = max_results.saturating_mul(2);
503
504        let mut matches = if candidate_files.len() > 10 {
505            candidate_files
506                .par_iter()
507                .map(|file| {
508                    search_candidate_file(
509                        file,
510                        &matcher,
511                        max_results,
512                        stop_after,
513                        &total_matches,
514                        &files_searched,
515                        &files_with_matches,
516                        &truncated,
517                    )
518                })
519                .reduce(Vec::new, |mut left, mut right| {
520                    left.append(&mut right);
521                    left
522                })
523        } else {
524            let mut matches = Vec::new();
525            for file in candidate_files {
526                matches.extend(search_candidate_file(
527                    file,
528                    &matcher,
529                    max_results,
530                    stop_after,
531                    &total_matches,
532                    &files_searched,
533                    &files_with_matches,
534                    &truncated,
535                ));
536
537                if should_stop_search(&truncated, &total_matches, stop_after) {
538                    break;
539                }
540            }
541            matches
542        };
543
544        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
545            self.path_to_id
546                .get(path)
547                .and_then(|file_id| self.files.get(*file_id as usize))
548                .map(|file| file.modified)
549        });
550
551        let matches = matches
552            .into_iter()
553            .map(|matched| GrepMatch {
554                file: matched.file.as_ref().clone(),
555                line: matched.line,
556                column: matched.column,
557                line_text: matched.line_text,
558                match_text: matched.match_text,
559            })
560            .collect();
561
562        GrepResult {
563            total_matches: total_matches.load(Ordering::Relaxed),
564            matches,
565            files_searched: files_searched.load(Ordering::Relaxed),
566            files_with_matches: files_with_matches.load(Ordering::Relaxed),
567            index_status: if self.ready {
568                IndexStatus::Ready
569            } else {
570                IndexStatus::Building
571            },
572            truncated: truncated.load(Ordering::Relaxed),
573        }
574    }
575
576    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
577        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
578            Ok(filters) => filters,
579            Err(_) => return Vec::new(),
580        };
581        let search_root = canonicalize_or_normalize(search_root);
582        let mut entries = self
583            .files
584            .iter()
585            .filter(|file| !file.path.as_os_str().is_empty())
586            .filter(|file| is_within_search_root(&search_root, &file.path))
587            .filter(|file| filters.matches(&self.project_root, &file.path))
588            .map(|file| (file.path.clone(), file.modified))
589            .collect::<Vec<_>>();
590
591        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
592            right_mtime
593                .cmp(left_mtime)
594                .then_with(|| left_path.cmp(right_path))
595        });
596
597        entries.into_iter().map(|(path, _)| path).collect()
598    }
599
600    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
601        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
602            return self.active_file_ids();
603        }
604
605        let mut and_trigrams = query.and_trigrams.clone();
606        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
607
608        let mut current: Option<Vec<u32>> = None;
609
610        for trigram in and_trigrams {
611            let filter = query.and_filters.get(&trigram).copied();
612            let matches = self.postings_for_trigram(trigram, filter);
613            current = Some(match current.take() {
614                Some(existing) => intersect_sorted_ids(&existing, &matches),
615                None => matches,
616            });
617
618            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
619                break;
620            }
621        }
622
623        let mut current = current.unwrap_or_else(|| self.active_file_ids());
624
625        for (index, group) in query.or_groups.iter().enumerate() {
626            let mut group_matches = Vec::new();
627            let filters = query.or_filters.get(index);
628
629            for trigram in group {
630                let filter = filters.and_then(|filters| filters.get(trigram).copied());
631                let matches = self.postings_for_trigram(*trigram, filter);
632                if group_matches.is_empty() {
633                    group_matches = matches;
634                } else {
635                    group_matches = union_sorted_ids(&group_matches, &matches);
636                }
637            }
638
639            current = intersect_sorted_ids(&current, &group_matches);
640            if current.is_empty() {
641                break;
642            }
643        }
644
645        let mut unindexed = self
646            .unindexed_files
647            .iter()
648            .copied()
649            .filter(|file_id| self.is_active_file(*file_id))
650            .collect::<Vec<_>>();
651        if !unindexed.is_empty() {
652            unindexed.sort_unstable();
653            current = union_sorted_ids(&current, &unindexed);
654        }
655
656        current
657    }
658
659    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
660        if fs::create_dir_all(cache_dir).is_err() {
661            return;
662        }
663
664        let cache_path = cache_dir.join("cache.bin");
665        let tmp_cache = cache_dir.join(format!(
666            "cache.bin.tmp.{}.{}",
667            std::process::id(),
668            SystemTime::now()
669                .duration_since(UNIX_EPOCH)
670                .unwrap_or(Duration::ZERO)
671                .as_nanos()
672        ));
673
674        let active_ids = self.active_file_ids();
675        let mut id_map = HashMap::new();
676        for (new_id, old_id) in active_ids.iter().enumerate() {
677            let Ok(new_id_u32) = u32::try_from(new_id) else {
678                return;
679            };
680            id_map.insert(*old_id, new_id_u32);
681        }
682
683        let write_result = (|| -> std::io::Result<()> {
684            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
685
686            postings_writer.write_all(INDEX_MAGIC)?;
687            write_u32(&mut postings_writer, INDEX_VERSION)?;
688
689            let head = git_head.unwrap_or_default();
690            let root = self.project_root.to_string_lossy();
691            let head_len = u32::try_from(head.len())
692                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
693            let root_len = u32::try_from(root.len())
694                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
695            let file_count = u32::try_from(active_ids.len())
696                .map_err(|_| std::io::Error::other("too many files to cache"))?;
697
698            write_u32(&mut postings_writer, head_len)?;
699            write_u32(&mut postings_writer, root_len)?;
700            write_u64(&mut postings_writer, self.max_file_size)?;
701            write_u32(&mut postings_writer, file_count)?;
702            postings_writer.write_all(head.as_bytes())?;
703            postings_writer.write_all(root.as_bytes())?;
704
705            for old_id in &active_ids {
706                let Some(file) = self.files.get(*old_id as usize) else {
707                    return Err(std::io::Error::other("missing file entry for cache write"));
708                };
709                let path =
710                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
711                        std::io::Error::other(format!(
712                            "refusing to cache path outside project root: {}",
713                            file.path.display()
714                        ))
715                    })?;
716                let path = path.to_string_lossy();
717                let path_len = u32::try_from(path.len())
718                    .map_err(|_| std::io::Error::other("cached path too large"))?;
719                let modified = file
720                    .modified
721                    .duration_since(UNIX_EPOCH)
722                    .unwrap_or(Duration::ZERO);
723                let unindexed = if self.unindexed_files.contains(old_id) {
724                    1u8
725                } else {
726                    0u8
727                };
728
729                postings_writer.write_all(&[unindexed])?;
730                write_u32(&mut postings_writer, path_len)?;
731                write_u64(&mut postings_writer, file.size)?;
732                write_u64(&mut postings_writer, modified.as_secs())?;
733                write_u32(&mut postings_writer, modified.subsec_nanos())?;
734                postings_writer.write_all(file.content_hash.as_bytes())?;
735                postings_writer.write_all(path.as_bytes())?;
736            }
737
738            let mut lookup_entries = Vec::new();
739            let mut postings_blob = Vec::new();
740            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
741            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
742
743            for (trigram, postings) in sorted_postings {
744                let offset = u64::try_from(postings_blob.len())
745                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
746                let mut count = 0u32;
747
748                for posting in postings {
749                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
750                        continue;
751                    };
752
753                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
754                    postings_blob.push(posting.next_mask);
755                    postings_blob.push(posting.loc_mask);
756                    count = count.saturating_add(1);
757                }
758
759                if count > 0 {
760                    lookup_entries.push((*trigram, offset, count));
761                }
762            }
763
764            write_u64(
765                &mut postings_writer,
766                u64::try_from(postings_blob.len())
767                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
768            )?;
769            postings_writer.write_all(&postings_blob)?;
770            postings_writer.flush()?;
771            let mut postings_blob_file = postings_writer
772                .into_inner()
773                .map_err(|error| std::io::Error::other(error.to_string()))?
774                .into_inner();
775            let checksum = crc32fast::hash(&postings_blob_file);
776            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
777
778            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
779            let entry_count = u32::try_from(lookup_entries.len())
780                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
781
782            lookup_writer.write_all(LOOKUP_MAGIC)?;
783            write_u32(&mut lookup_writer, INDEX_VERSION)?;
784            write_u32(&mut lookup_writer, entry_count)?;
785
786            for (trigram, offset, count) in lookup_entries {
787                write_u32(&mut lookup_writer, trigram)?;
788                write_u64(&mut lookup_writer, offset)?;
789                write_u32(&mut lookup_writer, count)?;
790            }
791
792            lookup_writer.flush()?;
793            let mut lookup_blob_file = lookup_writer
794                .into_inner()
795                .map_err(|error| std::io::Error::other(error.to_string()))?
796                .into_inner();
797            let checksum = crc32fast::hash(&lookup_blob_file);
798            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
799
800            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
801            write_u32(&mut cache_writer, CACHE_MAGIC)?;
802            write_u32(&mut cache_writer, INDEX_VERSION)?;
803            write_u64(
804                &mut cache_writer,
805                u64::try_from(postings_blob_file.len())
806                    .map_err(|_| std::io::Error::other("postings section too large"))?,
807            )?;
808            cache_writer.write_all(&postings_blob_file)?;
809            cache_writer.write_all(&lookup_blob_file)?;
810            cache_writer.flush()?;
811            cache_writer.get_ref().sync_all()?;
812            drop(cache_writer);
813            fs::rename(&tmp_cache, &cache_path)?;
814
815            Ok(())
816        })();
817
818        if write_result.is_err() {
819            let _ = fs::remove_file(&tmp_cache);
820        }
821    }
822
823    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
824        debug_assert!(current_canonical_root.is_absolute());
825        let cache_path = cache_dir.join("cache.bin");
826        let cache_bytes = fs::read(&cache_path).ok()?;
827        if cache_bytes.len() < 16 {
828            return None;
829        }
830        let mut header = Cursor::new(&cache_bytes);
831        if read_u32(&mut header).ok()? != CACHE_MAGIC {
832            return None;
833        }
834        if read_u32(&mut header).ok()? != INDEX_VERSION {
835            return None;
836        }
837        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
838        let start = usize::try_from(header.position()).ok()?;
839        let postings_end = start.checked_add(postings_len_total)?;
840        if postings_end > cache_bytes.len() {
841            return None;
842        }
843        let postings_bytes = &cache_bytes[start..postings_end];
844        let lookup_bytes = &cache_bytes[postings_end..];
845        let lookup_len_total = lookup_bytes.len();
846        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
847        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
848        if postings_len_total < 4 || lookup_len_total < 4 {
849            return None;
850        }
851        verify_crc32_bytes_slice(postings_bytes).ok()?;
852        verify_crc32_bytes_slice(lookup_bytes).ok()?;
853
854        let mut magic = [0u8; 8];
855        postings_reader.read_exact(&mut magic).ok()?;
856        if &magic != INDEX_MAGIC {
857            return None;
858        }
859        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
860            return None;
861        }
862
863        let head_len = read_u32(&mut postings_reader).ok()? as usize;
864        let root_len = read_u32(&mut postings_reader).ok()? as usize;
865        let max_file_size = read_u64(&mut postings_reader).ok()?;
866        let file_count = read_u32(&mut postings_reader).ok()? as usize;
867        if file_count > MAX_ENTRIES {
868            return None;
869        }
870        let postings_body_len = postings_len_total.checked_sub(4)?;
871        let lookup_body_len = lookup_len_total.checked_sub(4)?;
872
873        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
874        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
875        if minimum_file_bytes > remaining_postings {
876            return None;
877        }
878
879        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
880            return None;
881        }
882        let mut head_bytes = vec![0u8; head_len];
883        postings_reader.read_exact(&mut head_bytes).ok()?;
884        let git_head = String::from_utf8(head_bytes)
885            .ok()
886            .filter(|head| !head.is_empty());
887
888        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
889            return None;
890        }
891        let mut root_bytes = vec![0u8; root_len];
892        postings_reader.read_exact(&mut root_bytes).ok()?;
893        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
894        let project_root = current_canonical_root.to_path_buf();
895
896        let mut files = Vec::with_capacity(file_count);
897        let mut path_to_id = HashMap::new();
898        let mut unindexed_files = HashSet::new();
899
900        for file_id in 0..file_count {
901            let mut unindexed = [0u8; 1];
902            postings_reader.read_exact(&mut unindexed).ok()?;
903            let path_len = read_u32(&mut postings_reader).ok()? as usize;
904            let size = read_u64(&mut postings_reader).ok()?;
905            let secs = read_u64(&mut postings_reader).ok()?;
906            let nanos = read_u32(&mut postings_reader).ok()?;
907            let mut hash_bytes = [0u8; 32];
908            postings_reader.read_exact(&mut hash_bytes).ok()?;
909            let content_hash = blake3::Hash::from_bytes(hash_bytes);
910            if nanos >= 1_000_000_000 {
911                return None;
912            }
913            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
914                return None;
915            }
916            let mut path_bytes = vec![0u8; path_len];
917            postings_reader.read_exact(&mut path_bytes).ok()?;
918            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
919            let full_path = cached_path_under_root(&project_root, &relative_path)?;
920            let file_id_u32 = u32::try_from(file_id).ok()?;
921
922            files.push(FileEntry {
923                path: full_path.clone(),
924                size,
925                modified: UNIX_EPOCH + Duration::new(secs, nanos),
926                content_hash,
927            });
928            path_to_id.insert(full_path, file_id_u32);
929            if unindexed[0] == 1 {
930                unindexed_files.insert(file_id_u32);
931            }
932        }
933
934        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
935        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
936        if postings_len > max_postings_bytes {
937            return None;
938        }
939        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
940            return None;
941        }
942        let mut postings_blob = vec![0u8; postings_len];
943        postings_reader.read_exact(&mut postings_blob).ok()?;
944
945        let mut lookup_magic = [0u8; 8];
946        lookup_reader.read_exact(&mut lookup_magic).ok()?;
947        if &lookup_magic != LOOKUP_MAGIC {
948            return None;
949        }
950        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
951            return None;
952        }
953        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
954        if entry_count > MAX_ENTRIES {
955            return None;
956        }
957        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
958        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
959        if minimum_lookup_bytes > remaining_lookup {
960            return None;
961        }
962
963        let mut postings = HashMap::new();
964        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
965
966        for _ in 0..entry_count {
967            let trigram = read_u32(&mut lookup_reader).ok()?;
968            let offset = read_u64(&mut lookup_reader).ok()? as usize;
969            let count = read_u32(&mut lookup_reader).ok()? as usize;
970            if count > MAX_ENTRIES {
971                return None;
972            }
973            let bytes_len = count.checked_mul(POSTING_BYTES)?;
974            let end = offset.checked_add(bytes_len)?;
975            if end > postings_blob.len() {
976                return None;
977            }
978
979            let mut trigram_postings = Vec::with_capacity(count);
980            for chunk in postings_blob[offset..end].chunks_exact(6) {
981                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
982                let posting = Posting {
983                    file_id,
984                    next_mask: chunk[4],
985                    loc_mask: chunk[5],
986                };
987                trigram_postings.push(posting.clone());
988                file_trigrams.entry(file_id).or_default().push(trigram);
989            }
990            postings.insert(trigram, trigram_postings);
991        }
992
993        Some(SearchIndex {
994            postings,
995            files,
996            path_to_id,
997            ready: false,
998            project_root,
999            git_head,
1000            max_file_size,
1001            file_trigrams,
1002            unindexed_files,
1003        })
1004    }
1005
1006    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1007        self.git_head.as_deref()
1008    }
1009
1010    pub(crate) fn set_ready(&mut self, ready: bool) {
1011        self.ready = ready;
1012    }
1013
1014    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1015        self.git_head = current_head;
1016        verify_file_mtimes(self);
1017        self.ready = true;
1018    }
1019
1020    pub(crate) fn rebuild_or_refresh(
1021        root: &Path,
1022        max_file_size: u64,
1023        current_head: Option<String>,
1024        baseline: Option<SearchIndex>,
1025    ) -> Self {
1026        if let Some(mut baseline) = baseline {
1027            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1028            baseline.max_file_size = max_file_size;
1029
1030            if baseline.git_head == current_head || current_head.is_none() {
1031                // HEAD matches, but files may have changed on disk since the index was
1032                // last written (e.g., uncommitted edits, stash pop, manual file changes
1033                // while OpenCode was closed). Verify mtimes and re-index stale files.
1034                // Non-git projects also use this per-file (path, mtime, size)
1035                // fingerprint so unchanged trees reuse the disk cache instead of
1036                // rebuilding every configure.
1037                baseline.git_head = current_head;
1038                verify_file_mtimes(&mut baseline);
1039                baseline.ready = true;
1040                return baseline;
1041            }
1042
1043            if let (Some(previous), Some(current)) =
1044                (baseline.git_head.clone(), current_head.clone())
1045            {
1046                let project_root = baseline.project_root.clone();
1047                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1048                    baseline.git_head = Some(current);
1049                    verify_file_mtimes(&mut baseline);
1050                    baseline.ready = true;
1051                    return baseline;
1052                }
1053            }
1054        }
1055
1056        SearchIndex::build_with_limit(root, max_file_size)
1057    }
1058
1059    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1060        let file_id = u32::try_from(self.files.len()).ok()?;
1061        let metadata = fs::metadata(path).ok();
1062        let size = metadata
1063            .as_ref()
1064            .map_or(size_hint, |metadata| metadata.len());
1065        let modified = metadata
1066            .and_then(|metadata| metadata.modified().ok())
1067            .unwrap_or(UNIX_EPOCH);
1068
1069        self.files.push(FileEntry {
1070            path: path.to_path_buf(),
1071            size,
1072            modified,
1073            content_hash: cache_freshness::zero_hash(),
1074        });
1075        self.path_to_id.insert(path.to_path_buf(), file_id);
1076        Some(file_id)
1077    }
1078
1079    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1080        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1081            return;
1082        };
1083        self.unindexed_files.insert(file_id);
1084        self.file_trigrams.insert(file_id, Vec::new());
1085    }
1086
1087    fn active_file_ids(&self) -> Vec<u32> {
1088        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1089        ids.sort_unstable();
1090        ids
1091    }
1092
1093    fn is_active_file(&self, file_id: u32) -> bool {
1094        self.files
1095            .get(file_id as usize)
1096            .map(|file| !file.path.as_os_str().is_empty())
1097            .unwrap_or(false)
1098    }
1099
1100    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1101        let Some(postings) = self.postings.get(&trigram) else {
1102            return Vec::new();
1103        };
1104
1105        let mut matches = Vec::with_capacity(postings.len());
1106
1107        for posting in postings {
1108            if let Some(filter) = filter {
1109                // next_mask: bloom filter check — the character following this trigram in the
1110                // query must also appear after this trigram somewhere in the file.
1111                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1112                    continue;
1113                }
1114                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1115                // between consecutive trigram pairs, but is NOT used as a single-trigram
1116                // filter because the position in the query string has no relationship to
1117                // the position in the file. Using it here causes false negatives.
1118            }
1119            if self.is_active_file(posting.file_id) {
1120                matches.push(posting.file_id);
1121            }
1122        }
1123
1124        matches
1125    }
1126}
1127
1128fn search_candidate_file(
1129    file: &FileEntry,
1130    matcher: &SearchMatcher,
1131    max_results: usize,
1132    stop_after: usize,
1133    total_matches: &AtomicUsize,
1134    files_searched: &AtomicUsize,
1135    files_with_matches: &AtomicUsize,
1136    truncated: &AtomicBool,
1137) -> Vec<SharedGrepMatch> {
1138    if should_stop_search(truncated, total_matches, stop_after) {
1139        return Vec::new();
1140    }
1141
1142    let content = match read_indexed_file_bytes(&file.path) {
1143        Some(content) => content,
1144        None => return Vec::new(),
1145    };
1146    // Defense in depth: even though indexing tries to filter binaries via
1147    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1148    // query time. content_inspector is fast (~bytes-per-cycle on a small
1149    // preview) and this guarantees we never surface matches inside binary
1150    // files even if the indexer somehow let one through (e.g. file changed
1151    // between indexing and query).
1152    if is_binary_bytes(&content) {
1153        return Vec::new();
1154    }
1155    files_searched.fetch_add(1, Ordering::Relaxed);
1156
1157    let shared_path = Arc::new(file.path.clone());
1158    let mut matches = Vec::new();
1159    let mut line_starts = None;
1160    let mut seen_lines = HashSet::new();
1161    let mut matched_this_file = false;
1162
1163    match matcher {
1164        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1165            let finder = memchr::memmem::Finder::new(needle);
1166            let mut start = 0;
1167
1168            while let Some(position) = finder.find(&content[start..]) {
1169                if should_stop_search(truncated, total_matches, stop_after) {
1170                    break;
1171                }
1172
1173                let offset = start + position;
1174                start = offset + 1;
1175
1176                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1177                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1178                if !seen_lines.insert(line) {
1179                    continue;
1180                }
1181
1182                matched_this_file = true;
1183                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1184                if match_number > max_results {
1185                    truncated.store(true, Ordering::Relaxed);
1186                    break;
1187                }
1188
1189                let end = offset + needle.len();
1190                matches.push(SharedGrepMatch {
1191                    file: shared_path.clone(),
1192                    line,
1193                    column,
1194                    line_text,
1195                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1196                });
1197            }
1198        }
1199        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1200            let search_content = content.to_ascii_lowercase();
1201            let finder = memchr::memmem::Finder::new(needle);
1202            let mut start = 0;
1203
1204            while let Some(position) = finder.find(&search_content[start..]) {
1205                if should_stop_search(truncated, total_matches, stop_after) {
1206                    break;
1207                }
1208
1209                let offset = start + position;
1210                start = offset + 1;
1211
1212                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1213                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1214                if !seen_lines.insert(line) {
1215                    continue;
1216                }
1217
1218                matched_this_file = true;
1219                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1220                if match_number > max_results {
1221                    truncated.store(true, Ordering::Relaxed);
1222                    break;
1223                }
1224
1225                let end = offset + needle.len();
1226                matches.push(SharedGrepMatch {
1227                    file: shared_path.clone(),
1228                    line,
1229                    column,
1230                    line_text,
1231                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1232                });
1233            }
1234        }
1235        SearchMatcher::Regex(regex) => {
1236            for matched in regex.find_iter(&content) {
1237                if should_stop_search(truncated, total_matches, stop_after) {
1238                    break;
1239                }
1240
1241                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1242                let (line, column, line_text) =
1243                    line_details_bytes(&content, line_starts, matched.start());
1244                if !seen_lines.insert(line) {
1245                    continue;
1246                }
1247
1248                matched_this_file = true;
1249                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1250                if match_number > max_results {
1251                    truncated.store(true, Ordering::Relaxed);
1252                    break;
1253                }
1254
1255                matches.push(SharedGrepMatch {
1256                    file: shared_path.clone(),
1257                    line,
1258                    column,
1259                    line_text,
1260                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1261                });
1262            }
1263        }
1264    }
1265
1266    if matched_this_file {
1267        files_with_matches.fetch_add(1, Ordering::Relaxed);
1268    }
1269
1270    matches
1271}
1272
1273fn should_stop_search(
1274    truncated: &AtomicBool,
1275    total_matches: &AtomicUsize,
1276    stop_after: usize,
1277) -> bool {
1278    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1279}
1280
1281fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1282    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1283    let mut left_index = 0;
1284    let mut right_index = 0;
1285
1286    while left_index < left.len() && right_index < right.len() {
1287        match left[left_index].cmp(&right[right_index]) {
1288            std::cmp::Ordering::Less => left_index += 1,
1289            std::cmp::Ordering::Greater => right_index += 1,
1290            std::cmp::Ordering::Equal => {
1291                merged.push(left[left_index]);
1292                left_index += 1;
1293                right_index += 1;
1294            }
1295        }
1296    }
1297
1298    merged
1299}
1300
1301fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1302    let mut merged = Vec::with_capacity(left.len() + right.len());
1303    let mut left_index = 0;
1304    let mut right_index = 0;
1305
1306    while left_index < left.len() && right_index < right.len() {
1307        match left[left_index].cmp(&right[right_index]) {
1308            std::cmp::Ordering::Less => {
1309                merged.push(left[left_index]);
1310                left_index += 1;
1311            }
1312            std::cmp::Ordering::Greater => {
1313                merged.push(right[right_index]);
1314                right_index += 1;
1315            }
1316            std::cmp::Ordering::Equal => {
1317                merged.push(left[left_index]);
1318                left_index += 1;
1319                right_index += 1;
1320            }
1321        }
1322    }
1323
1324    merged.extend_from_slice(&left[left_index..]);
1325    merged.extend_from_slice(&right[right_index..]);
1326    merged
1327}
1328
1329pub fn decompose_regex(pattern: &str) -> RegexQuery {
1330    let hir = match regex_syntax::parse(pattern) {
1331        Ok(hir) => hir,
1332        Err(_) => return RegexQuery::default(),
1333    };
1334
1335    let build = build_query(&hir);
1336    build.into_query()
1337}
1338
1339pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1340    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1341}
1342
1343pub fn normalize_char(c: u8) -> u8 {
1344    c.to_ascii_lowercase()
1345}
1346
1347pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1348    if content.len() < 3 {
1349        return Vec::new();
1350    }
1351
1352    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1353    for start in 0..=content.len() - 3 {
1354        let trigram = pack_trigram(
1355            normalize_char(content[start]),
1356            normalize_char(content[start + 1]),
1357            normalize_char(content[start + 2]),
1358        );
1359        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1360        trigrams.push((trigram, next_char, start));
1361    }
1362    trigrams
1363}
1364
1365pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1366    let mut seen = HashSet::new();
1367    let mut out = Vec::new();
1368    for token in tokens {
1369        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1370            if seen.insert(trigram) {
1371                out.push(trigram);
1372            }
1373        }
1374    }
1375    out
1376}
1377
1378pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1379    if query_trigrams.is_empty() {
1380        return 0.0;
1381    }
1382
1383    let mut hits = 0u32;
1384    for &trigram in query_trigrams {
1385        if let Some(postings) = index.postings.get(&trigram) {
1386            if postings
1387                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1388                .is_ok()
1389            {
1390                hits += 1;
1391            }
1392        }
1393    }
1394
1395    if hits == 0 {
1396        return 0.0;
1397    }
1398
1399    let file_trigram_count = index
1400        .file_trigrams
1401        .get(&file_id)
1402        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1403    (hits as f32) / (1.0 + file_trigram_count.ln())
1404}
1405
1406pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1407    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1408    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1409        return PathBuf::from(override_dir)
1410            .join("index")
1411            .join(project_cache_key(project_root));
1412    }
1413    // Use configured storage dir (from plugin, XDG-compliant)
1414    if let Some(dir) = storage_dir {
1415        return dir.join("index").join(project_cache_key(project_root));
1416    }
1417    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1418    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1419    // If neither is set, fall back to a temp directory rather than `"."`
1420    // because the search-index code reads/writes absolute paths.
1421    let home = std::env::var_os("HOME")
1422        .or_else(|| std::env::var_os("USERPROFILE"))
1423        .map(PathBuf::from)
1424        .unwrap_or_else(std::env::temp_dir);
1425    home.join(".cache")
1426        .join("aft")
1427        .join("index")
1428        .join(project_cache_key(project_root))
1429}
1430
1431pub(crate) fn build_path_filters(
1432    include: &[String],
1433    exclude: &[String],
1434) -> Result<PathFilters, String> {
1435    Ok(PathFilters {
1436        includes: build_globset(include)?,
1437        excludes: build_globset(exclude)?,
1438    })
1439}
1440
1441pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1442    walk_project_files_from(root, root, filters)
1443}
1444
1445pub(crate) fn walk_project_files_from(
1446    filter_root: &Path,
1447    search_root: &Path,
1448    filters: &PathFilters,
1449) -> Vec<PathBuf> {
1450    let mut builder = WalkBuilder::new(search_root);
1451    builder
1452        .hidden(false)
1453        .git_ignore(true)
1454        .git_global(true)
1455        .git_exclude(true)
1456        .filter_entry(|entry| {
1457            let name = entry.file_name().to_string_lossy();
1458            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1459                return !matches!(
1460                    name.as_ref(),
1461                    "node_modules"
1462                        | "target"
1463                        | "venv"
1464                        | ".venv"
1465                        | ".git"
1466                        | "__pycache__"
1467                        | ".tox"
1468                        | "dist"
1469                        | "build"
1470                );
1471            }
1472            true
1473        });
1474
1475    let mut files = Vec::new();
1476    for entry in builder.build().filter_map(|entry| entry.ok()) {
1477        if !entry
1478            .file_type()
1479            .map_or(false, |file_type| file_type.is_file())
1480        {
1481            continue;
1482        }
1483        let path = entry.into_path();
1484        if filters.matches(filter_root, &path) {
1485            files.push(path);
1486        }
1487    }
1488
1489    sort_paths_by_mtime_desc(&mut files);
1490    files
1491}
1492
1493pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1494    let bytes = fs::read(path).ok()?;
1495    if is_binary_bytes(&bytes) {
1496        return None;
1497    }
1498    String::from_utf8(bytes).ok()
1499}
1500
1501fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1502    fs::read(path).ok()
1503}
1504
1505pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1506    path.strip_prefix(root)
1507        .map(PathBuf::from)
1508        .unwrap_or_else(|_| path.to_path_buf())
1509}
1510
1511pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1512    let normalized_root = normalize_path(root);
1513    let normalized_path = normalize_path(path);
1514    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1515    validate_cached_relative_path(relative)
1516}
1517
1518pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1519    let relative = validate_cached_relative_path(relative_path)?;
1520    let normalized_root = normalize_path(root);
1521    let full_path = normalize_path(&normalized_root.join(relative));
1522    full_path.starts_with(&normalized_root).then_some(full_path)
1523}
1524
1525pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1526    if path.is_absolute() {
1527        return None;
1528    }
1529
1530    let mut normalized = PathBuf::new();
1531    for component in path.components() {
1532        match component {
1533            Component::Normal(part) => normalized.push(part),
1534            Component::CurDir => {}
1535            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1536        }
1537    }
1538    (!normalized.as_os_str().is_empty()).then_some(normalized)
1539}
1540
1541/// Sort paths newest-first by mtime, falling back to lexicographic order.
1542///
1543/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1544/// `sort_by()` closure. That made the comparator non-deterministic — a
1545/// `stat()` syscall for the same path can return different values across
1546/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1547/// concurrent file-watcher activity), and Rust's slice::sort panics at
1548/// runtime when it detects a non-total-order comparator. CI hit this on
1549/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1550///
1551/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1552/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1553pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1554    use std::collections::HashMap;
1555    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1556    for path in paths.iter() {
1557        mtimes
1558            .entry(path.clone())
1559            .or_insert_with(|| path_modified_time(path));
1560    }
1561    paths.sort_by(|left, right| {
1562        let left_mtime = mtimes.get(left).and_then(|v| *v);
1563        let right_mtime = mtimes.get(right).and_then(|v| *v);
1564        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1565    });
1566}
1567
1568/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1569/// the sort. Same fix, applied to grep matches that share files.
1570pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1571    use std::collections::HashMap;
1572    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1573    for m in matches.iter() {
1574        mtimes.entry(m.file.clone()).or_insert_with(|| {
1575            let resolved = resolve_match_path(project_root, &m.file);
1576            path_modified_time(&resolved)
1577        });
1578    }
1579    matches.sort_by(|left, right| {
1580        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1581        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1582        right_mtime
1583            .cmp(&left_mtime)
1584            .then_with(|| left.file.cmp(&right.file))
1585            .then_with(|| left.line.cmp(&right.line))
1586            .then_with(|| left.column.cmp(&right.column))
1587    });
1588}
1589
1590/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1591/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1592/// table from the search index), but it can still return different values if
1593/// the file is modified mid-sort. Snapshot once.
1594fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1595    matches: &mut [SharedGrepMatch],
1596    modified_for_path: F,
1597) where
1598    F: Fn(&Path) -> Option<SystemTime>,
1599{
1600    use std::collections::HashMap;
1601    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1602    for m in matches.iter() {
1603        let path = m.file.as_path().to_path_buf();
1604        mtimes
1605            .entry(path.clone())
1606            .or_insert_with(|| modified_for_path(&path));
1607    }
1608    matches.sort_by(|left, right| {
1609        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1610        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1611        right_mtime
1612            .cmp(&left_mtime)
1613            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1614            .then_with(|| left.line.cmp(&right.line))
1615            .then_with(|| left.column.cmp(&right.column))
1616    });
1617}
1618
1619pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1620    let resolved_project_root = canonicalize_or_normalize(project_root);
1621    let root = match path {
1622        Some(path) => {
1623            let path = PathBuf::from(path);
1624            if path.is_absolute() {
1625                canonicalize_or_normalize(&path)
1626            } else {
1627                normalize_path(&resolved_project_root.join(path))
1628            }
1629        }
1630        None => resolved_project_root.clone(),
1631    };
1632
1633    let use_index = is_within_search_root(&resolved_project_root, &root);
1634    SearchScope { root, use_index }
1635}
1636
1637pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1638    content_inspector::inspect(content).is_binary()
1639}
1640
1641pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1642    run_git(root, &["rev-parse", "HEAD"])
1643}
1644
1645pub fn project_cache_key(project_root: &Path) -> String {
1646    use sha2::{Digest, Sha256};
1647
1648    let mut hasher = Sha256::new();
1649
1650    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1651        // Git repo: root commit is the unique identity.
1652        // Same repo cloned anywhere produces the same key.
1653        hasher.update(root_commit.as_bytes());
1654    } else {
1655        // Non-git project: use the canonical filesystem path as identity.
1656        let canonical_root = canonicalize_or_normalize(project_root);
1657        hasher.update(canonical_root.to_string_lossy().as_bytes());
1658    }
1659
1660    let digest = format!("{:x}", hasher.finalize());
1661    digest[..16].to_string()
1662}
1663
1664impl PathFilters {
1665    fn matches(&self, root: &Path, path: &Path) -> bool {
1666        let relative = to_glob_path(&relative_to_root(root, path));
1667        if self
1668            .includes
1669            .as_ref()
1670            .is_some_and(|includes| !includes.is_match(&relative))
1671        {
1672            return false;
1673        }
1674        if self
1675            .excludes
1676            .as_ref()
1677            .is_some_and(|excludes| excludes.is_match(&relative))
1678        {
1679            return false;
1680        }
1681        true
1682    }
1683}
1684
1685fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1686    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1687}
1688
1689fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1690    if path.is_absolute() {
1691        path.to_path_buf()
1692    } else {
1693        project_root.join(path)
1694    }
1695}
1696
1697fn path_modified_time(path: &Path) -> Option<SystemTime> {
1698    fs::metadata(path)
1699        .and_then(|metadata| metadata.modified())
1700        .ok()
1701}
1702
1703fn normalize_path(path: &Path) -> PathBuf {
1704    let mut result = PathBuf::new();
1705    for component in path.components() {
1706        match component {
1707            Component::ParentDir => {
1708                if !result.pop() {
1709                    result.push(component);
1710                }
1711            }
1712            Component::CurDir => {}
1713            _ => result.push(component),
1714        }
1715    }
1716    result
1717}
1718
1719/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1720/// since the index was last written. Also detect new files and deleted files.
1721fn verify_file_mtimes(index: &mut SearchIndex) {
1722    // Collect stale files (mtime mismatch or deleted)
1723    let mut stale_paths = Vec::new();
1724    for entry in &index.files {
1725        if entry.path.as_os_str().is_empty() {
1726            continue; // tombstoned entry
1727        }
1728        let cached = FileFreshness {
1729            mtime: entry.modified,
1730            size: entry.size,
1731            content_hash: entry.content_hash,
1732        };
1733        match cache_freshness::verify_file(&entry.path, &cached) {
1734            FreshnessVerdict::HotFresh | FreshnessVerdict::ContentFresh { .. } => {}
1735            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1736                stale_paths.push(entry.path.clone())
1737            }
1738        }
1739    }
1740
1741    for entry in &mut index.files {
1742        if entry.path.as_os_str().is_empty() {
1743            continue;
1744        }
1745        let cached = FileFreshness {
1746            mtime: entry.modified,
1747            size: entry.size,
1748            content_hash: entry.content_hash,
1749        };
1750        if let FreshnessVerdict::ContentFresh {
1751            new_mtime,
1752            new_size,
1753        } = cache_freshness::verify_file(&entry.path, &cached)
1754        {
1755            entry.modified = new_mtime;
1756            entry.size = new_size;
1757        }
1758    }
1759
1760    // Re-index stale files
1761    for path in &stale_paths {
1762        index.update_file(path);
1763    }
1764
1765    // Detect new files not in the index
1766    let filters = PathFilters::default();
1767    for path in walk_project_files(&index.project_root, &filters) {
1768        if !index.path_to_id.contains_key(&path) {
1769            index.update_file(&path);
1770        }
1771    }
1772
1773    if !stale_paths.is_empty() {
1774        crate::slog_info!(
1775            "search index: refreshed {} stale file(s) from disk cache",
1776            stale_paths.len()
1777        );
1778    }
1779}
1780
1781fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1782    normalize_path(path).starts_with(normalize_path(search_root))
1783}
1784
1785impl QueryBuild {
1786    fn into_query(self) -> RegexQuery {
1787        let mut query = RegexQuery::default();
1788
1789        for run in self.and_runs {
1790            add_run_to_and_query(&mut query, &run);
1791        }
1792
1793        for group in self.or_groups {
1794            let mut trigrams = BTreeSet::new();
1795            let mut filters = HashMap::new();
1796            for run in group {
1797                for (trigram, filter) in trigram_filters(&run) {
1798                    trigrams.insert(trigram);
1799                    merge_filter(filters.entry(trigram).or_default(), filter);
1800                }
1801            }
1802            if !trigrams.is_empty() {
1803                query.or_groups.push(trigrams.into_iter().collect());
1804                query.or_filters.push(filters);
1805            }
1806        }
1807
1808        query
1809    }
1810}
1811
1812fn build_query(hir: &Hir) -> QueryBuild {
1813    match hir.kind() {
1814        HirKind::Literal(literal) => {
1815            if literal.0.len() >= 3 {
1816                QueryBuild {
1817                    and_runs: vec![literal.0.to_vec()],
1818                    or_groups: Vec::new(),
1819                }
1820            } else {
1821                QueryBuild::default()
1822            }
1823        }
1824        HirKind::Capture(capture) => build_query(&capture.sub),
1825        HirKind::Concat(parts) => {
1826            let mut build = QueryBuild::default();
1827            for part in parts {
1828                let part_build = build_query(part);
1829                build.and_runs.extend(part_build.and_runs);
1830                build.or_groups.extend(part_build.or_groups);
1831            }
1832            build
1833        }
1834        HirKind::Alternation(parts) => {
1835            let mut group = Vec::new();
1836            for part in parts {
1837                let Some(mut choices) = guaranteed_run_choices(part) else {
1838                    return QueryBuild::default();
1839                };
1840                group.append(&mut choices);
1841            }
1842            if group.is_empty() {
1843                QueryBuild::default()
1844            } else {
1845                QueryBuild {
1846                    and_runs: Vec::new(),
1847                    or_groups: vec![group],
1848                }
1849            }
1850        }
1851        HirKind::Repetition(repetition) => {
1852            if repetition.min == 0 {
1853                QueryBuild::default()
1854            } else {
1855                build_query(&repetition.sub)
1856            }
1857        }
1858        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1859    }
1860}
1861
1862fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1863    match hir.kind() {
1864        HirKind::Literal(literal) => {
1865            if literal.0.len() >= 3 {
1866                Some(vec![literal.0.to_vec()])
1867            } else {
1868                None
1869            }
1870        }
1871        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1872        HirKind::Concat(parts) => {
1873            let mut runs = Vec::new();
1874            for part in parts {
1875                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1876                    runs.append(&mut part_runs);
1877                }
1878            }
1879            if runs.is_empty() {
1880                None
1881            } else {
1882                Some(runs)
1883            }
1884        }
1885        HirKind::Alternation(parts) => {
1886            let mut runs = Vec::new();
1887            for part in parts {
1888                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1889                    return None;
1890                };
1891                runs.append(&mut part_runs);
1892            }
1893            if runs.is_empty() {
1894                None
1895            } else {
1896                Some(runs)
1897            }
1898        }
1899        HirKind::Repetition(repetition) => {
1900            if repetition.min == 0 {
1901                None
1902            } else {
1903                guaranteed_run_choices(&repetition.sub)
1904            }
1905        }
1906        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1907    }
1908}
1909
1910fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1911    for (trigram, filter) in trigram_filters(run) {
1912        if !query.and_trigrams.contains(&trigram) {
1913            query.and_trigrams.push(trigram);
1914        }
1915        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1916    }
1917}
1918
1919fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1920    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1921    for (trigram, next_char, position) in extract_trigrams(run) {
1922        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1923        if next_char != EOF_SENTINEL {
1924            entry.next_mask |= mask_for_next_char(next_char);
1925        }
1926        entry.loc_mask |= mask_for_position(position);
1927    }
1928    filters.into_iter().collect()
1929}
1930
1931fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1932    target.next_mask |= filter.next_mask;
1933    target.loc_mask |= filter.loc_mask;
1934}
1935
1936fn mask_for_next_char(next_char: u8) -> u8 {
1937    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1938    1u8 << bit
1939}
1940
1941fn mask_for_position(position: usize) -> u8 {
1942    1u8 << (position % 8)
1943}
1944
1945fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1946    if patterns.is_empty() {
1947        return Ok(None);
1948    }
1949
1950    let mut builder = GlobSetBuilder::new();
1951    for pattern in patterns {
1952        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1953        builder.add(glob);
1954    }
1955    builder.build().map(Some).map_err(|error| error.to_string())
1956}
1957
1958fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1959    let mut buffer = [0u8; 4];
1960    reader.read_exact(&mut buffer)?;
1961    Ok(u32::from_le_bytes(buffer))
1962}
1963
1964fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1965    let mut buffer = [0u8; 8];
1966    reader.read_exact(&mut buffer)?;
1967    Ok(u64::from_le_bytes(buffer))
1968}
1969
1970fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1971    writer.write_all(&value.to_le_bytes())
1972}
1973
1974fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1975    writer.write_all(&value.to_le_bytes())
1976}
1977
1978fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1979    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1980        return Err(std::io::Error::other("search index checksum missing"));
1981    };
1982    let expected = u32::from_le_bytes(*stored);
1983    let actual = crc32fast::hash(body);
1984    if actual != expected {
1985        return Err(std::io::Error::other("search index checksum mismatch"));
1986    }
1987    Ok(())
1988}
1989
1990fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1991    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1992    total_len.checked_sub(pos)
1993}
1994
1995fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1996    let output = Command::new("git")
1997        .arg("-C")
1998        .arg(root)
1999        .args(args)
2000        .output()
2001        .ok()?;
2002    if !output.status.success() {
2003        return None;
2004    }
2005    let value = String::from_utf8(output.stdout).ok()?;
2006    let value = value.trim().to_string();
2007    if value.is_empty() {
2008        None
2009    } else {
2010        Some(value)
2011    }
2012}
2013
2014fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2015    let diff_range = format!("{}..{}", from, to);
2016    let output = match Command::new("git")
2017        .arg("-C")
2018        .arg(root)
2019        .args(["diff", "--name-status", "-M", &diff_range])
2020        .output()
2021    {
2022        Ok(output) => output,
2023        Err(_) => return false,
2024    };
2025
2026    if !output.status.success() {
2027        return false;
2028    }
2029
2030    let Ok(diff) = String::from_utf8(output.stdout) else {
2031        return false;
2032    };
2033
2034    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2035        let mut fields = line.split('\t');
2036        let Some(status) = fields.next() else {
2037            continue;
2038        };
2039
2040        if status.starts_with('R') {
2041            let Some(old_path) = fields
2042                .next()
2043                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2044            else {
2045                continue;
2046            };
2047            let Some(new_path) = fields
2048                .next()
2049                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2050            else {
2051                continue;
2052            };
2053            index.remove_file(&old_path);
2054            index.update_file(&new_path);
2055            continue;
2056        }
2057
2058        let Some(path) = fields
2059            .next()
2060            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2061        else {
2062            continue;
2063        };
2064        if status.starts_with('D') || !path.exists() {
2065            index.remove_file(&path);
2066        } else {
2067            index.update_file(&path);
2068        }
2069    }
2070
2071    true
2072}
2073
2074fn is_binary_path(path: &Path, size: u64) -> bool {
2075    if size == 0 {
2076        return false;
2077    }
2078
2079    let mut file = match File::open(path) {
2080        Ok(file) => file,
2081        Err(_) => return true,
2082    };
2083
2084    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2085    match file.read(&mut preview) {
2086        Ok(read) => is_binary_bytes(&preview[..read]),
2087        Err(_) => true,
2088    }
2089}
2090
2091fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2092    let mut starts = vec![0usize];
2093    for (index, byte) in content.iter().copied().enumerate() {
2094        if byte == b'\n' {
2095            starts.push(index + 1);
2096        }
2097    }
2098    starts
2099}
2100
2101fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2102    let line_index = match line_starts.binary_search(&offset) {
2103        Ok(index) => index,
2104        Err(index) => index.saturating_sub(1),
2105    };
2106    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2107    let line_end = content[line_start..]
2108        .iter()
2109        .position(|byte| *byte == b'\n')
2110        .map(|length| line_start + length)
2111        .unwrap_or(content.len());
2112    let mut line_slice = &content[line_start..line_end];
2113    if line_slice.ends_with(b"\r") {
2114        line_slice = &line_slice[..line_slice.len() - 1];
2115    }
2116    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2117    let column = String::from_utf8_lossy(&content[line_start..offset])
2118        .chars()
2119        .count() as u32
2120        + 1;
2121    (line_index as u32 + 1, column, line_text)
2122}
2123
2124fn to_glob_path(path: &Path) -> String {
2125    path.to_string_lossy().replace('\\', "/")
2126}
2127
2128#[cfg(test)]
2129mod tests {
2130    use std::process::Command;
2131
2132    use super::*;
2133
2134    #[test]
2135    fn extract_trigrams_tracks_next_char_and_position() {
2136        let trigrams = extract_trigrams(b"Rust");
2137        assert_eq!(trigrams.len(), 2);
2138        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2139        assert_eq!(
2140            trigrams[1],
2141            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2142        );
2143    }
2144
2145    #[test]
2146    fn decompose_regex_extracts_literals_and_alternations() {
2147        let query = decompose_regex("abc(def|ghi)xyz");
2148        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2149        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2150        assert_eq!(query.or_groups.len(), 1);
2151        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2152        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2153    }
2154
2155    #[test]
2156    fn candidates_intersect_posting_lists() {
2157        let mut index = SearchIndex::new();
2158        let dir = tempfile::tempdir().expect("create temp dir");
2159        let alpha = dir.path().join("alpha.txt");
2160        let beta = dir.path().join("beta.txt");
2161        fs::write(&alpha, "abcdef").expect("write alpha");
2162        fs::write(&beta, "abcxyz").expect("write beta");
2163        index.project_root = dir.path().to_path_buf();
2164        index.index_file(&alpha, b"abcdef");
2165        index.index_file(&beta, b"abcxyz");
2166
2167        let query = RegexQuery {
2168            and_trigrams: vec![
2169                pack_trigram(b'a', b'b', b'c'),
2170                pack_trigram(b'd', b'e', b'f'),
2171            ],
2172            ..RegexQuery::default()
2173        };
2174
2175        let candidates = index.candidates(&query);
2176        assert_eq!(candidates.len(), 1);
2177        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2178    }
2179
2180    #[test]
2181    fn candidates_apply_bloom_filters() {
2182        let mut index = SearchIndex::new();
2183        let dir = tempfile::tempdir().expect("create temp dir");
2184        let file = dir.path().join("sample.txt");
2185        fs::write(&file, "abcd efgh").expect("write sample");
2186        index.project_root = dir.path().to_path_buf();
2187        index.index_file(&file, b"abcd efgh");
2188
2189        let trigram = pack_trigram(b'a', b'b', b'c');
2190        let matching_filter = PostingFilter {
2191            next_mask: mask_for_next_char(b'd'),
2192            loc_mask: mask_for_position(0),
2193        };
2194        let non_matching_filter = PostingFilter {
2195            next_mask: mask_for_next_char(b'z'),
2196            loc_mask: mask_for_position(0),
2197        };
2198
2199        assert_eq!(
2200            index
2201                .postings_for_trigram(trigram, Some(matching_filter))
2202                .len(),
2203            1
2204        );
2205        assert!(index
2206            .postings_for_trigram(trigram, Some(non_matching_filter))
2207            .is_empty());
2208    }
2209
2210    #[test]
2211    fn disk_round_trip_preserves_postings_and_files() {
2212        let dir = tempfile::tempdir().expect("create temp dir");
2213        let project = dir.path().join("project");
2214        fs::create_dir_all(&project).expect("create project dir");
2215        let file = project.join("src.txt");
2216        fs::write(&file, "abcdef").expect("write source");
2217
2218        let mut index = SearchIndex::build(&project);
2219        index.git_head = Some("deadbeef".to_string());
2220        let cache_dir = dir.path().join("cache");
2221        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2222
2223        let loaded =
2224            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2225        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2226        assert_eq!(loaded.files.len(), 1);
2227        assert_eq!(
2228            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2229            PathBuf::from("src.txt")
2230        );
2231        assert_eq!(loaded.postings.len(), index.postings.len());
2232        assert!(loaded
2233            .postings
2234            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2235    }
2236
2237    #[test]
2238    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2239        let root = PathBuf::from("/tmp/aft-project");
2240
2241        assert_eq!(
2242            cache_relative_path(&root, &root.join("src/lib.rs")),
2243            Some(PathBuf::from("src/lib.rs"))
2244        );
2245        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2246        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2247        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2248        assert_eq!(
2249            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2250            Some(root.join("src/lib.rs"))
2251        );
2252    }
2253
2254    #[test]
2255    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2256        let dir = tempfile::tempdir().expect("create temp dir");
2257        let project = dir.path().join("project");
2258        fs::create_dir_all(&project).expect("create project dir");
2259        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2260        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2261        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2262
2263        Command::new("git")
2264            .arg("init")
2265            .arg(&project)
2266            .status()
2267            .expect("git init");
2268        for args in [
2269            ["config", "user.email", "aft@example.invalid"],
2270            ["config", "user.name", "AFT Test"],
2271        ] {
2272            Command::new("git")
2273                .arg("-C")
2274                .arg(&project)
2275                .args(args)
2276                .status()
2277                .expect("git config");
2278        }
2279        Command::new("git")
2280            .arg("-C")
2281            .arg(&project)
2282            .args(["add", "."])
2283            .status()
2284            .expect("git add initial");
2285        Command::new("git")
2286            .arg("-C")
2287            .arg(&project)
2288            .args(["commit", "-m", "initial"])
2289            .status()
2290            .expect("git commit initial");
2291        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2292        let mut baseline = SearchIndex::build(&project);
2293        baseline.git_head = Some(previous.clone());
2294
2295        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2296        Command::new("git")
2297            .arg("-C")
2298            .arg(&project)
2299            .args(["add", "-A"])
2300            .status()
2301            .expect("git add rename");
2302        Command::new("git")
2303            .arg("-C")
2304            .arg(&project)
2305            .args(["commit", "-m", "rename"])
2306            .status()
2307            .expect("git commit rename");
2308        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2309
2310        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2311        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2312
2313        let refreshed = SearchIndex::rebuild_or_refresh(
2314            &project,
2315            DEFAULT_MAX_FILE_SIZE,
2316            Some(current),
2317            Some(baseline),
2318        );
2319
2320        assert!(!refreshed
2321            .path_to_id
2322            .contains_key(&canonical_project.join("old.txt")));
2323        assert!(refreshed
2324            .path_to_id
2325            .contains_key(&canonical_project.join("new.txt")));
2326        assert!(refreshed
2327            .path_to_id
2328            .contains_key(&canonical_project.join("untracked.txt")));
2329        let matches =
2330            refreshed.search_grep("after local edit", true, &[], &[], &canonical_project, 10);
2331        assert_eq!(matches.matches.len(), 1);
2332    }
2333
2334    #[test]
2335    fn read_from_disk_rejects_corrupt_postings_checksum() {
2336        let dir = tempfile::tempdir().expect("create temp dir");
2337        let project = dir.path().join("project");
2338        fs::create_dir_all(&project).expect("create project dir");
2339        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2340
2341        let index = SearchIndex::build(&project);
2342        let cache_dir = dir.path().join("cache");
2343        index.write_to_disk(&cache_dir, None);
2344
2345        let cache_path = cache_dir.join("cache.bin");
2346        let mut bytes = fs::read(&cache_path).expect("read cache");
2347        let middle = bytes.len() / 2;
2348        bytes[middle] ^= 0xff;
2349        fs::write(&cache_path, bytes).expect("write corrupted cache");
2350
2351        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2352    }
2353
2354    #[test]
2355    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2356        let dir = tempfile::tempdir().expect("create temp dir");
2357        let project = dir.path().join("project");
2358        fs::create_dir_all(&project).expect("create project dir");
2359        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2360
2361        let index = SearchIndex::build(&project);
2362        let cache_dir = dir.path().join("cache");
2363        index.write_to_disk(&cache_dir, None);
2364
2365        assert!(cache_dir.join("cache.bin").is_file());
2366        assert!(fs::read_dir(&cache_dir)
2367            .expect("read cache dir")
2368            .all(|entry| !entry
2369                .expect("cache entry")
2370                .file_name()
2371                .to_string_lossy()
2372                .contains(".tmp.")));
2373    }
2374
2375    #[test]
2376    fn concurrent_search_index_writes_do_not_corrupt() {
2377        let dir = tempfile::tempdir().expect("create temp dir");
2378        let project = dir.path().join("project");
2379        fs::create_dir_all(&project).expect("create project dir");
2380        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2381        let cache_dir = dir.path().join("cache");
2382
2383        let a_project = project.clone();
2384        let a_cache = cache_dir.clone();
2385        let a = std::thread::spawn(move || {
2386            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2387            let index = SearchIndex::build(&a_project);
2388            index.write_to_disk(&a_cache, None);
2389        });
2390        let b_project = project.clone();
2391        let b_cache = cache_dir.clone();
2392        let b = std::thread::spawn(move || {
2393            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2394            let index = SearchIndex::build(&b_project);
2395            index.write_to_disk(&b_cache, None);
2396        });
2397        a.join().expect("writer a");
2398        b.join().expect("writer b");
2399
2400        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2401    }
2402
2403    #[test]
2404    fn search_index_atomic_rename_survives_partial_write() {
2405        let dir = tempfile::tempdir().expect("create temp dir");
2406        let cache_dir = dir.path().join("cache");
2407        fs::create_dir_all(&cache_dir).expect("create cache dir");
2408        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2409
2410        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2411    }
2412
2413    #[test]
2414    fn project_cache_key_includes_checkout_path() {
2415        let dir = tempfile::tempdir().expect("create temp dir");
2416        let source = dir.path().join("source");
2417        fs::create_dir_all(&source).expect("create source repo dir");
2418        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2419
2420        assert!(Command::new("git")
2421            .current_dir(&source)
2422            .args(["init"])
2423            .status()
2424            .expect("init git repo")
2425            .success());
2426        assert!(Command::new("git")
2427            .current_dir(&source)
2428            .args(["add", "."])
2429            .status()
2430            .expect("git add")
2431            .success());
2432        assert!(Command::new("git")
2433            .current_dir(&source)
2434            .args([
2435                "-c",
2436                "user.name=AFT Tests",
2437                "-c",
2438                "user.email=aft-tests@example.com",
2439                "commit",
2440                "-m",
2441                "initial",
2442            ])
2443            .status()
2444            .expect("git commit")
2445            .success());
2446
2447        let clone = dir.path().join("clone");
2448        assert!(Command::new("git")
2449            .args(["clone", "--quiet"])
2450            .arg(&source)
2451            .arg(&clone)
2452            .status()
2453            .expect("git clone")
2454            .success());
2455
2456        let source_key = project_cache_key(&source);
2457        let clone_key = project_cache_key(&clone);
2458
2459        assert_eq!(source_key.len(), 16);
2460        assert_eq!(clone_key.len(), 16);
2461        // Same repo (same root commit) → same cache key regardless of clone path
2462        assert_eq!(source_key, clone_key);
2463    }
2464
2465    #[test]
2466    fn git_head_unchanged_picks_up_local_edits() {
2467        let dir = tempfile::tempdir().expect("create temp dir");
2468        let project = dir.path().join("repo");
2469        fs::create_dir_all(&project).expect("create repo dir");
2470        let file = project.join("tracked.txt");
2471        fs::write(&file, "oldtoken\n").expect("write file");
2472        assert!(Command::new("git")
2473            .current_dir(&project)
2474            .arg("init")
2475            .status()
2476            .unwrap()
2477            .success());
2478        assert!(Command::new("git")
2479            .current_dir(&project)
2480            .args(["add", "."])
2481            .status()
2482            .unwrap()
2483            .success());
2484        assert!(Command::new("git")
2485            .current_dir(&project)
2486            .args([
2487                "-c",
2488                "user.name=AFT Tests",
2489                "-c",
2490                "user.email=aft-tests@example.com",
2491                "commit",
2492                "-m",
2493                "initial"
2494            ])
2495            .status()
2496            .unwrap()
2497            .success());
2498        let head = current_git_head(&project);
2499        let mut baseline = SearchIndex::build(&project);
2500        baseline.git_head = head.clone();
2501        fs::write(&file, "newtoken\n").expect("edit tracked file");
2502
2503        let refreshed =
2504            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2505        let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2506
2507        assert_eq!(result.total_matches, 1);
2508    }
2509
2510    #[test]
2511    fn non_git_project_reuses_cache_when_files_unchanged() {
2512        let dir = tempfile::tempdir().expect("create temp dir");
2513        let project = dir.path().join("project");
2514        fs::create_dir_all(&project).expect("create project dir");
2515        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2516        let baseline = SearchIndex::build(&project);
2517        let baseline_file_count = baseline.file_count();
2518
2519        let refreshed =
2520            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2521
2522        assert_eq!(refreshed.file_count(), baseline_file_count);
2523        assert_eq!(
2524            refreshed
2525                .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2526                .total_matches,
2527            1
2528        );
2529    }
2530
2531    #[test]
2532    fn resolve_search_scope_disables_index_for_external_path() {
2533        let dir = tempfile::tempdir().expect("create temp dir");
2534        let project = dir.path().join("project");
2535        let outside = dir.path().join("outside");
2536        fs::create_dir_all(&project).expect("create project dir");
2537        fs::create_dir_all(&outside).expect("create outside dir");
2538
2539        let scope = resolve_search_scope(&project, outside.to_str());
2540
2541        assert_eq!(
2542            scope.root,
2543            fs::canonicalize(&outside).expect("canonicalize outside")
2544        );
2545        assert!(!scope.use_index);
2546    }
2547
2548    #[test]
2549    fn grep_filters_matches_to_search_root() {
2550        let dir = tempfile::tempdir().expect("create temp dir");
2551        let project = dir.path().join("project");
2552        let src = project.join("src");
2553        let docs = project.join("docs");
2554        fs::create_dir_all(&src).expect("create src dir");
2555        fs::create_dir_all(&docs).expect("create docs dir");
2556        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2557        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2558
2559        let index = SearchIndex::build(&project);
2560        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2561
2562        assert_eq!(result.files_searched, 1);
2563        assert_eq!(result.files_with_matches, 1);
2564        assert_eq!(result.matches.len(), 1);
2565        // Index stores canonicalized paths; on macOS /var → /private/var
2566        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2567        assert_eq!(result.matches[0].file, expected);
2568    }
2569
2570    #[test]
2571    fn grep_deduplicates_multiple_matches_on_same_line() {
2572        let dir = tempfile::tempdir().expect("create temp dir");
2573        let project = dir.path().join("project");
2574        let src = project.join("src");
2575        fs::create_dir_all(&src).expect("create src dir");
2576        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2577
2578        let index = SearchIndex::build(&project);
2579        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2580
2581        assert_eq!(result.total_matches, 1);
2582        assert_eq!(result.matches.len(), 1);
2583    }
2584
2585    #[test]
2586    fn grep_reports_total_matches_before_truncation() {
2587        let dir = tempfile::tempdir().expect("create temp dir");
2588        let project = dir.path().join("project");
2589        let src = project.join("src");
2590        fs::create_dir_all(&src).expect("create src dir");
2591        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2592
2593        let index = SearchIndex::build(&project);
2594        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2595
2596        assert_eq!(result.total_matches, 2);
2597        assert_eq!(result.matches.len(), 1);
2598        assert!(result.truncated);
2599    }
2600
2601    #[test]
2602    fn glob_filters_results_to_search_root() {
2603        let dir = tempfile::tempdir().expect("create temp dir");
2604        let project = dir.path().join("project");
2605        let src = project.join("src");
2606        let scripts = project.join("scripts");
2607        fs::create_dir_all(&src).expect("create src dir");
2608        fs::create_dir_all(&scripts).expect("create scripts dir");
2609        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2610        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2611
2612        let index = SearchIndex::build(&project);
2613        let files = index.glob("**/*.rs", &src);
2614
2615        assert_eq!(
2616            files,
2617            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2618        );
2619    }
2620
2621    #[test]
2622    fn glob_includes_hidden_and_binary_files() {
2623        let dir = tempfile::tempdir().expect("create temp dir");
2624        let project = dir.path().join("project");
2625        let hidden_dir = project.join(".hidden");
2626        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2627        let hidden_file = hidden_dir.join("data.bin");
2628        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2629
2630        let index = SearchIndex::build(&project);
2631        let files = index.glob("**/*.bin", &project);
2632
2633        assert_eq!(
2634            files,
2635            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2636        );
2637    }
2638
2639    #[test]
2640    fn read_from_disk_rejects_invalid_nanos() {
2641        let dir = tempfile::tempdir().expect("create temp dir");
2642        let cache_dir = dir.path().join("cache");
2643        fs::create_dir_all(&cache_dir).expect("create cache dir");
2644
2645        let mut postings = Vec::new();
2646        postings.extend_from_slice(INDEX_MAGIC);
2647        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2648        postings.extend_from_slice(&0u32.to_le_bytes());
2649        postings.extend_from_slice(&1u32.to_le_bytes());
2650        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2651        postings.extend_from_slice(&1u32.to_le_bytes());
2652        postings.extend_from_slice(b"/");
2653        postings.push(0u8);
2654        postings.extend_from_slice(&1u32.to_le_bytes());
2655        postings.extend_from_slice(&0u64.to_le_bytes());
2656        postings.extend_from_slice(&0u64.to_le_bytes());
2657        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2658        postings.extend_from_slice(b"a");
2659        postings.extend_from_slice(&0u64.to_le_bytes());
2660
2661        let mut lookup = Vec::new();
2662        lookup.extend_from_slice(LOOKUP_MAGIC);
2663        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2664        lookup.extend_from_slice(&0u32.to_le_bytes());
2665
2666        let postings_checksum = crc32fast::hash(&postings);
2667        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2668        let lookup_checksum = crc32fast::hash(&lookup);
2669        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2670        let mut cache = Vec::new();
2671        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2672        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2673        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2674        cache.extend_from_slice(&postings);
2675        cache.extend_from_slice(&lookup);
2676        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2677
2678        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2679    }
2680
2681    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2682    /// changed between cmp() calls.
2683    ///
2684    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2685    /// which does a `stat()` syscall. If the file was deleted, modified, or
2686    /// touched mid-sort, the comparator returned different values for the
2687    /// same input pair on different invocations. Rust's slice::sort detects
2688    /// this and panics with "user-provided comparison function does not
2689    /// correctly implement a total order".
2690    ///
2691    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2692    /// bridge invalidated files in parallel with grep's sort path. This
2693    /// test simulates the worst case: most paths don't exist (Err from
2694    /// fs::metadata) and sort still completes successfully.
2695    #[test]
2696    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2697        // Mix of existing and non-existing paths in deliberately
2698        // non-monotonic order — pre-fix, the sort would call stat() at
2699        // least N log N times and any flakiness would trigger the panic.
2700        let dir = tempfile::tempdir().expect("create tempdir");
2701        let mut paths: Vec<PathBuf> = Vec::new();
2702        for i in 0..30 {
2703            // Half exist, half don't.
2704            let path = if i % 2 == 0 {
2705                let p = dir.path().join(format!("real-{i}.rs"));
2706                fs::write(&p, format!("// {i}\n")).expect("write");
2707                p
2708            } else {
2709                dir.path().join(format!("missing-{i}.rs"))
2710            };
2711            paths.push(path);
2712        }
2713
2714        // Run the sort many times to maximise the chance of catching any
2715        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2716        for _ in 0..50 {
2717            let mut copy = paths.clone();
2718            sort_paths_by_mtime_desc(&mut copy);
2719            assert_eq!(copy.len(), paths.len());
2720        }
2721    }
2722
2723    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2724    /// the same conditions as sort_paths_by_mtime_desc. See the
2725    /// sort_paths_... test above for the full rationale.
2726    #[test]
2727    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2728        let dir = tempfile::tempdir().expect("create tempdir");
2729        let mut matches: Vec<GrepMatch> = Vec::new();
2730        for i in 0..30 {
2731            let file = if i % 2 == 0 {
2732                let p = dir.path().join(format!("real-{i}.rs"));
2733                fs::write(&p, format!("// {i}\n")).expect("write");
2734                p
2735            } else {
2736                dir.path().join(format!("missing-{i}.rs"))
2737            };
2738            matches.push(GrepMatch {
2739                file,
2740                line: u32::try_from(i).unwrap_or(0),
2741                column: 0,
2742                line_text: format!("match {i}"),
2743                match_text: format!("match {i}"),
2744            });
2745        }
2746
2747        for _ in 0..50 {
2748            let mut copy = matches.clone();
2749            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2750            assert_eq!(copy.len(), matches.len());
2751        }
2752    }
2753}
aft/search_index.rs

aft/
search_index.rs