aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19
20const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
21const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
22const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
23const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
24const INDEX_VERSION: u32 = 3;
25const PREVIEW_BYTES: usize = 8 * 1024;
26const EOF_SENTINEL: u8 = 0;
27const MAX_ENTRIES: usize = 10_000_000;
28const MIN_FILE_ENTRY_BYTES: usize = 57;
29const LOOKUP_ENTRY_BYTES: usize = 16;
30const POSTING_BYTES: usize = 6;
31
32pub struct CacheLock {
33    path: PathBuf,
34}
35
36impl CacheLock {
37    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
38        fs::create_dir_all(cache_dir)?;
39        let path = cache_dir.join("cache.lock");
40        for _ in 0..200 {
41            match fs::OpenOptions::new()
42                .write(true)
43                .create_new(true)
44                .open(&path)
45            {
46                Ok(mut file) => {
47                    let _ = writeln!(file, "{}", std::process::id());
48                    let _ = file.sync_all();
49                    return Ok(Self { path });
50                }
51                Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
52                    std::thread::sleep(Duration::from_millis(10));
53                }
54                Err(error) => return Err(error),
55            }
56        }
57        Err(std::io::Error::other(
58            "timed out acquiring search cache lock",
59        ))
60    }
61}
62
63impl Drop for CacheLock {
64    fn drop(&mut self) {
65        let _ = fs::remove_file(&self.path);
66    }
67}
68
69#[derive(Clone, Debug)]
70pub struct SearchIndex {
71    pub postings: HashMap<u32, Vec<Posting>>,
72    pub files: Vec<FileEntry>,
73    pub path_to_id: HashMap<PathBuf, u32>,
74    pub ready: bool,
75    project_root: PathBuf,
76    git_head: Option<String>,
77    max_file_size: u64,
78    pub file_trigrams: HashMap<u32, Vec<u32>>,
79    unindexed_files: HashSet<u32>,
80}
81
82impl SearchIndex {
83    /// Number of indexed files.
84    pub fn file_count(&self) -> usize {
85        self.files.len()
86    }
87
88    /// Number of unique trigrams in the index.
89    pub fn trigram_count(&self) -> usize {
90        self.postings.len()
91    }
92
93    /// Compute distinct query trigrams from literal tokens.
94    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
95        query_trigrams_from_tokens(tokens)
96    }
97
98    /// Score-rank file candidates by lexical relevance to query trigrams.
99    pub fn lexical_rank(
100        &self,
101        query_trigrams: &[u32],
102        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
103        max_files: usize,
104    ) -> Vec<(PathBuf, f32)> {
105        if query_trigrams.is_empty() || max_files == 0 {
106            return Vec::new();
107        }
108
109        let mut non_zero: Vec<(u32, usize)> = query_trigrams
110            .iter()
111            .filter_map(|trigram| {
112                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
113                (posting_count > 0).then_some((*trigram, posting_count))
114            })
115            .collect();
116        if non_zero.is_empty() {
117            return Vec::new();
118        }
119
120        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
121        let selected_count = non_zero.len().min(3);
122        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
123
124        let mut candidate_ids = BTreeSet::new();
125        for (trigram, _) in non_zero.iter().take(selected_count) {
126            if let Some(postings) = self.postings.get(trigram) {
127                for posting in postings {
128                    if self.is_active_file(posting.file_id) {
129                        candidate_ids.insert(posting.file_id);
130                    }
131                }
132            }
133        }
134
135        let mut ranked = Vec::new();
136        for file_id in candidate_ids.into_iter().take(candidate_cap) {
137            let Some(entry) = self.files.get(file_id as usize) else {
138                continue;
139            };
140            if let Some(filter) = candidate_filter {
141                if !filter(&entry.path) {
142                    continue;
143                }
144            }
145            let score = lexical_score(self, query_trigrams, file_id);
146            if score > 0.0 {
147                ranked.push((entry.path.clone(), score));
148            }
149        }
150
151        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
152        ranked.truncate(max_files);
153        ranked
154    }
155}
156
157#[derive(Clone, Debug, PartialEq, Eq)]
158pub struct Posting {
159    pub file_id: u32,
160    pub next_mask: u8,
161    pub loc_mask: u8,
162}
163
164#[derive(Clone, Debug)]
165pub struct FileEntry {
166    pub path: PathBuf,
167    pub size: u64,
168    pub modified: SystemTime,
169    pub content_hash: blake3::Hash,
170}
171
172#[derive(Clone, Debug, PartialEq, Eq)]
173pub struct GrepMatch {
174    pub file: PathBuf,
175    pub line: u32,
176    pub column: u32,
177    pub line_text: String,
178    pub match_text: String,
179}
180
181#[derive(Clone, Debug)]
182pub struct GrepResult {
183    pub matches: Vec<GrepMatch>,
184    pub total_matches: usize,
185    pub files_searched: usize,
186    pub files_with_matches: usize,
187    pub index_status: IndexStatus,
188    pub truncated: bool,
189}
190
191#[derive(Clone, Copy, Debug, PartialEq, Eq)]
192pub enum IndexStatus {
193    Ready,
194    Building,
195    Fallback,
196}
197
198impl IndexStatus {
199    pub fn as_str(&self) -> &'static str {
200        match self {
201            IndexStatus::Ready => "Ready",
202            IndexStatus::Building => "Building",
203            IndexStatus::Fallback => "Fallback",
204        }
205    }
206}
207
208#[derive(Clone, Debug, Default)]
209pub struct RegexQuery {
210    pub and_trigrams: Vec<u32>,
211    pub or_groups: Vec<Vec<u32>>,
212    pub(crate) and_filters: HashMap<u32, PostingFilter>,
213    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
214}
215
216#[derive(Clone, Copy, Debug, Default)]
217pub(crate) struct PostingFilter {
218    next_mask: u8,
219    loc_mask: u8,
220}
221
222#[derive(Clone, Debug, Default)]
223struct QueryBuild {
224    and_runs: Vec<Vec<u8>>,
225    or_groups: Vec<Vec<Vec<u8>>>,
226}
227
228#[derive(Clone, Debug, Default)]
229pub(crate) struct PathFilters {
230    includes: Option<GlobSet>,
231    excludes: Option<GlobSet>,
232}
233
234#[derive(Clone, Debug)]
235pub(crate) struct SearchScope {
236    pub root: PathBuf,
237    pub use_index: bool,
238}
239
240#[derive(Clone, Debug)]
241struct SharedGrepMatch {
242    file: Arc<PathBuf>,
243    line: u32,
244    column: u32,
245    line_text: String,
246    match_text: String,
247}
248
249#[derive(Clone, Debug)]
250enum SearchMatcher {
251    Literal(LiteralSearch),
252    Regex(Regex),
253}
254
255#[derive(Clone, Debug)]
256enum LiteralSearch {
257    CaseSensitive(Vec<u8>),
258    AsciiCaseInsensitive(Vec<u8>),
259}
260
261impl SearchIndex {
262    pub fn new() -> Self {
263        SearchIndex {
264            postings: HashMap::new(),
265            files: Vec::new(),
266            path_to_id: HashMap::new(),
267            ready: false,
268            project_root: PathBuf::new(),
269            git_head: None,
270            max_file_size: DEFAULT_MAX_FILE_SIZE,
271            file_trigrams: HashMap::new(),
272            unindexed_files: HashSet::new(),
273        }
274    }
275
276    pub fn build(root: &Path) -> Self {
277        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
278    }
279
280    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
281        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
282        let mut index = SearchIndex {
283            project_root: project_root.clone(),
284            max_file_size,
285            ..SearchIndex::new()
286        };
287
288        let filters = PathFilters::default();
289        for path in walk_project_files(&project_root, &filters) {
290            index.update_file(&path);
291        }
292
293        index.git_head = current_git_head(&project_root);
294        index.ready = true;
295        index
296    }
297
298    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
299        self.remove_file(path);
300
301        let file_id = match self.allocate_file_id(path, content.len() as u64) {
302            Some(file_id) => file_id,
303            None => return,
304        };
305        if let Some(file) = self.files.get_mut(file_id as usize) {
306            file.content_hash = cache_freshness::hash_bytes(content);
307        }
308
309        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
310        for (trigram, next_char, position) in extract_trigrams(content) {
311            let entry = trigram_map.entry(trigram).or_default();
312            entry.next_mask |= mask_for_next_char(next_char);
313            entry.loc_mask |= mask_for_position(position);
314        }
315
316        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
317        for (trigram, filter) in trigram_map {
318            let postings = self.postings.entry(trigram).or_default();
319            postings.push(Posting {
320                file_id,
321                next_mask: filter.next_mask,
322                loc_mask: filter.loc_mask,
323            });
324            // Posting lists are kept sorted by file_id for binary search during
325            // intersection. Since file_ids are allocated incrementally, the new
326            // entry is usually already in order. Only sort when needed.
327            if postings.len() > 1
328                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
329            {
330                postings.sort_unstable_by_key(|p| p.file_id);
331            }
332            file_trigrams.push(trigram);
333        }
334
335        self.file_trigrams.insert(file_id, file_trigrams);
336        self.unindexed_files.remove(&file_id);
337    }
338
339    pub fn remove_file(&mut self, path: &Path) {
340        let Some(file_id) = self.path_to_id.remove(path) else {
341            return;
342        };
343
344        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
345            for trigram in trigrams {
346                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
347                    postings.retain(|posting| posting.file_id != file_id);
348                    postings.is_empty()
349                } else {
350                    false
351                };
352
353                if should_remove {
354                    self.postings.remove(&trigram);
355                }
356            }
357        }
358
359        self.unindexed_files.remove(&file_id);
360        if let Some(file) = self.files.get_mut(file_id as usize) {
361            file.path = PathBuf::new();
362            file.size = 0;
363            file.modified = UNIX_EPOCH;
364            file.content_hash = cache_freshness::zero_hash();
365        }
366    }
367
368    pub fn update_file(&mut self, path: &Path) {
369        self.remove_file(path);
370
371        let metadata = match fs::metadata(path) {
372            Ok(metadata) if metadata.is_file() => metadata,
373            _ => return,
374        };
375
376        if is_binary_path(path, metadata.len()) {
377            self.track_unindexed_file(path, &metadata);
378            return;
379        }
380
381        if metadata.len() > self.max_file_size {
382            self.track_unindexed_file(path, &metadata);
383            return;
384        }
385
386        let content = match fs::read(path) {
387            Ok(content) => content,
388            Err(_) => return,
389        };
390
391        if is_binary_bytes(&content) {
392            self.track_unindexed_file(path, &metadata);
393            return;
394        }
395
396        self.index_file(path, &content);
397    }
398
399    pub fn grep(
400        &self,
401        pattern: &str,
402        case_sensitive: bool,
403        include: &[String],
404        exclude: &[String],
405        search_root: &Path,
406        max_results: usize,
407    ) -> GrepResult {
408        self.search_grep(
409            pattern,
410            case_sensitive,
411            include,
412            exclude,
413            search_root,
414            max_results,
415        )
416    }
417
418    pub fn search_grep(
419        &self,
420        pattern: &str,
421        case_sensitive: bool,
422        include: &[String],
423        exclude: &[String],
424        search_root: &Path,
425        max_results: usize,
426    ) -> GrepResult {
427        // Detect if pattern is a plain literal (no regex metacharacters).
428        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
429        let is_literal = !pattern.chars().any(|c| {
430            matches!(
431                c,
432                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
433            )
434        });
435
436        let literal_search = if is_literal {
437            if case_sensitive {
438                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
439            } else if pattern.is_ascii() {
440                Some(LiteralSearch::AsciiCaseInsensitive(
441                    pattern
442                        .as_bytes()
443                        .iter()
444                        .map(|byte| byte.to_ascii_lowercase())
445                        .collect(),
446                ))
447            } else {
448                None
449            }
450        } else {
451            None
452        };
453
454        // Build the regex for non-literal patterns (or literal Unicode fallback).
455        let regex = if literal_search.is_some() {
456            None
457        } else {
458            let regex_pattern = if is_literal {
459                regex::escape(pattern)
460            } else {
461                pattern.to_string()
462            };
463            let mut builder = RegexBuilder::new(&regex_pattern);
464            builder.case_insensitive(!case_sensitive);
465            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
466            builder.multi_line(true);
467            match builder.build() {
468                Ok(r) => Some(r),
469                Err(_) => {
470                    return GrepResult {
471                        matches: Vec::new(),
472                        total_matches: 0,
473                        files_searched: 0,
474                        files_with_matches: 0,
475                        index_status: if self.ready {
476                            IndexStatus::Ready
477                        } else {
478                            IndexStatus::Building
479                        },
480                        truncated: false,
481                    };
482                }
483            }
484        };
485
486        let matcher = if let Some(literal_search) = literal_search {
487            SearchMatcher::Literal(literal_search)
488        } else {
489            SearchMatcher::Regex(
490                regex.expect("regex should exist when literal matcher is unavailable"),
491            )
492        };
493
494        let filters = match build_path_filters(include, exclude) {
495            Ok(filters) => filters,
496            Err(_) => PathFilters::default(),
497        };
498        let search_root = canonicalize_or_normalize(search_root);
499
500        let query = decompose_regex(pattern);
501        let candidate_ids = self.candidates(&query);
502
503        let candidate_files: Vec<&FileEntry> = candidate_ids
504            .into_iter()
505            .filter_map(|file_id| self.files.get(file_id as usize))
506            .filter(|file| !file.path.as_os_str().is_empty())
507            .filter(|file| is_within_search_root(&search_root, &file.path))
508            .filter(|file| filters.matches(&self.project_root, &file.path))
509            .collect();
510
511        let total_matches = AtomicUsize::new(0);
512        let files_searched = AtomicUsize::new(0);
513        let files_with_matches = AtomicUsize::new(0);
514        let truncated = AtomicBool::new(false);
515        let stop_after = max_results.saturating_mul(2);
516
517        let mut matches = if candidate_files.len() > 10 {
518            candidate_files
519                .par_iter()
520                .map(|file| {
521                    search_candidate_file(
522                        file,
523                        &matcher,
524                        max_results,
525                        stop_after,
526                        &total_matches,
527                        &files_searched,
528                        &files_with_matches,
529                        &truncated,
530                    )
531                })
532                .reduce(Vec::new, |mut left, mut right| {
533                    left.append(&mut right);
534                    left
535                })
536        } else {
537            let mut matches = Vec::new();
538            for file in candidate_files {
539                matches.extend(search_candidate_file(
540                    file,
541                    &matcher,
542                    max_results,
543                    stop_after,
544                    &total_matches,
545                    &files_searched,
546                    &files_with_matches,
547                    &truncated,
548                ));
549
550                if should_stop_search(&truncated, &total_matches, stop_after) {
551                    break;
552                }
553            }
554            matches
555        };
556
557        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
558            self.path_to_id
559                .get(path)
560                .and_then(|file_id| self.files.get(*file_id as usize))
561                .map(|file| file.modified)
562        });
563
564        let matches = matches
565            .into_iter()
566            .map(|matched| GrepMatch {
567                file: matched.file.as_ref().clone(),
568                line: matched.line,
569                column: matched.column,
570                line_text: matched.line_text,
571                match_text: matched.match_text,
572            })
573            .collect();
574
575        GrepResult {
576            total_matches: total_matches.load(Ordering::Relaxed),
577            matches,
578            files_searched: files_searched.load(Ordering::Relaxed),
579            files_with_matches: files_with_matches.load(Ordering::Relaxed),
580            index_status: if self.ready {
581                IndexStatus::Ready
582            } else {
583                IndexStatus::Building
584            },
585            truncated: truncated.load(Ordering::Relaxed),
586        }
587    }
588
589    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
590        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
591            Ok(filters) => filters,
592            Err(_) => return Vec::new(),
593        };
594        let search_root = canonicalize_or_normalize(search_root);
595        let mut entries = self
596            .files
597            .iter()
598            .filter(|file| !file.path.as_os_str().is_empty())
599            .filter(|file| is_within_search_root(&search_root, &file.path))
600            .filter(|file| filters.matches(&self.project_root, &file.path))
601            .map(|file| (file.path.clone(), file.modified))
602            .collect::<Vec<_>>();
603
604        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
605            right_mtime
606                .cmp(left_mtime)
607                .then_with(|| left_path.cmp(right_path))
608        });
609
610        entries.into_iter().map(|(path, _)| path).collect()
611    }
612
613    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
614        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
615            return self.active_file_ids();
616        }
617
618        let mut and_trigrams = query.and_trigrams.clone();
619        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
620
621        let mut current: Option<Vec<u32>> = None;
622
623        for trigram in and_trigrams {
624            let filter = query.and_filters.get(&trigram).copied();
625            let matches = self.postings_for_trigram(trigram, filter);
626            current = Some(match current.take() {
627                Some(existing) => intersect_sorted_ids(&existing, &matches),
628                None => matches,
629            });
630
631            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
632                break;
633            }
634        }
635
636        let mut current = current.unwrap_or_else(|| self.active_file_ids());
637
638        for (index, group) in query.or_groups.iter().enumerate() {
639            let mut group_matches = Vec::new();
640            let filters = query.or_filters.get(index);
641
642            for trigram in group {
643                let filter = filters.and_then(|filters| filters.get(trigram).copied());
644                let matches = self.postings_for_trigram(*trigram, filter);
645                if group_matches.is_empty() {
646                    group_matches = matches;
647                } else {
648                    group_matches = union_sorted_ids(&group_matches, &matches);
649                }
650            }
651
652            current = intersect_sorted_ids(&current, &group_matches);
653            if current.is_empty() {
654                break;
655            }
656        }
657
658        let mut unindexed = self
659            .unindexed_files
660            .iter()
661            .copied()
662            .filter(|file_id| self.is_active_file(*file_id))
663            .collect::<Vec<_>>();
664        if !unindexed.is_empty() {
665            unindexed.sort_unstable();
666            current = union_sorted_ids(&current, &unindexed);
667        }
668
669        current
670    }
671
672    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
673        if fs::create_dir_all(cache_dir).is_err() {
674            return;
675        }
676
677        let cache_path = cache_dir.join("cache.bin");
678        let tmp_cache = cache_dir.join(format!(
679            "cache.bin.tmp.{}.{}",
680            std::process::id(),
681            SystemTime::now()
682                .duration_since(UNIX_EPOCH)
683                .unwrap_or(Duration::ZERO)
684                .as_nanos()
685        ));
686
687        let active_ids = self.active_file_ids();
688        let mut id_map = HashMap::new();
689        for (new_id, old_id) in active_ids.iter().enumerate() {
690            let Ok(new_id_u32) = u32::try_from(new_id) else {
691                return;
692            };
693            id_map.insert(*old_id, new_id_u32);
694        }
695
696        let write_result = (|| -> std::io::Result<()> {
697            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
698
699            postings_writer.write_all(INDEX_MAGIC)?;
700            write_u32(&mut postings_writer, INDEX_VERSION)?;
701
702            let head = git_head.unwrap_or_default();
703            let root = self.project_root.to_string_lossy();
704            let head_len = u32::try_from(head.len())
705                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
706            let root_len = u32::try_from(root.len())
707                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
708            let file_count = u32::try_from(active_ids.len())
709                .map_err(|_| std::io::Error::other("too many files to cache"))?;
710
711            write_u32(&mut postings_writer, head_len)?;
712            write_u32(&mut postings_writer, root_len)?;
713            write_u64(&mut postings_writer, self.max_file_size)?;
714            write_u32(&mut postings_writer, file_count)?;
715            postings_writer.write_all(head.as_bytes())?;
716            postings_writer.write_all(root.as_bytes())?;
717
718            for old_id in &active_ids {
719                let Some(file) = self.files.get(*old_id as usize) else {
720                    return Err(std::io::Error::other("missing file entry for cache write"));
721                };
722                let path =
723                    cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
724                        std::io::Error::other(format!(
725                            "refusing to cache path outside project root: {}",
726                            file.path.display()
727                        ))
728                    })?;
729                let path = path.to_string_lossy();
730                let path_len = u32::try_from(path.len())
731                    .map_err(|_| std::io::Error::other("cached path too large"))?;
732                let modified = file
733                    .modified
734                    .duration_since(UNIX_EPOCH)
735                    .unwrap_or(Duration::ZERO);
736                let unindexed = if self.unindexed_files.contains(old_id) {
737                    1u8
738                } else {
739                    0u8
740                };
741
742                postings_writer.write_all(&[unindexed])?;
743                write_u32(&mut postings_writer, path_len)?;
744                write_u64(&mut postings_writer, file.size)?;
745                write_u64(&mut postings_writer, modified.as_secs())?;
746                write_u32(&mut postings_writer, modified.subsec_nanos())?;
747                postings_writer.write_all(file.content_hash.as_bytes())?;
748                postings_writer.write_all(path.as_bytes())?;
749            }
750
751            let mut lookup_entries = Vec::new();
752            let mut postings_blob = Vec::new();
753            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
754            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
755
756            for (trigram, postings) in sorted_postings {
757                let offset = u64::try_from(postings_blob.len())
758                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
759                let mut count = 0u32;
760
761                for posting in postings {
762                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
763                        continue;
764                    };
765
766                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
767                    postings_blob.push(posting.next_mask);
768                    postings_blob.push(posting.loc_mask);
769                    count = count.saturating_add(1);
770                }
771
772                if count > 0 {
773                    lookup_entries.push((*trigram, offset, count));
774                }
775            }
776
777            write_u64(
778                &mut postings_writer,
779                u64::try_from(postings_blob.len())
780                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
781            )?;
782            postings_writer.write_all(&postings_blob)?;
783            postings_writer.flush()?;
784            let mut postings_blob_file = postings_writer
785                .into_inner()
786                .map_err(|error| std::io::Error::other(error.to_string()))?
787                .into_inner();
788            let checksum = crc32fast::hash(&postings_blob_file);
789            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
790
791            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
792            let entry_count = u32::try_from(lookup_entries.len())
793                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
794
795            lookup_writer.write_all(LOOKUP_MAGIC)?;
796            write_u32(&mut lookup_writer, INDEX_VERSION)?;
797            write_u32(&mut lookup_writer, entry_count)?;
798
799            for (trigram, offset, count) in lookup_entries {
800                write_u32(&mut lookup_writer, trigram)?;
801                write_u64(&mut lookup_writer, offset)?;
802                write_u32(&mut lookup_writer, count)?;
803            }
804
805            lookup_writer.flush()?;
806            let mut lookup_blob_file = lookup_writer
807                .into_inner()
808                .map_err(|error| std::io::Error::other(error.to_string()))?
809                .into_inner();
810            let checksum = crc32fast::hash(&lookup_blob_file);
811            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
812
813            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
814            write_u32(&mut cache_writer, CACHE_MAGIC)?;
815            write_u32(&mut cache_writer, INDEX_VERSION)?;
816            write_u64(
817                &mut cache_writer,
818                u64::try_from(postings_blob_file.len())
819                    .map_err(|_| std::io::Error::other("postings section too large"))?,
820            )?;
821            cache_writer.write_all(&postings_blob_file)?;
822            cache_writer.write_all(&lookup_blob_file)?;
823            cache_writer.flush()?;
824            cache_writer.get_ref().sync_all()?;
825            drop(cache_writer);
826            fs::rename(&tmp_cache, &cache_path)?;
827
828            Ok(())
829        })();
830
831        if write_result.is_err() {
832            let _ = fs::remove_file(&tmp_cache);
833        }
834    }
835
836    pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
837        debug_assert!(current_canonical_root.is_absolute());
838        let cache_path = cache_dir.join("cache.bin");
839        let cache_bytes = fs::read(&cache_path).ok()?;
840        if cache_bytes.len() < 16 {
841            return None;
842        }
843        let mut header = Cursor::new(&cache_bytes);
844        if read_u32(&mut header).ok()? != CACHE_MAGIC {
845            return None;
846        }
847        if read_u32(&mut header).ok()? != INDEX_VERSION {
848            return None;
849        }
850        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
851        let start = usize::try_from(header.position()).ok()?;
852        let postings_end = start.checked_add(postings_len_total)?;
853        if postings_end > cache_bytes.len() {
854            return None;
855        }
856        let postings_bytes = &cache_bytes[start..postings_end];
857        let lookup_bytes = &cache_bytes[postings_end..];
858        let lookup_len_total = lookup_bytes.len();
859        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
860        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
861        if postings_len_total < 4 || lookup_len_total < 4 {
862            return None;
863        }
864        verify_crc32_bytes_slice(postings_bytes).ok()?;
865        verify_crc32_bytes_slice(lookup_bytes).ok()?;
866
867        let mut magic = [0u8; 8];
868        postings_reader.read_exact(&mut magic).ok()?;
869        if &magic != INDEX_MAGIC {
870            return None;
871        }
872        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
873            return None;
874        }
875
876        let head_len = read_u32(&mut postings_reader).ok()? as usize;
877        let root_len = read_u32(&mut postings_reader).ok()? as usize;
878        let max_file_size = read_u64(&mut postings_reader).ok()?;
879        let file_count = read_u32(&mut postings_reader).ok()? as usize;
880        if file_count > MAX_ENTRIES {
881            return None;
882        }
883        let postings_body_len = postings_len_total.checked_sub(4)?;
884        let lookup_body_len = lookup_len_total.checked_sub(4)?;
885
886        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
887        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
888        if minimum_file_bytes > remaining_postings {
889            return None;
890        }
891
892        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
893            return None;
894        }
895        let mut head_bytes = vec![0u8; head_len];
896        postings_reader.read_exact(&mut head_bytes).ok()?;
897        let git_head = String::from_utf8(head_bytes)
898            .ok()
899            .filter(|head| !head.is_empty());
900
901        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
902            return None;
903        }
904        let mut root_bytes = vec![0u8; root_len];
905        postings_reader.read_exact(&mut root_bytes).ok()?;
906        let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
907        let project_root = current_canonical_root.to_path_buf();
908
909        let mut files = Vec::with_capacity(file_count);
910        let mut path_to_id = HashMap::new();
911        let mut unindexed_files = HashSet::new();
912
913        for file_id in 0..file_count {
914            let mut unindexed = [0u8; 1];
915            postings_reader.read_exact(&mut unindexed).ok()?;
916            let path_len = read_u32(&mut postings_reader).ok()? as usize;
917            let size = read_u64(&mut postings_reader).ok()?;
918            let secs = read_u64(&mut postings_reader).ok()?;
919            let nanos = read_u32(&mut postings_reader).ok()?;
920            let mut hash_bytes = [0u8; 32];
921            postings_reader.read_exact(&mut hash_bytes).ok()?;
922            let content_hash = blake3::Hash::from_bytes(hash_bytes);
923            if nanos >= 1_000_000_000 {
924                return None;
925            }
926            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
927                return None;
928            }
929            let mut path_bytes = vec![0u8; path_len];
930            postings_reader.read_exact(&mut path_bytes).ok()?;
931            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
932            let full_path = cached_path_under_root(&project_root, &relative_path)?;
933            let file_id_u32 = u32::try_from(file_id).ok()?;
934
935            files.push(FileEntry {
936                path: full_path.clone(),
937                size,
938                modified: UNIX_EPOCH + Duration::new(secs, nanos),
939                content_hash,
940            });
941            path_to_id.insert(full_path, file_id_u32);
942            if unindexed[0] == 1 {
943                unindexed_files.insert(file_id_u32);
944            }
945        }
946
947        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
948        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
949        if postings_len > max_postings_bytes {
950            return None;
951        }
952        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
953            return None;
954        }
955        let mut postings_blob = vec![0u8; postings_len];
956        postings_reader.read_exact(&mut postings_blob).ok()?;
957
958        let mut lookup_magic = [0u8; 8];
959        lookup_reader.read_exact(&mut lookup_magic).ok()?;
960        if &lookup_magic != LOOKUP_MAGIC {
961            return None;
962        }
963        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
964            return None;
965        }
966        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
967        if entry_count > MAX_ENTRIES {
968            return None;
969        }
970        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
971        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
972        if minimum_lookup_bytes > remaining_lookup {
973            return None;
974        }
975
976        let mut postings = HashMap::new();
977        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
978
979        for _ in 0..entry_count {
980            let trigram = read_u32(&mut lookup_reader).ok()?;
981            let offset = read_u64(&mut lookup_reader).ok()? as usize;
982            let count = read_u32(&mut lookup_reader).ok()? as usize;
983            if count > MAX_ENTRIES {
984                return None;
985            }
986            let bytes_len = count.checked_mul(POSTING_BYTES)?;
987            let end = offset.checked_add(bytes_len)?;
988            if end > postings_blob.len() {
989                return None;
990            }
991
992            let mut trigram_postings = Vec::with_capacity(count);
993            for chunk in postings_blob[offset..end].chunks_exact(6) {
994                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
995                let posting = Posting {
996                    file_id,
997                    next_mask: chunk[4],
998                    loc_mask: chunk[5],
999                };
1000                trigram_postings.push(posting.clone());
1001                file_trigrams.entry(file_id).or_default().push(trigram);
1002            }
1003            postings.insert(trigram, trigram_postings);
1004        }
1005
1006        Some(SearchIndex {
1007            postings,
1008            files,
1009            path_to_id,
1010            ready: false,
1011            project_root,
1012            git_head,
1013            max_file_size,
1014            file_trigrams,
1015            unindexed_files,
1016        })
1017    }
1018
1019    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1020        self.git_head.as_deref()
1021    }
1022
1023    pub(crate) fn set_ready(&mut self, ready: bool) {
1024        self.ready = ready;
1025    }
1026
1027    pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1028        self.git_head = current_head;
1029        verify_file_mtimes(self);
1030        self.ready = true;
1031    }
1032
1033    pub(crate) fn rebuild_or_refresh(
1034        root: &Path,
1035        max_file_size: u64,
1036        current_head: Option<String>,
1037        baseline: Option<SearchIndex>,
1038    ) -> Self {
1039        if let Some(mut baseline) = baseline {
1040            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1041            baseline.max_file_size = max_file_size;
1042
1043            if baseline.git_head == current_head || current_head.is_none() {
1044                // HEAD matches, but files may have changed on disk since the index was
1045                // last written (e.g., uncommitted edits, stash pop, manual file changes
1046                // while OpenCode was closed). Verify mtimes and re-index stale files.
1047                // Non-git projects also use this per-file (path, mtime, size)
1048                // fingerprint so unchanged trees reuse the disk cache instead of
1049                // rebuilding every configure.
1050                baseline.git_head = current_head;
1051                verify_file_mtimes(&mut baseline);
1052                baseline.ready = true;
1053                return baseline;
1054            }
1055
1056            if let (Some(previous), Some(current)) =
1057                (baseline.git_head.clone(), current_head.clone())
1058            {
1059                let project_root = baseline.project_root.clone();
1060                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1061                    baseline.git_head = Some(current);
1062                    verify_file_mtimes(&mut baseline);
1063                    baseline.ready = true;
1064                    return baseline;
1065                }
1066            }
1067        }
1068
1069        SearchIndex::build_with_limit(root, max_file_size)
1070    }
1071
1072    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1073        let file_id = u32::try_from(self.files.len()).ok()?;
1074        let metadata = fs::metadata(path).ok();
1075        let size = metadata
1076            .as_ref()
1077            .map_or(size_hint, |metadata| metadata.len());
1078        let modified = metadata
1079            .and_then(|metadata| metadata.modified().ok())
1080            .unwrap_or(UNIX_EPOCH);
1081
1082        self.files.push(FileEntry {
1083            path: path.to_path_buf(),
1084            size,
1085            modified,
1086            content_hash: cache_freshness::zero_hash(),
1087        });
1088        self.path_to_id.insert(path.to_path_buf(), file_id);
1089        Some(file_id)
1090    }
1091
1092    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1093        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1094            return;
1095        };
1096        self.unindexed_files.insert(file_id);
1097        self.file_trigrams.insert(file_id, Vec::new());
1098    }
1099
1100    fn active_file_ids(&self) -> Vec<u32> {
1101        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1102        ids.sort_unstable();
1103        ids
1104    }
1105
1106    fn is_active_file(&self, file_id: u32) -> bool {
1107        self.files
1108            .get(file_id as usize)
1109            .map(|file| !file.path.as_os_str().is_empty())
1110            .unwrap_or(false)
1111    }
1112
1113    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1114        let Some(postings) = self.postings.get(&trigram) else {
1115            return Vec::new();
1116        };
1117
1118        let mut matches = Vec::with_capacity(postings.len());
1119
1120        for posting in postings {
1121            if let Some(filter) = filter {
1122                // next_mask: bloom filter check — the character following this trigram in the
1123                // query must also appear after this trigram somewhere in the file.
1124                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1125                    continue;
1126                }
1127                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1128                // between consecutive trigram pairs, but is NOT used as a single-trigram
1129                // filter because the position in the query string has no relationship to
1130                // the position in the file. Using it here causes false negatives.
1131            }
1132            if self.is_active_file(posting.file_id) {
1133                matches.push(posting.file_id);
1134            }
1135        }
1136
1137        matches
1138    }
1139}
1140
1141fn search_candidate_file(
1142    file: &FileEntry,
1143    matcher: &SearchMatcher,
1144    max_results: usize,
1145    stop_after: usize,
1146    total_matches: &AtomicUsize,
1147    files_searched: &AtomicUsize,
1148    files_with_matches: &AtomicUsize,
1149    truncated: &AtomicBool,
1150) -> Vec<SharedGrepMatch> {
1151    if should_stop_search(truncated, total_matches, stop_after) {
1152        return Vec::new();
1153    }
1154
1155    let content = match read_indexed_file_bytes(&file.path) {
1156        Some(content) => content,
1157        None => return Vec::new(),
1158    };
1159    // Defense in depth: even though indexing tries to filter binaries via
1160    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1161    // query time. content_inspector is fast (~bytes-per-cycle on a small
1162    // preview) and this guarantees we never surface matches inside binary
1163    // files even if the indexer somehow let one through (e.g. file changed
1164    // between indexing and query).
1165    if is_binary_bytes(&content) {
1166        return Vec::new();
1167    }
1168    files_searched.fetch_add(1, Ordering::Relaxed);
1169
1170    let shared_path = Arc::new(file.path.clone());
1171    let mut matches = Vec::new();
1172    let mut line_starts = None;
1173    let mut seen_lines = HashSet::new();
1174    let mut matched_this_file = false;
1175
1176    match matcher {
1177        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1178            let finder = memchr::memmem::Finder::new(needle);
1179            let mut start = 0;
1180
1181            while let Some(position) = finder.find(&content[start..]) {
1182                if should_stop_search(truncated, total_matches, stop_after) {
1183                    break;
1184                }
1185
1186                let offset = start + position;
1187                start = offset + 1;
1188
1189                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1190                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1191                if !seen_lines.insert(line) {
1192                    continue;
1193                }
1194
1195                matched_this_file = true;
1196                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1197                if match_number > max_results {
1198                    truncated.store(true, Ordering::Relaxed);
1199                    break;
1200                }
1201
1202                let end = offset + needle.len();
1203                matches.push(SharedGrepMatch {
1204                    file: shared_path.clone(),
1205                    line,
1206                    column,
1207                    line_text,
1208                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1209                });
1210            }
1211        }
1212        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1213            let search_content = content.to_ascii_lowercase();
1214            let finder = memchr::memmem::Finder::new(needle);
1215            let mut start = 0;
1216
1217            while let Some(position) = finder.find(&search_content[start..]) {
1218                if should_stop_search(truncated, total_matches, stop_after) {
1219                    break;
1220                }
1221
1222                let offset = start + position;
1223                start = offset + 1;
1224
1225                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1226                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1227                if !seen_lines.insert(line) {
1228                    continue;
1229                }
1230
1231                matched_this_file = true;
1232                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1233                if match_number > max_results {
1234                    truncated.store(true, Ordering::Relaxed);
1235                    break;
1236                }
1237
1238                let end = offset + needle.len();
1239                matches.push(SharedGrepMatch {
1240                    file: shared_path.clone(),
1241                    line,
1242                    column,
1243                    line_text,
1244                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1245                });
1246            }
1247        }
1248        SearchMatcher::Regex(regex) => {
1249            for matched in regex.find_iter(&content) {
1250                if should_stop_search(truncated, total_matches, stop_after) {
1251                    break;
1252                }
1253
1254                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1255                let (line, column, line_text) =
1256                    line_details_bytes(&content, line_starts, matched.start());
1257                if !seen_lines.insert(line) {
1258                    continue;
1259                }
1260
1261                matched_this_file = true;
1262                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1263                if match_number > max_results {
1264                    truncated.store(true, Ordering::Relaxed);
1265                    break;
1266                }
1267
1268                matches.push(SharedGrepMatch {
1269                    file: shared_path.clone(),
1270                    line,
1271                    column,
1272                    line_text,
1273                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1274                });
1275            }
1276        }
1277    }
1278
1279    if matched_this_file {
1280        files_with_matches.fetch_add(1, Ordering::Relaxed);
1281    }
1282
1283    matches
1284}
1285
1286fn should_stop_search(
1287    truncated: &AtomicBool,
1288    total_matches: &AtomicUsize,
1289    stop_after: usize,
1290) -> bool {
1291    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1292}
1293
1294fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1295    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1296    let mut left_index = 0;
1297    let mut right_index = 0;
1298
1299    while left_index < left.len() && right_index < right.len() {
1300        match left[left_index].cmp(&right[right_index]) {
1301            std::cmp::Ordering::Less => left_index += 1,
1302            std::cmp::Ordering::Greater => right_index += 1,
1303            std::cmp::Ordering::Equal => {
1304                merged.push(left[left_index]);
1305                left_index += 1;
1306                right_index += 1;
1307            }
1308        }
1309    }
1310
1311    merged
1312}
1313
1314fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1315    let mut merged = Vec::with_capacity(left.len() + right.len());
1316    let mut left_index = 0;
1317    let mut right_index = 0;
1318
1319    while left_index < left.len() && right_index < right.len() {
1320        match left[left_index].cmp(&right[right_index]) {
1321            std::cmp::Ordering::Less => {
1322                merged.push(left[left_index]);
1323                left_index += 1;
1324            }
1325            std::cmp::Ordering::Greater => {
1326                merged.push(right[right_index]);
1327                right_index += 1;
1328            }
1329            std::cmp::Ordering::Equal => {
1330                merged.push(left[left_index]);
1331                left_index += 1;
1332                right_index += 1;
1333            }
1334        }
1335    }
1336
1337    merged.extend_from_slice(&left[left_index..]);
1338    merged.extend_from_slice(&right[right_index..]);
1339    merged
1340}
1341
1342pub fn decompose_regex(pattern: &str) -> RegexQuery {
1343    let hir = match regex_syntax::parse(pattern) {
1344        Ok(hir) => hir,
1345        Err(_) => return RegexQuery::default(),
1346    };
1347
1348    let build = build_query(&hir);
1349    build.into_query()
1350}
1351
1352pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1353    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1354}
1355
1356pub fn normalize_char(c: u8) -> u8 {
1357    c.to_ascii_lowercase()
1358}
1359
1360pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1361    if content.len() < 3 {
1362        return Vec::new();
1363    }
1364
1365    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1366    for start in 0..=content.len() - 3 {
1367        let trigram = pack_trigram(
1368            normalize_char(content[start]),
1369            normalize_char(content[start + 1]),
1370            normalize_char(content[start + 2]),
1371        );
1372        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1373        trigrams.push((trigram, next_char, start));
1374    }
1375    trigrams
1376}
1377
1378pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1379    let mut seen = HashSet::new();
1380    let mut out = Vec::new();
1381    for token in tokens {
1382        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1383            if seen.insert(trigram) {
1384                out.push(trigram);
1385            }
1386        }
1387    }
1388    out
1389}
1390
1391pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1392    if query_trigrams.is_empty() {
1393        return 0.0;
1394    }
1395
1396    let mut hits = 0u32;
1397    for &trigram in query_trigrams {
1398        if let Some(postings) = index.postings.get(&trigram) {
1399            if postings
1400                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1401                .is_ok()
1402            {
1403                hits += 1;
1404            }
1405        }
1406    }
1407
1408    if hits == 0 {
1409        return 0.0;
1410    }
1411
1412    let file_trigram_count = index
1413        .file_trigrams
1414        .get(&file_id)
1415        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1416    (hits as f32) / (1.0 + file_trigram_count.ln())
1417}
1418
1419pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1420    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1421    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1422        return PathBuf::from(override_dir)
1423            .join("index")
1424            .join(project_cache_key(project_root));
1425    }
1426    // Use configured storage dir (from plugin, XDG-compliant)
1427    if let Some(dir) = storage_dir {
1428        return dir.join("index").join(project_cache_key(project_root));
1429    }
1430    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1431    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1432    // If neither is set, fall back to a temp directory rather than `"."`
1433    // because the search-index code reads/writes absolute paths.
1434    let home = std::env::var_os("HOME")
1435        .or_else(|| std::env::var_os("USERPROFILE"))
1436        .map(PathBuf::from)
1437        .unwrap_or_else(std::env::temp_dir);
1438    home.join(".cache")
1439        .join("aft")
1440        .join("index")
1441        .join(project_cache_key(project_root))
1442}
1443
1444pub(crate) fn build_path_filters(
1445    include: &[String],
1446    exclude: &[String],
1447) -> Result<PathFilters, String> {
1448    Ok(PathFilters {
1449        includes: build_globset(include)?,
1450        excludes: build_globset(exclude)?,
1451    })
1452}
1453
1454pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1455    walk_project_files_from(root, root, filters)
1456}
1457
1458pub(crate) fn walk_project_files_from(
1459    filter_root: &Path,
1460    search_root: &Path,
1461    filters: &PathFilters,
1462) -> Vec<PathBuf> {
1463    let mut builder = WalkBuilder::new(search_root);
1464    builder
1465        .hidden(false)
1466        .git_ignore(true)
1467        .git_global(true)
1468        .git_exclude(true)
1469        .filter_entry(|entry| {
1470            let name = entry.file_name().to_string_lossy();
1471            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1472                return !matches!(
1473                    name.as_ref(),
1474                    "node_modules"
1475                        | "target"
1476                        | "venv"
1477                        | ".venv"
1478                        | ".git"
1479                        | "__pycache__"
1480                        | ".tox"
1481                        | "dist"
1482                        | "build"
1483                );
1484            }
1485            true
1486        });
1487
1488    let mut files = Vec::new();
1489    for entry in builder.build().filter_map(|entry| entry.ok()) {
1490        if !entry
1491            .file_type()
1492            .map_or(false, |file_type| file_type.is_file())
1493        {
1494            continue;
1495        }
1496        let path = entry.into_path();
1497        if filters.matches(filter_root, &path) {
1498            files.push(path);
1499        }
1500    }
1501
1502    sort_paths_by_mtime_desc(&mut files);
1503    files
1504}
1505
1506pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1507    let bytes = fs::read(path).ok()?;
1508    if is_binary_bytes(&bytes) {
1509        return None;
1510    }
1511    String::from_utf8(bytes).ok()
1512}
1513
1514fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1515    fs::read(path).ok()
1516}
1517
1518pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1519    path.strip_prefix(root)
1520        .map(PathBuf::from)
1521        .unwrap_or_else(|_| path.to_path_buf())
1522}
1523
1524pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1525    let normalized_root = normalize_path(root);
1526    let normalized_path = normalize_path(path);
1527    let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1528    validate_cached_relative_path(relative)
1529}
1530
1531pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1532    let relative = validate_cached_relative_path(relative_path)?;
1533    let normalized_root = normalize_path(root);
1534    let full_path = normalize_path(&normalized_root.join(relative));
1535    full_path.starts_with(&normalized_root).then_some(full_path)
1536}
1537
1538pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1539    if path.is_absolute() {
1540        return None;
1541    }
1542
1543    let mut normalized = PathBuf::new();
1544    for component in path.components() {
1545        match component {
1546            Component::Normal(part) => normalized.push(part),
1547            Component::CurDir => {}
1548            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1549        }
1550    }
1551    (!normalized.as_os_str().is_empty()).then_some(normalized)
1552}
1553
1554/// Sort paths newest-first by mtime, falling back to lexicographic order.
1555///
1556/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1557/// `sort_by()` closure. That made the comparator non-deterministic — a
1558/// `stat()` syscall for the same path can return different values across
1559/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1560/// concurrent file-watcher activity), and Rust's slice::sort panics at
1561/// runtime when it detects a non-total-order comparator. CI hit this on
1562/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1563///
1564/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1565/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1566pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1567    use std::collections::HashMap;
1568    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1569    for path in paths.iter() {
1570        mtimes
1571            .entry(path.clone())
1572            .or_insert_with(|| path_modified_time(path));
1573    }
1574    paths.sort_by(|left, right| {
1575        let left_mtime = mtimes.get(left).and_then(|v| *v);
1576        let right_mtime = mtimes.get(right).and_then(|v| *v);
1577        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1578    });
1579}
1580
1581/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1582/// the sort. Same fix, applied to grep matches that share files.
1583pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1584    use std::collections::HashMap;
1585    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1586    for m in matches.iter() {
1587        mtimes.entry(m.file.clone()).or_insert_with(|| {
1588            let resolved = resolve_match_path(project_root, &m.file);
1589            path_modified_time(&resolved)
1590        });
1591    }
1592    matches.sort_by(|left, right| {
1593        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1594        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1595        right_mtime
1596            .cmp(&left_mtime)
1597            .then_with(|| left.file.cmp(&right.file))
1598            .then_with(|| left.line.cmp(&right.line))
1599            .then_with(|| left.column.cmp(&right.column))
1600    });
1601}
1602
1603/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1604/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1605/// table from the search index), but it can still return different values if
1606/// the file is modified mid-sort. Snapshot once.
1607fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1608    matches: &mut [SharedGrepMatch],
1609    modified_for_path: F,
1610) where
1611    F: Fn(&Path) -> Option<SystemTime>,
1612{
1613    use std::collections::HashMap;
1614    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1615    for m in matches.iter() {
1616        let path = m.file.as_path().to_path_buf();
1617        mtimes
1618            .entry(path.clone())
1619            .or_insert_with(|| modified_for_path(&path));
1620    }
1621    matches.sort_by(|left, right| {
1622        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1623        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1624        right_mtime
1625            .cmp(&left_mtime)
1626            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1627            .then_with(|| left.line.cmp(&right.line))
1628            .then_with(|| left.column.cmp(&right.column))
1629    });
1630}
1631
1632pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1633    let resolved_project_root = canonicalize_or_normalize(project_root);
1634    let root = match path {
1635        Some(path) => {
1636            let path = PathBuf::from(path);
1637            if path.is_absolute() {
1638                canonicalize_or_normalize(&path)
1639            } else {
1640                normalize_path(&resolved_project_root.join(path))
1641            }
1642        }
1643        None => resolved_project_root.clone(),
1644    };
1645
1646    let use_index = is_within_search_root(&resolved_project_root, &root);
1647    SearchScope { root, use_index }
1648}
1649
1650pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1651    content_inspector::inspect(content).is_binary()
1652}
1653
1654pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1655    run_git(root, &["rev-parse", "HEAD"])
1656}
1657
1658pub fn project_cache_key(project_root: &Path) -> String {
1659    use sha2::{Digest, Sha256};
1660
1661    let mut hasher = Sha256::new();
1662
1663    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1664        // Git repo: root commit is the unique identity.
1665        // Same repo cloned anywhere produces the same key.
1666        hasher.update(root_commit.as_bytes());
1667    } else {
1668        // Non-git project: use the canonical filesystem path as identity.
1669        let canonical_root = canonicalize_or_normalize(project_root);
1670        hasher.update(canonical_root.to_string_lossy().as_bytes());
1671    }
1672
1673    let digest = format!("{:x}", hasher.finalize());
1674    digest[..16].to_string()
1675}
1676
1677impl PathFilters {
1678    fn matches(&self, root: &Path, path: &Path) -> bool {
1679        let relative = to_glob_path(&relative_to_root(root, path));
1680        if self
1681            .includes
1682            .as_ref()
1683            .is_some_and(|includes| !includes.is_match(&relative))
1684        {
1685            return false;
1686        }
1687        if self
1688            .excludes
1689            .as_ref()
1690            .is_some_and(|excludes| excludes.is_match(&relative))
1691        {
1692            return false;
1693        }
1694        true
1695    }
1696}
1697
1698fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1699    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1700}
1701
1702fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1703    if path.is_absolute() {
1704        path.to_path_buf()
1705    } else {
1706        project_root.join(path)
1707    }
1708}
1709
1710fn path_modified_time(path: &Path) -> Option<SystemTime> {
1711    fs::metadata(path)
1712        .and_then(|metadata| metadata.modified())
1713        .ok()
1714}
1715
1716fn normalize_path(path: &Path) -> PathBuf {
1717    let mut result = PathBuf::new();
1718    for component in path.components() {
1719        match component {
1720            Component::ParentDir => {
1721                if !result.pop() {
1722                    result.push(component);
1723                }
1724            }
1725            Component::CurDir => {}
1726            _ => result.push(component),
1727        }
1728    }
1729    result
1730}
1731
1732/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1733/// since the index was last written. Also detect new files and deleted files.
1734fn verify_file_mtimes(index: &mut SearchIndex) {
1735    // Collect stale files (mtime mismatch or deleted)
1736    let mut stale_paths = Vec::new();
1737    for entry in &index.files {
1738        if entry.path.as_os_str().is_empty() {
1739            continue; // tombstoned entry
1740        }
1741        let cached = FileFreshness {
1742            mtime: entry.modified,
1743            size: entry.size,
1744            content_hash: entry.content_hash,
1745        };
1746        match cache_freshness::verify_file(&entry.path, &cached) {
1747            FreshnessVerdict::HotFresh | FreshnessVerdict::ContentFresh { .. } => {}
1748            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1749                stale_paths.push(entry.path.clone())
1750            }
1751        }
1752    }
1753
1754    for entry in &mut index.files {
1755        if entry.path.as_os_str().is_empty() {
1756            continue;
1757        }
1758        let cached = FileFreshness {
1759            mtime: entry.modified,
1760            size: entry.size,
1761            content_hash: entry.content_hash,
1762        };
1763        if let FreshnessVerdict::ContentFresh {
1764            new_mtime,
1765            new_size,
1766        } = cache_freshness::verify_file(&entry.path, &cached)
1767        {
1768            entry.modified = new_mtime;
1769            entry.size = new_size;
1770        }
1771    }
1772
1773    // Re-index stale files
1774    for path in &stale_paths {
1775        index.update_file(path);
1776    }
1777
1778    // Detect new files not in the index
1779    let filters = PathFilters::default();
1780    for path in walk_project_files(&index.project_root, &filters) {
1781        if !index.path_to_id.contains_key(&path) {
1782            index.update_file(&path);
1783        }
1784    }
1785
1786    if !stale_paths.is_empty() {
1787        crate::slog_info!(
1788            "search index: refreshed {} stale file(s) from disk cache",
1789            stale_paths.len()
1790        );
1791    }
1792}
1793
1794fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1795    normalize_path(path).starts_with(normalize_path(search_root))
1796}
1797
1798impl QueryBuild {
1799    fn into_query(self) -> RegexQuery {
1800        let mut query = RegexQuery::default();
1801
1802        for run in self.and_runs {
1803            add_run_to_and_query(&mut query, &run);
1804        }
1805
1806        for group in self.or_groups {
1807            let mut trigrams = BTreeSet::new();
1808            let mut filters = HashMap::new();
1809            for run in group {
1810                for (trigram, filter) in trigram_filters(&run) {
1811                    trigrams.insert(trigram);
1812                    merge_filter(filters.entry(trigram).or_default(), filter);
1813                }
1814            }
1815            if !trigrams.is_empty() {
1816                query.or_groups.push(trigrams.into_iter().collect());
1817                query.or_filters.push(filters);
1818            }
1819        }
1820
1821        query
1822    }
1823}
1824
1825fn build_query(hir: &Hir) -> QueryBuild {
1826    match hir.kind() {
1827        HirKind::Literal(literal) => {
1828            if literal.0.len() >= 3 {
1829                QueryBuild {
1830                    and_runs: vec![literal.0.to_vec()],
1831                    or_groups: Vec::new(),
1832                }
1833            } else {
1834                QueryBuild::default()
1835            }
1836        }
1837        HirKind::Capture(capture) => build_query(&capture.sub),
1838        HirKind::Concat(parts) => {
1839            let mut build = QueryBuild::default();
1840            for part in parts {
1841                let part_build = build_query(part);
1842                build.and_runs.extend(part_build.and_runs);
1843                build.or_groups.extend(part_build.or_groups);
1844            }
1845            build
1846        }
1847        HirKind::Alternation(parts) => {
1848            let mut group = Vec::new();
1849            for part in parts {
1850                let Some(mut choices) = guaranteed_run_choices(part) else {
1851                    return QueryBuild::default();
1852                };
1853                group.append(&mut choices);
1854            }
1855            if group.is_empty() {
1856                QueryBuild::default()
1857            } else {
1858                QueryBuild {
1859                    and_runs: Vec::new(),
1860                    or_groups: vec![group],
1861                }
1862            }
1863        }
1864        HirKind::Repetition(repetition) => {
1865            if repetition.min == 0 {
1866                QueryBuild::default()
1867            } else {
1868                build_query(&repetition.sub)
1869            }
1870        }
1871        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1872    }
1873}
1874
1875fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1876    match hir.kind() {
1877        HirKind::Literal(literal) => {
1878            if literal.0.len() >= 3 {
1879                Some(vec![literal.0.to_vec()])
1880            } else {
1881                None
1882            }
1883        }
1884        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1885        HirKind::Concat(parts) => {
1886            let mut runs = Vec::new();
1887            for part in parts {
1888                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1889                    runs.append(&mut part_runs);
1890                }
1891            }
1892            if runs.is_empty() {
1893                None
1894            } else {
1895                Some(runs)
1896            }
1897        }
1898        HirKind::Alternation(parts) => {
1899            let mut runs = Vec::new();
1900            for part in parts {
1901                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1902                    return None;
1903                };
1904                runs.append(&mut part_runs);
1905            }
1906            if runs.is_empty() {
1907                None
1908            } else {
1909                Some(runs)
1910            }
1911        }
1912        HirKind::Repetition(repetition) => {
1913            if repetition.min == 0 {
1914                None
1915            } else {
1916                guaranteed_run_choices(&repetition.sub)
1917            }
1918        }
1919        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1920    }
1921}
1922
1923fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1924    for (trigram, filter) in trigram_filters(run) {
1925        if !query.and_trigrams.contains(&trigram) {
1926            query.and_trigrams.push(trigram);
1927        }
1928        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1929    }
1930}
1931
1932fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1933    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1934    for (trigram, next_char, position) in extract_trigrams(run) {
1935        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1936        if next_char != EOF_SENTINEL {
1937            entry.next_mask |= mask_for_next_char(next_char);
1938        }
1939        entry.loc_mask |= mask_for_position(position);
1940    }
1941    filters.into_iter().collect()
1942}
1943
1944fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1945    target.next_mask |= filter.next_mask;
1946    target.loc_mask |= filter.loc_mask;
1947}
1948
1949fn mask_for_next_char(next_char: u8) -> u8 {
1950    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1951    1u8 << bit
1952}
1953
1954fn mask_for_position(position: usize) -> u8 {
1955    1u8 << (position % 8)
1956}
1957
1958fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1959    if patterns.is_empty() {
1960        return Ok(None);
1961    }
1962
1963    let mut builder = GlobSetBuilder::new();
1964    for pattern in patterns {
1965        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1966        builder.add(glob);
1967    }
1968    builder.build().map(Some).map_err(|error| error.to_string())
1969}
1970
1971fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1972    let mut buffer = [0u8; 4];
1973    reader.read_exact(&mut buffer)?;
1974    Ok(u32::from_le_bytes(buffer))
1975}
1976
1977fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1978    let mut buffer = [0u8; 8];
1979    reader.read_exact(&mut buffer)?;
1980    Ok(u64::from_le_bytes(buffer))
1981}
1982
1983fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1984    writer.write_all(&value.to_le_bytes())
1985}
1986
1987fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1988    writer.write_all(&value.to_le_bytes())
1989}
1990
1991fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1992    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1993        return Err(std::io::Error::other("search index checksum missing"));
1994    };
1995    let expected = u32::from_le_bytes(*stored);
1996    let actual = crc32fast::hash(body);
1997    if actual != expected {
1998        return Err(std::io::Error::other("search index checksum mismatch"));
1999    }
2000    Ok(())
2001}
2002
2003fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2004    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2005    total_len.checked_sub(pos)
2006}
2007
2008fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2009    let output = Command::new("git")
2010        .arg("-C")
2011        .arg(root)
2012        .args(args)
2013        .output()
2014        .ok()?;
2015    if !output.status.success() {
2016        return None;
2017    }
2018    let value = String::from_utf8(output.stdout).ok()?;
2019    let value = value.trim().to_string();
2020    if value.is_empty() {
2021        None
2022    } else {
2023        Some(value)
2024    }
2025}
2026
2027fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2028    let diff_range = format!("{}..{}", from, to);
2029    let output = match Command::new("git")
2030        .arg("-C")
2031        .arg(root)
2032        .args(["diff", "--name-status", "-M", &diff_range])
2033        .output()
2034    {
2035        Ok(output) => output,
2036        Err(_) => return false,
2037    };
2038
2039    if !output.status.success() {
2040        return false;
2041    }
2042
2043    let Ok(diff) = String::from_utf8(output.stdout) else {
2044        return false;
2045    };
2046
2047    for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2048        let mut fields = line.split('\t');
2049        let Some(status) = fields.next() else {
2050            continue;
2051        };
2052
2053        if status.starts_with('R') {
2054            let Some(old_path) = fields
2055                .next()
2056                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2057            else {
2058                continue;
2059            };
2060            let Some(new_path) = fields
2061                .next()
2062                .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2063            else {
2064                continue;
2065            };
2066            index.remove_file(&old_path);
2067            index.update_file(&new_path);
2068            continue;
2069        }
2070
2071        let Some(path) = fields
2072            .next()
2073            .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2074        else {
2075            continue;
2076        };
2077        if status.starts_with('D') || !path.exists() {
2078            index.remove_file(&path);
2079        } else {
2080            index.update_file(&path);
2081        }
2082    }
2083
2084    true
2085}
2086
2087fn is_binary_path(path: &Path, size: u64) -> bool {
2088    if size == 0 {
2089        return false;
2090    }
2091
2092    let mut file = match File::open(path) {
2093        Ok(file) => file,
2094        Err(_) => return true,
2095    };
2096
2097    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2098    match file.read(&mut preview) {
2099        Ok(read) => is_binary_bytes(&preview[..read]),
2100        Err(_) => true,
2101    }
2102}
2103
2104fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2105    let mut starts = vec![0usize];
2106    for (index, byte) in content.iter().copied().enumerate() {
2107        if byte == b'\n' {
2108            starts.push(index + 1);
2109        }
2110    }
2111    starts
2112}
2113
2114fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2115    let line_index = match line_starts.binary_search(&offset) {
2116        Ok(index) => index,
2117        Err(index) => index.saturating_sub(1),
2118    };
2119    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2120    let line_end = content[line_start..]
2121        .iter()
2122        .position(|byte| *byte == b'\n')
2123        .map(|length| line_start + length)
2124        .unwrap_or(content.len());
2125    let mut line_slice = &content[line_start..line_end];
2126    if line_slice.ends_with(b"\r") {
2127        line_slice = &line_slice[..line_slice.len() - 1];
2128    }
2129    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2130    let column = String::from_utf8_lossy(&content[line_start..offset])
2131        .chars()
2132        .count() as u32
2133        + 1;
2134    (line_index as u32 + 1, column, line_text)
2135}
2136
2137fn to_glob_path(path: &Path) -> String {
2138    path.to_string_lossy().replace('\\', "/")
2139}
2140
2141#[cfg(test)]
2142mod tests {
2143    use std::process::Command;
2144
2145    use super::*;
2146
2147    #[test]
2148    fn extract_trigrams_tracks_next_char_and_position() {
2149        let trigrams = extract_trigrams(b"Rust");
2150        assert_eq!(trigrams.len(), 2);
2151        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2152        assert_eq!(
2153            trigrams[1],
2154            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2155        );
2156    }
2157
2158    #[test]
2159    fn decompose_regex_extracts_literals_and_alternations() {
2160        let query = decompose_regex("abc(def|ghi)xyz");
2161        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2162        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2163        assert_eq!(query.or_groups.len(), 1);
2164        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2165        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2166    }
2167
2168    #[test]
2169    fn candidates_intersect_posting_lists() {
2170        let mut index = SearchIndex::new();
2171        let dir = tempfile::tempdir().expect("create temp dir");
2172        let alpha = dir.path().join("alpha.txt");
2173        let beta = dir.path().join("beta.txt");
2174        fs::write(&alpha, "abcdef").expect("write alpha");
2175        fs::write(&beta, "abcxyz").expect("write beta");
2176        index.project_root = dir.path().to_path_buf();
2177        index.index_file(&alpha, b"abcdef");
2178        index.index_file(&beta, b"abcxyz");
2179
2180        let query = RegexQuery {
2181            and_trigrams: vec![
2182                pack_trigram(b'a', b'b', b'c'),
2183                pack_trigram(b'd', b'e', b'f'),
2184            ],
2185            ..RegexQuery::default()
2186        };
2187
2188        let candidates = index.candidates(&query);
2189        assert_eq!(candidates.len(), 1);
2190        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2191    }
2192
2193    #[test]
2194    fn candidates_apply_bloom_filters() {
2195        let mut index = SearchIndex::new();
2196        let dir = tempfile::tempdir().expect("create temp dir");
2197        let file = dir.path().join("sample.txt");
2198        fs::write(&file, "abcd efgh").expect("write sample");
2199        index.project_root = dir.path().to_path_buf();
2200        index.index_file(&file, b"abcd efgh");
2201
2202        let trigram = pack_trigram(b'a', b'b', b'c');
2203        let matching_filter = PostingFilter {
2204            next_mask: mask_for_next_char(b'd'),
2205            loc_mask: mask_for_position(0),
2206        };
2207        let non_matching_filter = PostingFilter {
2208            next_mask: mask_for_next_char(b'z'),
2209            loc_mask: mask_for_position(0),
2210        };
2211
2212        assert_eq!(
2213            index
2214                .postings_for_trigram(trigram, Some(matching_filter))
2215                .len(),
2216            1
2217        );
2218        assert!(index
2219            .postings_for_trigram(trigram, Some(non_matching_filter))
2220            .is_empty());
2221    }
2222
2223    #[test]
2224    fn disk_round_trip_preserves_postings_and_files() {
2225        let dir = tempfile::tempdir().expect("create temp dir");
2226        let project = dir.path().join("project");
2227        fs::create_dir_all(&project).expect("create project dir");
2228        let file = project.join("src.txt");
2229        fs::write(&file, "abcdef").expect("write source");
2230
2231        let mut index = SearchIndex::build(&project);
2232        index.git_head = Some("deadbeef".to_string());
2233        let cache_dir = dir.path().join("cache");
2234        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2235
2236        let loaded =
2237            SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2238        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2239        assert_eq!(loaded.files.len(), 1);
2240        assert_eq!(
2241            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2242            PathBuf::from("src.txt")
2243        );
2244        assert_eq!(loaded.postings.len(), index.postings.len());
2245        assert!(loaded
2246            .postings
2247            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2248    }
2249
2250    #[test]
2251    fn cache_path_helpers_reject_absolute_and_parent_paths() {
2252        let root = PathBuf::from("/tmp/aft-project");
2253
2254        assert_eq!(
2255            cache_relative_path(&root, &root.join("src/lib.rs")),
2256            Some(PathBuf::from("src/lib.rs"))
2257        );
2258        assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2259        assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2260        assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2261        assert_eq!(
2262            cached_path_under_root(&root, Path::new("src/./lib.rs")),
2263            Some(root.join("src/lib.rs"))
2264        );
2265    }
2266
2267    #[test]
2268    fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2269        let dir = tempfile::tempdir().expect("create temp dir");
2270        let project = dir.path().join("project");
2271        fs::create_dir_all(&project).expect("create project dir");
2272        let canonical_project = fs::canonicalize(&project).expect("canonical project");
2273        fs::write(project.join("old.txt"), "old token\n").expect("write old");
2274        fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2275
2276        Command::new("git")
2277            .arg("init")
2278            .arg(&project)
2279            .status()
2280            .expect("git init");
2281        for args in [
2282            ["config", "user.email", "aft@example.invalid"],
2283            ["config", "user.name", "AFT Test"],
2284        ] {
2285            Command::new("git")
2286                .arg("-C")
2287                .arg(&project)
2288                .args(args)
2289                .status()
2290                .expect("git config");
2291        }
2292        Command::new("git")
2293            .arg("-C")
2294            .arg(&project)
2295            .args(["add", "."])
2296            .status()
2297            .expect("git add initial");
2298        Command::new("git")
2299            .arg("-C")
2300            .arg(&project)
2301            .args(["commit", "-m", "initial"])
2302            .status()
2303            .expect("git commit initial");
2304        let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2305        let mut baseline = SearchIndex::build(&project);
2306        baseline.git_head = Some(previous.clone());
2307
2308        fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2309        Command::new("git")
2310            .arg("-C")
2311            .arg(&project)
2312            .args(["add", "-A"])
2313            .status()
2314            .expect("git add rename");
2315        Command::new("git")
2316            .arg("-C")
2317            .arg(&project)
2318            .args(["commit", "-m", "rename"])
2319            .status()
2320            .expect("git commit rename");
2321        let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2322
2323        fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2324        fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2325
2326        let refreshed = SearchIndex::rebuild_or_refresh(
2327            &project,
2328            DEFAULT_MAX_FILE_SIZE,
2329            Some(current),
2330            Some(baseline),
2331        );
2332
2333        assert!(!refreshed
2334            .path_to_id
2335            .contains_key(&canonical_project.join("old.txt")));
2336        assert!(refreshed
2337            .path_to_id
2338            .contains_key(&canonical_project.join("new.txt")));
2339        assert!(refreshed
2340            .path_to_id
2341            .contains_key(&canonical_project.join("untracked.txt")));
2342        let matches =
2343            refreshed.search_grep("after local edit", true, &[], &[], &canonical_project, 10);
2344        assert_eq!(matches.matches.len(), 1);
2345    }
2346
2347    #[test]
2348    fn read_from_disk_rejects_corrupt_postings_checksum() {
2349        let dir = tempfile::tempdir().expect("create temp dir");
2350        let project = dir.path().join("project");
2351        fs::create_dir_all(&project).expect("create project dir");
2352        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2353
2354        let index = SearchIndex::build(&project);
2355        let cache_dir = dir.path().join("cache");
2356        index.write_to_disk(&cache_dir, None);
2357
2358        let cache_path = cache_dir.join("cache.bin");
2359        let mut bytes = fs::read(&cache_path).expect("read cache");
2360        let middle = bytes.len() / 2;
2361        bytes[middle] ^= 0xff;
2362        fs::write(&cache_path, bytes).expect("write corrupted cache");
2363
2364        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2365    }
2366
2367    #[test]
2368    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2369        let dir = tempfile::tempdir().expect("create temp dir");
2370        let project = dir.path().join("project");
2371        fs::create_dir_all(&project).expect("create project dir");
2372        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2373
2374        let index = SearchIndex::build(&project);
2375        let cache_dir = dir.path().join("cache");
2376        index.write_to_disk(&cache_dir, None);
2377
2378        assert!(cache_dir.join("cache.bin").is_file());
2379        assert!(fs::read_dir(&cache_dir)
2380            .expect("read cache dir")
2381            .all(|entry| !entry
2382                .expect("cache entry")
2383                .file_name()
2384                .to_string_lossy()
2385                .contains(".tmp.")));
2386    }
2387
2388    #[test]
2389    fn concurrent_search_index_writes_do_not_corrupt() {
2390        let dir = tempfile::tempdir().expect("create temp dir");
2391        let project = dir.path().join("project");
2392        fs::create_dir_all(&project).expect("create project dir");
2393        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2394        let cache_dir = dir.path().join("cache");
2395
2396        let a_project = project.clone();
2397        let a_cache = cache_dir.clone();
2398        let a = std::thread::spawn(move || {
2399            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2400            let index = SearchIndex::build(&a_project);
2401            index.write_to_disk(&a_cache, None);
2402        });
2403        let b_project = project.clone();
2404        let b_cache = cache_dir.clone();
2405        let b = std::thread::spawn(move || {
2406            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2407            let index = SearchIndex::build(&b_project);
2408            index.write_to_disk(&b_cache, None);
2409        });
2410        a.join().expect("writer a");
2411        b.join().expect("writer b");
2412
2413        assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2414    }
2415
2416    #[test]
2417    fn search_index_atomic_rename_survives_partial_write() {
2418        let dir = tempfile::tempdir().expect("create temp dir");
2419        let cache_dir = dir.path().join("cache");
2420        fs::create_dir_all(&cache_dir).expect("create cache dir");
2421        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2422
2423        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2424    }
2425
2426    #[test]
2427    fn project_cache_key_includes_checkout_path() {
2428        let dir = tempfile::tempdir().expect("create temp dir");
2429        let source = dir.path().join("source");
2430        fs::create_dir_all(&source).expect("create source repo dir");
2431        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2432
2433        assert!(Command::new("git")
2434            .current_dir(&source)
2435            .args(["init"])
2436            .status()
2437            .expect("init git repo")
2438            .success());
2439        assert!(Command::new("git")
2440            .current_dir(&source)
2441            .args(["add", "."])
2442            .status()
2443            .expect("git add")
2444            .success());
2445        assert!(Command::new("git")
2446            .current_dir(&source)
2447            .args([
2448                "-c",
2449                "user.name=AFT Tests",
2450                "-c",
2451                "user.email=aft-tests@example.com",
2452                "commit",
2453                "-m",
2454                "initial",
2455            ])
2456            .status()
2457            .expect("git commit")
2458            .success());
2459
2460        let clone = dir.path().join("clone");
2461        assert!(Command::new("git")
2462            .args(["clone", "--quiet"])
2463            .arg(&source)
2464            .arg(&clone)
2465            .status()
2466            .expect("git clone")
2467            .success());
2468
2469        let source_key = project_cache_key(&source);
2470        let clone_key = project_cache_key(&clone);
2471
2472        assert_eq!(source_key.len(), 16);
2473        assert_eq!(clone_key.len(), 16);
2474        // Same repo (same root commit) → same cache key regardless of clone path
2475        assert_eq!(source_key, clone_key);
2476    }
2477
2478    #[test]
2479    fn git_head_unchanged_picks_up_local_edits() {
2480        let dir = tempfile::tempdir().expect("create temp dir");
2481        let project = dir.path().join("repo");
2482        fs::create_dir_all(&project).expect("create repo dir");
2483        let file = project.join("tracked.txt");
2484        fs::write(&file, "oldtoken\n").expect("write file");
2485        assert!(Command::new("git")
2486            .current_dir(&project)
2487            .arg("init")
2488            .status()
2489            .unwrap()
2490            .success());
2491        assert!(Command::new("git")
2492            .current_dir(&project)
2493            .args(["add", "."])
2494            .status()
2495            .unwrap()
2496            .success());
2497        assert!(Command::new("git")
2498            .current_dir(&project)
2499            .args([
2500                "-c",
2501                "user.name=AFT Tests",
2502                "-c",
2503                "user.email=aft-tests@example.com",
2504                "commit",
2505                "-m",
2506                "initial"
2507            ])
2508            .status()
2509            .unwrap()
2510            .success());
2511        let head = current_git_head(&project);
2512        let mut baseline = SearchIndex::build(&project);
2513        baseline.git_head = head.clone();
2514        fs::write(&file, "newtoken\n").expect("edit tracked file");
2515
2516        let refreshed =
2517            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2518        let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2519
2520        assert_eq!(result.total_matches, 1);
2521    }
2522
2523    #[test]
2524    fn non_git_project_reuses_cache_when_files_unchanged() {
2525        let dir = tempfile::tempdir().expect("create temp dir");
2526        let project = dir.path().join("project");
2527        fs::create_dir_all(&project).expect("create project dir");
2528        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2529        let baseline = SearchIndex::build(&project);
2530        let baseline_file_count = baseline.file_count();
2531
2532        let refreshed =
2533            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2534
2535        assert_eq!(refreshed.file_count(), baseline_file_count);
2536        assert_eq!(
2537            refreshed
2538                .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2539                .total_matches,
2540            1
2541        );
2542    }
2543
2544    #[test]
2545    fn resolve_search_scope_disables_index_for_external_path() {
2546        let dir = tempfile::tempdir().expect("create temp dir");
2547        let project = dir.path().join("project");
2548        let outside = dir.path().join("outside");
2549        fs::create_dir_all(&project).expect("create project dir");
2550        fs::create_dir_all(&outside).expect("create outside dir");
2551
2552        let scope = resolve_search_scope(&project, outside.to_str());
2553
2554        assert_eq!(
2555            scope.root,
2556            fs::canonicalize(&outside).expect("canonicalize outside")
2557        );
2558        assert!(!scope.use_index);
2559    }
2560
2561    #[test]
2562    fn grep_filters_matches_to_search_root() {
2563        let dir = tempfile::tempdir().expect("create temp dir");
2564        let project = dir.path().join("project");
2565        let src = project.join("src");
2566        let docs = project.join("docs");
2567        fs::create_dir_all(&src).expect("create src dir");
2568        fs::create_dir_all(&docs).expect("create docs dir");
2569        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2570        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2571
2572        let index = SearchIndex::build(&project);
2573        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2574
2575        assert_eq!(result.files_searched, 1);
2576        assert_eq!(result.files_with_matches, 1);
2577        assert_eq!(result.matches.len(), 1);
2578        // Index stores canonicalized paths; on macOS /var → /private/var
2579        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2580        assert_eq!(result.matches[0].file, expected);
2581    }
2582
2583    #[test]
2584    fn grep_deduplicates_multiple_matches_on_same_line() {
2585        let dir = tempfile::tempdir().expect("create temp dir");
2586        let project = dir.path().join("project");
2587        let src = project.join("src");
2588        fs::create_dir_all(&src).expect("create src dir");
2589        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2590
2591        let index = SearchIndex::build(&project);
2592        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2593
2594        assert_eq!(result.total_matches, 1);
2595        assert_eq!(result.matches.len(), 1);
2596    }
2597
2598    #[test]
2599    fn grep_reports_total_matches_before_truncation() {
2600        let dir = tempfile::tempdir().expect("create temp dir");
2601        let project = dir.path().join("project");
2602        let src = project.join("src");
2603        fs::create_dir_all(&src).expect("create src dir");
2604        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2605
2606        let index = SearchIndex::build(&project);
2607        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2608
2609        assert_eq!(result.total_matches, 2);
2610        assert_eq!(result.matches.len(), 1);
2611        assert!(result.truncated);
2612    }
2613
2614    #[test]
2615    fn glob_filters_results_to_search_root() {
2616        let dir = tempfile::tempdir().expect("create temp dir");
2617        let project = dir.path().join("project");
2618        let src = project.join("src");
2619        let scripts = project.join("scripts");
2620        fs::create_dir_all(&src).expect("create src dir");
2621        fs::create_dir_all(&scripts).expect("create scripts dir");
2622        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2623        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2624
2625        let index = SearchIndex::build(&project);
2626        let files = index.glob("**/*.rs", &src);
2627
2628        assert_eq!(
2629            files,
2630            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2631        );
2632    }
2633
2634    #[test]
2635    fn glob_includes_hidden_and_binary_files() {
2636        let dir = tempfile::tempdir().expect("create temp dir");
2637        let project = dir.path().join("project");
2638        let hidden_dir = project.join(".hidden");
2639        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2640        let hidden_file = hidden_dir.join("data.bin");
2641        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2642
2643        let index = SearchIndex::build(&project);
2644        let files = index.glob("**/*.bin", &project);
2645
2646        assert_eq!(
2647            files,
2648            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2649        );
2650    }
2651
2652    #[test]
2653    fn read_from_disk_rejects_invalid_nanos() {
2654        let dir = tempfile::tempdir().expect("create temp dir");
2655        let cache_dir = dir.path().join("cache");
2656        fs::create_dir_all(&cache_dir).expect("create cache dir");
2657
2658        let mut postings = Vec::new();
2659        postings.extend_from_slice(INDEX_MAGIC);
2660        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2661        postings.extend_from_slice(&0u32.to_le_bytes());
2662        postings.extend_from_slice(&1u32.to_le_bytes());
2663        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2664        postings.extend_from_slice(&1u32.to_le_bytes());
2665        postings.extend_from_slice(b"/");
2666        postings.push(0u8);
2667        postings.extend_from_slice(&1u32.to_le_bytes());
2668        postings.extend_from_slice(&0u64.to_le_bytes());
2669        postings.extend_from_slice(&0u64.to_le_bytes());
2670        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2671        postings.extend_from_slice(b"a");
2672        postings.extend_from_slice(&0u64.to_le_bytes());
2673
2674        let mut lookup = Vec::new();
2675        lookup.extend_from_slice(LOOKUP_MAGIC);
2676        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2677        lookup.extend_from_slice(&0u32.to_le_bytes());
2678
2679        let postings_checksum = crc32fast::hash(&postings);
2680        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2681        let lookup_checksum = crc32fast::hash(&lookup);
2682        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2683        let mut cache = Vec::new();
2684        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2685        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2686        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2687        cache.extend_from_slice(&postings);
2688        cache.extend_from_slice(&lookup);
2689        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2690
2691        assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2692    }
2693
2694    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2695    /// changed between cmp() calls.
2696    ///
2697    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2698    /// which does a `stat()` syscall. If the file was deleted, modified, or
2699    /// touched mid-sort, the comparator returned different values for the
2700    /// same input pair on different invocations. Rust's slice::sort detects
2701    /// this and panics with "user-provided comparison function does not
2702    /// correctly implement a total order".
2703    ///
2704    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2705    /// bridge invalidated files in parallel with grep's sort path. This
2706    /// test simulates the worst case: most paths don't exist (Err from
2707    /// fs::metadata) and sort still completes successfully.
2708    #[test]
2709    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2710        // Mix of existing and non-existing paths in deliberately
2711        // non-monotonic order — pre-fix, the sort would call stat() at
2712        // least N log N times and any flakiness would trigger the panic.
2713        let dir = tempfile::tempdir().expect("create tempdir");
2714        let mut paths: Vec<PathBuf> = Vec::new();
2715        for i in 0..30 {
2716            // Half exist, half don't.
2717            let path = if i % 2 == 0 {
2718                let p = dir.path().join(format!("real-{i}.rs"));
2719                fs::write(&p, format!("// {i}\n")).expect("write");
2720                p
2721            } else {
2722                dir.path().join(format!("missing-{i}.rs"))
2723            };
2724            paths.push(path);
2725        }
2726
2727        // Run the sort many times to maximise the chance of catching any
2728        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2729        for _ in 0..50 {
2730            let mut copy = paths.clone();
2731            sort_paths_by_mtime_desc(&mut copy);
2732            assert_eq!(copy.len(), paths.len());
2733        }
2734    }
2735
2736    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2737    /// the same conditions as sort_paths_by_mtime_desc. See the
2738    /// sort_paths_... test above for the full rationale.
2739    #[test]
2740    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2741        let dir = tempfile::tempdir().expect("create tempdir");
2742        let mut matches: Vec<GrepMatch> = Vec::new();
2743        for i in 0..30 {
2744            let file = if i % 2 == 0 {
2745                let p = dir.path().join(format!("real-{i}.rs"));
2746                fs::write(&p, format!("// {i}\n")).expect("write");
2747                p
2748            } else {
2749                dir.path().join(format!("missing-{i}.rs"))
2750            };
2751            matches.push(GrepMatch {
2752                file,
2753                line: u32::try_from(i).unwrap_or(0),
2754                column: 0,
2755                line_text: format!("match {i}"),
2756                match_text: format!("match {i}"),
2757            });
2758        }
2759
2760        for _ in 0..50 {
2761            let mut copy = matches.clone();
2762            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2763            assert_eq!(copy.len(), matches.len());
2764        }
2765    }
2766}
aft/search_index.rs

aft/
search_index.rs