aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
20const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
21const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
22const INDEX_VERSION: u32 = 2;
23const PREVIEW_BYTES: usize = 8 * 1024;
24const EOF_SENTINEL: u8 = 0;
25const MAX_ENTRIES: usize = 10_000_000;
26const MIN_FILE_ENTRY_BYTES: usize = 25;
27const LOOKUP_ENTRY_BYTES: usize = 16;
28const POSTING_BYTES: usize = 6;
29
30pub struct CacheLock {
31    path: PathBuf,
32}
33
34impl CacheLock {
35    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
36        fs::create_dir_all(cache_dir)?;
37        let path = cache_dir.join("cache.lock");
38        for _ in 0..200 {
39            match fs::OpenOptions::new()
40                .write(true)
41                .create_new(true)
42                .open(&path)
43            {
44                Ok(mut file) => {
45                    let _ = writeln!(file, "{}", std::process::id());
46                    let _ = file.sync_all();
47                    return Ok(Self { path });
48                }
49                Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
50                    std::thread::sleep(Duration::from_millis(10));
51                }
52                Err(error) => return Err(error),
53            }
54        }
55        Err(std::io::Error::other(
56            "timed out acquiring search cache lock",
57        ))
58    }
59}
60
61impl Drop for CacheLock {
62    fn drop(&mut self) {
63        let _ = fs::remove_file(&self.path);
64    }
65}
66
67#[derive(Clone, Debug)]
68pub struct SearchIndex {
69    pub postings: HashMap<u32, Vec<Posting>>,
70    pub files: Vec<FileEntry>,
71    pub path_to_id: HashMap<PathBuf, u32>,
72    pub ready: bool,
73    project_root: PathBuf,
74    git_head: Option<String>,
75    max_file_size: u64,
76    file_trigrams: HashMap<u32, Vec<u32>>,
77    unindexed_files: HashSet<u32>,
78}
79
80impl SearchIndex {
81    /// Number of indexed files.
82    pub fn file_count(&self) -> usize {
83        self.files.len()
84    }
85
86    /// Number of unique trigrams in the index.
87    pub fn trigram_count(&self) -> usize {
88        self.postings.len()
89    }
90}
91
92#[derive(Clone, Debug, PartialEq, Eq)]
93pub struct Posting {
94    pub file_id: u32,
95    pub next_mask: u8,
96    pub loc_mask: u8,
97}
98
99#[derive(Clone, Debug)]
100pub struct FileEntry {
101    pub path: PathBuf,
102    pub size: u64,
103    pub modified: SystemTime,
104}
105
106#[derive(Clone, Debug, PartialEq, Eq)]
107pub struct GrepMatch {
108    pub file: PathBuf,
109    pub line: u32,
110    pub column: u32,
111    pub line_text: String,
112    pub match_text: String,
113}
114
115#[derive(Clone, Debug)]
116pub struct GrepResult {
117    pub matches: Vec<GrepMatch>,
118    pub total_matches: usize,
119    pub files_searched: usize,
120    pub files_with_matches: usize,
121    pub index_status: IndexStatus,
122    pub truncated: bool,
123}
124
125#[derive(Clone, Copy, Debug, PartialEq, Eq)]
126pub enum IndexStatus {
127    Ready,
128    Building,
129    Fallback,
130}
131
132impl IndexStatus {
133    pub fn as_str(&self) -> &'static str {
134        match self {
135            IndexStatus::Ready => "Ready",
136            IndexStatus::Building => "Building",
137            IndexStatus::Fallback => "Fallback",
138        }
139    }
140}
141
142#[derive(Clone, Debug, Default)]
143pub struct RegexQuery {
144    pub and_trigrams: Vec<u32>,
145    pub or_groups: Vec<Vec<u32>>,
146    pub(crate) and_filters: HashMap<u32, PostingFilter>,
147    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
148}
149
150#[derive(Clone, Copy, Debug, Default)]
151pub(crate) struct PostingFilter {
152    next_mask: u8,
153    loc_mask: u8,
154}
155
156#[derive(Clone, Debug, Default)]
157struct QueryBuild {
158    and_runs: Vec<Vec<u8>>,
159    or_groups: Vec<Vec<Vec<u8>>>,
160}
161
162#[derive(Clone, Debug, Default)]
163pub(crate) struct PathFilters {
164    includes: Option<GlobSet>,
165    excludes: Option<GlobSet>,
166}
167
168#[derive(Clone, Debug)]
169pub(crate) struct SearchScope {
170    pub root: PathBuf,
171    pub use_index: bool,
172}
173
174#[derive(Clone, Debug)]
175struct SharedGrepMatch {
176    file: Arc<PathBuf>,
177    line: u32,
178    column: u32,
179    line_text: String,
180    match_text: String,
181}
182
183#[derive(Clone, Debug)]
184enum SearchMatcher {
185    Literal(LiteralSearch),
186    Regex(Regex),
187}
188
189#[derive(Clone, Debug)]
190enum LiteralSearch {
191    CaseSensitive(Vec<u8>),
192    AsciiCaseInsensitive(Vec<u8>),
193}
194
195impl SearchIndex {
196    pub fn new() -> Self {
197        SearchIndex {
198            postings: HashMap::new(),
199            files: Vec::new(),
200            path_to_id: HashMap::new(),
201            ready: false,
202            project_root: PathBuf::new(),
203            git_head: None,
204            max_file_size: DEFAULT_MAX_FILE_SIZE,
205            file_trigrams: HashMap::new(),
206            unindexed_files: HashSet::new(),
207        }
208    }
209
210    pub fn build(root: &Path) -> Self {
211        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
212    }
213
214    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
215        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
216        let mut index = SearchIndex {
217            project_root: project_root.clone(),
218            max_file_size,
219            ..SearchIndex::new()
220        };
221
222        let filters = PathFilters::default();
223        for path in walk_project_files(&project_root, &filters) {
224            index.update_file(&path);
225        }
226
227        index.git_head = current_git_head(&project_root);
228        index.ready = true;
229        index
230    }
231
232    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
233        self.remove_file(path);
234
235        let file_id = match self.allocate_file_id(path, content.len() as u64) {
236            Some(file_id) => file_id,
237            None => return,
238        };
239
240        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
241        for (trigram, next_char, position) in extract_trigrams(content) {
242            let entry = trigram_map.entry(trigram).or_default();
243            entry.next_mask |= mask_for_next_char(next_char);
244            entry.loc_mask |= mask_for_position(position);
245        }
246
247        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
248        for (trigram, filter) in trigram_map {
249            let postings = self.postings.entry(trigram).or_default();
250            postings.push(Posting {
251                file_id,
252                next_mask: filter.next_mask,
253                loc_mask: filter.loc_mask,
254            });
255            // Posting lists are kept sorted by file_id for binary search during
256            // intersection. Since file_ids are allocated incrementally, the new
257            // entry is usually already in order. Only sort when needed.
258            if postings.len() > 1
259                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
260            {
261                postings.sort_unstable_by_key(|p| p.file_id);
262            }
263            file_trigrams.push(trigram);
264        }
265
266        self.file_trigrams.insert(file_id, file_trigrams);
267        self.unindexed_files.remove(&file_id);
268    }
269
270    pub fn remove_file(&mut self, path: &Path) {
271        let Some(file_id) = self.path_to_id.remove(path) else {
272            return;
273        };
274
275        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
276            for trigram in trigrams {
277                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
278                    postings.retain(|posting| posting.file_id != file_id);
279                    postings.is_empty()
280                } else {
281                    false
282                };
283
284                if should_remove {
285                    self.postings.remove(&trigram);
286                }
287            }
288        }
289
290        self.unindexed_files.remove(&file_id);
291        if let Some(file) = self.files.get_mut(file_id as usize) {
292            file.path = PathBuf::new();
293            file.size = 0;
294            file.modified = UNIX_EPOCH;
295        }
296    }
297
298    pub fn update_file(&mut self, path: &Path) {
299        self.remove_file(path);
300
301        let metadata = match fs::metadata(path) {
302            Ok(metadata) if metadata.is_file() => metadata,
303            _ => return,
304        };
305
306        if is_binary_path(path, metadata.len()) {
307            self.track_unindexed_file(path, &metadata);
308            return;
309        }
310
311        if metadata.len() > self.max_file_size {
312            self.track_unindexed_file(path, &metadata);
313            return;
314        }
315
316        let content = match fs::read(path) {
317            Ok(content) => content,
318            Err(_) => return,
319        };
320
321        if is_binary_bytes(&content) {
322            self.track_unindexed_file(path, &metadata);
323            return;
324        }
325
326        self.index_file(path, &content);
327    }
328
329    pub fn grep(
330        &self,
331        pattern: &str,
332        case_sensitive: bool,
333        include: &[String],
334        exclude: &[String],
335        search_root: &Path,
336        max_results: usize,
337    ) -> GrepResult {
338        self.search_grep(
339            pattern,
340            case_sensitive,
341            include,
342            exclude,
343            search_root,
344            max_results,
345        )
346    }
347
348    pub fn search_grep(
349        &self,
350        pattern: &str,
351        case_sensitive: bool,
352        include: &[String],
353        exclude: &[String],
354        search_root: &Path,
355        max_results: usize,
356    ) -> GrepResult {
357        // Detect if pattern is a plain literal (no regex metacharacters).
358        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
359        let is_literal = !pattern.chars().any(|c| {
360            matches!(
361                c,
362                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
363            )
364        });
365
366        let literal_search = if is_literal {
367            if case_sensitive {
368                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
369            } else if pattern.is_ascii() {
370                Some(LiteralSearch::AsciiCaseInsensitive(
371                    pattern
372                        .as_bytes()
373                        .iter()
374                        .map(|byte| byte.to_ascii_lowercase())
375                        .collect(),
376                ))
377            } else {
378                None
379            }
380        } else {
381            None
382        };
383
384        // Build the regex for non-literal patterns (or literal Unicode fallback).
385        let regex = if literal_search.is_some() {
386            None
387        } else {
388            let regex_pattern = if is_literal {
389                regex::escape(pattern)
390            } else {
391                pattern.to_string()
392            };
393            let mut builder = RegexBuilder::new(&regex_pattern);
394            builder.case_insensitive(!case_sensitive);
395            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
396            builder.multi_line(true);
397            match builder.build() {
398                Ok(r) => Some(r),
399                Err(_) => {
400                    return GrepResult {
401                        matches: Vec::new(),
402                        total_matches: 0,
403                        files_searched: 0,
404                        files_with_matches: 0,
405                        index_status: if self.ready {
406                            IndexStatus::Ready
407                        } else {
408                            IndexStatus::Building
409                        },
410                        truncated: false,
411                    };
412                }
413            }
414        };
415
416        let matcher = if let Some(literal_search) = literal_search {
417            SearchMatcher::Literal(literal_search)
418        } else {
419            SearchMatcher::Regex(
420                regex.expect("regex should exist when literal matcher is unavailable"),
421            )
422        };
423
424        let filters = match build_path_filters(include, exclude) {
425            Ok(filters) => filters,
426            Err(_) => PathFilters::default(),
427        };
428        let search_root = canonicalize_or_normalize(search_root);
429
430        let query = decompose_regex(pattern);
431        let candidate_ids = self.candidates(&query);
432
433        let candidate_files: Vec<&FileEntry> = candidate_ids
434            .into_iter()
435            .filter_map(|file_id| self.files.get(file_id as usize))
436            .filter(|file| !file.path.as_os_str().is_empty())
437            .filter(|file| is_within_search_root(&search_root, &file.path))
438            .filter(|file| filters.matches(&self.project_root, &file.path))
439            .collect();
440
441        let total_matches = AtomicUsize::new(0);
442        let files_searched = AtomicUsize::new(0);
443        let files_with_matches = AtomicUsize::new(0);
444        let truncated = AtomicBool::new(false);
445        let stop_after = max_results.saturating_mul(2);
446
447        let mut matches = if candidate_files.len() > 10 {
448            candidate_files
449                .par_iter()
450                .map(|file| {
451                    search_candidate_file(
452                        file,
453                        &matcher,
454                        max_results,
455                        stop_after,
456                        &total_matches,
457                        &files_searched,
458                        &files_with_matches,
459                        &truncated,
460                    )
461                })
462                .reduce(Vec::new, |mut left, mut right| {
463                    left.append(&mut right);
464                    left
465                })
466        } else {
467            let mut matches = Vec::new();
468            for file in candidate_files {
469                matches.extend(search_candidate_file(
470                    file,
471                    &matcher,
472                    max_results,
473                    stop_after,
474                    &total_matches,
475                    &files_searched,
476                    &files_with_matches,
477                    &truncated,
478                ));
479
480                if should_stop_search(&truncated, &total_matches, stop_after) {
481                    break;
482                }
483            }
484            matches
485        };
486
487        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
488            self.path_to_id
489                .get(path)
490                .and_then(|file_id| self.files.get(*file_id as usize))
491                .map(|file| file.modified)
492        });
493
494        let matches = matches
495            .into_iter()
496            .map(|matched| GrepMatch {
497                file: matched.file.as_ref().clone(),
498                line: matched.line,
499                column: matched.column,
500                line_text: matched.line_text,
501                match_text: matched.match_text,
502            })
503            .collect();
504
505        GrepResult {
506            total_matches: total_matches.load(Ordering::Relaxed),
507            matches,
508            files_searched: files_searched.load(Ordering::Relaxed),
509            files_with_matches: files_with_matches.load(Ordering::Relaxed),
510            index_status: if self.ready {
511                IndexStatus::Ready
512            } else {
513                IndexStatus::Building
514            },
515            truncated: truncated.load(Ordering::Relaxed),
516        }
517    }
518
519    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
520        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
521            Ok(filters) => filters,
522            Err(_) => return Vec::new(),
523        };
524        let search_root = canonicalize_or_normalize(search_root);
525        let mut entries = self
526            .files
527            .iter()
528            .filter(|file| !file.path.as_os_str().is_empty())
529            .filter(|file| is_within_search_root(&search_root, &file.path))
530            .filter(|file| filters.matches(&self.project_root, &file.path))
531            .map(|file| (file.path.clone(), file.modified))
532            .collect::<Vec<_>>();
533
534        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
535            right_mtime
536                .cmp(left_mtime)
537                .then_with(|| left_path.cmp(right_path))
538        });
539
540        entries.into_iter().map(|(path, _)| path).collect()
541    }
542
543    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
544        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
545            return self.active_file_ids();
546        }
547
548        let mut and_trigrams = query.and_trigrams.clone();
549        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
550
551        let mut current: Option<Vec<u32>> = None;
552
553        for trigram in and_trigrams {
554            let filter = query.and_filters.get(&trigram).copied();
555            let matches = self.postings_for_trigram(trigram, filter);
556            current = Some(match current.take() {
557                Some(existing) => intersect_sorted_ids(&existing, &matches),
558                None => matches,
559            });
560
561            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
562                break;
563            }
564        }
565
566        let mut current = current.unwrap_or_else(|| self.active_file_ids());
567
568        for (index, group) in query.or_groups.iter().enumerate() {
569            let mut group_matches = Vec::new();
570            let filters = query.or_filters.get(index);
571
572            for trigram in group {
573                let filter = filters.and_then(|filters| filters.get(trigram).copied());
574                let matches = self.postings_for_trigram(*trigram, filter);
575                if group_matches.is_empty() {
576                    group_matches = matches;
577                } else {
578                    group_matches = union_sorted_ids(&group_matches, &matches);
579                }
580            }
581
582            current = intersect_sorted_ids(&current, &group_matches);
583            if current.is_empty() {
584                break;
585            }
586        }
587
588        let mut unindexed = self
589            .unindexed_files
590            .iter()
591            .copied()
592            .filter(|file_id| self.is_active_file(*file_id))
593            .collect::<Vec<_>>();
594        if !unindexed.is_empty() {
595            unindexed.sort_unstable();
596            current = union_sorted_ids(&current, &unindexed);
597        }
598
599        current
600    }
601
602    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
603        if fs::create_dir_all(cache_dir).is_err() {
604            return;
605        }
606
607        let cache_path = cache_dir.join("cache.bin");
608        let tmp_cache = cache_dir.join(format!(
609            "cache.bin.tmp.{}.{}",
610            std::process::id(),
611            SystemTime::now()
612                .duration_since(UNIX_EPOCH)
613                .unwrap_or(Duration::ZERO)
614                .as_nanos()
615        ));
616
617        let active_ids = self.active_file_ids();
618        let mut id_map = HashMap::new();
619        for (new_id, old_id) in active_ids.iter().enumerate() {
620            let Ok(new_id_u32) = u32::try_from(new_id) else {
621                return;
622            };
623            id_map.insert(*old_id, new_id_u32);
624        }
625
626        let write_result = (|| -> std::io::Result<()> {
627            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
628
629            postings_writer.write_all(INDEX_MAGIC)?;
630            write_u32(&mut postings_writer, INDEX_VERSION)?;
631
632            let head = git_head.unwrap_or_default();
633            let root = self.project_root.to_string_lossy();
634            let head_len = u32::try_from(head.len())
635                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
636            let root_len = u32::try_from(root.len())
637                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
638            let file_count = u32::try_from(active_ids.len())
639                .map_err(|_| std::io::Error::other("too many files to cache"))?;
640
641            write_u32(&mut postings_writer, head_len)?;
642            write_u32(&mut postings_writer, root_len)?;
643            write_u64(&mut postings_writer, self.max_file_size)?;
644            write_u32(&mut postings_writer, file_count)?;
645            postings_writer.write_all(head.as_bytes())?;
646            postings_writer.write_all(root.as_bytes())?;
647
648            for old_id in &active_ids {
649                let Some(file) = self.files.get(*old_id as usize) else {
650                    return Err(std::io::Error::other("missing file entry for cache write"));
651                };
652                let path = relative_to_root(&self.project_root, &file.path);
653                let path = path.to_string_lossy();
654                let path_len = u32::try_from(path.len())
655                    .map_err(|_| std::io::Error::other("cached path too large"))?;
656                let modified = file
657                    .modified
658                    .duration_since(UNIX_EPOCH)
659                    .unwrap_or(Duration::ZERO);
660                let unindexed = if self.unindexed_files.contains(old_id) {
661                    1u8
662                } else {
663                    0u8
664                };
665
666                postings_writer.write_all(&[unindexed])?;
667                write_u32(&mut postings_writer, path_len)?;
668                write_u64(&mut postings_writer, file.size)?;
669                write_u64(&mut postings_writer, modified.as_secs())?;
670                write_u32(&mut postings_writer, modified.subsec_nanos())?;
671                postings_writer.write_all(path.as_bytes())?;
672            }
673
674            let mut lookup_entries = Vec::new();
675            let mut postings_blob = Vec::new();
676            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
677            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
678
679            for (trigram, postings) in sorted_postings {
680                let offset = u64::try_from(postings_blob.len())
681                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
682                let mut count = 0u32;
683
684                for posting in postings {
685                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
686                        continue;
687                    };
688
689                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
690                    postings_blob.push(posting.next_mask);
691                    postings_blob.push(posting.loc_mask);
692                    count = count.saturating_add(1);
693                }
694
695                if count > 0 {
696                    lookup_entries.push((*trigram, offset, count));
697                }
698            }
699
700            write_u64(
701                &mut postings_writer,
702                u64::try_from(postings_blob.len())
703                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
704            )?;
705            postings_writer.write_all(&postings_blob)?;
706            postings_writer.flush()?;
707            let mut postings_blob_file = postings_writer
708                .into_inner()
709                .map_err(|error| std::io::Error::other(error.to_string()))?
710                .into_inner();
711            let checksum = crc32fast::hash(&postings_blob_file);
712            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
713
714            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
715            let entry_count = u32::try_from(lookup_entries.len())
716                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
717
718            lookup_writer.write_all(LOOKUP_MAGIC)?;
719            write_u32(&mut lookup_writer, INDEX_VERSION)?;
720            write_u32(&mut lookup_writer, entry_count)?;
721
722            for (trigram, offset, count) in lookup_entries {
723                write_u32(&mut lookup_writer, trigram)?;
724                write_u64(&mut lookup_writer, offset)?;
725                write_u32(&mut lookup_writer, count)?;
726            }
727
728            lookup_writer.flush()?;
729            let mut lookup_blob_file = lookup_writer
730                .into_inner()
731                .map_err(|error| std::io::Error::other(error.to_string()))?
732                .into_inner();
733            let checksum = crc32fast::hash(&lookup_blob_file);
734            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
735
736            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
737            write_u32(&mut cache_writer, CACHE_MAGIC)?;
738            write_u32(&mut cache_writer, INDEX_VERSION)?;
739            write_u64(
740                &mut cache_writer,
741                u64::try_from(postings_blob_file.len())
742                    .map_err(|_| std::io::Error::other("postings section too large"))?,
743            )?;
744            cache_writer.write_all(&postings_blob_file)?;
745            cache_writer.write_all(&lookup_blob_file)?;
746            cache_writer.flush()?;
747            cache_writer.get_ref().sync_all()?;
748            drop(cache_writer);
749            fs::rename(&tmp_cache, &cache_path)?;
750
751            Ok(())
752        })();
753
754        if write_result.is_err() {
755            let _ = fs::remove_file(&tmp_cache);
756        }
757    }
758
759    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
760        let cache_path = cache_dir.join("cache.bin");
761        let cache_bytes = fs::read(&cache_path).ok()?;
762        if cache_bytes.len() < 16 {
763            return None;
764        }
765        let mut header = Cursor::new(&cache_bytes);
766        if read_u32(&mut header).ok()? != CACHE_MAGIC {
767            return None;
768        }
769        if read_u32(&mut header).ok()? != INDEX_VERSION {
770            return None;
771        }
772        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
773        let start = usize::try_from(header.position()).ok()?;
774        let postings_end = start.checked_add(postings_len_total)?;
775        if postings_end > cache_bytes.len() {
776            return None;
777        }
778        let postings_bytes = &cache_bytes[start..postings_end];
779        let lookup_bytes = &cache_bytes[postings_end..];
780        let lookup_len_total = lookup_bytes.len();
781        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
782        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
783        if postings_len_total < 4 || lookup_len_total < 4 {
784            return None;
785        }
786        verify_crc32_bytes_slice(postings_bytes).ok()?;
787        verify_crc32_bytes_slice(lookup_bytes).ok()?;
788
789        let mut magic = [0u8; 8];
790        postings_reader.read_exact(&mut magic).ok()?;
791        if &magic != INDEX_MAGIC {
792            return None;
793        }
794        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
795            return None;
796        }
797
798        let head_len = read_u32(&mut postings_reader).ok()? as usize;
799        let root_len = read_u32(&mut postings_reader).ok()? as usize;
800        let max_file_size = read_u64(&mut postings_reader).ok()?;
801        let file_count = read_u32(&mut postings_reader).ok()? as usize;
802        if file_count > MAX_ENTRIES {
803            return None;
804        }
805        let postings_body_len = postings_len_total.checked_sub(4)?;
806        let lookup_body_len = lookup_len_total.checked_sub(4)?;
807
808        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
809        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
810        if minimum_file_bytes > remaining_postings {
811            return None;
812        }
813
814        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
815            return None;
816        }
817        let mut head_bytes = vec![0u8; head_len];
818        postings_reader.read_exact(&mut head_bytes).ok()?;
819        let git_head = String::from_utf8(head_bytes)
820            .ok()
821            .filter(|head| !head.is_empty());
822
823        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
824            return None;
825        }
826        let mut root_bytes = vec![0u8; root_len];
827        postings_reader.read_exact(&mut root_bytes).ok()?;
828        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
829
830        let mut files = Vec::with_capacity(file_count);
831        let mut path_to_id = HashMap::new();
832        let mut unindexed_files = HashSet::new();
833
834        for file_id in 0..file_count {
835            let mut unindexed = [0u8; 1];
836            postings_reader.read_exact(&mut unindexed).ok()?;
837            let path_len = read_u32(&mut postings_reader).ok()? as usize;
838            let size = read_u64(&mut postings_reader).ok()?;
839            let secs = read_u64(&mut postings_reader).ok()?;
840            let nanos = read_u32(&mut postings_reader).ok()?;
841            if nanos >= 1_000_000_000 {
842                return None;
843            }
844            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
845                return None;
846            }
847            let mut path_bytes = vec![0u8; path_len];
848            postings_reader.read_exact(&mut path_bytes).ok()?;
849            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
850            let full_path = project_root.join(relative_path);
851            let file_id_u32 = u32::try_from(file_id).ok()?;
852
853            files.push(FileEntry {
854                path: full_path.clone(),
855                size,
856                modified: UNIX_EPOCH + Duration::new(secs, nanos),
857            });
858            path_to_id.insert(full_path, file_id_u32);
859            if unindexed[0] == 1 {
860                unindexed_files.insert(file_id_u32);
861            }
862        }
863
864        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
865        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
866        if postings_len > max_postings_bytes {
867            return None;
868        }
869        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
870            return None;
871        }
872        let mut postings_blob = vec![0u8; postings_len];
873        postings_reader.read_exact(&mut postings_blob).ok()?;
874
875        let mut lookup_magic = [0u8; 8];
876        lookup_reader.read_exact(&mut lookup_magic).ok()?;
877        if &lookup_magic != LOOKUP_MAGIC {
878            return None;
879        }
880        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
881            return None;
882        }
883        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
884        if entry_count > MAX_ENTRIES {
885            return None;
886        }
887        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
888        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
889        if minimum_lookup_bytes > remaining_lookup {
890            return None;
891        }
892
893        let mut postings = HashMap::new();
894        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
895
896        for _ in 0..entry_count {
897            let trigram = read_u32(&mut lookup_reader).ok()?;
898            let offset = read_u64(&mut lookup_reader).ok()? as usize;
899            let count = read_u32(&mut lookup_reader).ok()? as usize;
900            if count > MAX_ENTRIES {
901                return None;
902            }
903            let bytes_len = count.checked_mul(POSTING_BYTES)?;
904            let end = offset.checked_add(bytes_len)?;
905            if end > postings_blob.len() {
906                return None;
907            }
908
909            let mut trigram_postings = Vec::with_capacity(count);
910            for chunk in postings_blob[offset..end].chunks_exact(6) {
911                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
912                let posting = Posting {
913                    file_id,
914                    next_mask: chunk[4],
915                    loc_mask: chunk[5],
916                };
917                trigram_postings.push(posting.clone());
918                file_trigrams.entry(file_id).or_default().push(trigram);
919            }
920            postings.insert(trigram, trigram_postings);
921        }
922
923        Some(SearchIndex {
924            postings,
925            files,
926            path_to_id,
927            ready: true,
928            project_root,
929            git_head,
930            max_file_size,
931            file_trigrams,
932            unindexed_files,
933        })
934    }
935
936    pub(crate) fn stored_git_head(&self) -> Option<&str> {
937        self.git_head.as_deref()
938    }
939
940    pub(crate) fn set_ready(&mut self, ready: bool) {
941        self.ready = ready;
942    }
943
944    pub(crate) fn rebuild_or_refresh(
945        root: &Path,
946        max_file_size: u64,
947        current_head: Option<String>,
948        baseline: Option<SearchIndex>,
949    ) -> Self {
950        if let Some(mut baseline) = baseline {
951            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
952            baseline.max_file_size = max_file_size;
953
954            if baseline.git_head == current_head || current_head.is_none() {
955                // HEAD matches, but files may have changed on disk since the index was
956                // last written (e.g., uncommitted edits, stash pop, manual file changes
957                // while OpenCode was closed). Verify mtimes and re-index stale files.
958                // Non-git projects also use this per-file (path, mtime, size)
959                // fingerprint so unchanged trees reuse the disk cache instead of
960                // rebuilding every configure.
961                baseline.git_head = current_head;
962                verify_file_mtimes(&mut baseline);
963                baseline.ready = true;
964                return baseline;
965            }
966
967            if let (Some(previous), Some(current)) =
968                (baseline.git_head.clone(), current_head.clone())
969            {
970                let project_root = baseline.project_root.clone();
971                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
972                    baseline.git_head = Some(current);
973                    baseline.ready = true;
974                    return baseline;
975                }
976            }
977        }
978
979        SearchIndex::build_with_limit(root, max_file_size)
980    }
981
982    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
983        let file_id = u32::try_from(self.files.len()).ok()?;
984        let metadata = fs::metadata(path).ok();
985        let size = metadata
986            .as_ref()
987            .map_or(size_hint, |metadata| metadata.len());
988        let modified = metadata
989            .and_then(|metadata| metadata.modified().ok())
990            .unwrap_or(UNIX_EPOCH);
991
992        self.files.push(FileEntry {
993            path: path.to_path_buf(),
994            size,
995            modified,
996        });
997        self.path_to_id.insert(path.to_path_buf(), file_id);
998        Some(file_id)
999    }
1000
1001    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1002        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1003            return;
1004        };
1005        self.unindexed_files.insert(file_id);
1006        self.file_trigrams.insert(file_id, Vec::new());
1007    }
1008
1009    fn active_file_ids(&self) -> Vec<u32> {
1010        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1011        ids.sort_unstable();
1012        ids
1013    }
1014
1015    fn is_active_file(&self, file_id: u32) -> bool {
1016        self.files
1017            .get(file_id as usize)
1018            .map(|file| !file.path.as_os_str().is_empty())
1019            .unwrap_or(false)
1020    }
1021
1022    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1023        let Some(postings) = self.postings.get(&trigram) else {
1024            return Vec::new();
1025        };
1026
1027        let mut matches = Vec::with_capacity(postings.len());
1028
1029        for posting in postings {
1030            if let Some(filter) = filter {
1031                // next_mask: bloom filter check — the character following this trigram in the
1032                // query must also appear after this trigram somewhere in the file.
1033                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1034                    continue;
1035                }
1036                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1037                // between consecutive trigram pairs, but is NOT used as a single-trigram
1038                // filter because the position in the query string has no relationship to
1039                // the position in the file. Using it here causes false negatives.
1040            }
1041            if self.is_active_file(posting.file_id) {
1042                matches.push(posting.file_id);
1043            }
1044        }
1045
1046        matches
1047    }
1048}
1049
1050fn search_candidate_file(
1051    file: &FileEntry,
1052    matcher: &SearchMatcher,
1053    max_results: usize,
1054    stop_after: usize,
1055    total_matches: &AtomicUsize,
1056    files_searched: &AtomicUsize,
1057    files_with_matches: &AtomicUsize,
1058    truncated: &AtomicBool,
1059) -> Vec<SharedGrepMatch> {
1060    if should_stop_search(truncated, total_matches, stop_after) {
1061        return Vec::new();
1062    }
1063
1064    let content = match read_indexed_file_bytes(&file.path) {
1065        Some(content) => content,
1066        None => return Vec::new(),
1067    };
1068    // Defense in depth: even though indexing tries to filter binaries via
1069    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1070    // query time. content_inspector is fast (~bytes-per-cycle on a small
1071    // preview) and this guarantees we never surface matches inside binary
1072    // files even if the indexer somehow let one through (e.g. file changed
1073    // between indexing and query).
1074    if is_binary_bytes(&content) {
1075        return Vec::new();
1076    }
1077    files_searched.fetch_add(1, Ordering::Relaxed);
1078
1079    let shared_path = Arc::new(file.path.clone());
1080    let mut matches = Vec::new();
1081    let mut line_starts = None;
1082    let mut seen_lines = HashSet::new();
1083    let mut matched_this_file = false;
1084
1085    match matcher {
1086        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1087            let finder = memchr::memmem::Finder::new(needle);
1088            let mut start = 0;
1089
1090            while let Some(position) = finder.find(&content[start..]) {
1091                if should_stop_search(truncated, total_matches, stop_after) {
1092                    break;
1093                }
1094
1095                let offset = start + position;
1096                start = offset + 1;
1097
1098                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1099                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1100                if !seen_lines.insert(line) {
1101                    continue;
1102                }
1103
1104                matched_this_file = true;
1105                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1106                if match_number > max_results {
1107                    truncated.store(true, Ordering::Relaxed);
1108                    break;
1109                }
1110
1111                let end = offset + needle.len();
1112                matches.push(SharedGrepMatch {
1113                    file: shared_path.clone(),
1114                    line,
1115                    column,
1116                    line_text,
1117                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1118                });
1119            }
1120        }
1121        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1122            let search_content = content.to_ascii_lowercase();
1123            let finder = memchr::memmem::Finder::new(needle);
1124            let mut start = 0;
1125
1126            while let Some(position) = finder.find(&search_content[start..]) {
1127                if should_stop_search(truncated, total_matches, stop_after) {
1128                    break;
1129                }
1130
1131                let offset = start + position;
1132                start = offset + 1;
1133
1134                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1135                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1136                if !seen_lines.insert(line) {
1137                    continue;
1138                }
1139
1140                matched_this_file = true;
1141                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1142                if match_number > max_results {
1143                    truncated.store(true, Ordering::Relaxed);
1144                    break;
1145                }
1146
1147                let end = offset + needle.len();
1148                matches.push(SharedGrepMatch {
1149                    file: shared_path.clone(),
1150                    line,
1151                    column,
1152                    line_text,
1153                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1154                });
1155            }
1156        }
1157        SearchMatcher::Regex(regex) => {
1158            for matched in regex.find_iter(&content) {
1159                if should_stop_search(truncated, total_matches, stop_after) {
1160                    break;
1161                }
1162
1163                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1164                let (line, column, line_text) =
1165                    line_details_bytes(&content, line_starts, matched.start());
1166                if !seen_lines.insert(line) {
1167                    continue;
1168                }
1169
1170                matched_this_file = true;
1171                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1172                if match_number > max_results {
1173                    truncated.store(true, Ordering::Relaxed);
1174                    break;
1175                }
1176
1177                matches.push(SharedGrepMatch {
1178                    file: shared_path.clone(),
1179                    line,
1180                    column,
1181                    line_text,
1182                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1183                });
1184            }
1185        }
1186    }
1187
1188    if matched_this_file {
1189        files_with_matches.fetch_add(1, Ordering::Relaxed);
1190    }
1191
1192    matches
1193}
1194
1195fn should_stop_search(
1196    truncated: &AtomicBool,
1197    total_matches: &AtomicUsize,
1198    stop_after: usize,
1199) -> bool {
1200    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1201}
1202
1203fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1204    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1205    let mut left_index = 0;
1206    let mut right_index = 0;
1207
1208    while left_index < left.len() && right_index < right.len() {
1209        match left[left_index].cmp(&right[right_index]) {
1210            std::cmp::Ordering::Less => left_index += 1,
1211            std::cmp::Ordering::Greater => right_index += 1,
1212            std::cmp::Ordering::Equal => {
1213                merged.push(left[left_index]);
1214                left_index += 1;
1215                right_index += 1;
1216            }
1217        }
1218    }
1219
1220    merged
1221}
1222
1223fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1224    let mut merged = Vec::with_capacity(left.len() + right.len());
1225    let mut left_index = 0;
1226    let mut right_index = 0;
1227
1228    while left_index < left.len() && right_index < right.len() {
1229        match left[left_index].cmp(&right[right_index]) {
1230            std::cmp::Ordering::Less => {
1231                merged.push(left[left_index]);
1232                left_index += 1;
1233            }
1234            std::cmp::Ordering::Greater => {
1235                merged.push(right[right_index]);
1236                right_index += 1;
1237            }
1238            std::cmp::Ordering::Equal => {
1239                merged.push(left[left_index]);
1240                left_index += 1;
1241                right_index += 1;
1242            }
1243        }
1244    }
1245
1246    merged.extend_from_slice(&left[left_index..]);
1247    merged.extend_from_slice(&right[right_index..]);
1248    merged
1249}
1250
1251pub fn decompose_regex(pattern: &str) -> RegexQuery {
1252    let hir = match regex_syntax::parse(pattern) {
1253        Ok(hir) => hir,
1254        Err(_) => return RegexQuery::default(),
1255    };
1256
1257    let build = build_query(&hir);
1258    build.into_query()
1259}
1260
1261pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1262    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1263}
1264
1265pub fn normalize_char(c: u8) -> u8 {
1266    c.to_ascii_lowercase()
1267}
1268
1269pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1270    if content.len() < 3 {
1271        return Vec::new();
1272    }
1273
1274    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1275    for start in 0..=content.len() - 3 {
1276        let trigram = pack_trigram(
1277            normalize_char(content[start]),
1278            normalize_char(content[start + 1]),
1279            normalize_char(content[start + 2]),
1280        );
1281        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1282        trigrams.push((trigram, next_char, start));
1283    }
1284    trigrams
1285}
1286
1287pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1288    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1289    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1290        return PathBuf::from(override_dir)
1291            .join("index")
1292            .join(project_cache_key(project_root));
1293    }
1294    // Use configured storage dir (from plugin, XDG-compliant)
1295    if let Some(dir) = storage_dir {
1296        return dir.join("index").join(project_cache_key(project_root));
1297    }
1298    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1299    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1300    // If neither is set, fall back to a temp directory rather than `"."`
1301    // because the search-index code reads/writes absolute paths.
1302    let home = std::env::var_os("HOME")
1303        .or_else(|| std::env::var_os("USERPROFILE"))
1304        .map(PathBuf::from)
1305        .unwrap_or_else(std::env::temp_dir);
1306    home.join(".cache")
1307        .join("aft")
1308        .join("index")
1309        .join(project_cache_key(project_root))
1310}
1311
1312pub(crate) fn build_path_filters(
1313    include: &[String],
1314    exclude: &[String],
1315) -> Result<PathFilters, String> {
1316    Ok(PathFilters {
1317        includes: build_globset(include)?,
1318        excludes: build_globset(exclude)?,
1319    })
1320}
1321
1322pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1323    walk_project_files_from(root, root, filters)
1324}
1325
1326pub(crate) fn walk_project_files_from(
1327    filter_root: &Path,
1328    search_root: &Path,
1329    filters: &PathFilters,
1330) -> Vec<PathBuf> {
1331    let mut builder = WalkBuilder::new(search_root);
1332    builder
1333        .hidden(false)
1334        .git_ignore(true)
1335        .git_global(true)
1336        .git_exclude(true)
1337        .filter_entry(|entry| {
1338            let name = entry.file_name().to_string_lossy();
1339            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1340                return !matches!(
1341                    name.as_ref(),
1342                    "node_modules"
1343                        | "target"
1344                        | "venv"
1345                        | ".venv"
1346                        | ".git"
1347                        | "__pycache__"
1348                        | ".tox"
1349                        | "dist"
1350                        | "build"
1351                );
1352            }
1353            true
1354        });
1355
1356    let mut files = Vec::new();
1357    for entry in builder.build().filter_map(|entry| entry.ok()) {
1358        if !entry
1359            .file_type()
1360            .map_or(false, |file_type| file_type.is_file())
1361        {
1362            continue;
1363        }
1364        let path = entry.into_path();
1365        if filters.matches(filter_root, &path) {
1366            files.push(path);
1367        }
1368    }
1369
1370    sort_paths_by_mtime_desc(&mut files);
1371    files
1372}
1373
1374pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1375    let bytes = fs::read(path).ok()?;
1376    if is_binary_bytes(&bytes) {
1377        return None;
1378    }
1379    String::from_utf8(bytes).ok()
1380}
1381
1382fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1383    fs::read(path).ok()
1384}
1385
1386pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1387    path.strip_prefix(root)
1388        .map(PathBuf::from)
1389        .unwrap_or_else(|_| path.to_path_buf())
1390}
1391
1392/// Sort paths newest-first by mtime, falling back to lexicographic order.
1393///
1394/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1395/// `sort_by()` closure. That made the comparator non-deterministic — a
1396/// `stat()` syscall for the same path can return different values across
1397/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1398/// concurrent file-watcher activity), and Rust's slice::sort panics at
1399/// runtime when it detects a non-total-order comparator. CI hit this on
1400/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1401///
1402/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1403/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1404pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1405    use std::collections::HashMap;
1406    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1407    for path in paths.iter() {
1408        mtimes
1409            .entry(path.clone())
1410            .or_insert_with(|| path_modified_time(path));
1411    }
1412    paths.sort_by(|left, right| {
1413        let left_mtime = mtimes.get(left).and_then(|v| *v);
1414        let right_mtime = mtimes.get(right).and_then(|v| *v);
1415        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1416    });
1417}
1418
1419/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1420/// the sort. Same fix, applied to grep matches that share files.
1421pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1422    use std::collections::HashMap;
1423    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1424    for m in matches.iter() {
1425        mtimes.entry(m.file.clone()).or_insert_with(|| {
1426            let resolved = resolve_match_path(project_root, &m.file);
1427            path_modified_time(&resolved)
1428        });
1429    }
1430    matches.sort_by(|left, right| {
1431        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1432        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1433        right_mtime
1434            .cmp(&left_mtime)
1435            .then_with(|| left.file.cmp(&right.file))
1436            .then_with(|| left.line.cmp(&right.line))
1437            .then_with(|| left.column.cmp(&right.column))
1438    });
1439}
1440
1441/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1442/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1443/// table from the search index), but it can still return different values if
1444/// the file is modified mid-sort. Snapshot once.
1445fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1446    matches: &mut [SharedGrepMatch],
1447    modified_for_path: F,
1448) where
1449    F: Fn(&Path) -> Option<SystemTime>,
1450{
1451    use std::collections::HashMap;
1452    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1453    for m in matches.iter() {
1454        let path = m.file.as_path().to_path_buf();
1455        mtimes
1456            .entry(path.clone())
1457            .or_insert_with(|| modified_for_path(&path));
1458    }
1459    matches.sort_by(|left, right| {
1460        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1461        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1462        right_mtime
1463            .cmp(&left_mtime)
1464            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1465            .then_with(|| left.line.cmp(&right.line))
1466            .then_with(|| left.column.cmp(&right.column))
1467    });
1468}
1469
1470pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1471    let resolved_project_root = canonicalize_or_normalize(project_root);
1472    let root = match path {
1473        Some(path) => {
1474            let path = PathBuf::from(path);
1475            if path.is_absolute() {
1476                canonicalize_or_normalize(&path)
1477            } else {
1478                normalize_path(&resolved_project_root.join(path))
1479            }
1480        }
1481        None => resolved_project_root.clone(),
1482    };
1483
1484    let use_index = is_within_search_root(&resolved_project_root, &root);
1485    SearchScope { root, use_index }
1486}
1487
1488pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1489    content_inspector::inspect(content).is_binary()
1490}
1491
1492pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1493    run_git(root, &["rev-parse", "HEAD"])
1494}
1495
1496pub fn project_cache_key(project_root: &Path) -> String {
1497    use sha2::{Digest, Sha256};
1498
1499    let mut hasher = Sha256::new();
1500
1501    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1502        // Git repo: root commit is the unique identity.
1503        // Same repo cloned anywhere produces the same key.
1504        hasher.update(root_commit.as_bytes());
1505    } else {
1506        // Non-git project: use the canonical filesystem path as identity.
1507        let canonical_root = canonicalize_or_normalize(project_root);
1508        hasher.update(canonical_root.to_string_lossy().as_bytes());
1509    }
1510
1511    let digest = format!("{:x}", hasher.finalize());
1512    digest[..16].to_string()
1513}
1514
1515impl PathFilters {
1516    fn matches(&self, root: &Path, path: &Path) -> bool {
1517        let relative = to_glob_path(&relative_to_root(root, path));
1518        if self
1519            .includes
1520            .as_ref()
1521            .is_some_and(|includes| !includes.is_match(&relative))
1522        {
1523            return false;
1524        }
1525        if self
1526            .excludes
1527            .as_ref()
1528            .is_some_and(|excludes| excludes.is_match(&relative))
1529        {
1530            return false;
1531        }
1532        true
1533    }
1534}
1535
1536fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1537    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1538}
1539
1540fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1541    if path.is_absolute() {
1542        path.to_path_buf()
1543    } else {
1544        project_root.join(path)
1545    }
1546}
1547
1548fn path_modified_time(path: &Path) -> Option<SystemTime> {
1549    fs::metadata(path)
1550        .and_then(|metadata| metadata.modified())
1551        .ok()
1552}
1553
1554fn normalize_path(path: &Path) -> PathBuf {
1555    let mut result = PathBuf::new();
1556    for component in path.components() {
1557        match component {
1558            Component::ParentDir => {
1559                if !result.pop() {
1560                    result.push(component);
1561                }
1562            }
1563            Component::CurDir => {}
1564            _ => result.push(component),
1565        }
1566    }
1567    result
1568}
1569
1570/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1571/// since the index was last written. Also detect new files and deleted files.
1572fn verify_file_mtimes(index: &mut SearchIndex) {
1573    // Collect stale files (mtime mismatch or deleted)
1574    let mut stale_paths = Vec::new();
1575    for entry in &index.files {
1576        if entry.path.as_os_str().is_empty() {
1577            continue; // tombstoned entry
1578        }
1579        match fs::metadata(&entry.path) {
1580            Ok(meta) => {
1581                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1582                if current_mtime != entry.modified || meta.len() != entry.size {
1583                    stale_paths.push(entry.path.clone());
1584                }
1585            }
1586            Err(_) => {
1587                // File deleted
1588                stale_paths.push(entry.path.clone());
1589            }
1590        }
1591    }
1592
1593    // Re-index stale files
1594    for path in &stale_paths {
1595        index.update_file(path);
1596    }
1597
1598    // Detect new files not in the index
1599    let filters = PathFilters::default();
1600    for path in walk_project_files(&index.project_root, &filters) {
1601        if !index.path_to_id.contains_key(&path) {
1602            index.update_file(&path);
1603        }
1604    }
1605
1606    if !stale_paths.is_empty() {
1607        log::info!(
1608            "search index: refreshed {} stale file(s) from disk cache",
1609            stale_paths.len()
1610        );
1611    }
1612}
1613
1614fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1615    path.starts_with(search_root)
1616}
1617
1618impl QueryBuild {
1619    fn into_query(self) -> RegexQuery {
1620        let mut query = RegexQuery::default();
1621
1622        for run in self.and_runs {
1623            add_run_to_and_query(&mut query, &run);
1624        }
1625
1626        for group in self.or_groups {
1627            let mut trigrams = BTreeSet::new();
1628            let mut filters = HashMap::new();
1629            for run in group {
1630                for (trigram, filter) in trigram_filters(&run) {
1631                    trigrams.insert(trigram);
1632                    merge_filter(filters.entry(trigram).or_default(), filter);
1633                }
1634            }
1635            if !trigrams.is_empty() {
1636                query.or_groups.push(trigrams.into_iter().collect());
1637                query.or_filters.push(filters);
1638            }
1639        }
1640
1641        query
1642    }
1643}
1644
1645fn build_query(hir: &Hir) -> QueryBuild {
1646    match hir.kind() {
1647        HirKind::Literal(literal) => {
1648            if literal.0.len() >= 3 {
1649                QueryBuild {
1650                    and_runs: vec![literal.0.to_vec()],
1651                    or_groups: Vec::new(),
1652                }
1653            } else {
1654                QueryBuild::default()
1655            }
1656        }
1657        HirKind::Capture(capture) => build_query(&capture.sub),
1658        HirKind::Concat(parts) => {
1659            let mut build = QueryBuild::default();
1660            for part in parts {
1661                let part_build = build_query(part);
1662                build.and_runs.extend(part_build.and_runs);
1663                build.or_groups.extend(part_build.or_groups);
1664            }
1665            build
1666        }
1667        HirKind::Alternation(parts) => {
1668            let mut group = Vec::new();
1669            for part in parts {
1670                let Some(mut choices) = guaranteed_run_choices(part) else {
1671                    return QueryBuild::default();
1672                };
1673                group.append(&mut choices);
1674            }
1675            if group.is_empty() {
1676                QueryBuild::default()
1677            } else {
1678                QueryBuild {
1679                    and_runs: Vec::new(),
1680                    or_groups: vec![group],
1681                }
1682            }
1683        }
1684        HirKind::Repetition(repetition) => {
1685            if repetition.min == 0 {
1686                QueryBuild::default()
1687            } else {
1688                build_query(&repetition.sub)
1689            }
1690        }
1691        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1692    }
1693}
1694
1695fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1696    match hir.kind() {
1697        HirKind::Literal(literal) => {
1698            if literal.0.len() >= 3 {
1699                Some(vec![literal.0.to_vec()])
1700            } else {
1701                None
1702            }
1703        }
1704        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1705        HirKind::Concat(parts) => {
1706            let mut runs = Vec::new();
1707            for part in parts {
1708                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1709                    runs.append(&mut part_runs);
1710                }
1711            }
1712            if runs.is_empty() {
1713                None
1714            } else {
1715                Some(runs)
1716            }
1717        }
1718        HirKind::Alternation(parts) => {
1719            let mut runs = Vec::new();
1720            for part in parts {
1721                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1722                    return None;
1723                };
1724                runs.append(&mut part_runs);
1725            }
1726            if runs.is_empty() {
1727                None
1728            } else {
1729                Some(runs)
1730            }
1731        }
1732        HirKind::Repetition(repetition) => {
1733            if repetition.min == 0 {
1734                None
1735            } else {
1736                guaranteed_run_choices(&repetition.sub)
1737            }
1738        }
1739        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1740    }
1741}
1742
1743fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1744    for (trigram, filter) in trigram_filters(run) {
1745        if !query.and_trigrams.contains(&trigram) {
1746            query.and_trigrams.push(trigram);
1747        }
1748        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1749    }
1750}
1751
1752fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1753    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1754    for (trigram, next_char, position) in extract_trigrams(run) {
1755        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1756        if next_char != EOF_SENTINEL {
1757            entry.next_mask |= mask_for_next_char(next_char);
1758        }
1759        entry.loc_mask |= mask_for_position(position);
1760    }
1761    filters.into_iter().collect()
1762}
1763
1764fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1765    target.next_mask |= filter.next_mask;
1766    target.loc_mask |= filter.loc_mask;
1767}
1768
1769fn mask_for_next_char(next_char: u8) -> u8 {
1770    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1771    1u8 << bit
1772}
1773
1774fn mask_for_position(position: usize) -> u8 {
1775    1u8 << (position % 8)
1776}
1777
1778fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1779    if patterns.is_empty() {
1780        return Ok(None);
1781    }
1782
1783    let mut builder = GlobSetBuilder::new();
1784    for pattern in patterns {
1785        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1786        builder.add(glob);
1787    }
1788    builder.build().map(Some).map_err(|error| error.to_string())
1789}
1790
1791fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1792    let mut buffer = [0u8; 4];
1793    reader.read_exact(&mut buffer)?;
1794    Ok(u32::from_le_bytes(buffer))
1795}
1796
1797fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1798    let mut buffer = [0u8; 8];
1799    reader.read_exact(&mut buffer)?;
1800    Ok(u64::from_le_bytes(buffer))
1801}
1802
1803fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1804    writer.write_all(&value.to_le_bytes())
1805}
1806
1807fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1808    writer.write_all(&value.to_le_bytes())
1809}
1810
1811fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1812    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1813        return Err(std::io::Error::other("search index checksum missing"));
1814    };
1815    let expected = u32::from_le_bytes(*stored);
1816    let actual = crc32fast::hash(body);
1817    if actual != expected {
1818        return Err(std::io::Error::other("search index checksum mismatch"));
1819    }
1820    Ok(())
1821}
1822
1823fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1824    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1825    total_len.checked_sub(pos)
1826}
1827
1828fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1829    let output = Command::new("git")
1830        .arg("-C")
1831        .arg(root)
1832        .args(args)
1833        .output()
1834        .ok()?;
1835    if !output.status.success() {
1836        return None;
1837    }
1838    let value = String::from_utf8(output.stdout).ok()?;
1839    let value = value.trim().to_string();
1840    if value.is_empty() {
1841        None
1842    } else {
1843        Some(value)
1844    }
1845}
1846
1847fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1848    let diff_range = format!("{}..{}", from, to);
1849    let output = match Command::new("git")
1850        .arg("-C")
1851        .arg(root)
1852        .args(["diff", "--name-only", &diff_range])
1853        .output()
1854    {
1855        Ok(output) => output,
1856        Err(_) => return false,
1857    };
1858
1859    if !output.status.success() {
1860        return false;
1861    }
1862
1863    let Ok(paths) = String::from_utf8(output.stdout) else {
1864        return false;
1865    };
1866
1867    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1868        let path = root.join(relative_path);
1869        if path.exists() {
1870            index.update_file(&path);
1871        } else {
1872            index.remove_file(&path);
1873        }
1874    }
1875
1876    true
1877}
1878
1879fn is_binary_path(path: &Path, size: u64) -> bool {
1880    if size == 0 {
1881        return false;
1882    }
1883
1884    let mut file = match File::open(path) {
1885        Ok(file) => file,
1886        Err(_) => return true,
1887    };
1888
1889    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1890    match file.read(&mut preview) {
1891        Ok(read) => is_binary_bytes(&preview[..read]),
1892        Err(_) => true,
1893    }
1894}
1895
1896fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1897    let mut starts = vec![0usize];
1898    for (index, byte) in content.iter().copied().enumerate() {
1899        if byte == b'\n' {
1900            starts.push(index + 1);
1901        }
1902    }
1903    starts
1904}
1905
1906fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1907    let line_index = match line_starts.binary_search(&offset) {
1908        Ok(index) => index,
1909        Err(index) => index.saturating_sub(1),
1910    };
1911    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1912    let line_end = content[line_start..]
1913        .iter()
1914        .position(|byte| *byte == b'\n')
1915        .map(|length| line_start + length)
1916        .unwrap_or(content.len());
1917    let mut line_slice = &content[line_start..line_end];
1918    if line_slice.ends_with(b"\r") {
1919        line_slice = &line_slice[..line_slice.len() - 1];
1920    }
1921    let line_text = String::from_utf8_lossy(line_slice).into_owned();
1922    let column = String::from_utf8_lossy(&content[line_start..offset])
1923        .chars()
1924        .count() as u32
1925        + 1;
1926    (line_index as u32 + 1, column, line_text)
1927}
1928
1929fn to_glob_path(path: &Path) -> String {
1930    path.to_string_lossy().replace('\\', "/")
1931}
1932
1933#[cfg(test)]
1934mod tests {
1935    use std::process::Command;
1936
1937    use super::*;
1938
1939    #[test]
1940    fn extract_trigrams_tracks_next_char_and_position() {
1941        let trigrams = extract_trigrams(b"Rust");
1942        assert_eq!(trigrams.len(), 2);
1943        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1944        assert_eq!(
1945            trigrams[1],
1946            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1947        );
1948    }
1949
1950    #[test]
1951    fn decompose_regex_extracts_literals_and_alternations() {
1952        let query = decompose_regex("abc(def|ghi)xyz");
1953        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1954        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1955        assert_eq!(query.or_groups.len(), 1);
1956        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1957        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1958    }
1959
1960    #[test]
1961    fn candidates_intersect_posting_lists() {
1962        let mut index = SearchIndex::new();
1963        let dir = tempfile::tempdir().expect("create temp dir");
1964        let alpha = dir.path().join("alpha.txt");
1965        let beta = dir.path().join("beta.txt");
1966        fs::write(&alpha, "abcdef").expect("write alpha");
1967        fs::write(&beta, "abcxyz").expect("write beta");
1968        index.project_root = dir.path().to_path_buf();
1969        index.index_file(&alpha, b"abcdef");
1970        index.index_file(&beta, b"abcxyz");
1971
1972        let query = RegexQuery {
1973            and_trigrams: vec![
1974                pack_trigram(b'a', b'b', b'c'),
1975                pack_trigram(b'd', b'e', b'f'),
1976            ],
1977            ..RegexQuery::default()
1978        };
1979
1980        let candidates = index.candidates(&query);
1981        assert_eq!(candidates.len(), 1);
1982        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1983    }
1984
1985    #[test]
1986    fn candidates_apply_bloom_filters() {
1987        let mut index = SearchIndex::new();
1988        let dir = tempfile::tempdir().expect("create temp dir");
1989        let file = dir.path().join("sample.txt");
1990        fs::write(&file, "abcd efgh").expect("write sample");
1991        index.project_root = dir.path().to_path_buf();
1992        index.index_file(&file, b"abcd efgh");
1993
1994        let trigram = pack_trigram(b'a', b'b', b'c');
1995        let matching_filter = PostingFilter {
1996            next_mask: mask_for_next_char(b'd'),
1997            loc_mask: mask_for_position(0),
1998        };
1999        let non_matching_filter = PostingFilter {
2000            next_mask: mask_for_next_char(b'z'),
2001            loc_mask: mask_for_position(0),
2002        };
2003
2004        assert_eq!(
2005            index
2006                .postings_for_trigram(trigram, Some(matching_filter))
2007                .len(),
2008            1
2009        );
2010        assert!(index
2011            .postings_for_trigram(trigram, Some(non_matching_filter))
2012            .is_empty());
2013    }
2014
2015    #[test]
2016    fn disk_round_trip_preserves_postings_and_files() {
2017        let dir = tempfile::tempdir().expect("create temp dir");
2018        let project = dir.path().join("project");
2019        fs::create_dir_all(&project).expect("create project dir");
2020        let file = project.join("src.txt");
2021        fs::write(&file, "abcdef").expect("write source");
2022
2023        let mut index = SearchIndex::build(&project);
2024        index.git_head = Some("deadbeef".to_string());
2025        let cache_dir = dir.path().join("cache");
2026        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2027
2028        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
2029        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2030        assert_eq!(loaded.files.len(), 1);
2031        assert_eq!(
2032            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2033            PathBuf::from("src.txt")
2034        );
2035        assert_eq!(loaded.postings.len(), index.postings.len());
2036        assert!(loaded
2037            .postings
2038            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2039    }
2040
2041    #[test]
2042    fn read_from_disk_rejects_corrupt_postings_checksum() {
2043        let dir = tempfile::tempdir().expect("create temp dir");
2044        let project = dir.path().join("project");
2045        fs::create_dir_all(&project).expect("create project dir");
2046        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2047
2048        let index = SearchIndex::build(&project);
2049        let cache_dir = dir.path().join("cache");
2050        index.write_to_disk(&cache_dir, None);
2051
2052        let cache_path = cache_dir.join("cache.bin");
2053        let mut bytes = fs::read(&cache_path).expect("read cache");
2054        let middle = bytes.len() / 2;
2055        bytes[middle] ^= 0xff;
2056        fs::write(&cache_path, bytes).expect("write corrupted cache");
2057
2058        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2059    }
2060
2061    #[test]
2062    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2063        let dir = tempfile::tempdir().expect("create temp dir");
2064        let project = dir.path().join("project");
2065        fs::create_dir_all(&project).expect("create project dir");
2066        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2067
2068        let index = SearchIndex::build(&project);
2069        let cache_dir = dir.path().join("cache");
2070        index.write_to_disk(&cache_dir, None);
2071
2072        assert!(cache_dir.join("cache.bin").is_file());
2073        assert!(fs::read_dir(&cache_dir)
2074            .expect("read cache dir")
2075            .all(|entry| !entry
2076                .expect("cache entry")
2077                .file_name()
2078                .to_string_lossy()
2079                .contains(".tmp.")));
2080    }
2081
2082    #[test]
2083    fn concurrent_search_index_writes_do_not_corrupt() {
2084        let dir = tempfile::tempdir().expect("create temp dir");
2085        let project = dir.path().join("project");
2086        fs::create_dir_all(&project).expect("create project dir");
2087        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2088        let cache_dir = dir.path().join("cache");
2089
2090        let a_project = project.clone();
2091        let a_cache = cache_dir.clone();
2092        let a = std::thread::spawn(move || {
2093            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2094            let index = SearchIndex::build(&a_project);
2095            index.write_to_disk(&a_cache, None);
2096        });
2097        let b_project = project.clone();
2098        let b_cache = cache_dir.clone();
2099        let b = std::thread::spawn(move || {
2100            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2101            let index = SearchIndex::build(&b_project);
2102            index.write_to_disk(&b_cache, None);
2103        });
2104        a.join().expect("writer a");
2105        b.join().expect("writer b");
2106
2107        assert!(SearchIndex::read_from_disk(&cache_dir).is_some());
2108    }
2109
2110    #[test]
2111    fn search_index_atomic_rename_survives_partial_write() {
2112        let dir = tempfile::tempdir().expect("create temp dir");
2113        let cache_dir = dir.path().join("cache");
2114        fs::create_dir_all(&cache_dir).expect("create cache dir");
2115        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2116
2117        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2118    }
2119
2120    #[test]
2121    fn project_cache_key_includes_checkout_path() {
2122        let dir = tempfile::tempdir().expect("create temp dir");
2123        let source = dir.path().join("source");
2124        fs::create_dir_all(&source).expect("create source repo dir");
2125        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2126
2127        assert!(Command::new("git")
2128            .current_dir(&source)
2129            .args(["init"])
2130            .status()
2131            .expect("init git repo")
2132            .success());
2133        assert!(Command::new("git")
2134            .current_dir(&source)
2135            .args(["add", "."])
2136            .status()
2137            .expect("git add")
2138            .success());
2139        assert!(Command::new("git")
2140            .current_dir(&source)
2141            .args([
2142                "-c",
2143                "user.name=AFT Tests",
2144                "-c",
2145                "user.email=aft-tests@example.com",
2146                "commit",
2147                "-m",
2148                "initial",
2149            ])
2150            .status()
2151            .expect("git commit")
2152            .success());
2153
2154        let clone = dir.path().join("clone");
2155        assert!(Command::new("git")
2156            .args(["clone", "--quiet"])
2157            .arg(&source)
2158            .arg(&clone)
2159            .status()
2160            .expect("git clone")
2161            .success());
2162
2163        let source_key = project_cache_key(&source);
2164        let clone_key = project_cache_key(&clone);
2165
2166        assert_eq!(source_key.len(), 16);
2167        assert_eq!(clone_key.len(), 16);
2168        // Same repo (same root commit) → same cache key regardless of clone path
2169        assert_eq!(source_key, clone_key);
2170    }
2171
2172    #[test]
2173    fn git_head_unchanged_picks_up_local_edits() {
2174        let dir = tempfile::tempdir().expect("create temp dir");
2175        let project = dir.path().join("repo");
2176        fs::create_dir_all(&project).expect("create repo dir");
2177        let file = project.join("tracked.txt");
2178        fs::write(&file, "oldtoken\n").expect("write file");
2179        assert!(Command::new("git")
2180            .current_dir(&project)
2181            .arg("init")
2182            .status()
2183            .unwrap()
2184            .success());
2185        assert!(Command::new("git")
2186            .current_dir(&project)
2187            .args(["add", "."])
2188            .status()
2189            .unwrap()
2190            .success());
2191        assert!(Command::new("git")
2192            .current_dir(&project)
2193            .args([
2194                "-c",
2195                "user.name=AFT Tests",
2196                "-c",
2197                "user.email=aft-tests@example.com",
2198                "commit",
2199                "-m",
2200                "initial"
2201            ])
2202            .status()
2203            .unwrap()
2204            .success());
2205        let head = current_git_head(&project);
2206        let mut baseline = SearchIndex::build(&project);
2207        baseline.git_head = head.clone();
2208        fs::write(&file, "newtoken\n").expect("edit tracked file");
2209
2210        let refreshed =
2211            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2212        let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2213
2214        assert_eq!(result.total_matches, 1);
2215    }
2216
2217    #[test]
2218    fn non_git_project_reuses_cache_when_files_unchanged() {
2219        let dir = tempfile::tempdir().expect("create temp dir");
2220        let project = dir.path().join("project");
2221        fs::create_dir_all(&project).expect("create project dir");
2222        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2223        let baseline = SearchIndex::build(&project);
2224        let baseline_file_count = baseline.file_count();
2225
2226        let refreshed =
2227            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2228
2229        assert_eq!(refreshed.file_count(), baseline_file_count);
2230        assert_eq!(
2231            refreshed
2232                .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2233                .total_matches,
2234            1
2235        );
2236    }
2237
2238    #[test]
2239    fn resolve_search_scope_disables_index_for_external_path() {
2240        let dir = tempfile::tempdir().expect("create temp dir");
2241        let project = dir.path().join("project");
2242        let outside = dir.path().join("outside");
2243        fs::create_dir_all(&project).expect("create project dir");
2244        fs::create_dir_all(&outside).expect("create outside dir");
2245
2246        let scope = resolve_search_scope(&project, outside.to_str());
2247
2248        assert_eq!(
2249            scope.root,
2250            fs::canonicalize(&outside).expect("canonicalize outside")
2251        );
2252        assert!(!scope.use_index);
2253    }
2254
2255    #[test]
2256    fn grep_filters_matches_to_search_root() {
2257        let dir = tempfile::tempdir().expect("create temp dir");
2258        let project = dir.path().join("project");
2259        let src = project.join("src");
2260        let docs = project.join("docs");
2261        fs::create_dir_all(&src).expect("create src dir");
2262        fs::create_dir_all(&docs).expect("create docs dir");
2263        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2264        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2265
2266        let index = SearchIndex::build(&project);
2267        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2268
2269        assert_eq!(result.files_searched, 1);
2270        assert_eq!(result.files_with_matches, 1);
2271        assert_eq!(result.matches.len(), 1);
2272        // Index stores canonicalized paths; on macOS /var → /private/var
2273        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2274        assert_eq!(result.matches[0].file, expected);
2275    }
2276
2277    #[test]
2278    fn grep_deduplicates_multiple_matches_on_same_line() {
2279        let dir = tempfile::tempdir().expect("create temp dir");
2280        let project = dir.path().join("project");
2281        let src = project.join("src");
2282        fs::create_dir_all(&src).expect("create src dir");
2283        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2284
2285        let index = SearchIndex::build(&project);
2286        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2287
2288        assert_eq!(result.total_matches, 1);
2289        assert_eq!(result.matches.len(), 1);
2290    }
2291
2292    #[test]
2293    fn grep_reports_total_matches_before_truncation() {
2294        let dir = tempfile::tempdir().expect("create temp dir");
2295        let project = dir.path().join("project");
2296        let src = project.join("src");
2297        fs::create_dir_all(&src).expect("create src dir");
2298        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2299
2300        let index = SearchIndex::build(&project);
2301        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2302
2303        assert_eq!(result.total_matches, 2);
2304        assert_eq!(result.matches.len(), 1);
2305        assert!(result.truncated);
2306    }
2307
2308    #[test]
2309    fn glob_filters_results_to_search_root() {
2310        let dir = tempfile::tempdir().expect("create temp dir");
2311        let project = dir.path().join("project");
2312        let src = project.join("src");
2313        let scripts = project.join("scripts");
2314        fs::create_dir_all(&src).expect("create src dir");
2315        fs::create_dir_all(&scripts).expect("create scripts dir");
2316        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2317        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2318
2319        let index = SearchIndex::build(&project);
2320        let files = index.glob("**/*.rs", &src);
2321
2322        assert_eq!(
2323            files,
2324            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2325        );
2326    }
2327
2328    #[test]
2329    fn glob_includes_hidden_and_binary_files() {
2330        let dir = tempfile::tempdir().expect("create temp dir");
2331        let project = dir.path().join("project");
2332        let hidden_dir = project.join(".hidden");
2333        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2334        let hidden_file = hidden_dir.join("data.bin");
2335        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2336
2337        let index = SearchIndex::build(&project);
2338        let files = index.glob("**/*.bin", &project);
2339
2340        assert_eq!(
2341            files,
2342            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2343        );
2344    }
2345
2346    #[test]
2347    fn read_from_disk_rejects_invalid_nanos() {
2348        let dir = tempfile::tempdir().expect("create temp dir");
2349        let cache_dir = dir.path().join("cache");
2350        fs::create_dir_all(&cache_dir).expect("create cache dir");
2351
2352        let mut postings = Vec::new();
2353        postings.extend_from_slice(INDEX_MAGIC);
2354        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2355        postings.extend_from_slice(&0u32.to_le_bytes());
2356        postings.extend_from_slice(&1u32.to_le_bytes());
2357        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2358        postings.extend_from_slice(&1u32.to_le_bytes());
2359        postings.extend_from_slice(b"/");
2360        postings.push(0u8);
2361        postings.extend_from_slice(&1u32.to_le_bytes());
2362        postings.extend_from_slice(&0u64.to_le_bytes());
2363        postings.extend_from_slice(&0u64.to_le_bytes());
2364        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2365        postings.extend_from_slice(b"a");
2366        postings.extend_from_slice(&0u64.to_le_bytes());
2367
2368        let mut lookup = Vec::new();
2369        lookup.extend_from_slice(LOOKUP_MAGIC);
2370        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2371        lookup.extend_from_slice(&0u32.to_le_bytes());
2372
2373        let postings_checksum = crc32fast::hash(&postings);
2374        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2375        let lookup_checksum = crc32fast::hash(&lookup);
2376        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2377        let mut cache = Vec::new();
2378        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2379        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2380        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2381        cache.extend_from_slice(&postings);
2382        cache.extend_from_slice(&lookup);
2383        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2384
2385        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2386    }
2387
2388    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2389    /// changed between cmp() calls.
2390    ///
2391    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2392    /// which does a `stat()` syscall. If the file was deleted, modified, or
2393    /// touched mid-sort, the comparator returned different values for the
2394    /// same input pair on different invocations. Rust's slice::sort detects
2395    /// this and panics with "user-provided comparison function does not
2396    /// correctly implement a total order".
2397    ///
2398    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2399    /// bridge invalidated files in parallel with grep's sort path. This
2400    /// test simulates the worst case: most paths don't exist (Err from
2401    /// fs::metadata) and sort still completes successfully.
2402    #[test]
2403    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2404        // Mix of existing and non-existing paths in deliberately
2405        // non-monotonic order — pre-fix, the sort would call stat() at
2406        // least N log N times and any flakiness would trigger the panic.
2407        let dir = tempfile::tempdir().expect("create tempdir");
2408        let mut paths: Vec<PathBuf> = Vec::new();
2409        for i in 0..30 {
2410            // Half exist, half don't.
2411            let path = if i % 2 == 0 {
2412                let p = dir.path().join(format!("real-{i}.rs"));
2413                fs::write(&p, format!("// {i}\n")).expect("write");
2414                p
2415            } else {
2416                dir.path().join(format!("missing-{i}.rs"))
2417            };
2418            paths.push(path);
2419        }
2420
2421        // Run the sort many times to maximise the chance of catching any
2422        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2423        for _ in 0..50 {
2424            let mut copy = paths.clone();
2425            sort_paths_by_mtime_desc(&mut copy);
2426            assert_eq!(copy.len(), paths.len());
2427        }
2428    }
2429
2430    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2431    /// the same conditions as sort_paths_by_mtime_desc. See the
2432    /// sort_paths_... test above for the full rationale.
2433    #[test]
2434    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2435        let dir = tempfile::tempdir().expect("create tempdir");
2436        let mut matches: Vec<GrepMatch> = Vec::new();
2437        for i in 0..30 {
2438            let file = if i % 2 == 0 {
2439                let p = dir.path().join(format!("real-{i}.rs"));
2440                fs::write(&p, format!("// {i}\n")).expect("write");
2441                p
2442            } else {
2443                dir.path().join(format!("missing-{i}.rs"))
2444            };
2445            matches.push(GrepMatch {
2446                file,
2447                line: u32::try_from(i).unwrap_or(0),
2448                column: 0,
2449                line_text: format!("match {i}"),
2450                match_text: format!("match {i}"),
2451            });
2452        }
2453
2454        for _ in 0..50 {
2455            let mut copy = matches.clone();
2456            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2457            assert_eq!(copy.len(), matches.len());
2458        }
2459    }
2460}
aft/search_index.rs

aft/
search_index.rs