aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 1;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24const MAX_ENTRIES: usize = 10_000_000;
25const MIN_FILE_ENTRY_BYTES: usize = 25;
26const LOOKUP_ENTRY_BYTES: usize = 16;
27const POSTING_BYTES: usize = 6;
28
29#[derive(Clone, Debug)]
30pub struct SearchIndex {
31    pub postings: HashMap<u32, Vec<Posting>>,
32    pub files: Vec<FileEntry>,
33    pub path_to_id: HashMap<PathBuf, u32>,
34    pub ready: bool,
35    project_root: PathBuf,
36    git_head: Option<String>,
37    max_file_size: u64,
38    file_trigrams: HashMap<u32, Vec<u32>>,
39    unindexed_files: HashSet<u32>,
40}
41
42impl SearchIndex {
43    /// Number of indexed files.
44    pub fn file_count(&self) -> usize {
45        self.files.len()
46    }
47
48    /// Number of unique trigrams in the index.
49    pub fn trigram_count(&self) -> usize {
50        self.postings.len()
51    }
52}
53
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct Posting {
56    pub file_id: u32,
57    pub next_mask: u8,
58    pub loc_mask: u8,
59}
60
61#[derive(Clone, Debug)]
62pub struct FileEntry {
63    pub path: PathBuf,
64    pub size: u64,
65    pub modified: SystemTime,
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct GrepMatch {
70    pub file: PathBuf,
71    pub line: u32,
72    pub column: u32,
73    pub line_text: String,
74    pub match_text: String,
75}
76
77#[derive(Clone, Debug)]
78pub struct GrepResult {
79    pub matches: Vec<GrepMatch>,
80    pub total_matches: usize,
81    pub files_searched: usize,
82    pub files_with_matches: usize,
83    pub index_status: IndexStatus,
84    pub truncated: bool,
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq)]
88pub enum IndexStatus {
89    Ready,
90    Building,
91    Fallback,
92}
93
94impl IndexStatus {
95    pub fn as_str(&self) -> &'static str {
96        match self {
97            IndexStatus::Ready => "Ready",
98            IndexStatus::Building => "Building",
99            IndexStatus::Fallback => "Fallback",
100        }
101    }
102}
103
104#[derive(Clone, Debug, Default)]
105pub struct RegexQuery {
106    pub and_trigrams: Vec<u32>,
107    pub or_groups: Vec<Vec<u32>>,
108    pub(crate) and_filters: HashMap<u32, PostingFilter>,
109    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
110}
111
112#[derive(Clone, Copy, Debug, Default)]
113pub(crate) struct PostingFilter {
114    next_mask: u8,
115    loc_mask: u8,
116}
117
118#[derive(Clone, Debug, Default)]
119struct QueryBuild {
120    and_runs: Vec<Vec<u8>>,
121    or_groups: Vec<Vec<Vec<u8>>>,
122}
123
124#[derive(Clone, Debug, Default)]
125pub(crate) struct PathFilters {
126    includes: Option<GlobSet>,
127    excludes: Option<GlobSet>,
128}
129
130#[derive(Clone, Debug)]
131pub(crate) struct SearchScope {
132    pub root: PathBuf,
133    pub use_index: bool,
134}
135
136#[derive(Clone, Debug)]
137struct SharedGrepMatch {
138    file: Arc<PathBuf>,
139    line: u32,
140    column: u32,
141    line_text: String,
142    match_text: String,
143}
144
145#[derive(Clone, Debug)]
146enum SearchMatcher {
147    Literal(LiteralSearch),
148    Regex(Regex),
149}
150
151#[derive(Clone, Debug)]
152enum LiteralSearch {
153    CaseSensitive(Vec<u8>),
154    AsciiCaseInsensitive(Vec<u8>),
155}
156
157impl SearchIndex {
158    pub fn new() -> Self {
159        SearchIndex {
160            postings: HashMap::new(),
161            files: Vec::new(),
162            path_to_id: HashMap::new(),
163            ready: false,
164            project_root: PathBuf::new(),
165            git_head: None,
166            max_file_size: DEFAULT_MAX_FILE_SIZE,
167            file_trigrams: HashMap::new(),
168            unindexed_files: HashSet::new(),
169        }
170    }
171
172    pub fn build(root: &Path) -> Self {
173        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
174    }
175
176    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
177        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
178        let mut index = SearchIndex {
179            project_root: project_root.clone(),
180            max_file_size,
181            ..SearchIndex::new()
182        };
183
184        let filters = PathFilters::default();
185        for path in walk_project_files(&project_root, &filters) {
186            index.update_file(&path);
187        }
188
189        index.git_head = current_git_head(&project_root);
190        index.ready = true;
191        index
192    }
193
194    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
195        self.remove_file(path);
196
197        let file_id = match self.allocate_file_id(path, content.len() as u64) {
198            Some(file_id) => file_id,
199            None => return,
200        };
201
202        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
203        for (trigram, next_char, position) in extract_trigrams(content) {
204            let entry = trigram_map.entry(trigram).or_default();
205            entry.next_mask |= mask_for_next_char(next_char);
206            entry.loc_mask |= mask_for_position(position);
207        }
208
209        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
210        for (trigram, filter) in trigram_map {
211            let postings = self.postings.entry(trigram).or_default();
212            postings.push(Posting {
213                file_id,
214                next_mask: filter.next_mask,
215                loc_mask: filter.loc_mask,
216            });
217            // Posting lists are kept sorted by file_id for binary search during
218            // intersection. Since file_ids are allocated incrementally, the new
219            // entry is usually already in order. Only sort when needed.
220            if postings.len() > 1
221                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
222            {
223                postings.sort_unstable_by_key(|p| p.file_id);
224            }
225            file_trigrams.push(trigram);
226        }
227
228        self.file_trigrams.insert(file_id, file_trigrams);
229        self.unindexed_files.remove(&file_id);
230    }
231
232    pub fn remove_file(&mut self, path: &Path) {
233        let Some(file_id) = self.path_to_id.remove(path) else {
234            return;
235        };
236
237        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
238            for trigram in trigrams {
239                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
240                    postings.retain(|posting| posting.file_id != file_id);
241                    postings.is_empty()
242                } else {
243                    false
244                };
245
246                if should_remove {
247                    self.postings.remove(&trigram);
248                }
249            }
250        }
251
252        self.unindexed_files.remove(&file_id);
253        if let Some(file) = self.files.get_mut(file_id as usize) {
254            file.path = PathBuf::new();
255            file.size = 0;
256            file.modified = UNIX_EPOCH;
257        }
258    }
259
260    pub fn update_file(&mut self, path: &Path) {
261        self.remove_file(path);
262
263        let metadata = match fs::metadata(path) {
264            Ok(metadata) if metadata.is_file() => metadata,
265            _ => return,
266        };
267
268        if is_binary_path(path, metadata.len()) {
269            return;
270        }
271
272        if metadata.len() > self.max_file_size {
273            self.track_unindexed_file(path, &metadata);
274            return;
275        }
276
277        let content = match fs::read(path) {
278            Ok(content) => content,
279            Err(_) => return,
280        };
281
282        if is_binary_bytes(&content) {
283            return;
284        }
285
286        self.index_file(path, &content);
287    }
288
289    pub fn grep(
290        &self,
291        pattern: &str,
292        case_sensitive: bool,
293        include: &[String],
294        exclude: &[String],
295        search_root: &Path,
296        max_results: usize,
297    ) -> GrepResult {
298        self.search_grep(
299            pattern,
300            case_sensitive,
301            include,
302            exclude,
303            search_root,
304            max_results,
305        )
306    }
307
308    pub fn search_grep(
309        &self,
310        pattern: &str,
311        case_sensitive: bool,
312        include: &[String],
313        exclude: &[String],
314        search_root: &Path,
315        max_results: usize,
316    ) -> GrepResult {
317        // Detect if pattern is a plain literal (no regex metacharacters).
318        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
319        let is_literal = !pattern.chars().any(|c| {
320            matches!(
321                c,
322                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
323            )
324        });
325
326        let literal_search = if is_literal {
327            if case_sensitive {
328                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
329            } else if pattern.is_ascii() {
330                Some(LiteralSearch::AsciiCaseInsensitive(
331                    pattern
332                        .as_bytes()
333                        .iter()
334                        .map(|byte| byte.to_ascii_lowercase())
335                        .collect(),
336                ))
337            } else {
338                None
339            }
340        } else {
341            None
342        };
343
344        // Build the regex for non-literal patterns (or literal Unicode fallback).
345        let regex = if literal_search.is_some() {
346            None
347        } else {
348            let regex_pattern = if is_literal {
349                regex::escape(pattern)
350            } else {
351                pattern.to_string()
352            };
353            let mut builder = RegexBuilder::new(&regex_pattern);
354            builder.case_insensitive(!case_sensitive);
355            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
356            builder.multi_line(true);
357            match builder.build() {
358                Ok(r) => Some(r),
359                Err(_) => {
360                    return GrepResult {
361                        matches: Vec::new(),
362                        total_matches: 0,
363                        files_searched: 0,
364                        files_with_matches: 0,
365                        index_status: if self.ready {
366                            IndexStatus::Ready
367                        } else {
368                            IndexStatus::Building
369                        },
370                        truncated: false,
371                    };
372                }
373            }
374        };
375
376        let matcher = if let Some(literal_search) = literal_search {
377            SearchMatcher::Literal(literal_search)
378        } else {
379            SearchMatcher::Regex(
380                regex.expect("regex should exist when literal matcher is unavailable"),
381            )
382        };
383
384        let filters = match build_path_filters(include, exclude) {
385            Ok(filters) => filters,
386            Err(_) => PathFilters::default(),
387        };
388        let search_root = canonicalize_or_normalize(search_root);
389
390        let query = decompose_regex(pattern);
391        let candidate_ids = self.candidates(&query);
392
393        let candidate_files: Vec<&FileEntry> = candidate_ids
394            .into_iter()
395            .filter_map(|file_id| self.files.get(file_id as usize))
396            .filter(|file| !file.path.as_os_str().is_empty())
397            .filter(|file| is_within_search_root(&search_root, &file.path))
398            .filter(|file| filters.matches(&self.project_root, &file.path))
399            .collect();
400
401        let total_matches = AtomicUsize::new(0);
402        let files_searched = AtomicUsize::new(0);
403        let files_with_matches = AtomicUsize::new(0);
404        let truncated = AtomicBool::new(false);
405        let stop_after = max_results.saturating_mul(2);
406
407        let mut matches = if candidate_files.len() > 10 {
408            candidate_files
409                .par_iter()
410                .map(|file| {
411                    search_candidate_file(
412                        file,
413                        &matcher,
414                        max_results,
415                        stop_after,
416                        &total_matches,
417                        &files_searched,
418                        &files_with_matches,
419                        &truncated,
420                    )
421                })
422                .reduce(Vec::new, |mut left, mut right| {
423                    left.append(&mut right);
424                    left
425                })
426        } else {
427            let mut matches = Vec::new();
428            for file in candidate_files {
429                matches.extend(search_candidate_file(
430                    file,
431                    &matcher,
432                    max_results,
433                    stop_after,
434                    &total_matches,
435                    &files_searched,
436                    &files_with_matches,
437                    &truncated,
438                ));
439
440                if should_stop_search(&truncated, &total_matches, stop_after) {
441                    break;
442                }
443            }
444            matches
445        };
446
447        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
448            self.path_to_id
449                .get(path)
450                .and_then(|file_id| self.files.get(*file_id as usize))
451                .map(|file| file.modified)
452        });
453
454        let matches = matches
455            .into_iter()
456            .map(|matched| GrepMatch {
457                file: matched.file.as_ref().clone(),
458                line: matched.line,
459                column: matched.column,
460                line_text: matched.line_text,
461                match_text: matched.match_text,
462            })
463            .collect();
464
465        GrepResult {
466            total_matches: total_matches.load(Ordering::Relaxed),
467            matches,
468            files_searched: files_searched.load(Ordering::Relaxed),
469            files_with_matches: files_with_matches.load(Ordering::Relaxed),
470            index_status: if self.ready {
471                IndexStatus::Ready
472            } else {
473                IndexStatus::Building
474            },
475            truncated: truncated.load(Ordering::Relaxed),
476        }
477    }
478
479    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
480        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
481            Ok(filters) => filters,
482            Err(_) => return Vec::new(),
483        };
484        let search_root = canonicalize_or_normalize(search_root);
485        let filter_root = if search_root.starts_with(&self.project_root) {
486            &self.project_root
487        } else {
488            &search_root
489        };
490
491        let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
492        sort_paths_by_mtime_desc(&mut paths);
493        paths
494    }
495
496    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
497        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
498            return self.active_file_ids();
499        }
500
501        let mut and_trigrams = query.and_trigrams.clone();
502        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
503
504        let mut current: Option<Vec<u32>> = None;
505
506        for trigram in and_trigrams {
507            let filter = query.and_filters.get(&trigram).copied();
508            let matches = self.postings_for_trigram(trigram, filter);
509            current = Some(match current.take() {
510                Some(existing) => intersect_sorted_ids(&existing, &matches),
511                None => matches,
512            });
513
514            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
515                break;
516            }
517        }
518
519        let mut current = current.unwrap_or_else(|| self.active_file_ids());
520
521        for (index, group) in query.or_groups.iter().enumerate() {
522            let mut group_matches = Vec::new();
523            let filters = query.or_filters.get(index);
524
525            for trigram in group {
526                let filter = filters.and_then(|filters| filters.get(trigram).copied());
527                let matches = self.postings_for_trigram(*trigram, filter);
528                if group_matches.is_empty() {
529                    group_matches = matches;
530                } else {
531                    group_matches = union_sorted_ids(&group_matches, &matches);
532                }
533            }
534
535            current = intersect_sorted_ids(&current, &group_matches);
536            if current.is_empty() {
537                break;
538            }
539        }
540
541        let mut unindexed = self
542            .unindexed_files
543            .iter()
544            .copied()
545            .filter(|file_id| self.is_active_file(*file_id))
546            .collect::<Vec<_>>();
547        if !unindexed.is_empty() {
548            unindexed.sort_unstable();
549            current = union_sorted_ids(&current, &unindexed);
550        }
551
552        current
553    }
554
555    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
556        if fs::create_dir_all(cache_dir).is_err() {
557            return;
558        }
559
560        let postings_path = cache_dir.join("postings.bin");
561        let lookup_path = cache_dir.join("lookup.bin");
562        let tmp_postings = cache_dir.join("postings.bin.tmp");
563        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
564
565        let active_ids = self.active_file_ids();
566        let mut id_map = HashMap::new();
567        for (new_id, old_id) in active_ids.iter().enumerate() {
568            let Ok(new_id_u32) = u32::try_from(new_id) else {
569                return;
570            };
571            id_map.insert(*old_id, new_id_u32);
572        }
573
574        let write_result = (|| -> std::io::Result<()> {
575            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
576
577            postings_writer.write_all(INDEX_MAGIC)?;
578            write_u32(&mut postings_writer, INDEX_VERSION)?;
579
580            let head = git_head.unwrap_or_default();
581            let root = self.project_root.to_string_lossy();
582            let head_len = u32::try_from(head.len())
583                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
584            let root_len = u32::try_from(root.len())
585                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
586            let file_count = u32::try_from(active_ids.len())
587                .map_err(|_| std::io::Error::other("too many files to cache"))?;
588
589            write_u32(&mut postings_writer, head_len)?;
590            write_u32(&mut postings_writer, root_len)?;
591            write_u64(&mut postings_writer, self.max_file_size)?;
592            write_u32(&mut postings_writer, file_count)?;
593            postings_writer.write_all(head.as_bytes())?;
594            postings_writer.write_all(root.as_bytes())?;
595
596            for old_id in &active_ids {
597                let Some(file) = self.files.get(*old_id as usize) else {
598                    return Err(std::io::Error::other("missing file entry for cache write"));
599                };
600                let path = relative_to_root(&self.project_root, &file.path);
601                let path = path.to_string_lossy();
602                let path_len = u32::try_from(path.len())
603                    .map_err(|_| std::io::Error::other("cached path too large"))?;
604                let modified = file
605                    .modified
606                    .duration_since(UNIX_EPOCH)
607                    .unwrap_or(Duration::ZERO);
608                let unindexed = if self.unindexed_files.contains(old_id) {
609                    1u8
610                } else {
611                    0u8
612                };
613
614                postings_writer.write_all(&[unindexed])?;
615                write_u32(&mut postings_writer, path_len)?;
616                write_u64(&mut postings_writer, file.size)?;
617                write_u64(&mut postings_writer, modified.as_secs())?;
618                write_u32(&mut postings_writer, modified.subsec_nanos())?;
619                postings_writer.write_all(path.as_bytes())?;
620            }
621
622            let mut lookup_entries = Vec::new();
623            let mut postings_blob = Vec::new();
624            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
625            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
626
627            for (trigram, postings) in sorted_postings {
628                let offset = u64::try_from(postings_blob.len())
629                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
630                let mut count = 0u32;
631
632                for posting in postings {
633                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
634                        continue;
635                    };
636
637                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
638                    postings_blob.push(posting.next_mask);
639                    postings_blob.push(posting.loc_mask);
640                    count = count.saturating_add(1);
641                }
642
643                if count > 0 {
644                    lookup_entries.push((*trigram, offset, count));
645                }
646            }
647
648            write_u64(
649                &mut postings_writer,
650                u64::try_from(postings_blob.len())
651                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
652            )?;
653            postings_writer.write_all(&postings_blob)?;
654            postings_writer.flush()?;
655            drop(postings_writer);
656
657            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
658            let entry_count = u32::try_from(lookup_entries.len())
659                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
660
661            lookup_writer.write_all(LOOKUP_MAGIC)?;
662            write_u32(&mut lookup_writer, INDEX_VERSION)?;
663            write_u32(&mut lookup_writer, entry_count)?;
664
665            for (trigram, offset, count) in lookup_entries {
666                write_u32(&mut lookup_writer, trigram)?;
667                write_u64(&mut lookup_writer, offset)?;
668                write_u32(&mut lookup_writer, count)?;
669            }
670
671            lookup_writer.flush()?;
672            drop(lookup_writer);
673
674            fs::rename(&tmp_postings, &postings_path)?;
675            fs::rename(&tmp_lookup, &lookup_path)?;
676
677            Ok(())
678        })();
679
680        if write_result.is_err() {
681            let _ = fs::remove_file(&tmp_postings);
682            let _ = fs::remove_file(&tmp_lookup);
683        }
684    }
685
686    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
687        let postings_path = cache_dir.join("postings.bin");
688        let lookup_path = cache_dir.join("lookup.bin");
689
690        let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
691        let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
692        let postings_len_total =
693            usize::try_from(postings_reader.get_ref().metadata().ok()?.len()).ok()?;
694        let lookup_len_total =
695            usize::try_from(lookup_reader.get_ref().metadata().ok()?.len()).ok()?;
696
697        let mut magic = [0u8; 8];
698        postings_reader.read_exact(&mut magic).ok()?;
699        if &magic != INDEX_MAGIC {
700            return None;
701        }
702        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
703            return None;
704        }
705
706        let head_len = read_u32(&mut postings_reader).ok()? as usize;
707        let root_len = read_u32(&mut postings_reader).ok()? as usize;
708        let max_file_size = read_u64(&mut postings_reader).ok()?;
709        let file_count = read_u32(&mut postings_reader).ok()? as usize;
710        if file_count > MAX_ENTRIES {
711            return None;
712        }
713        let remaining_postings = remaining_bytes(&mut postings_reader, postings_len_total)?;
714        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
715        if minimum_file_bytes > remaining_postings {
716            return None;
717        }
718
719        if head_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
720            return None;
721        }
722        let mut head_bytes = vec![0u8; head_len];
723        postings_reader.read_exact(&mut head_bytes).ok()?;
724        let git_head = String::from_utf8(head_bytes)
725            .ok()
726            .filter(|head| !head.is_empty());
727
728        if root_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
729            return None;
730        }
731        let mut root_bytes = vec![0u8; root_len];
732        postings_reader.read_exact(&mut root_bytes).ok()?;
733        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
734
735        let mut files = Vec::with_capacity(file_count);
736        let mut path_to_id = HashMap::new();
737        let mut unindexed_files = HashSet::new();
738
739        for file_id in 0..file_count {
740            let mut unindexed = [0u8; 1];
741            postings_reader.read_exact(&mut unindexed).ok()?;
742            let path_len = read_u32(&mut postings_reader).ok()? as usize;
743            let size = read_u64(&mut postings_reader).ok()?;
744            let secs = read_u64(&mut postings_reader).ok()?;
745            let nanos = read_u32(&mut postings_reader).ok()?;
746            if nanos >= 1_000_000_000 {
747                return None;
748            }
749            if path_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
750                return None;
751            }
752            let mut path_bytes = vec![0u8; path_len];
753            postings_reader.read_exact(&mut path_bytes).ok()?;
754            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
755            let full_path = project_root.join(relative_path);
756            let file_id_u32 = u32::try_from(file_id).ok()?;
757
758            files.push(FileEntry {
759                path: full_path.clone(),
760                size,
761                modified: UNIX_EPOCH + Duration::new(secs, nanos),
762            });
763            path_to_id.insert(full_path, file_id_u32);
764            if unindexed[0] == 1 {
765                unindexed_files.insert(file_id_u32);
766            }
767        }
768
769        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
770        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
771        if postings_len > max_postings_bytes {
772            return None;
773        }
774        if postings_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
775            return None;
776        }
777        let mut postings_blob = vec![0u8; postings_len];
778        postings_reader.read_exact(&mut postings_blob).ok()?;
779
780        let mut lookup_magic = [0u8; 8];
781        lookup_reader.read_exact(&mut lookup_magic).ok()?;
782        if &lookup_magic != LOOKUP_MAGIC {
783            return None;
784        }
785        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
786            return None;
787        }
788        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
789        if entry_count > MAX_ENTRIES {
790            return None;
791        }
792        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_len_total)?;
793        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
794        if minimum_lookup_bytes > remaining_lookup {
795            return None;
796        }
797
798        let mut postings = HashMap::new();
799        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
800
801        for _ in 0..entry_count {
802            let trigram = read_u32(&mut lookup_reader).ok()?;
803            let offset = read_u64(&mut lookup_reader).ok()? as usize;
804            let count = read_u32(&mut lookup_reader).ok()? as usize;
805            if count > MAX_ENTRIES {
806                return None;
807            }
808            let bytes_len = count.checked_mul(POSTING_BYTES)?;
809            let end = offset.checked_add(bytes_len)?;
810            if end > postings_blob.len() {
811                return None;
812            }
813
814            let mut trigram_postings = Vec::with_capacity(count);
815            for chunk in postings_blob[offset..end].chunks_exact(6) {
816                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
817                let posting = Posting {
818                    file_id,
819                    next_mask: chunk[4],
820                    loc_mask: chunk[5],
821                };
822                trigram_postings.push(posting.clone());
823                file_trigrams.entry(file_id).or_default().push(trigram);
824            }
825            postings.insert(trigram, trigram_postings);
826        }
827
828        Some(SearchIndex {
829            postings,
830            files,
831            path_to_id,
832            ready: true,
833            project_root,
834            git_head,
835            max_file_size,
836            file_trigrams,
837            unindexed_files,
838        })
839    }
840
841    pub(crate) fn stored_git_head(&self) -> Option<&str> {
842        self.git_head.as_deref()
843    }
844
845    pub(crate) fn set_ready(&mut self, ready: bool) {
846        self.ready = ready;
847    }
848
849    pub(crate) fn rebuild_or_refresh(
850        root: &Path,
851        max_file_size: u64,
852        current_head: Option<String>,
853        baseline: Option<SearchIndex>,
854    ) -> Self {
855        if current_head.is_none() {
856            return SearchIndex::build_with_limit(root, max_file_size);
857        }
858
859        if let Some(mut baseline) = baseline {
860            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
861            baseline.max_file_size = max_file_size;
862
863            if baseline.git_head == current_head {
864                // HEAD matches, but files may have changed on disk since the index was
865                // last written (e.g., uncommitted edits, stash pop, manual file changes
866                // while OpenCode was closed). Verify mtimes and re-index stale files.
867                verify_file_mtimes(&mut baseline);
868                baseline.ready = true;
869                return baseline;
870            }
871
872            if let (Some(previous), Some(current)) =
873                (baseline.git_head.clone(), current_head.clone())
874            {
875                let project_root = baseline.project_root.clone();
876                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
877                    baseline.git_head = Some(current);
878                    baseline.ready = true;
879                    return baseline;
880                }
881            }
882        }
883
884        SearchIndex::build_with_limit(root, max_file_size)
885    }
886
887    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
888        let file_id = u32::try_from(self.files.len()).ok()?;
889        let metadata = fs::metadata(path).ok();
890        let size = metadata
891            .as_ref()
892            .map_or(size_hint, |metadata| metadata.len());
893        let modified = metadata
894            .and_then(|metadata| metadata.modified().ok())
895            .unwrap_or(UNIX_EPOCH);
896
897        self.files.push(FileEntry {
898            path: path.to_path_buf(),
899            size,
900            modified,
901        });
902        self.path_to_id.insert(path.to_path_buf(), file_id);
903        Some(file_id)
904    }
905
906    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
907        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
908            return;
909        };
910        self.unindexed_files.insert(file_id);
911        self.file_trigrams.insert(file_id, Vec::new());
912    }
913
914    fn active_file_ids(&self) -> Vec<u32> {
915        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
916        ids.sort_unstable();
917        ids
918    }
919
920    fn is_active_file(&self, file_id: u32) -> bool {
921        self.files
922            .get(file_id as usize)
923            .map(|file| !file.path.as_os_str().is_empty())
924            .unwrap_or(false)
925    }
926
927    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
928        let Some(postings) = self.postings.get(&trigram) else {
929            return Vec::new();
930        };
931
932        let mut matches = Vec::with_capacity(postings.len());
933
934        for posting in postings {
935            if let Some(filter) = filter {
936                // next_mask: bloom filter check — the character following this trigram in the
937                // query must also appear after this trigram somewhere in the file.
938                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
939                    continue;
940                }
941                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
942                // between consecutive trigram pairs, but is NOT used as a single-trigram
943                // filter because the position in the query string has no relationship to
944                // the position in the file. Using it here causes false negatives.
945            }
946            if self.is_active_file(posting.file_id) {
947                matches.push(posting.file_id);
948            }
949        }
950
951        matches
952    }
953}
954
955fn search_candidate_file(
956    file: &FileEntry,
957    matcher: &SearchMatcher,
958    max_results: usize,
959    stop_after: usize,
960    total_matches: &AtomicUsize,
961    files_searched: &AtomicUsize,
962    files_with_matches: &AtomicUsize,
963    truncated: &AtomicBool,
964) -> Vec<SharedGrepMatch> {
965    if should_stop_search(truncated, total_matches, stop_after) {
966        return Vec::new();
967    }
968
969    let content = match read_indexed_file_bytes(&file.path) {
970        Some(content) => content,
971        None => return Vec::new(),
972    };
973    files_searched.fetch_add(1, Ordering::Relaxed);
974
975    let shared_path = Arc::new(file.path.clone());
976    let mut matches = Vec::new();
977    let mut line_starts = None;
978    let mut seen_lines = HashSet::new();
979    let mut matched_this_file = false;
980
981    match matcher {
982        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
983            let finder = memchr::memmem::Finder::new(needle);
984            let mut start = 0;
985
986            while let Some(position) = finder.find(&content[start..]) {
987                if should_stop_search(truncated, total_matches, stop_after) {
988                    break;
989                }
990
991                let offset = start + position;
992                start = offset + 1;
993
994                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
995                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
996                if !seen_lines.insert(line) {
997                    continue;
998                }
999
1000                matched_this_file = true;
1001                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1002                if match_number > max_results {
1003                    truncated.store(true, Ordering::Relaxed);
1004                    break;
1005                }
1006
1007                let end = offset + needle.len();
1008                matches.push(SharedGrepMatch {
1009                    file: shared_path.clone(),
1010                    line,
1011                    column,
1012                    line_text,
1013                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1014                });
1015            }
1016        }
1017        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1018            let search_content = content.to_ascii_lowercase();
1019            let finder = memchr::memmem::Finder::new(needle);
1020            let mut start = 0;
1021
1022            while let Some(position) = finder.find(&search_content[start..]) {
1023                if should_stop_search(truncated, total_matches, stop_after) {
1024                    break;
1025                }
1026
1027                let offset = start + position;
1028                start = offset + 1;
1029
1030                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1031                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1032                if !seen_lines.insert(line) {
1033                    continue;
1034                }
1035
1036                matched_this_file = true;
1037                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1038                if match_number > max_results {
1039                    truncated.store(true, Ordering::Relaxed);
1040                    break;
1041                }
1042
1043                let end = offset + needle.len();
1044                matches.push(SharedGrepMatch {
1045                    file: shared_path.clone(),
1046                    line,
1047                    column,
1048                    line_text,
1049                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1050                });
1051            }
1052        }
1053        SearchMatcher::Regex(regex) => {
1054            for matched in regex.find_iter(&content) {
1055                if should_stop_search(truncated, total_matches, stop_after) {
1056                    break;
1057                }
1058
1059                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1060                let (line, column, line_text) =
1061                    line_details_bytes(&content, line_starts, matched.start());
1062                if !seen_lines.insert(line) {
1063                    continue;
1064                }
1065
1066                matched_this_file = true;
1067                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1068                if match_number > max_results {
1069                    truncated.store(true, Ordering::Relaxed);
1070                    break;
1071                }
1072
1073                matches.push(SharedGrepMatch {
1074                    file: shared_path.clone(),
1075                    line,
1076                    column,
1077                    line_text,
1078                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1079                });
1080            }
1081        }
1082    }
1083
1084    if matched_this_file {
1085        files_with_matches.fetch_add(1, Ordering::Relaxed);
1086    }
1087
1088    matches
1089}
1090
1091fn should_stop_search(
1092    truncated: &AtomicBool,
1093    total_matches: &AtomicUsize,
1094    stop_after: usize,
1095) -> bool {
1096    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1097}
1098
1099fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1100    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1101    let mut left_index = 0;
1102    let mut right_index = 0;
1103
1104    while left_index < left.len() && right_index < right.len() {
1105        match left[left_index].cmp(&right[right_index]) {
1106            std::cmp::Ordering::Less => left_index += 1,
1107            std::cmp::Ordering::Greater => right_index += 1,
1108            std::cmp::Ordering::Equal => {
1109                merged.push(left[left_index]);
1110                left_index += 1;
1111                right_index += 1;
1112            }
1113        }
1114    }
1115
1116    merged
1117}
1118
1119fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1120    let mut merged = Vec::with_capacity(left.len() + right.len());
1121    let mut left_index = 0;
1122    let mut right_index = 0;
1123
1124    while left_index < left.len() && right_index < right.len() {
1125        match left[left_index].cmp(&right[right_index]) {
1126            std::cmp::Ordering::Less => {
1127                merged.push(left[left_index]);
1128                left_index += 1;
1129            }
1130            std::cmp::Ordering::Greater => {
1131                merged.push(right[right_index]);
1132                right_index += 1;
1133            }
1134            std::cmp::Ordering::Equal => {
1135                merged.push(left[left_index]);
1136                left_index += 1;
1137                right_index += 1;
1138            }
1139        }
1140    }
1141
1142    merged.extend_from_slice(&left[left_index..]);
1143    merged.extend_from_slice(&right[right_index..]);
1144    merged
1145}
1146
1147pub fn decompose_regex(pattern: &str) -> RegexQuery {
1148    let hir = match regex_syntax::parse(pattern) {
1149        Ok(hir) => hir,
1150        Err(_) => return RegexQuery::default(),
1151    };
1152
1153    let build = build_query(&hir);
1154    build.into_query()
1155}
1156
1157pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1158    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1159}
1160
1161pub fn normalize_char(c: u8) -> u8 {
1162    c.to_ascii_lowercase()
1163}
1164
1165pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1166    if content.len() < 3 {
1167        return Vec::new();
1168    }
1169
1170    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1171    for start in 0..=content.len() - 3 {
1172        let trigram = pack_trigram(
1173            normalize_char(content[start]),
1174            normalize_char(content[start + 1]),
1175            normalize_char(content[start + 2]),
1176        );
1177        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1178        trigrams.push((trigram, next_char, start));
1179    }
1180    trigrams
1181}
1182
1183pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1184    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1185    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1186        return PathBuf::from(override_dir)
1187            .join("index")
1188            .join(project_cache_key(project_root));
1189    }
1190    // Use configured storage dir (from plugin, XDG-compliant)
1191    if let Some(dir) = storage_dir {
1192        return dir.join("index").join(project_cache_key(project_root));
1193    }
1194    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage)
1195    let home = std::env::var_os("HOME")
1196        .map(PathBuf::from)
1197        .unwrap_or_else(|| PathBuf::from("."));
1198    home.join(".cache")
1199        .join("aft")
1200        .join("index")
1201        .join(project_cache_key(project_root))
1202}
1203
1204pub(crate) fn build_path_filters(
1205    include: &[String],
1206    exclude: &[String],
1207) -> Result<PathFilters, String> {
1208    Ok(PathFilters {
1209        includes: build_globset(include)?,
1210        excludes: build_globset(exclude)?,
1211    })
1212}
1213
1214pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1215    walk_project_files_from(root, root, filters)
1216}
1217
1218pub(crate) fn walk_project_files_from(
1219    filter_root: &Path,
1220    search_root: &Path,
1221    filters: &PathFilters,
1222) -> Vec<PathBuf> {
1223    let mut builder = WalkBuilder::new(search_root);
1224    builder
1225        .hidden(false)
1226        .git_ignore(true)
1227        .git_global(true)
1228        .git_exclude(true)
1229        .filter_entry(|entry| {
1230            let name = entry.file_name().to_string_lossy();
1231            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1232                return !matches!(
1233                    name.as_ref(),
1234                    "node_modules"
1235                        | "target"
1236                        | "venv"
1237                        | ".venv"
1238                        | ".git"
1239                        | "__pycache__"
1240                        | ".tox"
1241                        | "dist"
1242                        | "build"
1243                );
1244            }
1245            true
1246        });
1247
1248    let mut files = Vec::new();
1249    for entry in builder.build().filter_map(|entry| entry.ok()) {
1250        if !entry
1251            .file_type()
1252            .map_or(false, |file_type| file_type.is_file())
1253        {
1254            continue;
1255        }
1256        let path = entry.into_path();
1257        if filters.matches(filter_root, &path) {
1258            files.push(path);
1259        }
1260    }
1261
1262    sort_paths_by_mtime_desc(&mut files);
1263    files
1264}
1265
1266pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1267    let bytes = fs::read(path).ok()?;
1268    if is_binary_bytes(&bytes) {
1269        return None;
1270    }
1271    String::from_utf8(bytes).ok()
1272}
1273
1274fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1275    fs::read(path).ok()
1276}
1277
1278pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1279    path.strip_prefix(root)
1280        .map(PathBuf::from)
1281        .unwrap_or_else(|_| path.to_path_buf())
1282}
1283
1284/// Sort paths newest-first by mtime, falling back to lexicographic order.
1285///
1286/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1287/// `sort_by()` closure. That made the comparator non-deterministic — a
1288/// `stat()` syscall for the same path can return different values across
1289/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1290/// concurrent file-watcher activity), and Rust's slice::sort panics at
1291/// runtime when it detects a non-total-order comparator. CI hit this on
1292/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1293///
1294/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1295/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1296pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1297    use std::collections::HashMap;
1298    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1299    for path in paths.iter() {
1300        mtimes
1301            .entry(path.clone())
1302            .or_insert_with(|| path_modified_time(path));
1303    }
1304    paths.sort_by(|left, right| {
1305        let left_mtime = mtimes.get(left).and_then(|v| *v);
1306        let right_mtime = mtimes.get(right).and_then(|v| *v);
1307        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1308    });
1309}
1310
1311/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1312/// the sort. Same fix, applied to grep matches that share files.
1313pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1314    use std::collections::HashMap;
1315    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1316    for m in matches.iter() {
1317        mtimes.entry(m.file.clone()).or_insert_with(|| {
1318            let resolved = resolve_match_path(project_root, &m.file);
1319            path_modified_time(&resolved)
1320        });
1321    }
1322    matches.sort_by(|left, right| {
1323        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1324        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1325        right_mtime
1326            .cmp(&left_mtime)
1327            .then_with(|| left.file.cmp(&right.file))
1328            .then_with(|| left.line.cmp(&right.line))
1329            .then_with(|| left.column.cmp(&right.column))
1330    });
1331}
1332
1333/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1334/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1335/// table from the search index), but it can still return different values if
1336/// the file is modified mid-sort. Snapshot once.
1337fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1338    matches: &mut [SharedGrepMatch],
1339    modified_for_path: F,
1340) where
1341    F: Fn(&Path) -> Option<SystemTime>,
1342{
1343    use std::collections::HashMap;
1344    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1345    for m in matches.iter() {
1346        let path = m.file.as_path().to_path_buf();
1347        mtimes
1348            .entry(path.clone())
1349            .or_insert_with(|| modified_for_path(&path));
1350    }
1351    matches.sort_by(|left, right| {
1352        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1353        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1354        right_mtime
1355            .cmp(&left_mtime)
1356            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1357            .then_with(|| left.line.cmp(&right.line))
1358            .then_with(|| left.column.cmp(&right.column))
1359    });
1360}
1361
1362pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1363    let resolved_project_root = canonicalize_or_normalize(project_root);
1364    let root = match path {
1365        Some(path) => {
1366            let path = PathBuf::from(path);
1367            if path.is_absolute() {
1368                canonicalize_or_normalize(&path)
1369            } else {
1370                normalize_path(&resolved_project_root.join(path))
1371            }
1372        }
1373        None => resolved_project_root.clone(),
1374    };
1375
1376    let use_index = is_within_search_root(&resolved_project_root, &root);
1377    SearchScope { root, use_index }
1378}
1379
1380pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1381    content_inspector::inspect(content).is_binary()
1382}
1383
1384pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1385    run_git(root, &["rev-parse", "HEAD"])
1386}
1387
1388pub(crate) fn project_cache_key(project_root: &Path) -> String {
1389    use sha2::{Digest, Sha256};
1390
1391    let mut hasher = Sha256::new();
1392
1393    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1394        // Git repo: root commit is the unique identity.
1395        // Same repo cloned anywhere produces the same key.
1396        hasher.update(root_commit.as_bytes());
1397    } else {
1398        // Non-git project: use the canonical filesystem path as identity.
1399        let canonical_root = canonicalize_or_normalize(project_root);
1400        hasher.update(canonical_root.to_string_lossy().as_bytes());
1401    }
1402
1403    let digest = format!("{:x}", hasher.finalize());
1404    digest[..16].to_string()
1405}
1406
1407impl PathFilters {
1408    fn matches(&self, root: &Path, path: &Path) -> bool {
1409        let relative = to_glob_path(&relative_to_root(root, path));
1410        if self
1411            .includes
1412            .as_ref()
1413            .is_some_and(|includes| !includes.is_match(&relative))
1414        {
1415            return false;
1416        }
1417        if self
1418            .excludes
1419            .as_ref()
1420            .is_some_and(|excludes| excludes.is_match(&relative))
1421        {
1422            return false;
1423        }
1424        true
1425    }
1426}
1427
1428fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1429    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1430}
1431
1432fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1433    if path.is_absolute() {
1434        path.to_path_buf()
1435    } else {
1436        project_root.join(path)
1437    }
1438}
1439
1440fn path_modified_time(path: &Path) -> Option<SystemTime> {
1441    fs::metadata(path)
1442        .and_then(|metadata| metadata.modified())
1443        .ok()
1444}
1445
1446fn normalize_path(path: &Path) -> PathBuf {
1447    let mut result = PathBuf::new();
1448    for component in path.components() {
1449        match component {
1450            Component::ParentDir => {
1451                if !result.pop() {
1452                    result.push(component);
1453                }
1454            }
1455            Component::CurDir => {}
1456            _ => result.push(component),
1457        }
1458    }
1459    result
1460}
1461
1462/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1463/// since the index was last written. Also detect new files and deleted files.
1464fn verify_file_mtimes(index: &mut SearchIndex) {
1465    // Collect stale files (mtime mismatch or deleted)
1466    let mut stale_paths = Vec::new();
1467    for entry in &index.files {
1468        if entry.path.as_os_str().is_empty() {
1469            continue; // tombstoned entry
1470        }
1471        match fs::metadata(&entry.path) {
1472            Ok(meta) => {
1473                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1474                if current_mtime != entry.modified || meta.len() != entry.size {
1475                    stale_paths.push(entry.path.clone());
1476                }
1477            }
1478            Err(_) => {
1479                // File deleted
1480                stale_paths.push(entry.path.clone());
1481            }
1482        }
1483    }
1484
1485    // Re-index stale files
1486    for path in &stale_paths {
1487        index.update_file(path);
1488    }
1489
1490    // Detect new files not in the index
1491    let filters = PathFilters::default();
1492    for path in walk_project_files(&index.project_root, &filters) {
1493        if !index.path_to_id.contains_key(&path) {
1494            index.update_file(&path);
1495        }
1496    }
1497
1498    if !stale_paths.is_empty() {
1499        log::info!(
1500            "[aft] search index: refreshed {} stale file(s) from disk cache",
1501            stale_paths.len()
1502        );
1503    }
1504}
1505
1506fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1507    path.starts_with(search_root)
1508}
1509
1510impl QueryBuild {
1511    fn into_query(self) -> RegexQuery {
1512        let mut query = RegexQuery::default();
1513
1514        for run in self.and_runs {
1515            add_run_to_and_query(&mut query, &run);
1516        }
1517
1518        for group in self.or_groups {
1519            let mut trigrams = BTreeSet::new();
1520            let mut filters = HashMap::new();
1521            for run in group {
1522                for (trigram, filter) in trigram_filters(&run) {
1523                    trigrams.insert(trigram);
1524                    merge_filter(filters.entry(trigram).or_default(), filter);
1525                }
1526            }
1527            if !trigrams.is_empty() {
1528                query.or_groups.push(trigrams.into_iter().collect());
1529                query.or_filters.push(filters);
1530            }
1531        }
1532
1533        query
1534    }
1535}
1536
1537fn build_query(hir: &Hir) -> QueryBuild {
1538    match hir.kind() {
1539        HirKind::Literal(literal) => {
1540            if literal.0.len() >= 3 {
1541                QueryBuild {
1542                    and_runs: vec![literal.0.to_vec()],
1543                    or_groups: Vec::new(),
1544                }
1545            } else {
1546                QueryBuild::default()
1547            }
1548        }
1549        HirKind::Capture(capture) => build_query(&capture.sub),
1550        HirKind::Concat(parts) => {
1551            let mut build = QueryBuild::default();
1552            for part in parts {
1553                let part_build = build_query(part);
1554                build.and_runs.extend(part_build.and_runs);
1555                build.or_groups.extend(part_build.or_groups);
1556            }
1557            build
1558        }
1559        HirKind::Alternation(parts) => {
1560            let mut group = Vec::new();
1561            for part in parts {
1562                let Some(mut choices) = guaranteed_run_choices(part) else {
1563                    return QueryBuild::default();
1564                };
1565                group.append(&mut choices);
1566            }
1567            if group.is_empty() {
1568                QueryBuild::default()
1569            } else {
1570                QueryBuild {
1571                    and_runs: Vec::new(),
1572                    or_groups: vec![group],
1573                }
1574            }
1575        }
1576        HirKind::Repetition(repetition) => {
1577            if repetition.min == 0 {
1578                QueryBuild::default()
1579            } else {
1580                build_query(&repetition.sub)
1581            }
1582        }
1583        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1584    }
1585}
1586
1587fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1588    match hir.kind() {
1589        HirKind::Literal(literal) => {
1590            if literal.0.len() >= 3 {
1591                Some(vec![literal.0.to_vec()])
1592            } else {
1593                None
1594            }
1595        }
1596        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1597        HirKind::Concat(parts) => {
1598            let mut runs = Vec::new();
1599            for part in parts {
1600                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1601                    runs.append(&mut part_runs);
1602                }
1603            }
1604            if runs.is_empty() {
1605                None
1606            } else {
1607                Some(runs)
1608            }
1609        }
1610        HirKind::Alternation(parts) => {
1611            let mut runs = Vec::new();
1612            for part in parts {
1613                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1614                    return None;
1615                };
1616                runs.append(&mut part_runs);
1617            }
1618            if runs.is_empty() {
1619                None
1620            } else {
1621                Some(runs)
1622            }
1623        }
1624        HirKind::Repetition(repetition) => {
1625            if repetition.min == 0 {
1626                None
1627            } else {
1628                guaranteed_run_choices(&repetition.sub)
1629            }
1630        }
1631        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1632    }
1633}
1634
1635fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1636    for (trigram, filter) in trigram_filters(run) {
1637        if !query.and_trigrams.contains(&trigram) {
1638            query.and_trigrams.push(trigram);
1639        }
1640        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1641    }
1642}
1643
1644fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1645    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1646    for (trigram, next_char, position) in extract_trigrams(run) {
1647        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1648        if next_char != EOF_SENTINEL {
1649            entry.next_mask |= mask_for_next_char(next_char);
1650        }
1651        entry.loc_mask |= mask_for_position(position);
1652    }
1653    filters.into_iter().collect()
1654}
1655
1656fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1657    target.next_mask |= filter.next_mask;
1658    target.loc_mask |= filter.loc_mask;
1659}
1660
1661fn mask_for_next_char(next_char: u8) -> u8 {
1662    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1663    1u8 << bit
1664}
1665
1666fn mask_for_position(position: usize) -> u8 {
1667    1u8 << (position % 8)
1668}
1669
1670fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1671    if patterns.is_empty() {
1672        return Ok(None);
1673    }
1674
1675    let mut builder = GlobSetBuilder::new();
1676    for pattern in patterns {
1677        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1678        builder.add(glob);
1679    }
1680    builder.build().map(Some).map_err(|error| error.to_string())
1681}
1682
1683fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1684    let mut buffer = [0u8; 4];
1685    reader.read_exact(&mut buffer)?;
1686    Ok(u32::from_le_bytes(buffer))
1687}
1688
1689fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1690    let mut buffer = [0u8; 8];
1691    reader.read_exact(&mut buffer)?;
1692    Ok(u64::from_le_bytes(buffer))
1693}
1694
1695fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1696    writer.write_all(&value.to_le_bytes())
1697}
1698
1699fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1700    writer.write_all(&value.to_le_bytes())
1701}
1702
1703fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1704    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1705    total_len.checked_sub(pos)
1706}
1707
1708fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1709    let output = Command::new("git")
1710        .arg("-C")
1711        .arg(root)
1712        .args(args)
1713        .output()
1714        .ok()?;
1715    if !output.status.success() {
1716        return None;
1717    }
1718    let value = String::from_utf8(output.stdout).ok()?;
1719    let value = value.trim().to_string();
1720    if value.is_empty() {
1721        None
1722    } else {
1723        Some(value)
1724    }
1725}
1726
1727fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1728    let diff_range = format!("{}..{}", from, to);
1729    let output = match Command::new("git")
1730        .arg("-C")
1731        .arg(root)
1732        .args(["diff", "--name-only", &diff_range])
1733        .output()
1734    {
1735        Ok(output) => output,
1736        Err(_) => return false,
1737    };
1738
1739    if !output.status.success() {
1740        return false;
1741    }
1742
1743    let Ok(paths) = String::from_utf8(output.stdout) else {
1744        return false;
1745    };
1746
1747    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1748        let path = root.join(relative_path);
1749        if path.exists() {
1750            index.update_file(&path);
1751        } else {
1752            index.remove_file(&path);
1753        }
1754    }
1755
1756    true
1757}
1758
1759fn is_binary_path(path: &Path, size: u64) -> bool {
1760    if size == 0 {
1761        return false;
1762    }
1763
1764    let mut file = match File::open(path) {
1765        Ok(file) => file,
1766        Err(_) => return true,
1767    };
1768
1769    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1770    match file.read(&mut preview) {
1771        Ok(read) => is_binary_bytes(&preview[..read]),
1772        Err(_) => true,
1773    }
1774}
1775
1776fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1777    let mut starts = vec![0usize];
1778    for (index, byte) in content.iter().copied().enumerate() {
1779        if byte == b'\n' {
1780            starts.push(index + 1);
1781        }
1782    }
1783    starts
1784}
1785
1786fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1787    let line_index = match line_starts.binary_search(&offset) {
1788        Ok(index) => index,
1789        Err(index) => index.saturating_sub(1),
1790    };
1791    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1792    let line_end = content[line_start..]
1793        .iter()
1794        .position(|byte| *byte == b'\n')
1795        .map(|length| line_start + length)
1796        .unwrap_or(content.len());
1797    let mut line_slice = &content[line_start..line_end];
1798    if line_slice.ends_with(b"\r") {
1799        line_slice = &line_slice[..line_slice.len() - 1];
1800    }
1801    let line_text = String::from_utf8_lossy(line_slice).into_owned();
1802    let column = String::from_utf8_lossy(&content[line_start..offset])
1803        .chars()
1804        .count() as u32
1805        + 1;
1806    (line_index as u32 + 1, column, line_text)
1807}
1808
1809fn to_glob_path(path: &Path) -> String {
1810    path.to_string_lossy().replace('\\', "/")
1811}
1812
1813#[cfg(test)]
1814mod tests {
1815    use std::process::Command;
1816
1817    use super::*;
1818
1819    #[test]
1820    fn extract_trigrams_tracks_next_char_and_position() {
1821        let trigrams = extract_trigrams(b"Rust");
1822        assert_eq!(trigrams.len(), 2);
1823        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1824        assert_eq!(
1825            trigrams[1],
1826            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1827        );
1828    }
1829
1830    #[test]
1831    fn decompose_regex_extracts_literals_and_alternations() {
1832        let query = decompose_regex("abc(def|ghi)xyz");
1833        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1834        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1835        assert_eq!(query.or_groups.len(), 1);
1836        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1837        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1838    }
1839
1840    #[test]
1841    fn candidates_intersect_posting_lists() {
1842        let mut index = SearchIndex::new();
1843        let dir = tempfile::tempdir().expect("create temp dir");
1844        let alpha = dir.path().join("alpha.txt");
1845        let beta = dir.path().join("beta.txt");
1846        fs::write(&alpha, "abcdef").expect("write alpha");
1847        fs::write(&beta, "abcxyz").expect("write beta");
1848        index.project_root = dir.path().to_path_buf();
1849        index.index_file(&alpha, b"abcdef");
1850        index.index_file(&beta, b"abcxyz");
1851
1852        let query = RegexQuery {
1853            and_trigrams: vec![
1854                pack_trigram(b'a', b'b', b'c'),
1855                pack_trigram(b'd', b'e', b'f'),
1856            ],
1857            ..RegexQuery::default()
1858        };
1859
1860        let candidates = index.candidates(&query);
1861        assert_eq!(candidates.len(), 1);
1862        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1863    }
1864
1865    #[test]
1866    fn candidates_apply_bloom_filters() {
1867        let mut index = SearchIndex::new();
1868        let dir = tempfile::tempdir().expect("create temp dir");
1869        let file = dir.path().join("sample.txt");
1870        fs::write(&file, "abcd efgh").expect("write sample");
1871        index.project_root = dir.path().to_path_buf();
1872        index.index_file(&file, b"abcd efgh");
1873
1874        let trigram = pack_trigram(b'a', b'b', b'c');
1875        let matching_filter = PostingFilter {
1876            next_mask: mask_for_next_char(b'd'),
1877            loc_mask: mask_for_position(0),
1878        };
1879        let non_matching_filter = PostingFilter {
1880            next_mask: mask_for_next_char(b'z'),
1881            loc_mask: mask_for_position(0),
1882        };
1883
1884        assert_eq!(
1885            index
1886                .postings_for_trigram(trigram, Some(matching_filter))
1887                .len(),
1888            1
1889        );
1890        assert!(index
1891            .postings_for_trigram(trigram, Some(non_matching_filter))
1892            .is_empty());
1893    }
1894
1895    #[test]
1896    fn disk_round_trip_preserves_postings_and_files() {
1897        let dir = tempfile::tempdir().expect("create temp dir");
1898        let project = dir.path().join("project");
1899        fs::create_dir_all(&project).expect("create project dir");
1900        let file = project.join("src.txt");
1901        fs::write(&file, "abcdef").expect("write source");
1902
1903        let mut index = SearchIndex::build(&project);
1904        index.git_head = Some("deadbeef".to_string());
1905        let cache_dir = dir.path().join("cache");
1906        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1907
1908        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1909        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1910        assert_eq!(loaded.files.len(), 1);
1911        assert_eq!(
1912            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1913            PathBuf::from("src.txt")
1914        );
1915        assert_eq!(loaded.postings.len(), index.postings.len());
1916        assert!(loaded
1917            .postings
1918            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1919    }
1920
1921    #[test]
1922    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1923        let dir = tempfile::tempdir().expect("create temp dir");
1924        let project = dir.path().join("project");
1925        fs::create_dir_all(&project).expect("create project dir");
1926        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1927
1928        let index = SearchIndex::build(&project);
1929        let cache_dir = dir.path().join("cache");
1930        index.write_to_disk(&cache_dir, None);
1931
1932        assert!(cache_dir.join("postings.bin").is_file());
1933        assert!(cache_dir.join("lookup.bin").is_file());
1934        assert!(!cache_dir.join("postings.bin.tmp").exists());
1935        assert!(!cache_dir.join("lookup.bin.tmp").exists());
1936    }
1937
1938    #[test]
1939    fn project_cache_key_includes_checkout_path() {
1940        let dir = tempfile::tempdir().expect("create temp dir");
1941        let source = dir.path().join("source");
1942        fs::create_dir_all(&source).expect("create source repo dir");
1943        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1944
1945        assert!(Command::new("git")
1946            .current_dir(&source)
1947            .args(["init"])
1948            .status()
1949            .expect("init git repo")
1950            .success());
1951        assert!(Command::new("git")
1952            .current_dir(&source)
1953            .args(["add", "."])
1954            .status()
1955            .expect("git add")
1956            .success());
1957        assert!(Command::new("git")
1958            .current_dir(&source)
1959            .args([
1960                "-c",
1961                "user.name=AFT Tests",
1962                "-c",
1963                "user.email=aft-tests@example.com",
1964                "commit",
1965                "-m",
1966                "initial",
1967            ])
1968            .status()
1969            .expect("git commit")
1970            .success());
1971
1972        let clone = dir.path().join("clone");
1973        assert!(Command::new("git")
1974            .args(["clone", "--quiet"])
1975            .arg(&source)
1976            .arg(&clone)
1977            .status()
1978            .expect("git clone")
1979            .success());
1980
1981        let source_key = project_cache_key(&source);
1982        let clone_key = project_cache_key(&clone);
1983
1984        assert_eq!(source_key.len(), 16);
1985        assert_eq!(clone_key.len(), 16);
1986        // Same repo (same root commit) → same cache key regardless of clone path
1987        assert_eq!(source_key, clone_key);
1988    }
1989
1990    #[test]
1991    fn resolve_search_scope_disables_index_for_external_path() {
1992        let dir = tempfile::tempdir().expect("create temp dir");
1993        let project = dir.path().join("project");
1994        let outside = dir.path().join("outside");
1995        fs::create_dir_all(&project).expect("create project dir");
1996        fs::create_dir_all(&outside).expect("create outside dir");
1997
1998        let scope = resolve_search_scope(&project, outside.to_str());
1999
2000        assert_eq!(
2001            scope.root,
2002            fs::canonicalize(&outside).expect("canonicalize outside")
2003        );
2004        assert!(!scope.use_index);
2005    }
2006
2007    #[test]
2008    fn grep_filters_matches_to_search_root() {
2009        let dir = tempfile::tempdir().expect("create temp dir");
2010        let project = dir.path().join("project");
2011        let src = project.join("src");
2012        let docs = project.join("docs");
2013        fs::create_dir_all(&src).expect("create src dir");
2014        fs::create_dir_all(&docs).expect("create docs dir");
2015        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2016        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2017
2018        let index = SearchIndex::build(&project);
2019        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2020
2021        assert_eq!(result.files_searched, 1);
2022        assert_eq!(result.files_with_matches, 1);
2023        assert_eq!(result.matches.len(), 1);
2024        // Index stores canonicalized paths; on macOS /var → /private/var
2025        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2026        assert_eq!(result.matches[0].file, expected);
2027    }
2028
2029    #[test]
2030    fn grep_deduplicates_multiple_matches_on_same_line() {
2031        let dir = tempfile::tempdir().expect("create temp dir");
2032        let project = dir.path().join("project");
2033        let src = project.join("src");
2034        fs::create_dir_all(&src).expect("create src dir");
2035        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2036
2037        let index = SearchIndex::build(&project);
2038        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2039
2040        assert_eq!(result.total_matches, 1);
2041        assert_eq!(result.matches.len(), 1);
2042    }
2043
2044    #[test]
2045    fn grep_reports_total_matches_before_truncation() {
2046        let dir = tempfile::tempdir().expect("create temp dir");
2047        let project = dir.path().join("project");
2048        let src = project.join("src");
2049        fs::create_dir_all(&src).expect("create src dir");
2050        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2051
2052        let index = SearchIndex::build(&project);
2053        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2054
2055        assert_eq!(result.total_matches, 2);
2056        assert_eq!(result.matches.len(), 1);
2057        assert!(result.truncated);
2058    }
2059
2060    #[test]
2061    fn glob_filters_results_to_search_root() {
2062        let dir = tempfile::tempdir().expect("create temp dir");
2063        let project = dir.path().join("project");
2064        let src = project.join("src");
2065        let scripts = project.join("scripts");
2066        fs::create_dir_all(&src).expect("create src dir");
2067        fs::create_dir_all(&scripts).expect("create scripts dir");
2068        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2069        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2070
2071        let index = SearchIndex::build(&project);
2072        let files = index.glob("**/*.rs", &src);
2073
2074        assert_eq!(
2075            files,
2076            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2077        );
2078    }
2079
2080    #[test]
2081    fn glob_includes_hidden_and_binary_files() {
2082        let dir = tempfile::tempdir().expect("create temp dir");
2083        let project = dir.path().join("project");
2084        let hidden_dir = project.join(".hidden");
2085        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2086        let hidden_file = hidden_dir.join("data.bin");
2087        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2088
2089        let index = SearchIndex::build(&project);
2090        let files = index.glob("**/*.bin", &project);
2091
2092        assert_eq!(
2093            files,
2094            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2095        );
2096    }
2097
2098    #[test]
2099    fn read_from_disk_rejects_invalid_nanos() {
2100        let dir = tempfile::tempdir().expect("create temp dir");
2101        let cache_dir = dir.path().join("cache");
2102        fs::create_dir_all(&cache_dir).expect("create cache dir");
2103
2104        let mut postings = Vec::new();
2105        postings.extend_from_slice(INDEX_MAGIC);
2106        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2107        postings.extend_from_slice(&0u32.to_le_bytes());
2108        postings.extend_from_slice(&1u32.to_le_bytes());
2109        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2110        postings.extend_from_slice(&1u32.to_le_bytes());
2111        postings.extend_from_slice(b"/");
2112        postings.push(0u8);
2113        postings.extend_from_slice(&1u32.to_le_bytes());
2114        postings.extend_from_slice(&0u64.to_le_bytes());
2115        postings.extend_from_slice(&0u64.to_le_bytes());
2116        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2117        postings.extend_from_slice(b"a");
2118        postings.extend_from_slice(&0u64.to_le_bytes());
2119
2120        let mut lookup = Vec::new();
2121        lookup.extend_from_slice(LOOKUP_MAGIC);
2122        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2123        lookup.extend_from_slice(&0u32.to_le_bytes());
2124
2125        fs::write(cache_dir.join("postings.bin"), postings).expect("write postings");
2126        fs::write(cache_dir.join("lookup.bin"), lookup).expect("write lookup");
2127
2128        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2129    }
2130
2131    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2132    /// changed between cmp() calls.
2133    ///
2134    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2135    /// which does a `stat()` syscall. If the file was deleted, modified, or
2136    /// touched mid-sort, the comparator returned different values for the
2137    /// same input pair on different invocations. Rust's slice::sort detects
2138    /// this and panics with "user-provided comparison function does not
2139    /// correctly implement a total order".
2140    ///
2141    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2142    /// bridge invalidated files in parallel with grep's sort path. This
2143    /// test simulates the worst case: most paths don't exist (Err from
2144    /// fs::metadata) and sort still completes successfully.
2145    #[test]
2146    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2147        // Mix of existing and non-existing paths in deliberately
2148        // non-monotonic order — pre-fix, the sort would call stat() at
2149        // least N log N times and any flakiness would trigger the panic.
2150        let dir = tempfile::tempdir().expect("create tempdir");
2151        let mut paths: Vec<PathBuf> = Vec::new();
2152        for i in 0..30 {
2153            // Half exist, half don't.
2154            let path = if i % 2 == 0 {
2155                let p = dir.path().join(format!("real-{i}.rs"));
2156                fs::write(&p, format!("// {i}\n")).expect("write");
2157                p
2158            } else {
2159                dir.path().join(format!("missing-{i}.rs"))
2160            };
2161            paths.push(path);
2162        }
2163
2164        // Run the sort many times to maximise the chance of catching any
2165        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2166        for _ in 0..50 {
2167            let mut copy = paths.clone();
2168            sort_paths_by_mtime_desc(&mut copy);
2169            assert_eq!(copy.len(), paths.len());
2170        }
2171    }
2172
2173    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2174    /// the same conditions as sort_paths_by_mtime_desc. See the
2175    /// sort_paths_... test above for the full rationale.
2176    #[test]
2177    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2178        let dir = tempfile::tempdir().expect("create tempdir");
2179        let mut matches: Vec<GrepMatch> = Vec::new();
2180        for i in 0..30 {
2181            let file = if i % 2 == 0 {
2182                let p = dir.path().join(format!("real-{i}.rs"));
2183                fs::write(&p, format!("// {i}\n")).expect("write");
2184                p
2185            } else {
2186                dir.path().join(format!("missing-{i}.rs"))
2187            };
2188            matches.push(GrepMatch {
2189                file,
2190                line: u32::try_from(i).unwrap_or(0),
2191                column: 0,
2192                line_text: format!("match {i}"),
2193                match_text: format!("match {i}"),
2194            });
2195        }
2196
2197        for _ in 0..50 {
2198            let mut copy = matches.clone();
2199            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2200            assert_eq!(copy.len(), matches.len());
2201        }
2202    }
2203}
aft/search_index.rs

aft/
search_index.rs