aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const CACHE_MAGIC: u32 = 0x3144_4958; // "XID1" little-endian
20const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
21const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
22const INDEX_VERSION: u32 = 2;
23const PREVIEW_BYTES: usize = 8 * 1024;
24const EOF_SENTINEL: u8 = 0;
25const MAX_ENTRIES: usize = 10_000_000;
26const MIN_FILE_ENTRY_BYTES: usize = 25;
27const LOOKUP_ENTRY_BYTES: usize = 16;
28const POSTING_BYTES: usize = 6;
29
30pub struct CacheLock {
31    path: PathBuf,
32}
33
34impl CacheLock {
35    pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
36        fs::create_dir_all(cache_dir)?;
37        let path = cache_dir.join("cache.lock");
38        for _ in 0..200 {
39            match fs::OpenOptions::new()
40                .write(true)
41                .create_new(true)
42                .open(&path)
43            {
44                Ok(mut file) => {
45                    let _ = writeln!(file, "{}", std::process::id());
46                    let _ = file.sync_all();
47                    return Ok(Self { path });
48                }
49                Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
50                    std::thread::sleep(Duration::from_millis(10));
51                }
52                Err(error) => return Err(error),
53            }
54        }
55        Err(std::io::Error::other(
56            "timed out acquiring search cache lock",
57        ))
58    }
59}
60
61impl Drop for CacheLock {
62    fn drop(&mut self) {
63        let _ = fs::remove_file(&self.path);
64    }
65}
66
67#[derive(Clone, Debug)]
68pub struct SearchIndex {
69    pub postings: HashMap<u32, Vec<Posting>>,
70    pub files: Vec<FileEntry>,
71    pub path_to_id: HashMap<PathBuf, u32>,
72    pub ready: bool,
73    project_root: PathBuf,
74    git_head: Option<String>,
75    max_file_size: u64,
76    pub file_trigrams: HashMap<u32, Vec<u32>>,
77    unindexed_files: HashSet<u32>,
78}
79
80impl SearchIndex {
81    /// Number of indexed files.
82    pub fn file_count(&self) -> usize {
83        self.files.len()
84    }
85
86    /// Number of unique trigrams in the index.
87    pub fn trigram_count(&self) -> usize {
88        self.postings.len()
89    }
90
91    /// Compute distinct query trigrams from literal tokens.
92    pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
93        query_trigrams_from_tokens(tokens)
94    }
95
96    /// Score-rank file candidates by lexical relevance to query trigrams.
97    pub fn lexical_rank(
98        &self,
99        query_trigrams: &[u32],
100        candidate_filter: Option<&dyn Fn(&Path) -> bool>,
101        max_files: usize,
102    ) -> Vec<(PathBuf, f32)> {
103        if query_trigrams.is_empty() || max_files == 0 {
104            return Vec::new();
105        }
106
107        let mut non_zero: Vec<(u32, usize)> = query_trigrams
108            .iter()
109            .filter_map(|trigram| {
110                let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
111                (posting_count > 0).then_some((*trigram, posting_count))
112            })
113            .collect();
114        if non_zero.is_empty() {
115            return Vec::new();
116        }
117
118        non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
119        let selected_count = non_zero.len().min(3);
120        let candidate_cap = if selected_count == 3 { 200 } else { 500 };
121
122        let mut candidate_ids = BTreeSet::new();
123        for (trigram, _) in non_zero.iter().take(selected_count) {
124            if let Some(postings) = self.postings.get(trigram) {
125                for posting in postings {
126                    if self.is_active_file(posting.file_id) {
127                        candidate_ids.insert(posting.file_id);
128                    }
129                }
130            }
131        }
132
133        let mut ranked = Vec::new();
134        for file_id in candidate_ids.into_iter().take(candidate_cap) {
135            let Some(entry) = self.files.get(file_id as usize) else {
136                continue;
137            };
138            if let Some(filter) = candidate_filter {
139                if !filter(&entry.path) {
140                    continue;
141                }
142            }
143            let score = lexical_score(self, query_trigrams, file_id);
144            if score > 0.0 {
145                ranked.push((entry.path.clone(), score));
146            }
147        }
148
149        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
150        ranked.truncate(max_files);
151        ranked
152    }
153}
154
155#[derive(Clone, Debug, PartialEq, Eq)]
156pub struct Posting {
157    pub file_id: u32,
158    pub next_mask: u8,
159    pub loc_mask: u8,
160}
161
162#[derive(Clone, Debug)]
163pub struct FileEntry {
164    pub path: PathBuf,
165    pub size: u64,
166    pub modified: SystemTime,
167}
168
169#[derive(Clone, Debug, PartialEq, Eq)]
170pub struct GrepMatch {
171    pub file: PathBuf,
172    pub line: u32,
173    pub column: u32,
174    pub line_text: String,
175    pub match_text: String,
176}
177
178#[derive(Clone, Debug)]
179pub struct GrepResult {
180    pub matches: Vec<GrepMatch>,
181    pub total_matches: usize,
182    pub files_searched: usize,
183    pub files_with_matches: usize,
184    pub index_status: IndexStatus,
185    pub truncated: bool,
186}
187
188#[derive(Clone, Copy, Debug, PartialEq, Eq)]
189pub enum IndexStatus {
190    Ready,
191    Building,
192    Fallback,
193}
194
195impl IndexStatus {
196    pub fn as_str(&self) -> &'static str {
197        match self {
198            IndexStatus::Ready => "Ready",
199            IndexStatus::Building => "Building",
200            IndexStatus::Fallback => "Fallback",
201        }
202    }
203}
204
205#[derive(Clone, Debug, Default)]
206pub struct RegexQuery {
207    pub and_trigrams: Vec<u32>,
208    pub or_groups: Vec<Vec<u32>>,
209    pub(crate) and_filters: HashMap<u32, PostingFilter>,
210    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
211}
212
213#[derive(Clone, Copy, Debug, Default)]
214pub(crate) struct PostingFilter {
215    next_mask: u8,
216    loc_mask: u8,
217}
218
219#[derive(Clone, Debug, Default)]
220struct QueryBuild {
221    and_runs: Vec<Vec<u8>>,
222    or_groups: Vec<Vec<Vec<u8>>>,
223}
224
225#[derive(Clone, Debug, Default)]
226pub(crate) struct PathFilters {
227    includes: Option<GlobSet>,
228    excludes: Option<GlobSet>,
229}
230
231#[derive(Clone, Debug)]
232pub(crate) struct SearchScope {
233    pub root: PathBuf,
234    pub use_index: bool,
235}
236
237#[derive(Clone, Debug)]
238struct SharedGrepMatch {
239    file: Arc<PathBuf>,
240    line: u32,
241    column: u32,
242    line_text: String,
243    match_text: String,
244}
245
246#[derive(Clone, Debug)]
247enum SearchMatcher {
248    Literal(LiteralSearch),
249    Regex(Regex),
250}
251
252#[derive(Clone, Debug)]
253enum LiteralSearch {
254    CaseSensitive(Vec<u8>),
255    AsciiCaseInsensitive(Vec<u8>),
256}
257
258impl SearchIndex {
259    pub fn new() -> Self {
260        SearchIndex {
261            postings: HashMap::new(),
262            files: Vec::new(),
263            path_to_id: HashMap::new(),
264            ready: false,
265            project_root: PathBuf::new(),
266            git_head: None,
267            max_file_size: DEFAULT_MAX_FILE_SIZE,
268            file_trigrams: HashMap::new(),
269            unindexed_files: HashSet::new(),
270        }
271    }
272
273    pub fn build(root: &Path) -> Self {
274        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
275    }
276
277    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
278        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
279        let mut index = SearchIndex {
280            project_root: project_root.clone(),
281            max_file_size,
282            ..SearchIndex::new()
283        };
284
285        let filters = PathFilters::default();
286        for path in walk_project_files(&project_root, &filters) {
287            index.update_file(&path);
288        }
289
290        index.git_head = current_git_head(&project_root);
291        index.ready = true;
292        index
293    }
294
295    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
296        self.remove_file(path);
297
298        let file_id = match self.allocate_file_id(path, content.len() as u64) {
299            Some(file_id) => file_id,
300            None => return,
301        };
302
303        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
304        for (trigram, next_char, position) in extract_trigrams(content) {
305            let entry = trigram_map.entry(trigram).or_default();
306            entry.next_mask |= mask_for_next_char(next_char);
307            entry.loc_mask |= mask_for_position(position);
308        }
309
310        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
311        for (trigram, filter) in trigram_map {
312            let postings = self.postings.entry(trigram).or_default();
313            postings.push(Posting {
314                file_id,
315                next_mask: filter.next_mask,
316                loc_mask: filter.loc_mask,
317            });
318            // Posting lists are kept sorted by file_id for binary search during
319            // intersection. Since file_ids are allocated incrementally, the new
320            // entry is usually already in order. Only sort when needed.
321            if postings.len() > 1
322                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
323            {
324                postings.sort_unstable_by_key(|p| p.file_id);
325            }
326            file_trigrams.push(trigram);
327        }
328
329        self.file_trigrams.insert(file_id, file_trigrams);
330        self.unindexed_files.remove(&file_id);
331    }
332
333    pub fn remove_file(&mut self, path: &Path) {
334        let Some(file_id) = self.path_to_id.remove(path) else {
335            return;
336        };
337
338        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
339            for trigram in trigrams {
340                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
341                    postings.retain(|posting| posting.file_id != file_id);
342                    postings.is_empty()
343                } else {
344                    false
345                };
346
347                if should_remove {
348                    self.postings.remove(&trigram);
349                }
350            }
351        }
352
353        self.unindexed_files.remove(&file_id);
354        if let Some(file) = self.files.get_mut(file_id as usize) {
355            file.path = PathBuf::new();
356            file.size = 0;
357            file.modified = UNIX_EPOCH;
358        }
359    }
360
361    pub fn update_file(&mut self, path: &Path) {
362        self.remove_file(path);
363
364        let metadata = match fs::metadata(path) {
365            Ok(metadata) if metadata.is_file() => metadata,
366            _ => return,
367        };
368
369        if is_binary_path(path, metadata.len()) {
370            self.track_unindexed_file(path, &metadata);
371            return;
372        }
373
374        if metadata.len() > self.max_file_size {
375            self.track_unindexed_file(path, &metadata);
376            return;
377        }
378
379        let content = match fs::read(path) {
380            Ok(content) => content,
381            Err(_) => return,
382        };
383
384        if is_binary_bytes(&content) {
385            self.track_unindexed_file(path, &metadata);
386            return;
387        }
388
389        self.index_file(path, &content);
390    }
391
392    pub fn grep(
393        &self,
394        pattern: &str,
395        case_sensitive: bool,
396        include: &[String],
397        exclude: &[String],
398        search_root: &Path,
399        max_results: usize,
400    ) -> GrepResult {
401        self.search_grep(
402            pattern,
403            case_sensitive,
404            include,
405            exclude,
406            search_root,
407            max_results,
408        )
409    }
410
411    pub fn search_grep(
412        &self,
413        pattern: &str,
414        case_sensitive: bool,
415        include: &[String],
416        exclude: &[String],
417        search_root: &Path,
418        max_results: usize,
419    ) -> GrepResult {
420        // Detect if pattern is a plain literal (no regex metacharacters).
421        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
422        let is_literal = !pattern.chars().any(|c| {
423            matches!(
424                c,
425                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
426            )
427        });
428
429        let literal_search = if is_literal {
430            if case_sensitive {
431                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
432            } else if pattern.is_ascii() {
433                Some(LiteralSearch::AsciiCaseInsensitive(
434                    pattern
435                        .as_bytes()
436                        .iter()
437                        .map(|byte| byte.to_ascii_lowercase())
438                        .collect(),
439                ))
440            } else {
441                None
442            }
443        } else {
444            None
445        };
446
447        // Build the regex for non-literal patterns (or literal Unicode fallback).
448        let regex = if literal_search.is_some() {
449            None
450        } else {
451            let regex_pattern = if is_literal {
452                regex::escape(pattern)
453            } else {
454                pattern.to_string()
455            };
456            let mut builder = RegexBuilder::new(&regex_pattern);
457            builder.case_insensitive(!case_sensitive);
458            // Treat `^` and `$` as line anchors (grep semantics), not file anchors.
459            builder.multi_line(true);
460            match builder.build() {
461                Ok(r) => Some(r),
462                Err(_) => {
463                    return GrepResult {
464                        matches: Vec::new(),
465                        total_matches: 0,
466                        files_searched: 0,
467                        files_with_matches: 0,
468                        index_status: if self.ready {
469                            IndexStatus::Ready
470                        } else {
471                            IndexStatus::Building
472                        },
473                        truncated: false,
474                    };
475                }
476            }
477        };
478
479        let matcher = if let Some(literal_search) = literal_search {
480            SearchMatcher::Literal(literal_search)
481        } else {
482            SearchMatcher::Regex(
483                regex.expect("regex should exist when literal matcher is unavailable"),
484            )
485        };
486
487        let filters = match build_path_filters(include, exclude) {
488            Ok(filters) => filters,
489            Err(_) => PathFilters::default(),
490        };
491        let search_root = canonicalize_or_normalize(search_root);
492
493        let query = decompose_regex(pattern);
494        let candidate_ids = self.candidates(&query);
495
496        let candidate_files: Vec<&FileEntry> = candidate_ids
497            .into_iter()
498            .filter_map(|file_id| self.files.get(file_id as usize))
499            .filter(|file| !file.path.as_os_str().is_empty())
500            .filter(|file| is_within_search_root(&search_root, &file.path))
501            .filter(|file| filters.matches(&self.project_root, &file.path))
502            .collect();
503
504        let total_matches = AtomicUsize::new(0);
505        let files_searched = AtomicUsize::new(0);
506        let files_with_matches = AtomicUsize::new(0);
507        let truncated = AtomicBool::new(false);
508        let stop_after = max_results.saturating_mul(2);
509
510        let mut matches = if candidate_files.len() > 10 {
511            candidate_files
512                .par_iter()
513                .map(|file| {
514                    search_candidate_file(
515                        file,
516                        &matcher,
517                        max_results,
518                        stop_after,
519                        &total_matches,
520                        &files_searched,
521                        &files_with_matches,
522                        &truncated,
523                    )
524                })
525                .reduce(Vec::new, |mut left, mut right| {
526                    left.append(&mut right);
527                    left
528                })
529        } else {
530            let mut matches = Vec::new();
531            for file in candidate_files {
532                matches.extend(search_candidate_file(
533                    file,
534                    &matcher,
535                    max_results,
536                    stop_after,
537                    &total_matches,
538                    &files_searched,
539                    &files_with_matches,
540                    &truncated,
541                ));
542
543                if should_stop_search(&truncated, &total_matches, stop_after) {
544                    break;
545                }
546            }
547            matches
548        };
549
550        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
551            self.path_to_id
552                .get(path)
553                .and_then(|file_id| self.files.get(*file_id as usize))
554                .map(|file| file.modified)
555        });
556
557        let matches = matches
558            .into_iter()
559            .map(|matched| GrepMatch {
560                file: matched.file.as_ref().clone(),
561                line: matched.line,
562                column: matched.column,
563                line_text: matched.line_text,
564                match_text: matched.match_text,
565            })
566            .collect();
567
568        GrepResult {
569            total_matches: total_matches.load(Ordering::Relaxed),
570            matches,
571            files_searched: files_searched.load(Ordering::Relaxed),
572            files_with_matches: files_with_matches.load(Ordering::Relaxed),
573            index_status: if self.ready {
574                IndexStatus::Ready
575            } else {
576                IndexStatus::Building
577            },
578            truncated: truncated.load(Ordering::Relaxed),
579        }
580    }
581
582    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
583        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
584            Ok(filters) => filters,
585            Err(_) => return Vec::new(),
586        };
587        let search_root = canonicalize_or_normalize(search_root);
588        let mut entries = self
589            .files
590            .iter()
591            .filter(|file| !file.path.as_os_str().is_empty())
592            .filter(|file| is_within_search_root(&search_root, &file.path))
593            .filter(|file| filters.matches(&self.project_root, &file.path))
594            .map(|file| (file.path.clone(), file.modified))
595            .collect::<Vec<_>>();
596
597        entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
598            right_mtime
599                .cmp(left_mtime)
600                .then_with(|| left_path.cmp(right_path))
601        });
602
603        entries.into_iter().map(|(path, _)| path).collect()
604    }
605
606    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
607        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
608            return self.active_file_ids();
609        }
610
611        let mut and_trigrams = query.and_trigrams.clone();
612        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
613
614        let mut current: Option<Vec<u32>> = None;
615
616        for trigram in and_trigrams {
617            let filter = query.and_filters.get(&trigram).copied();
618            let matches = self.postings_for_trigram(trigram, filter);
619            current = Some(match current.take() {
620                Some(existing) => intersect_sorted_ids(&existing, &matches),
621                None => matches,
622            });
623
624            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
625                break;
626            }
627        }
628
629        let mut current = current.unwrap_or_else(|| self.active_file_ids());
630
631        for (index, group) in query.or_groups.iter().enumerate() {
632            let mut group_matches = Vec::new();
633            let filters = query.or_filters.get(index);
634
635            for trigram in group {
636                let filter = filters.and_then(|filters| filters.get(trigram).copied());
637                let matches = self.postings_for_trigram(*trigram, filter);
638                if group_matches.is_empty() {
639                    group_matches = matches;
640                } else {
641                    group_matches = union_sorted_ids(&group_matches, &matches);
642                }
643            }
644
645            current = intersect_sorted_ids(&current, &group_matches);
646            if current.is_empty() {
647                break;
648            }
649        }
650
651        let mut unindexed = self
652            .unindexed_files
653            .iter()
654            .copied()
655            .filter(|file_id| self.is_active_file(*file_id))
656            .collect::<Vec<_>>();
657        if !unindexed.is_empty() {
658            unindexed.sort_unstable();
659            current = union_sorted_ids(&current, &unindexed);
660        }
661
662        current
663    }
664
665    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
666        if fs::create_dir_all(cache_dir).is_err() {
667            return;
668        }
669
670        let cache_path = cache_dir.join("cache.bin");
671        let tmp_cache = cache_dir.join(format!(
672            "cache.bin.tmp.{}.{}",
673            std::process::id(),
674            SystemTime::now()
675                .duration_since(UNIX_EPOCH)
676                .unwrap_or(Duration::ZERO)
677                .as_nanos()
678        ));
679
680        let active_ids = self.active_file_ids();
681        let mut id_map = HashMap::new();
682        for (new_id, old_id) in active_ids.iter().enumerate() {
683            let Ok(new_id_u32) = u32::try_from(new_id) else {
684                return;
685            };
686            id_map.insert(*old_id, new_id_u32);
687        }
688
689        let write_result = (|| -> std::io::Result<()> {
690            let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
691
692            postings_writer.write_all(INDEX_MAGIC)?;
693            write_u32(&mut postings_writer, INDEX_VERSION)?;
694
695            let head = git_head.unwrap_or_default();
696            let root = self.project_root.to_string_lossy();
697            let head_len = u32::try_from(head.len())
698                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
699            let root_len = u32::try_from(root.len())
700                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
701            let file_count = u32::try_from(active_ids.len())
702                .map_err(|_| std::io::Error::other("too many files to cache"))?;
703
704            write_u32(&mut postings_writer, head_len)?;
705            write_u32(&mut postings_writer, root_len)?;
706            write_u64(&mut postings_writer, self.max_file_size)?;
707            write_u32(&mut postings_writer, file_count)?;
708            postings_writer.write_all(head.as_bytes())?;
709            postings_writer.write_all(root.as_bytes())?;
710
711            for old_id in &active_ids {
712                let Some(file) = self.files.get(*old_id as usize) else {
713                    return Err(std::io::Error::other("missing file entry for cache write"));
714                };
715                let path = relative_to_root(&self.project_root, &file.path);
716                let path = path.to_string_lossy();
717                let path_len = u32::try_from(path.len())
718                    .map_err(|_| std::io::Error::other("cached path too large"))?;
719                let modified = file
720                    .modified
721                    .duration_since(UNIX_EPOCH)
722                    .unwrap_or(Duration::ZERO);
723                let unindexed = if self.unindexed_files.contains(old_id) {
724                    1u8
725                } else {
726                    0u8
727                };
728
729                postings_writer.write_all(&[unindexed])?;
730                write_u32(&mut postings_writer, path_len)?;
731                write_u64(&mut postings_writer, file.size)?;
732                write_u64(&mut postings_writer, modified.as_secs())?;
733                write_u32(&mut postings_writer, modified.subsec_nanos())?;
734                postings_writer.write_all(path.as_bytes())?;
735            }
736
737            let mut lookup_entries = Vec::new();
738            let mut postings_blob = Vec::new();
739            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
740            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
741
742            for (trigram, postings) in sorted_postings {
743                let offset = u64::try_from(postings_blob.len())
744                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
745                let mut count = 0u32;
746
747                for posting in postings {
748                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
749                        continue;
750                    };
751
752                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
753                    postings_blob.push(posting.next_mask);
754                    postings_blob.push(posting.loc_mask);
755                    count = count.saturating_add(1);
756                }
757
758                if count > 0 {
759                    lookup_entries.push((*trigram, offset, count));
760                }
761            }
762
763            write_u64(
764                &mut postings_writer,
765                u64::try_from(postings_blob.len())
766                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
767            )?;
768            postings_writer.write_all(&postings_blob)?;
769            postings_writer.flush()?;
770            let mut postings_blob_file = postings_writer
771                .into_inner()
772                .map_err(|error| std::io::Error::other(error.to_string()))?
773                .into_inner();
774            let checksum = crc32fast::hash(&postings_blob_file);
775            postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
776
777            let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
778            let entry_count = u32::try_from(lookup_entries.len())
779                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
780
781            lookup_writer.write_all(LOOKUP_MAGIC)?;
782            write_u32(&mut lookup_writer, INDEX_VERSION)?;
783            write_u32(&mut lookup_writer, entry_count)?;
784
785            for (trigram, offset, count) in lookup_entries {
786                write_u32(&mut lookup_writer, trigram)?;
787                write_u64(&mut lookup_writer, offset)?;
788                write_u32(&mut lookup_writer, count)?;
789            }
790
791            lookup_writer.flush()?;
792            let mut lookup_blob_file = lookup_writer
793                .into_inner()
794                .map_err(|error| std::io::Error::other(error.to_string()))?
795                .into_inner();
796            let checksum = crc32fast::hash(&lookup_blob_file);
797            lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
798
799            let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
800            write_u32(&mut cache_writer, CACHE_MAGIC)?;
801            write_u32(&mut cache_writer, INDEX_VERSION)?;
802            write_u64(
803                &mut cache_writer,
804                u64::try_from(postings_blob_file.len())
805                    .map_err(|_| std::io::Error::other("postings section too large"))?,
806            )?;
807            cache_writer.write_all(&postings_blob_file)?;
808            cache_writer.write_all(&lookup_blob_file)?;
809            cache_writer.flush()?;
810            cache_writer.get_ref().sync_all()?;
811            drop(cache_writer);
812            fs::rename(&tmp_cache, &cache_path)?;
813
814            Ok(())
815        })();
816
817        if write_result.is_err() {
818            let _ = fs::remove_file(&tmp_cache);
819        }
820    }
821
822    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
823        let cache_path = cache_dir.join("cache.bin");
824        let cache_bytes = fs::read(&cache_path).ok()?;
825        if cache_bytes.len() < 16 {
826            return None;
827        }
828        let mut header = Cursor::new(&cache_bytes);
829        if read_u32(&mut header).ok()? != CACHE_MAGIC {
830            return None;
831        }
832        if read_u32(&mut header).ok()? != INDEX_VERSION {
833            return None;
834        }
835        let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
836        let start = usize::try_from(header.position()).ok()?;
837        let postings_end = start.checked_add(postings_len_total)?;
838        if postings_end > cache_bytes.len() {
839            return None;
840        }
841        let postings_bytes = &cache_bytes[start..postings_end];
842        let lookup_bytes = &cache_bytes[postings_end..];
843        let lookup_len_total = lookup_bytes.len();
844        let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
845        let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
846        if postings_len_total < 4 || lookup_len_total < 4 {
847            return None;
848        }
849        verify_crc32_bytes_slice(postings_bytes).ok()?;
850        verify_crc32_bytes_slice(lookup_bytes).ok()?;
851
852        let mut magic = [0u8; 8];
853        postings_reader.read_exact(&mut magic).ok()?;
854        if &magic != INDEX_MAGIC {
855            return None;
856        }
857        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
858            return None;
859        }
860
861        let head_len = read_u32(&mut postings_reader).ok()? as usize;
862        let root_len = read_u32(&mut postings_reader).ok()? as usize;
863        let max_file_size = read_u64(&mut postings_reader).ok()?;
864        let file_count = read_u32(&mut postings_reader).ok()? as usize;
865        if file_count > MAX_ENTRIES {
866            return None;
867        }
868        let postings_body_len = postings_len_total.checked_sub(4)?;
869        let lookup_body_len = lookup_len_total.checked_sub(4)?;
870
871        let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
872        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
873        if minimum_file_bytes > remaining_postings {
874            return None;
875        }
876
877        if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
878            return None;
879        }
880        let mut head_bytes = vec![0u8; head_len];
881        postings_reader.read_exact(&mut head_bytes).ok()?;
882        let git_head = String::from_utf8(head_bytes)
883            .ok()
884            .filter(|head| !head.is_empty());
885
886        if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
887            return None;
888        }
889        let mut root_bytes = vec![0u8; root_len];
890        postings_reader.read_exact(&mut root_bytes).ok()?;
891        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
892
893        let mut files = Vec::with_capacity(file_count);
894        let mut path_to_id = HashMap::new();
895        let mut unindexed_files = HashSet::new();
896
897        for file_id in 0..file_count {
898            let mut unindexed = [0u8; 1];
899            postings_reader.read_exact(&mut unindexed).ok()?;
900            let path_len = read_u32(&mut postings_reader).ok()? as usize;
901            let size = read_u64(&mut postings_reader).ok()?;
902            let secs = read_u64(&mut postings_reader).ok()?;
903            let nanos = read_u32(&mut postings_reader).ok()?;
904            if nanos >= 1_000_000_000 {
905                return None;
906            }
907            if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
908                return None;
909            }
910            let mut path_bytes = vec![0u8; path_len];
911            postings_reader.read_exact(&mut path_bytes).ok()?;
912            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
913            let full_path = project_root.join(relative_path);
914            let file_id_u32 = u32::try_from(file_id).ok()?;
915
916            files.push(FileEntry {
917                path: full_path.clone(),
918                size,
919                modified: UNIX_EPOCH + Duration::new(secs, nanos),
920            });
921            path_to_id.insert(full_path, file_id_u32);
922            if unindexed[0] == 1 {
923                unindexed_files.insert(file_id_u32);
924            }
925        }
926
927        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
928        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
929        if postings_len > max_postings_bytes {
930            return None;
931        }
932        if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
933            return None;
934        }
935        let mut postings_blob = vec![0u8; postings_len];
936        postings_reader.read_exact(&mut postings_blob).ok()?;
937
938        let mut lookup_magic = [0u8; 8];
939        lookup_reader.read_exact(&mut lookup_magic).ok()?;
940        if &lookup_magic != LOOKUP_MAGIC {
941            return None;
942        }
943        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
944            return None;
945        }
946        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
947        if entry_count > MAX_ENTRIES {
948            return None;
949        }
950        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
951        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
952        if minimum_lookup_bytes > remaining_lookup {
953            return None;
954        }
955
956        let mut postings = HashMap::new();
957        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
958
959        for _ in 0..entry_count {
960            let trigram = read_u32(&mut lookup_reader).ok()?;
961            let offset = read_u64(&mut lookup_reader).ok()? as usize;
962            let count = read_u32(&mut lookup_reader).ok()? as usize;
963            if count > MAX_ENTRIES {
964                return None;
965            }
966            let bytes_len = count.checked_mul(POSTING_BYTES)?;
967            let end = offset.checked_add(bytes_len)?;
968            if end > postings_blob.len() {
969                return None;
970            }
971
972            let mut trigram_postings = Vec::with_capacity(count);
973            for chunk in postings_blob[offset..end].chunks_exact(6) {
974                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
975                let posting = Posting {
976                    file_id,
977                    next_mask: chunk[4],
978                    loc_mask: chunk[5],
979                };
980                trigram_postings.push(posting.clone());
981                file_trigrams.entry(file_id).or_default().push(trigram);
982            }
983            postings.insert(trigram, trigram_postings);
984        }
985
986        Some(SearchIndex {
987            postings,
988            files,
989            path_to_id,
990            ready: true,
991            project_root,
992            git_head,
993            max_file_size,
994            file_trigrams,
995            unindexed_files,
996        })
997    }
998
999    pub(crate) fn stored_git_head(&self) -> Option<&str> {
1000        self.git_head.as_deref()
1001    }
1002
1003    pub(crate) fn set_ready(&mut self, ready: bool) {
1004        self.ready = ready;
1005    }
1006
1007    pub(crate) fn rebuild_or_refresh(
1008        root: &Path,
1009        max_file_size: u64,
1010        current_head: Option<String>,
1011        baseline: Option<SearchIndex>,
1012    ) -> Self {
1013        if let Some(mut baseline) = baseline {
1014            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1015            baseline.max_file_size = max_file_size;
1016
1017            if baseline.git_head == current_head || current_head.is_none() {
1018                // HEAD matches, but files may have changed on disk since the index was
1019                // last written (e.g., uncommitted edits, stash pop, manual file changes
1020                // while OpenCode was closed). Verify mtimes and re-index stale files.
1021                // Non-git projects also use this per-file (path, mtime, size)
1022                // fingerprint so unchanged trees reuse the disk cache instead of
1023                // rebuilding every configure.
1024                baseline.git_head = current_head;
1025                verify_file_mtimes(&mut baseline);
1026                baseline.ready = true;
1027                return baseline;
1028            }
1029
1030            if let (Some(previous), Some(current)) =
1031                (baseline.git_head.clone(), current_head.clone())
1032            {
1033                let project_root = baseline.project_root.clone();
1034                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
1035                    baseline.git_head = Some(current);
1036                    baseline.ready = true;
1037                    return baseline;
1038                }
1039            }
1040        }
1041
1042        SearchIndex::build_with_limit(root, max_file_size)
1043    }
1044
1045    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1046        let file_id = u32::try_from(self.files.len()).ok()?;
1047        let metadata = fs::metadata(path).ok();
1048        let size = metadata
1049            .as_ref()
1050            .map_or(size_hint, |metadata| metadata.len());
1051        let modified = metadata
1052            .and_then(|metadata| metadata.modified().ok())
1053            .unwrap_or(UNIX_EPOCH);
1054
1055        self.files.push(FileEntry {
1056            path: path.to_path_buf(),
1057            size,
1058            modified,
1059        });
1060        self.path_to_id.insert(path.to_path_buf(), file_id);
1061        Some(file_id)
1062    }
1063
1064    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1065        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1066            return;
1067        };
1068        self.unindexed_files.insert(file_id);
1069        self.file_trigrams.insert(file_id, Vec::new());
1070    }
1071
1072    fn active_file_ids(&self) -> Vec<u32> {
1073        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1074        ids.sort_unstable();
1075        ids
1076    }
1077
1078    fn is_active_file(&self, file_id: u32) -> bool {
1079        self.files
1080            .get(file_id as usize)
1081            .map(|file| !file.path.as_os_str().is_empty())
1082            .unwrap_or(false)
1083    }
1084
1085    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1086        let Some(postings) = self.postings.get(&trigram) else {
1087            return Vec::new();
1088        };
1089
1090        let mut matches = Vec::with_capacity(postings.len());
1091
1092        for posting in postings {
1093            if let Some(filter) = filter {
1094                // next_mask: bloom filter check — the character following this trigram in the
1095                // query must also appear after this trigram somewhere in the file.
1096                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1097                    continue;
1098                }
1099                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
1100                // between consecutive trigram pairs, but is NOT used as a single-trigram
1101                // filter because the position in the query string has no relationship to
1102                // the position in the file. Using it here causes false negatives.
1103            }
1104            if self.is_active_file(posting.file_id) {
1105                matches.push(posting.file_id);
1106            }
1107        }
1108
1109        matches
1110    }
1111}
1112
1113fn search_candidate_file(
1114    file: &FileEntry,
1115    matcher: &SearchMatcher,
1116    max_results: usize,
1117    stop_after: usize,
1118    total_matches: &AtomicUsize,
1119    files_searched: &AtomicUsize,
1120    files_with_matches: &AtomicUsize,
1121    truncated: &AtomicBool,
1122) -> Vec<SharedGrepMatch> {
1123    if should_stop_search(truncated, total_matches, stop_after) {
1124        return Vec::new();
1125    }
1126
1127    let content = match read_indexed_file_bytes(&file.path) {
1128        Some(content) => content,
1129        None => return Vec::new(),
1130    };
1131    // Defense in depth: even though indexing tries to filter binaries via
1132    // `is_binary_path` + full-content `is_binary_bytes`, we double-check at
1133    // query time. content_inspector is fast (~bytes-per-cycle on a small
1134    // preview) and this guarantees we never surface matches inside binary
1135    // files even if the indexer somehow let one through (e.g. file changed
1136    // between indexing and query).
1137    if is_binary_bytes(&content) {
1138        return Vec::new();
1139    }
1140    files_searched.fetch_add(1, Ordering::Relaxed);
1141
1142    let shared_path = Arc::new(file.path.clone());
1143    let mut matches = Vec::new();
1144    let mut line_starts = None;
1145    let mut seen_lines = HashSet::new();
1146    let mut matched_this_file = false;
1147
1148    match matcher {
1149        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1150            let finder = memchr::memmem::Finder::new(needle);
1151            let mut start = 0;
1152
1153            while let Some(position) = finder.find(&content[start..]) {
1154                if should_stop_search(truncated, total_matches, stop_after) {
1155                    break;
1156                }
1157
1158                let offset = start + position;
1159                start = offset + 1;
1160
1161                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1162                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1163                if !seen_lines.insert(line) {
1164                    continue;
1165                }
1166
1167                matched_this_file = true;
1168                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1169                if match_number > max_results {
1170                    truncated.store(true, Ordering::Relaxed);
1171                    break;
1172                }
1173
1174                let end = offset + needle.len();
1175                matches.push(SharedGrepMatch {
1176                    file: shared_path.clone(),
1177                    line,
1178                    column,
1179                    line_text,
1180                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1181                });
1182            }
1183        }
1184        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1185            let search_content = content.to_ascii_lowercase();
1186            let finder = memchr::memmem::Finder::new(needle);
1187            let mut start = 0;
1188
1189            while let Some(position) = finder.find(&search_content[start..]) {
1190                if should_stop_search(truncated, total_matches, stop_after) {
1191                    break;
1192                }
1193
1194                let offset = start + position;
1195                start = offset + 1;
1196
1197                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1198                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1199                if !seen_lines.insert(line) {
1200                    continue;
1201                }
1202
1203                matched_this_file = true;
1204                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1205                if match_number > max_results {
1206                    truncated.store(true, Ordering::Relaxed);
1207                    break;
1208                }
1209
1210                let end = offset + needle.len();
1211                matches.push(SharedGrepMatch {
1212                    file: shared_path.clone(),
1213                    line,
1214                    column,
1215                    line_text,
1216                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1217                });
1218            }
1219        }
1220        SearchMatcher::Regex(regex) => {
1221            for matched in regex.find_iter(&content) {
1222                if should_stop_search(truncated, total_matches, stop_after) {
1223                    break;
1224                }
1225
1226                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1227                let (line, column, line_text) =
1228                    line_details_bytes(&content, line_starts, matched.start());
1229                if !seen_lines.insert(line) {
1230                    continue;
1231                }
1232
1233                matched_this_file = true;
1234                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1235                if match_number > max_results {
1236                    truncated.store(true, Ordering::Relaxed);
1237                    break;
1238                }
1239
1240                matches.push(SharedGrepMatch {
1241                    file: shared_path.clone(),
1242                    line,
1243                    column,
1244                    line_text,
1245                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1246                });
1247            }
1248        }
1249    }
1250
1251    if matched_this_file {
1252        files_with_matches.fetch_add(1, Ordering::Relaxed);
1253    }
1254
1255    matches
1256}
1257
1258fn should_stop_search(
1259    truncated: &AtomicBool,
1260    total_matches: &AtomicUsize,
1261    stop_after: usize,
1262) -> bool {
1263    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1264}
1265
1266fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1267    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1268    let mut left_index = 0;
1269    let mut right_index = 0;
1270
1271    while left_index < left.len() && right_index < right.len() {
1272        match left[left_index].cmp(&right[right_index]) {
1273            std::cmp::Ordering::Less => left_index += 1,
1274            std::cmp::Ordering::Greater => right_index += 1,
1275            std::cmp::Ordering::Equal => {
1276                merged.push(left[left_index]);
1277                left_index += 1;
1278                right_index += 1;
1279            }
1280        }
1281    }
1282
1283    merged
1284}
1285
1286fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1287    let mut merged = Vec::with_capacity(left.len() + right.len());
1288    let mut left_index = 0;
1289    let mut right_index = 0;
1290
1291    while left_index < left.len() && right_index < right.len() {
1292        match left[left_index].cmp(&right[right_index]) {
1293            std::cmp::Ordering::Less => {
1294                merged.push(left[left_index]);
1295                left_index += 1;
1296            }
1297            std::cmp::Ordering::Greater => {
1298                merged.push(right[right_index]);
1299                right_index += 1;
1300            }
1301            std::cmp::Ordering::Equal => {
1302                merged.push(left[left_index]);
1303                left_index += 1;
1304                right_index += 1;
1305            }
1306        }
1307    }
1308
1309    merged.extend_from_slice(&left[left_index..]);
1310    merged.extend_from_slice(&right[right_index..]);
1311    merged
1312}
1313
1314pub fn decompose_regex(pattern: &str) -> RegexQuery {
1315    let hir = match regex_syntax::parse(pattern) {
1316        Ok(hir) => hir,
1317        Err(_) => return RegexQuery::default(),
1318    };
1319
1320    let build = build_query(&hir);
1321    build.into_query()
1322}
1323
1324pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1325    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1326}
1327
1328pub fn normalize_char(c: u8) -> u8 {
1329    c.to_ascii_lowercase()
1330}
1331
1332pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1333    if content.len() < 3 {
1334        return Vec::new();
1335    }
1336
1337    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1338    for start in 0..=content.len() - 3 {
1339        let trigram = pack_trigram(
1340            normalize_char(content[start]),
1341            normalize_char(content[start + 1]),
1342            normalize_char(content[start + 2]),
1343        );
1344        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1345        trigrams.push((trigram, next_char, start));
1346    }
1347    trigrams
1348}
1349
1350pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1351    let mut seen = HashSet::new();
1352    let mut out = Vec::new();
1353    for token in tokens {
1354        for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1355            if seen.insert(trigram) {
1356                out.push(trigram);
1357            }
1358        }
1359    }
1360    out
1361}
1362
1363pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1364    if query_trigrams.is_empty() {
1365        return 0.0;
1366    }
1367
1368    let mut hits = 0u32;
1369    for &trigram in query_trigrams {
1370        if let Some(postings) = index.postings.get(&trigram) {
1371            if postings
1372                .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1373                .is_ok()
1374            {
1375                hits += 1;
1376            }
1377        }
1378    }
1379
1380    if hits == 0 {
1381        return 0.0;
1382    }
1383
1384    let file_trigram_count = index
1385        .file_trigrams
1386        .get(&file_id)
1387        .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1388    (hits as f32) / (1.0 + file_trigram_count.ln())
1389}
1390
1391pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1392    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1393    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1394        return PathBuf::from(override_dir)
1395            .join("index")
1396            .join(project_cache_key(project_root));
1397    }
1398    // Use configured storage dir (from plugin, XDG-compliant)
1399    if let Some(dir) = storage_dir {
1400        return dir.join("index").join(project_cache_key(project_root));
1401    }
1402    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage).
1403    // On Windows `HOME` is typically unset, so try `USERPROFILE` next.
1404    // If neither is set, fall back to a temp directory rather than `"."`
1405    // because the search-index code reads/writes absolute paths.
1406    let home = std::env::var_os("HOME")
1407        .or_else(|| std::env::var_os("USERPROFILE"))
1408        .map(PathBuf::from)
1409        .unwrap_or_else(std::env::temp_dir);
1410    home.join(".cache")
1411        .join("aft")
1412        .join("index")
1413        .join(project_cache_key(project_root))
1414}
1415
1416pub(crate) fn build_path_filters(
1417    include: &[String],
1418    exclude: &[String],
1419) -> Result<PathFilters, String> {
1420    Ok(PathFilters {
1421        includes: build_globset(include)?,
1422        excludes: build_globset(exclude)?,
1423    })
1424}
1425
1426pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1427    walk_project_files_from(root, root, filters)
1428}
1429
1430pub(crate) fn walk_project_files_from(
1431    filter_root: &Path,
1432    search_root: &Path,
1433    filters: &PathFilters,
1434) -> Vec<PathBuf> {
1435    let mut builder = WalkBuilder::new(search_root);
1436    builder
1437        .hidden(false)
1438        .git_ignore(true)
1439        .git_global(true)
1440        .git_exclude(true)
1441        .filter_entry(|entry| {
1442            let name = entry.file_name().to_string_lossy();
1443            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1444                return !matches!(
1445                    name.as_ref(),
1446                    "node_modules"
1447                        | "target"
1448                        | "venv"
1449                        | ".venv"
1450                        | ".git"
1451                        | "__pycache__"
1452                        | ".tox"
1453                        | "dist"
1454                        | "build"
1455                );
1456            }
1457            true
1458        });
1459
1460    let mut files = Vec::new();
1461    for entry in builder.build().filter_map(|entry| entry.ok()) {
1462        if !entry
1463            .file_type()
1464            .map_or(false, |file_type| file_type.is_file())
1465        {
1466            continue;
1467        }
1468        let path = entry.into_path();
1469        if filters.matches(filter_root, &path) {
1470            files.push(path);
1471        }
1472    }
1473
1474    sort_paths_by_mtime_desc(&mut files);
1475    files
1476}
1477
1478pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1479    let bytes = fs::read(path).ok()?;
1480    if is_binary_bytes(&bytes) {
1481        return None;
1482    }
1483    String::from_utf8(bytes).ok()
1484}
1485
1486fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1487    fs::read(path).ok()
1488}
1489
1490pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1491    path.strip_prefix(root)
1492        .map(PathBuf::from)
1493        .unwrap_or_else(|_| path.to_path_buf())
1494}
1495
1496/// Sort paths newest-first by mtime, falling back to lexicographic order.
1497///
1498/// Pre-v0.15.2 this called `path_modified_time(...)` directly inside the
1499/// `sort_by()` closure. That made the comparator non-deterministic — a
1500/// `stat()` syscall for the same path can return different values across
1501/// invocations (file edited mid-sort, file deleted, OS clock adjustments,
1502/// concurrent file-watcher activity), and Rust's slice::sort panics at
1503/// runtime when it detects a non-total-order comparator. CI hit this on
1504/// a Pi e2e test where the bridge invalidated files in parallel with grep.
1505///
1506/// Fix: snapshot mtimes ONCE into a HashMap before sorting, then look up
1507/// from the map inside the closure. Pure function ⇒ guaranteed total order.
1508pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1509    use std::collections::HashMap;
1510    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1511    for path in paths.iter() {
1512        mtimes
1513            .entry(path.clone())
1514            .or_insert_with(|| path_modified_time(path));
1515    }
1516    paths.sort_by(|left, right| {
1517        let left_mtime = mtimes.get(left).and_then(|v| *v);
1518        let right_mtime = mtimes.get(right).and_then(|v| *v);
1519        right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1520    });
1521}
1522
1523/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1524/// the sort. Same fix, applied to grep matches that share files.
1525pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1526    use std::collections::HashMap;
1527    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1528    for m in matches.iter() {
1529        mtimes.entry(m.file.clone()).or_insert_with(|| {
1530            let resolved = resolve_match_path(project_root, &m.file);
1531            path_modified_time(&resolved)
1532        });
1533    }
1534    matches.sort_by(|left, right| {
1535        let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1536        let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1537        right_mtime
1538            .cmp(&left_mtime)
1539            .then_with(|| left.file.cmp(&right.file))
1540            .then_with(|| left.line.cmp(&right.line))
1541            .then_with(|| left.column.cmp(&right.column))
1542    });
1543}
1544
1545/// See `sort_paths_by_mtime_desc` for why mtimes are snapshotted ahead of
1546/// the sort. The cached lookup function `modified_for_path` is fast (in-memory
1547/// table from the search index), but it can still return different values if
1548/// the file is modified mid-sort. Snapshot once.
1549fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1550    matches: &mut [SharedGrepMatch],
1551    modified_for_path: F,
1552) where
1553    F: Fn(&Path) -> Option<SystemTime>,
1554{
1555    use std::collections::HashMap;
1556    let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1557    for m in matches.iter() {
1558        let path = m.file.as_path().to_path_buf();
1559        mtimes
1560            .entry(path.clone())
1561            .or_insert_with(|| modified_for_path(&path));
1562    }
1563    matches.sort_by(|left, right| {
1564        let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1565        let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1566        right_mtime
1567            .cmp(&left_mtime)
1568            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1569            .then_with(|| left.line.cmp(&right.line))
1570            .then_with(|| left.column.cmp(&right.column))
1571    });
1572}
1573
1574pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1575    let resolved_project_root = canonicalize_or_normalize(project_root);
1576    let root = match path {
1577        Some(path) => {
1578            let path = PathBuf::from(path);
1579            if path.is_absolute() {
1580                canonicalize_or_normalize(&path)
1581            } else {
1582                normalize_path(&resolved_project_root.join(path))
1583            }
1584        }
1585        None => resolved_project_root.clone(),
1586    };
1587
1588    let use_index = is_within_search_root(&resolved_project_root, &root);
1589    SearchScope { root, use_index }
1590}
1591
1592pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1593    content_inspector::inspect(content).is_binary()
1594}
1595
1596pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1597    run_git(root, &["rev-parse", "HEAD"])
1598}
1599
1600pub fn project_cache_key(project_root: &Path) -> String {
1601    use sha2::{Digest, Sha256};
1602
1603    let mut hasher = Sha256::new();
1604
1605    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1606        // Git repo: root commit is the unique identity.
1607        // Same repo cloned anywhere produces the same key.
1608        hasher.update(root_commit.as_bytes());
1609    } else {
1610        // Non-git project: use the canonical filesystem path as identity.
1611        let canonical_root = canonicalize_or_normalize(project_root);
1612        hasher.update(canonical_root.to_string_lossy().as_bytes());
1613    }
1614
1615    let digest = format!("{:x}", hasher.finalize());
1616    digest[..16].to_string()
1617}
1618
1619impl PathFilters {
1620    fn matches(&self, root: &Path, path: &Path) -> bool {
1621        let relative = to_glob_path(&relative_to_root(root, path));
1622        if self
1623            .includes
1624            .as_ref()
1625            .is_some_and(|includes| !includes.is_match(&relative))
1626        {
1627            return false;
1628        }
1629        if self
1630            .excludes
1631            .as_ref()
1632            .is_some_and(|excludes| excludes.is_match(&relative))
1633        {
1634            return false;
1635        }
1636        true
1637    }
1638}
1639
1640fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1641    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1642}
1643
1644fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1645    if path.is_absolute() {
1646        path.to_path_buf()
1647    } else {
1648        project_root.join(path)
1649    }
1650}
1651
1652fn path_modified_time(path: &Path) -> Option<SystemTime> {
1653    fs::metadata(path)
1654        .and_then(|metadata| metadata.modified())
1655        .ok()
1656}
1657
1658fn normalize_path(path: &Path) -> PathBuf {
1659    let mut result = PathBuf::new();
1660    for component in path.components() {
1661        match component {
1662            Component::ParentDir => {
1663                if !result.pop() {
1664                    result.push(component);
1665                }
1666            }
1667            Component::CurDir => {}
1668            _ => result.push(component),
1669        }
1670    }
1671    result
1672}
1673
1674/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1675/// since the index was last written. Also detect new files and deleted files.
1676fn verify_file_mtimes(index: &mut SearchIndex) {
1677    // Collect stale files (mtime mismatch or deleted)
1678    let mut stale_paths = Vec::new();
1679    for entry in &index.files {
1680        if entry.path.as_os_str().is_empty() {
1681            continue; // tombstoned entry
1682        }
1683        match fs::metadata(&entry.path) {
1684            Ok(meta) => {
1685                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1686                if current_mtime != entry.modified || meta.len() != entry.size {
1687                    stale_paths.push(entry.path.clone());
1688                }
1689            }
1690            Err(_) => {
1691                // File deleted
1692                stale_paths.push(entry.path.clone());
1693            }
1694        }
1695    }
1696
1697    // Re-index stale files
1698    for path in &stale_paths {
1699        index.update_file(path);
1700    }
1701
1702    // Detect new files not in the index
1703    let filters = PathFilters::default();
1704    for path in walk_project_files(&index.project_root, &filters) {
1705        if !index.path_to_id.contains_key(&path) {
1706            index.update_file(&path);
1707        }
1708    }
1709
1710    if !stale_paths.is_empty() {
1711        crate::slog_info!(
1712            "search index: refreshed {} stale file(s) from disk cache",
1713            stale_paths.len()
1714        );
1715    }
1716}
1717
1718fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1719    path.starts_with(search_root)
1720}
1721
1722impl QueryBuild {
1723    fn into_query(self) -> RegexQuery {
1724        let mut query = RegexQuery::default();
1725
1726        for run in self.and_runs {
1727            add_run_to_and_query(&mut query, &run);
1728        }
1729
1730        for group in self.or_groups {
1731            let mut trigrams = BTreeSet::new();
1732            let mut filters = HashMap::new();
1733            for run in group {
1734                for (trigram, filter) in trigram_filters(&run) {
1735                    trigrams.insert(trigram);
1736                    merge_filter(filters.entry(trigram).or_default(), filter);
1737                }
1738            }
1739            if !trigrams.is_empty() {
1740                query.or_groups.push(trigrams.into_iter().collect());
1741                query.or_filters.push(filters);
1742            }
1743        }
1744
1745        query
1746    }
1747}
1748
1749fn build_query(hir: &Hir) -> QueryBuild {
1750    match hir.kind() {
1751        HirKind::Literal(literal) => {
1752            if literal.0.len() >= 3 {
1753                QueryBuild {
1754                    and_runs: vec![literal.0.to_vec()],
1755                    or_groups: Vec::new(),
1756                }
1757            } else {
1758                QueryBuild::default()
1759            }
1760        }
1761        HirKind::Capture(capture) => build_query(&capture.sub),
1762        HirKind::Concat(parts) => {
1763            let mut build = QueryBuild::default();
1764            for part in parts {
1765                let part_build = build_query(part);
1766                build.and_runs.extend(part_build.and_runs);
1767                build.or_groups.extend(part_build.or_groups);
1768            }
1769            build
1770        }
1771        HirKind::Alternation(parts) => {
1772            let mut group = Vec::new();
1773            for part in parts {
1774                let Some(mut choices) = guaranteed_run_choices(part) else {
1775                    return QueryBuild::default();
1776                };
1777                group.append(&mut choices);
1778            }
1779            if group.is_empty() {
1780                QueryBuild::default()
1781            } else {
1782                QueryBuild {
1783                    and_runs: Vec::new(),
1784                    or_groups: vec![group],
1785                }
1786            }
1787        }
1788        HirKind::Repetition(repetition) => {
1789            if repetition.min == 0 {
1790                QueryBuild::default()
1791            } else {
1792                build_query(&repetition.sub)
1793            }
1794        }
1795        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1796    }
1797}
1798
1799fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1800    match hir.kind() {
1801        HirKind::Literal(literal) => {
1802            if literal.0.len() >= 3 {
1803                Some(vec![literal.0.to_vec()])
1804            } else {
1805                None
1806            }
1807        }
1808        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1809        HirKind::Concat(parts) => {
1810            let mut runs = Vec::new();
1811            for part in parts {
1812                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1813                    runs.append(&mut part_runs);
1814                }
1815            }
1816            if runs.is_empty() {
1817                None
1818            } else {
1819                Some(runs)
1820            }
1821        }
1822        HirKind::Alternation(parts) => {
1823            let mut runs = Vec::new();
1824            for part in parts {
1825                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1826                    return None;
1827                };
1828                runs.append(&mut part_runs);
1829            }
1830            if runs.is_empty() {
1831                None
1832            } else {
1833                Some(runs)
1834            }
1835        }
1836        HirKind::Repetition(repetition) => {
1837            if repetition.min == 0 {
1838                None
1839            } else {
1840                guaranteed_run_choices(&repetition.sub)
1841            }
1842        }
1843        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1844    }
1845}
1846
1847fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1848    for (trigram, filter) in trigram_filters(run) {
1849        if !query.and_trigrams.contains(&trigram) {
1850            query.and_trigrams.push(trigram);
1851        }
1852        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1853    }
1854}
1855
1856fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1857    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1858    for (trigram, next_char, position) in extract_trigrams(run) {
1859        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1860        if next_char != EOF_SENTINEL {
1861            entry.next_mask |= mask_for_next_char(next_char);
1862        }
1863        entry.loc_mask |= mask_for_position(position);
1864    }
1865    filters.into_iter().collect()
1866}
1867
1868fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1869    target.next_mask |= filter.next_mask;
1870    target.loc_mask |= filter.loc_mask;
1871}
1872
1873fn mask_for_next_char(next_char: u8) -> u8 {
1874    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1875    1u8 << bit
1876}
1877
1878fn mask_for_position(position: usize) -> u8 {
1879    1u8 << (position % 8)
1880}
1881
1882fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1883    if patterns.is_empty() {
1884        return Ok(None);
1885    }
1886
1887    let mut builder = GlobSetBuilder::new();
1888    for pattern in patterns {
1889        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1890        builder.add(glob);
1891    }
1892    builder.build().map(Some).map_err(|error| error.to_string())
1893}
1894
1895fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1896    let mut buffer = [0u8; 4];
1897    reader.read_exact(&mut buffer)?;
1898    Ok(u32::from_le_bytes(buffer))
1899}
1900
1901fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1902    let mut buffer = [0u8; 8];
1903    reader.read_exact(&mut buffer)?;
1904    Ok(u64::from_le_bytes(buffer))
1905}
1906
1907fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1908    writer.write_all(&value.to_le_bytes())
1909}
1910
1911fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1912    writer.write_all(&value.to_le_bytes())
1913}
1914
1915fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1916    let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1917        return Err(std::io::Error::other("search index checksum missing"));
1918    };
1919    let expected = u32::from_le_bytes(*stored);
1920    let actual = crc32fast::hash(body);
1921    if actual != expected {
1922        return Err(std::io::Error::other("search index checksum mismatch"));
1923    }
1924    Ok(())
1925}
1926
1927fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1928    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1929    total_len.checked_sub(pos)
1930}
1931
1932fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1933    let output = Command::new("git")
1934        .arg("-C")
1935        .arg(root)
1936        .args(args)
1937        .output()
1938        .ok()?;
1939    if !output.status.success() {
1940        return None;
1941    }
1942    let value = String::from_utf8(output.stdout).ok()?;
1943    let value = value.trim().to_string();
1944    if value.is_empty() {
1945        None
1946    } else {
1947        Some(value)
1948    }
1949}
1950
1951fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1952    let diff_range = format!("{}..{}", from, to);
1953    let output = match Command::new("git")
1954        .arg("-C")
1955        .arg(root)
1956        .args(["diff", "--name-only", &diff_range])
1957        .output()
1958    {
1959        Ok(output) => output,
1960        Err(_) => return false,
1961    };
1962
1963    if !output.status.success() {
1964        return false;
1965    }
1966
1967    let Ok(paths) = String::from_utf8(output.stdout) else {
1968        return false;
1969    };
1970
1971    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1972        let path = root.join(relative_path);
1973        if path.exists() {
1974            index.update_file(&path);
1975        } else {
1976            index.remove_file(&path);
1977        }
1978    }
1979
1980    true
1981}
1982
1983fn is_binary_path(path: &Path, size: u64) -> bool {
1984    if size == 0 {
1985        return false;
1986    }
1987
1988    let mut file = match File::open(path) {
1989        Ok(file) => file,
1990        Err(_) => return true,
1991    };
1992
1993    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1994    match file.read(&mut preview) {
1995        Ok(read) => is_binary_bytes(&preview[..read]),
1996        Err(_) => true,
1997    }
1998}
1999
2000fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2001    let mut starts = vec![0usize];
2002    for (index, byte) in content.iter().copied().enumerate() {
2003        if byte == b'\n' {
2004            starts.push(index + 1);
2005        }
2006    }
2007    starts
2008}
2009
2010fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2011    let line_index = match line_starts.binary_search(&offset) {
2012        Ok(index) => index,
2013        Err(index) => index.saturating_sub(1),
2014    };
2015    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2016    let line_end = content[line_start..]
2017        .iter()
2018        .position(|byte| *byte == b'\n')
2019        .map(|length| line_start + length)
2020        .unwrap_or(content.len());
2021    let mut line_slice = &content[line_start..line_end];
2022    if line_slice.ends_with(b"\r") {
2023        line_slice = &line_slice[..line_slice.len() - 1];
2024    }
2025    let line_text = String::from_utf8_lossy(line_slice).into_owned();
2026    let column = String::from_utf8_lossy(&content[line_start..offset])
2027        .chars()
2028        .count() as u32
2029        + 1;
2030    (line_index as u32 + 1, column, line_text)
2031}
2032
2033fn to_glob_path(path: &Path) -> String {
2034    path.to_string_lossy().replace('\\', "/")
2035}
2036
2037#[cfg(test)]
2038mod tests {
2039    use std::process::Command;
2040
2041    use super::*;
2042
2043    #[test]
2044    fn extract_trigrams_tracks_next_char_and_position() {
2045        let trigrams = extract_trigrams(b"Rust");
2046        assert_eq!(trigrams.len(), 2);
2047        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2048        assert_eq!(
2049            trigrams[1],
2050            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2051        );
2052    }
2053
2054    #[test]
2055    fn decompose_regex_extracts_literals_and_alternations() {
2056        let query = decompose_regex("abc(def|ghi)xyz");
2057        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2058        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2059        assert_eq!(query.or_groups.len(), 1);
2060        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2061        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2062    }
2063
2064    #[test]
2065    fn candidates_intersect_posting_lists() {
2066        let mut index = SearchIndex::new();
2067        let dir = tempfile::tempdir().expect("create temp dir");
2068        let alpha = dir.path().join("alpha.txt");
2069        let beta = dir.path().join("beta.txt");
2070        fs::write(&alpha, "abcdef").expect("write alpha");
2071        fs::write(&beta, "abcxyz").expect("write beta");
2072        index.project_root = dir.path().to_path_buf();
2073        index.index_file(&alpha, b"abcdef");
2074        index.index_file(&beta, b"abcxyz");
2075
2076        let query = RegexQuery {
2077            and_trigrams: vec![
2078                pack_trigram(b'a', b'b', b'c'),
2079                pack_trigram(b'd', b'e', b'f'),
2080            ],
2081            ..RegexQuery::default()
2082        };
2083
2084        let candidates = index.candidates(&query);
2085        assert_eq!(candidates.len(), 1);
2086        assert_eq!(index.files[candidates[0] as usize].path, alpha);
2087    }
2088
2089    #[test]
2090    fn candidates_apply_bloom_filters() {
2091        let mut index = SearchIndex::new();
2092        let dir = tempfile::tempdir().expect("create temp dir");
2093        let file = dir.path().join("sample.txt");
2094        fs::write(&file, "abcd efgh").expect("write sample");
2095        index.project_root = dir.path().to_path_buf();
2096        index.index_file(&file, b"abcd efgh");
2097
2098        let trigram = pack_trigram(b'a', b'b', b'c');
2099        let matching_filter = PostingFilter {
2100            next_mask: mask_for_next_char(b'd'),
2101            loc_mask: mask_for_position(0),
2102        };
2103        let non_matching_filter = PostingFilter {
2104            next_mask: mask_for_next_char(b'z'),
2105            loc_mask: mask_for_position(0),
2106        };
2107
2108        assert_eq!(
2109            index
2110                .postings_for_trigram(trigram, Some(matching_filter))
2111                .len(),
2112            1
2113        );
2114        assert!(index
2115            .postings_for_trigram(trigram, Some(non_matching_filter))
2116            .is_empty());
2117    }
2118
2119    #[test]
2120    fn disk_round_trip_preserves_postings_and_files() {
2121        let dir = tempfile::tempdir().expect("create temp dir");
2122        let project = dir.path().join("project");
2123        fs::create_dir_all(&project).expect("create project dir");
2124        let file = project.join("src.txt");
2125        fs::write(&file, "abcdef").expect("write source");
2126
2127        let mut index = SearchIndex::build(&project);
2128        index.git_head = Some("deadbeef".to_string());
2129        let cache_dir = dir.path().join("cache");
2130        index.write_to_disk(&cache_dir, index.git_head.as_deref());
2131
2132        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
2133        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2134        assert_eq!(loaded.files.len(), 1);
2135        assert_eq!(
2136            relative_to_root(&loaded.project_root, &loaded.files[0].path),
2137            PathBuf::from("src.txt")
2138        );
2139        assert_eq!(loaded.postings.len(), index.postings.len());
2140        assert!(loaded
2141            .postings
2142            .contains_key(&pack_trigram(b'a', b'b', b'c')));
2143    }
2144
2145    #[test]
2146    fn read_from_disk_rejects_corrupt_postings_checksum() {
2147        let dir = tempfile::tempdir().expect("create temp dir");
2148        let project = dir.path().join("project");
2149        fs::create_dir_all(&project).expect("create project dir");
2150        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2151
2152        let index = SearchIndex::build(&project);
2153        let cache_dir = dir.path().join("cache");
2154        index.write_to_disk(&cache_dir, None);
2155
2156        let cache_path = cache_dir.join("cache.bin");
2157        let mut bytes = fs::read(&cache_path).expect("read cache");
2158        let middle = bytes.len() / 2;
2159        bytes[middle] ^= 0xff;
2160        fs::write(&cache_path, bytes).expect("write corrupted cache");
2161
2162        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2163    }
2164
2165    #[test]
2166    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2167        let dir = tempfile::tempdir().expect("create temp dir");
2168        let project = dir.path().join("project");
2169        fs::create_dir_all(&project).expect("create project dir");
2170        fs::write(project.join("src.txt"), "abcdef").expect("write source");
2171
2172        let index = SearchIndex::build(&project);
2173        let cache_dir = dir.path().join("cache");
2174        index.write_to_disk(&cache_dir, None);
2175
2176        assert!(cache_dir.join("cache.bin").is_file());
2177        assert!(fs::read_dir(&cache_dir)
2178            .expect("read cache dir")
2179            .all(|entry| !entry
2180                .expect("cache entry")
2181                .file_name()
2182                .to_string_lossy()
2183                .contains(".tmp.")));
2184    }
2185
2186    #[test]
2187    fn concurrent_search_index_writes_do_not_corrupt() {
2188        let dir = tempfile::tempdir().expect("create temp dir");
2189        let project = dir.path().join("project");
2190        fs::create_dir_all(&project).expect("create project dir");
2191        fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2192        let cache_dir = dir.path().join("cache");
2193
2194        let a_project = project.clone();
2195        let a_cache = cache_dir.clone();
2196        let a = std::thread::spawn(move || {
2197            let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2198            let index = SearchIndex::build(&a_project);
2199            index.write_to_disk(&a_cache, None);
2200        });
2201        let b_project = project.clone();
2202        let b_cache = cache_dir.clone();
2203        let b = std::thread::spawn(move || {
2204            let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2205            let index = SearchIndex::build(&b_project);
2206            index.write_to_disk(&b_cache, None);
2207        });
2208        a.join().expect("writer a");
2209        b.join().expect("writer b");
2210
2211        assert!(SearchIndex::read_from_disk(&cache_dir).is_some());
2212    }
2213
2214    #[test]
2215    fn search_index_atomic_rename_survives_partial_write() {
2216        let dir = tempfile::tempdir().expect("create temp dir");
2217        let cache_dir = dir.path().join("cache");
2218        fs::create_dir_all(&cache_dir).expect("create cache dir");
2219        fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2220
2221        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2222    }
2223
2224    #[test]
2225    fn project_cache_key_includes_checkout_path() {
2226        let dir = tempfile::tempdir().expect("create temp dir");
2227        let source = dir.path().join("source");
2228        fs::create_dir_all(&source).expect("create source repo dir");
2229        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2230
2231        assert!(Command::new("git")
2232            .current_dir(&source)
2233            .args(["init"])
2234            .status()
2235            .expect("init git repo")
2236            .success());
2237        assert!(Command::new("git")
2238            .current_dir(&source)
2239            .args(["add", "."])
2240            .status()
2241            .expect("git add")
2242            .success());
2243        assert!(Command::new("git")
2244            .current_dir(&source)
2245            .args([
2246                "-c",
2247                "user.name=AFT Tests",
2248                "-c",
2249                "user.email=aft-tests@example.com",
2250                "commit",
2251                "-m",
2252                "initial",
2253            ])
2254            .status()
2255            .expect("git commit")
2256            .success());
2257
2258        let clone = dir.path().join("clone");
2259        assert!(Command::new("git")
2260            .args(["clone", "--quiet"])
2261            .arg(&source)
2262            .arg(&clone)
2263            .status()
2264            .expect("git clone")
2265            .success());
2266
2267        let source_key = project_cache_key(&source);
2268        let clone_key = project_cache_key(&clone);
2269
2270        assert_eq!(source_key.len(), 16);
2271        assert_eq!(clone_key.len(), 16);
2272        // Same repo (same root commit) → same cache key regardless of clone path
2273        assert_eq!(source_key, clone_key);
2274    }
2275
2276    #[test]
2277    fn git_head_unchanged_picks_up_local_edits() {
2278        let dir = tempfile::tempdir().expect("create temp dir");
2279        let project = dir.path().join("repo");
2280        fs::create_dir_all(&project).expect("create repo dir");
2281        let file = project.join("tracked.txt");
2282        fs::write(&file, "oldtoken\n").expect("write file");
2283        assert!(Command::new("git")
2284            .current_dir(&project)
2285            .arg("init")
2286            .status()
2287            .unwrap()
2288            .success());
2289        assert!(Command::new("git")
2290            .current_dir(&project)
2291            .args(["add", "."])
2292            .status()
2293            .unwrap()
2294            .success());
2295        assert!(Command::new("git")
2296            .current_dir(&project)
2297            .args([
2298                "-c",
2299                "user.name=AFT Tests",
2300                "-c",
2301                "user.email=aft-tests@example.com",
2302                "commit",
2303                "-m",
2304                "initial"
2305            ])
2306            .status()
2307            .unwrap()
2308            .success());
2309        let head = current_git_head(&project);
2310        let mut baseline = SearchIndex::build(&project);
2311        baseline.git_head = head.clone();
2312        fs::write(&file, "newtoken\n").expect("edit tracked file");
2313
2314        let refreshed =
2315            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2316        let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2317
2318        assert_eq!(result.total_matches, 1);
2319    }
2320
2321    #[test]
2322    fn non_git_project_reuses_cache_when_files_unchanged() {
2323        let dir = tempfile::tempdir().expect("create temp dir");
2324        let project = dir.path().join("project");
2325        fs::create_dir_all(&project).expect("create project dir");
2326        fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2327        let baseline = SearchIndex::build(&project);
2328        let baseline_file_count = baseline.file_count();
2329
2330        let refreshed =
2331            SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2332
2333        assert_eq!(refreshed.file_count(), baseline_file_count);
2334        assert_eq!(
2335            refreshed
2336                .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2337                .total_matches,
2338            1
2339        );
2340    }
2341
2342    #[test]
2343    fn resolve_search_scope_disables_index_for_external_path() {
2344        let dir = tempfile::tempdir().expect("create temp dir");
2345        let project = dir.path().join("project");
2346        let outside = dir.path().join("outside");
2347        fs::create_dir_all(&project).expect("create project dir");
2348        fs::create_dir_all(&outside).expect("create outside dir");
2349
2350        let scope = resolve_search_scope(&project, outside.to_str());
2351
2352        assert_eq!(
2353            scope.root,
2354            fs::canonicalize(&outside).expect("canonicalize outside")
2355        );
2356        assert!(!scope.use_index);
2357    }
2358
2359    #[test]
2360    fn grep_filters_matches_to_search_root() {
2361        let dir = tempfile::tempdir().expect("create temp dir");
2362        let project = dir.path().join("project");
2363        let src = project.join("src");
2364        let docs = project.join("docs");
2365        fs::create_dir_all(&src).expect("create src dir");
2366        fs::create_dir_all(&docs).expect("create docs dir");
2367        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2368        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2369
2370        let index = SearchIndex::build(&project);
2371        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2372
2373        assert_eq!(result.files_searched, 1);
2374        assert_eq!(result.files_with_matches, 1);
2375        assert_eq!(result.matches.len(), 1);
2376        // Index stores canonicalized paths; on macOS /var → /private/var
2377        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2378        assert_eq!(result.matches[0].file, expected);
2379    }
2380
2381    #[test]
2382    fn grep_deduplicates_multiple_matches_on_same_line() {
2383        let dir = tempfile::tempdir().expect("create temp dir");
2384        let project = dir.path().join("project");
2385        let src = project.join("src");
2386        fs::create_dir_all(&src).expect("create src dir");
2387        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2388
2389        let index = SearchIndex::build(&project);
2390        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2391
2392        assert_eq!(result.total_matches, 1);
2393        assert_eq!(result.matches.len(), 1);
2394    }
2395
2396    #[test]
2397    fn grep_reports_total_matches_before_truncation() {
2398        let dir = tempfile::tempdir().expect("create temp dir");
2399        let project = dir.path().join("project");
2400        let src = project.join("src");
2401        fs::create_dir_all(&src).expect("create src dir");
2402        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2403
2404        let index = SearchIndex::build(&project);
2405        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2406
2407        assert_eq!(result.total_matches, 2);
2408        assert_eq!(result.matches.len(), 1);
2409        assert!(result.truncated);
2410    }
2411
2412    #[test]
2413    fn glob_filters_results_to_search_root() {
2414        let dir = tempfile::tempdir().expect("create temp dir");
2415        let project = dir.path().join("project");
2416        let src = project.join("src");
2417        let scripts = project.join("scripts");
2418        fs::create_dir_all(&src).expect("create src dir");
2419        fs::create_dir_all(&scripts).expect("create scripts dir");
2420        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2421        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2422
2423        let index = SearchIndex::build(&project);
2424        let files = index.glob("**/*.rs", &src);
2425
2426        assert_eq!(
2427            files,
2428            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2429        );
2430    }
2431
2432    #[test]
2433    fn glob_includes_hidden_and_binary_files() {
2434        let dir = tempfile::tempdir().expect("create temp dir");
2435        let project = dir.path().join("project");
2436        let hidden_dir = project.join(".hidden");
2437        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2438        let hidden_file = hidden_dir.join("data.bin");
2439        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2440
2441        let index = SearchIndex::build(&project);
2442        let files = index.glob("**/*.bin", &project);
2443
2444        assert_eq!(
2445            files,
2446            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2447        );
2448    }
2449
2450    #[test]
2451    fn read_from_disk_rejects_invalid_nanos() {
2452        let dir = tempfile::tempdir().expect("create temp dir");
2453        let cache_dir = dir.path().join("cache");
2454        fs::create_dir_all(&cache_dir).expect("create cache dir");
2455
2456        let mut postings = Vec::new();
2457        postings.extend_from_slice(INDEX_MAGIC);
2458        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2459        postings.extend_from_slice(&0u32.to_le_bytes());
2460        postings.extend_from_slice(&1u32.to_le_bytes());
2461        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2462        postings.extend_from_slice(&1u32.to_le_bytes());
2463        postings.extend_from_slice(b"/");
2464        postings.push(0u8);
2465        postings.extend_from_slice(&1u32.to_le_bytes());
2466        postings.extend_from_slice(&0u64.to_le_bytes());
2467        postings.extend_from_slice(&0u64.to_le_bytes());
2468        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2469        postings.extend_from_slice(b"a");
2470        postings.extend_from_slice(&0u64.to_le_bytes());
2471
2472        let mut lookup = Vec::new();
2473        lookup.extend_from_slice(LOOKUP_MAGIC);
2474        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2475        lookup.extend_from_slice(&0u32.to_le_bytes());
2476
2477        let postings_checksum = crc32fast::hash(&postings);
2478        postings.extend_from_slice(&postings_checksum.to_le_bytes());
2479        let lookup_checksum = crc32fast::hash(&lookup);
2480        lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2481        let mut cache = Vec::new();
2482        cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2483        cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2484        cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2485        cache.extend_from_slice(&postings);
2486        cache.extend_from_slice(&lookup);
2487        fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2488
2489        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2490    }
2491
2492    /// Regression: v0.15.2 — sort_paths_by_mtime_desc panicked when files
2493    /// changed between cmp() calls.
2494    ///
2495    /// Pre-fix, the sort closure called `path_modified_time(path)` directly,
2496    /// which does a `stat()` syscall. If the file was deleted, modified, or
2497    /// touched mid-sort, the comparator returned different values for the
2498    /// same input pair on different invocations. Rust's slice::sort detects
2499    /// this and panics with "user-provided comparison function does not
2500    /// correctly implement a total order".
2501    ///
2502    /// CI hit this on a Pi e2e test (workflow run 24887807972) where the
2503    /// bridge invalidated files in parallel with grep's sort path. This
2504    /// test simulates the worst case: most paths don't exist (Err from
2505    /// fs::metadata) and sort still completes successfully.
2506    #[test]
2507    fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2508        // Mix of existing and non-existing paths in deliberately
2509        // non-monotonic order — pre-fix, the sort would call stat() at
2510        // least N log N times and any flakiness would trigger the panic.
2511        let dir = tempfile::tempdir().expect("create tempdir");
2512        let mut paths: Vec<PathBuf> = Vec::new();
2513        for i in 0..30 {
2514            // Half exist, half don't.
2515            let path = if i % 2 == 0 {
2516                let p = dir.path().join(format!("real-{i}.rs"));
2517                fs::write(&p, format!("// {i}\n")).expect("write");
2518                p
2519            } else {
2520                dir.path().join(format!("missing-{i}.rs"))
2521            };
2522            paths.push(path);
2523        }
2524
2525        // Run the sort many times to maximise the chance of catching any
2526        // residual non-determinism. Pre-fix: panic. Post-fix: stable.
2527        for _ in 0..50 {
2528            let mut copy = paths.clone();
2529            sort_paths_by_mtime_desc(&mut copy);
2530            assert_eq!(copy.len(), paths.len());
2531        }
2532    }
2533
2534    /// Regression: v0.15.2 — sort_grep_matches_by_mtime_desc panicked under
2535    /// the same conditions as sort_paths_by_mtime_desc. See the
2536    /// sort_paths_... test above for the full rationale.
2537    #[test]
2538    fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2539        let dir = tempfile::tempdir().expect("create tempdir");
2540        let mut matches: Vec<GrepMatch> = Vec::new();
2541        for i in 0..30 {
2542            let file = if i % 2 == 0 {
2543                let p = dir.path().join(format!("real-{i}.rs"));
2544                fs::write(&p, format!("// {i}\n")).expect("write");
2545                p
2546            } else {
2547                dir.path().join(format!("missing-{i}.rs"))
2548            };
2549            matches.push(GrepMatch {
2550                file,
2551                line: u32::try_from(i).unwrap_or(0),
2552                column: 0,
2553                line_text: format!("match {i}"),
2554                match_text: format!("match {i}"),
2555            });
2556        }
2557
2558        for _ in 0..50 {
2559            let mut copy = matches.clone();
2560            sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2561            assert_eq!(copy.len(), matches.len());
2562        }
2563    }
2564}
aft/search_index.rs

aft/
search_index.rs