aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 1;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24const MAX_ENTRIES: usize = 10_000_000;
25const MIN_FILE_ENTRY_BYTES: usize = 25;
26const LOOKUP_ENTRY_BYTES: usize = 16;
27const POSTING_BYTES: usize = 6;
28
29#[derive(Clone, Debug)]
30pub struct SearchIndex {
31    pub postings: HashMap<u32, Vec<Posting>>,
32    pub files: Vec<FileEntry>,
33    pub path_to_id: HashMap<PathBuf, u32>,
34    pub ready: bool,
35    project_root: PathBuf,
36    git_head: Option<String>,
37    max_file_size: u64,
38    file_trigrams: HashMap<u32, Vec<u32>>,
39    unindexed_files: HashSet<u32>,
40}
41
42impl SearchIndex {
43    /// Number of indexed files.
44    pub fn file_count(&self) -> usize {
45        self.files.len()
46    }
47
48    /// Number of unique trigrams in the index.
49    pub fn trigram_count(&self) -> usize {
50        self.postings.len()
51    }
52}
53
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct Posting {
56    pub file_id: u32,
57    pub next_mask: u8,
58    pub loc_mask: u8,
59}
60
61#[derive(Clone, Debug)]
62pub struct FileEntry {
63    pub path: PathBuf,
64    pub size: u64,
65    pub modified: SystemTime,
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct GrepMatch {
70    pub file: PathBuf,
71    pub line: u32,
72    pub column: u32,
73    pub line_text: String,
74    pub match_text: String,
75}
76
77#[derive(Clone, Debug)]
78pub struct GrepResult {
79    pub matches: Vec<GrepMatch>,
80    pub total_matches: usize,
81    pub files_searched: usize,
82    pub files_with_matches: usize,
83    pub index_status: IndexStatus,
84    pub truncated: bool,
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq)]
88pub enum IndexStatus {
89    Ready,
90    Building,
91    Fallback,
92}
93
94impl IndexStatus {
95    pub fn as_str(&self) -> &'static str {
96        match self {
97            IndexStatus::Ready => "Ready",
98            IndexStatus::Building => "Building",
99            IndexStatus::Fallback => "Fallback",
100        }
101    }
102}
103
104#[derive(Clone, Debug, Default)]
105pub struct RegexQuery {
106    pub and_trigrams: Vec<u32>,
107    pub or_groups: Vec<Vec<u32>>,
108    pub(crate) and_filters: HashMap<u32, PostingFilter>,
109    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
110}
111
112#[derive(Clone, Copy, Debug, Default)]
113pub(crate) struct PostingFilter {
114    next_mask: u8,
115    loc_mask: u8,
116}
117
118#[derive(Clone, Debug, Default)]
119struct QueryBuild {
120    and_runs: Vec<Vec<u8>>,
121    or_groups: Vec<Vec<Vec<u8>>>,
122}
123
124#[derive(Clone, Debug, Default)]
125pub(crate) struct PathFilters {
126    includes: Option<GlobSet>,
127    excludes: Option<GlobSet>,
128}
129
130#[derive(Clone, Debug)]
131pub(crate) struct SearchScope {
132    pub root: PathBuf,
133    pub use_index: bool,
134}
135
136#[derive(Clone, Debug)]
137struct SharedGrepMatch {
138    file: Arc<PathBuf>,
139    line: u32,
140    column: u32,
141    line_text: String,
142    match_text: String,
143}
144
145#[derive(Clone, Debug)]
146enum SearchMatcher {
147    Literal(LiteralSearch),
148    Regex(Regex),
149}
150
151#[derive(Clone, Debug)]
152enum LiteralSearch {
153    CaseSensitive(Vec<u8>),
154    AsciiCaseInsensitive(Vec<u8>),
155}
156
157impl SearchIndex {
158    pub fn new() -> Self {
159        SearchIndex {
160            postings: HashMap::new(),
161            files: Vec::new(),
162            path_to_id: HashMap::new(),
163            ready: false,
164            project_root: PathBuf::new(),
165            git_head: None,
166            max_file_size: DEFAULT_MAX_FILE_SIZE,
167            file_trigrams: HashMap::new(),
168            unindexed_files: HashSet::new(),
169        }
170    }
171
172    pub fn build(root: &Path) -> Self {
173        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
174    }
175
176    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
177        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
178        let mut index = SearchIndex {
179            project_root: project_root.clone(),
180            max_file_size,
181            ..SearchIndex::new()
182        };
183
184        let filters = PathFilters::default();
185        for path in walk_project_files(&project_root, &filters) {
186            index.update_file(&path);
187        }
188
189        index.git_head = current_git_head(&project_root);
190        index.ready = true;
191        index
192    }
193
194    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
195        self.remove_file(path);
196
197        let file_id = match self.allocate_file_id(path, content.len() as u64) {
198            Some(file_id) => file_id,
199            None => return,
200        };
201
202        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
203        for (trigram, next_char, position) in extract_trigrams(content) {
204            let entry = trigram_map.entry(trigram).or_default();
205            entry.next_mask |= mask_for_next_char(next_char);
206            entry.loc_mask |= mask_for_position(position);
207        }
208
209        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
210        for (trigram, filter) in trigram_map {
211            let postings = self.postings.entry(trigram).or_default();
212            postings.push(Posting {
213                file_id,
214                next_mask: filter.next_mask,
215                loc_mask: filter.loc_mask,
216            });
217            // Posting lists are kept sorted by file_id for binary search during
218            // intersection. Since file_ids are allocated incrementally, the new
219            // entry is usually already in order. Only sort when needed.
220            if postings.len() > 1
221                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
222            {
223                postings.sort_unstable_by_key(|p| p.file_id);
224            }
225            file_trigrams.push(trigram);
226        }
227
228        self.file_trigrams.insert(file_id, file_trigrams);
229        self.unindexed_files.remove(&file_id);
230    }
231
232    pub fn remove_file(&mut self, path: &Path) {
233        let Some(file_id) = self.path_to_id.remove(path) else {
234            return;
235        };
236
237        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
238            for trigram in trigrams {
239                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
240                    postings.retain(|posting| posting.file_id != file_id);
241                    postings.is_empty()
242                } else {
243                    false
244                };
245
246                if should_remove {
247                    self.postings.remove(&trigram);
248                }
249            }
250        }
251
252        self.unindexed_files.remove(&file_id);
253        if let Some(file) = self.files.get_mut(file_id as usize) {
254            file.path = PathBuf::new();
255            file.size = 0;
256            file.modified = UNIX_EPOCH;
257        }
258    }
259
260    pub fn update_file(&mut self, path: &Path) {
261        self.remove_file(path);
262
263        let metadata = match fs::metadata(path) {
264            Ok(metadata) if metadata.is_file() => metadata,
265            _ => return,
266        };
267
268        if is_binary_path(path, metadata.len()) {
269            return;
270        }
271
272        if metadata.len() > self.max_file_size {
273            self.track_unindexed_file(path, &metadata);
274            return;
275        }
276
277        let content = match fs::read(path) {
278            Ok(content) => content,
279            Err(_) => return,
280        };
281
282        if is_binary_bytes(&content) {
283            return;
284        }
285
286        self.index_file(path, &content);
287    }
288
289    pub fn grep(
290        &self,
291        pattern: &str,
292        case_sensitive: bool,
293        include: &[String],
294        exclude: &[String],
295        search_root: &Path,
296        max_results: usize,
297    ) -> GrepResult {
298        self.search_grep(
299            pattern,
300            case_sensitive,
301            include,
302            exclude,
303            search_root,
304            max_results,
305        )
306    }
307
308    pub fn search_grep(
309        &self,
310        pattern: &str,
311        case_sensitive: bool,
312        include: &[String],
313        exclude: &[String],
314        search_root: &Path,
315        max_results: usize,
316    ) -> GrepResult {
317        // Detect if pattern is a plain literal (no regex metacharacters).
318        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
319        let is_literal = !pattern.chars().any(|c| {
320            matches!(
321                c,
322                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
323            )
324        });
325
326        let literal_search = if is_literal {
327            if case_sensitive {
328                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
329            } else if pattern.is_ascii() {
330                Some(LiteralSearch::AsciiCaseInsensitive(
331                    pattern
332                        .as_bytes()
333                        .iter()
334                        .map(|byte| byte.to_ascii_lowercase())
335                        .collect(),
336                ))
337            } else {
338                None
339            }
340        } else {
341            None
342        };
343
344        // Build the regex for non-literal patterns (or literal Unicode fallback).
345        let regex = if literal_search.is_some() {
346            None
347        } else {
348            let regex_pattern = if is_literal {
349                regex::escape(pattern)
350            } else {
351                pattern.to_string()
352            };
353            let mut builder = RegexBuilder::new(&regex_pattern);
354            builder.case_insensitive(!case_sensitive);
355            match builder.build() {
356                Ok(r) => Some(r),
357                Err(_) => {
358                    return GrepResult {
359                        matches: Vec::new(),
360                        total_matches: 0,
361                        files_searched: 0,
362                        files_with_matches: 0,
363                        index_status: if self.ready {
364                            IndexStatus::Ready
365                        } else {
366                            IndexStatus::Building
367                        },
368                        truncated: false,
369                    };
370                }
371            }
372        };
373
374        let matcher = if let Some(literal_search) = literal_search {
375            SearchMatcher::Literal(literal_search)
376        } else {
377            SearchMatcher::Regex(
378                regex.expect("regex should exist when literal matcher is unavailable"),
379            )
380        };
381
382        let filters = match build_path_filters(include, exclude) {
383            Ok(filters) => filters,
384            Err(_) => PathFilters::default(),
385        };
386        let search_root = canonicalize_or_normalize(search_root);
387
388        let query = decompose_regex(pattern);
389        let candidate_ids = self.candidates(&query);
390
391        let candidate_files: Vec<&FileEntry> = candidate_ids
392            .into_iter()
393            .filter_map(|file_id| self.files.get(file_id as usize))
394            .filter(|file| !file.path.as_os_str().is_empty())
395            .filter(|file| is_within_search_root(&search_root, &file.path))
396            .filter(|file| filters.matches(&self.project_root, &file.path))
397            .collect();
398
399        let total_matches = AtomicUsize::new(0);
400        let files_searched = AtomicUsize::new(0);
401        let files_with_matches = AtomicUsize::new(0);
402        let truncated = AtomicBool::new(false);
403        let stop_after = max_results.saturating_mul(2);
404
405        let mut matches = if candidate_files.len() > 10 {
406            candidate_files
407                .par_iter()
408                .map(|file| {
409                    search_candidate_file(
410                        file,
411                        &matcher,
412                        max_results,
413                        stop_after,
414                        &total_matches,
415                        &files_searched,
416                        &files_with_matches,
417                        &truncated,
418                    )
419                })
420                .reduce(Vec::new, |mut left, mut right| {
421                    left.append(&mut right);
422                    left
423                })
424        } else {
425            let mut matches = Vec::new();
426            for file in candidate_files {
427                matches.extend(search_candidate_file(
428                    file,
429                    &matcher,
430                    max_results,
431                    stop_after,
432                    &total_matches,
433                    &files_searched,
434                    &files_with_matches,
435                    &truncated,
436                ));
437
438                if should_stop_search(&truncated, &total_matches, stop_after) {
439                    break;
440                }
441            }
442            matches
443        };
444
445        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
446            self.path_to_id
447                .get(path)
448                .and_then(|file_id| self.files.get(*file_id as usize))
449                .map(|file| file.modified)
450        });
451
452        let matches = matches
453            .into_iter()
454            .map(|matched| GrepMatch {
455                file: matched.file.as_ref().clone(),
456                line: matched.line,
457                column: matched.column,
458                line_text: matched.line_text,
459                match_text: matched.match_text,
460            })
461            .collect();
462
463        GrepResult {
464            total_matches: total_matches.load(Ordering::Relaxed),
465            matches,
466            files_searched: files_searched.load(Ordering::Relaxed),
467            files_with_matches: files_with_matches.load(Ordering::Relaxed),
468            index_status: if self.ready {
469                IndexStatus::Ready
470            } else {
471                IndexStatus::Building
472            },
473            truncated: truncated.load(Ordering::Relaxed),
474        }
475    }
476
477    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
478        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
479            Ok(filters) => filters,
480            Err(_) => return Vec::new(),
481        };
482        let search_root = canonicalize_or_normalize(search_root);
483        let filter_root = if search_root.starts_with(&self.project_root) {
484            &self.project_root
485        } else {
486            &search_root
487        };
488
489        let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
490        sort_paths_by_mtime_desc(&mut paths);
491        paths
492    }
493
494    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
495        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
496            return self.active_file_ids();
497        }
498
499        let mut and_trigrams = query.and_trigrams.clone();
500        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
501
502        let mut current: Option<Vec<u32>> = None;
503
504        for trigram in and_trigrams {
505            let filter = query.and_filters.get(&trigram).copied();
506            let matches = self.postings_for_trigram(trigram, filter);
507            current = Some(match current.take() {
508                Some(existing) => intersect_sorted_ids(&existing, &matches),
509                None => matches,
510            });
511
512            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
513                break;
514            }
515        }
516
517        let mut current = current.unwrap_or_else(|| self.active_file_ids());
518
519        for (index, group) in query.or_groups.iter().enumerate() {
520            let mut group_matches = Vec::new();
521            let filters = query.or_filters.get(index);
522
523            for trigram in group {
524                let filter = filters.and_then(|filters| filters.get(trigram).copied());
525                let matches = self.postings_for_trigram(*trigram, filter);
526                if group_matches.is_empty() {
527                    group_matches = matches;
528                } else {
529                    group_matches = union_sorted_ids(&group_matches, &matches);
530                }
531            }
532
533            current = intersect_sorted_ids(&current, &group_matches);
534            if current.is_empty() {
535                break;
536            }
537        }
538
539        let mut unindexed = self
540            .unindexed_files
541            .iter()
542            .copied()
543            .filter(|file_id| self.is_active_file(*file_id))
544            .collect::<Vec<_>>();
545        if !unindexed.is_empty() {
546            unindexed.sort_unstable();
547            current = union_sorted_ids(&current, &unindexed);
548        }
549
550        current
551    }
552
553    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
554        if fs::create_dir_all(cache_dir).is_err() {
555            return;
556        }
557
558        let postings_path = cache_dir.join("postings.bin");
559        let lookup_path = cache_dir.join("lookup.bin");
560        let tmp_postings = cache_dir.join("postings.bin.tmp");
561        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
562
563        let active_ids = self.active_file_ids();
564        let mut id_map = HashMap::new();
565        for (new_id, old_id) in active_ids.iter().enumerate() {
566            let Ok(new_id_u32) = u32::try_from(new_id) else {
567                return;
568            };
569            id_map.insert(*old_id, new_id_u32);
570        }
571
572        let write_result = (|| -> std::io::Result<()> {
573            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
574
575            postings_writer.write_all(INDEX_MAGIC)?;
576            write_u32(&mut postings_writer, INDEX_VERSION)?;
577
578            let head = git_head.unwrap_or_default();
579            let root = self.project_root.to_string_lossy();
580            let head_len = u32::try_from(head.len())
581                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
582            let root_len = u32::try_from(root.len())
583                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
584            let file_count = u32::try_from(active_ids.len())
585                .map_err(|_| std::io::Error::other("too many files to cache"))?;
586
587            write_u32(&mut postings_writer, head_len)?;
588            write_u32(&mut postings_writer, root_len)?;
589            write_u64(&mut postings_writer, self.max_file_size)?;
590            write_u32(&mut postings_writer, file_count)?;
591            postings_writer.write_all(head.as_bytes())?;
592            postings_writer.write_all(root.as_bytes())?;
593
594            for old_id in &active_ids {
595                let Some(file) = self.files.get(*old_id as usize) else {
596                    return Err(std::io::Error::other("missing file entry for cache write"));
597                };
598                let path = relative_to_root(&self.project_root, &file.path);
599                let path = path.to_string_lossy();
600                let path_len = u32::try_from(path.len())
601                    .map_err(|_| std::io::Error::other("cached path too large"))?;
602                let modified = file
603                    .modified
604                    .duration_since(UNIX_EPOCH)
605                    .unwrap_or(Duration::ZERO);
606                let unindexed = if self.unindexed_files.contains(old_id) {
607                    1u8
608                } else {
609                    0u8
610                };
611
612                postings_writer.write_all(&[unindexed])?;
613                write_u32(&mut postings_writer, path_len)?;
614                write_u64(&mut postings_writer, file.size)?;
615                write_u64(&mut postings_writer, modified.as_secs())?;
616                write_u32(&mut postings_writer, modified.subsec_nanos())?;
617                postings_writer.write_all(path.as_bytes())?;
618            }
619
620            let mut lookup_entries = Vec::new();
621            let mut postings_blob = Vec::new();
622            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
623            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
624
625            for (trigram, postings) in sorted_postings {
626                let offset = u64::try_from(postings_blob.len())
627                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
628                let mut count = 0u32;
629
630                for posting in postings {
631                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
632                        continue;
633                    };
634
635                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
636                    postings_blob.push(posting.next_mask);
637                    postings_blob.push(posting.loc_mask);
638                    count = count.saturating_add(1);
639                }
640
641                if count > 0 {
642                    lookup_entries.push((*trigram, offset, count));
643                }
644            }
645
646            write_u64(
647                &mut postings_writer,
648                u64::try_from(postings_blob.len())
649                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
650            )?;
651            postings_writer.write_all(&postings_blob)?;
652            postings_writer.flush()?;
653            drop(postings_writer);
654
655            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
656            let entry_count = u32::try_from(lookup_entries.len())
657                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
658
659            lookup_writer.write_all(LOOKUP_MAGIC)?;
660            write_u32(&mut lookup_writer, INDEX_VERSION)?;
661            write_u32(&mut lookup_writer, entry_count)?;
662
663            for (trigram, offset, count) in lookup_entries {
664                write_u32(&mut lookup_writer, trigram)?;
665                write_u64(&mut lookup_writer, offset)?;
666                write_u32(&mut lookup_writer, count)?;
667            }
668
669            lookup_writer.flush()?;
670            drop(lookup_writer);
671
672            fs::rename(&tmp_postings, &postings_path)?;
673            fs::rename(&tmp_lookup, &lookup_path)?;
674
675            Ok(())
676        })();
677
678        if write_result.is_err() {
679            let _ = fs::remove_file(&tmp_postings);
680            let _ = fs::remove_file(&tmp_lookup);
681        }
682    }
683
684    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
685        let postings_path = cache_dir.join("postings.bin");
686        let lookup_path = cache_dir.join("lookup.bin");
687
688        let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
689        let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
690        let postings_len_total =
691            usize::try_from(postings_reader.get_ref().metadata().ok()?.len()).ok()?;
692        let lookup_len_total =
693            usize::try_from(lookup_reader.get_ref().metadata().ok()?.len()).ok()?;
694
695        let mut magic = [0u8; 8];
696        postings_reader.read_exact(&mut magic).ok()?;
697        if &magic != INDEX_MAGIC {
698            return None;
699        }
700        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
701            return None;
702        }
703
704        let head_len = read_u32(&mut postings_reader).ok()? as usize;
705        let root_len = read_u32(&mut postings_reader).ok()? as usize;
706        let max_file_size = read_u64(&mut postings_reader).ok()?;
707        let file_count = read_u32(&mut postings_reader).ok()? as usize;
708        if file_count > MAX_ENTRIES {
709            return None;
710        }
711        let remaining_postings = remaining_bytes(&mut postings_reader, postings_len_total)?;
712        let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
713        if minimum_file_bytes > remaining_postings {
714            return None;
715        }
716
717        if head_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
718            return None;
719        }
720        let mut head_bytes = vec![0u8; head_len];
721        postings_reader.read_exact(&mut head_bytes).ok()?;
722        let git_head = String::from_utf8(head_bytes)
723            .ok()
724            .filter(|head| !head.is_empty());
725
726        if root_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
727            return None;
728        }
729        let mut root_bytes = vec![0u8; root_len];
730        postings_reader.read_exact(&mut root_bytes).ok()?;
731        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
732
733        let mut files = Vec::with_capacity(file_count);
734        let mut path_to_id = HashMap::new();
735        let mut unindexed_files = HashSet::new();
736
737        for file_id in 0..file_count {
738            let mut unindexed = [0u8; 1];
739            postings_reader.read_exact(&mut unindexed).ok()?;
740            let path_len = read_u32(&mut postings_reader).ok()? as usize;
741            let size = read_u64(&mut postings_reader).ok()?;
742            let secs = read_u64(&mut postings_reader).ok()?;
743            let nanos = read_u32(&mut postings_reader).ok()?;
744            if nanos >= 1_000_000_000 {
745                return None;
746            }
747            if path_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
748                return None;
749            }
750            let mut path_bytes = vec![0u8; path_len];
751            postings_reader.read_exact(&mut path_bytes).ok()?;
752            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
753            let full_path = project_root.join(relative_path);
754            let file_id_u32 = u32::try_from(file_id).ok()?;
755
756            files.push(FileEntry {
757                path: full_path.clone(),
758                size,
759                modified: UNIX_EPOCH + Duration::new(secs, nanos),
760            });
761            path_to_id.insert(full_path, file_id_u32);
762            if unindexed[0] == 1 {
763                unindexed_files.insert(file_id_u32);
764            }
765        }
766
767        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
768        let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
769        if postings_len > max_postings_bytes {
770            return None;
771        }
772        if postings_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
773            return None;
774        }
775        let mut postings_blob = vec![0u8; postings_len];
776        postings_reader.read_exact(&mut postings_blob).ok()?;
777
778        let mut lookup_magic = [0u8; 8];
779        lookup_reader.read_exact(&mut lookup_magic).ok()?;
780        if &lookup_magic != LOOKUP_MAGIC {
781            return None;
782        }
783        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
784            return None;
785        }
786        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
787        if entry_count > MAX_ENTRIES {
788            return None;
789        }
790        let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_len_total)?;
791        let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
792        if minimum_lookup_bytes > remaining_lookup {
793            return None;
794        }
795
796        let mut postings = HashMap::new();
797        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
798
799        for _ in 0..entry_count {
800            let trigram = read_u32(&mut lookup_reader).ok()?;
801            let offset = read_u64(&mut lookup_reader).ok()? as usize;
802            let count = read_u32(&mut lookup_reader).ok()? as usize;
803            if count > MAX_ENTRIES {
804                return None;
805            }
806            let bytes_len = count.checked_mul(POSTING_BYTES)?;
807            let end = offset.checked_add(bytes_len)?;
808            if end > postings_blob.len() {
809                return None;
810            }
811
812            let mut trigram_postings = Vec::with_capacity(count);
813            for chunk in postings_blob[offset..end].chunks_exact(6) {
814                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
815                let posting = Posting {
816                    file_id,
817                    next_mask: chunk[4],
818                    loc_mask: chunk[5],
819                };
820                trigram_postings.push(posting.clone());
821                file_trigrams.entry(file_id).or_default().push(trigram);
822            }
823            postings.insert(trigram, trigram_postings);
824        }
825
826        Some(SearchIndex {
827            postings,
828            files,
829            path_to_id,
830            ready: true,
831            project_root,
832            git_head,
833            max_file_size,
834            file_trigrams,
835            unindexed_files,
836        })
837    }
838
839    pub(crate) fn stored_git_head(&self) -> Option<&str> {
840        self.git_head.as_deref()
841    }
842
843    pub(crate) fn set_ready(&mut self, ready: bool) {
844        self.ready = ready;
845    }
846
847    pub(crate) fn rebuild_or_refresh(
848        root: &Path,
849        max_file_size: u64,
850        current_head: Option<String>,
851        baseline: Option<SearchIndex>,
852    ) -> Self {
853        if current_head.is_none() {
854            return SearchIndex::build_with_limit(root, max_file_size);
855        }
856
857        if let Some(mut baseline) = baseline {
858            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
859            baseline.max_file_size = max_file_size;
860
861            if baseline.git_head == current_head {
862                // HEAD matches, but files may have changed on disk since the index was
863                // last written (e.g., uncommitted edits, stash pop, manual file changes
864                // while OpenCode was closed). Verify mtimes and re-index stale files.
865                verify_file_mtimes(&mut baseline);
866                baseline.ready = true;
867                return baseline;
868            }
869
870            if let (Some(previous), Some(current)) =
871                (baseline.git_head.clone(), current_head.clone())
872            {
873                let project_root = baseline.project_root.clone();
874                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
875                    baseline.git_head = Some(current);
876                    baseline.ready = true;
877                    return baseline;
878                }
879            }
880        }
881
882        SearchIndex::build_with_limit(root, max_file_size)
883    }
884
885    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
886        let file_id = u32::try_from(self.files.len()).ok()?;
887        let metadata = fs::metadata(path).ok();
888        let size = metadata
889            .as_ref()
890            .map_or(size_hint, |metadata| metadata.len());
891        let modified = metadata
892            .and_then(|metadata| metadata.modified().ok())
893            .unwrap_or(UNIX_EPOCH);
894
895        self.files.push(FileEntry {
896            path: path.to_path_buf(),
897            size,
898            modified,
899        });
900        self.path_to_id.insert(path.to_path_buf(), file_id);
901        Some(file_id)
902    }
903
904    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
905        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
906            return;
907        };
908        self.unindexed_files.insert(file_id);
909        self.file_trigrams.insert(file_id, Vec::new());
910    }
911
912    fn active_file_ids(&self) -> Vec<u32> {
913        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
914        ids.sort_unstable();
915        ids
916    }
917
918    fn is_active_file(&self, file_id: u32) -> bool {
919        self.files
920            .get(file_id as usize)
921            .map(|file| !file.path.as_os_str().is_empty())
922            .unwrap_or(false)
923    }
924
925    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
926        let Some(postings) = self.postings.get(&trigram) else {
927            return Vec::new();
928        };
929
930        let mut matches = Vec::with_capacity(postings.len());
931
932        for posting in postings {
933            if let Some(filter) = filter {
934                // next_mask: bloom filter check — the character following this trigram in the
935                // query must also appear after this trigram somewhere in the file.
936                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
937                    continue;
938                }
939                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
940                // between consecutive trigram pairs, but is NOT used as a single-trigram
941                // filter because the position in the query string has no relationship to
942                // the position in the file. Using it here causes false negatives.
943            }
944            if self.is_active_file(posting.file_id) {
945                matches.push(posting.file_id);
946            }
947        }
948
949        matches
950    }
951}
952
953fn search_candidate_file(
954    file: &FileEntry,
955    matcher: &SearchMatcher,
956    max_results: usize,
957    stop_after: usize,
958    total_matches: &AtomicUsize,
959    files_searched: &AtomicUsize,
960    files_with_matches: &AtomicUsize,
961    truncated: &AtomicBool,
962) -> Vec<SharedGrepMatch> {
963    if should_stop_search(truncated, total_matches, stop_after) {
964        return Vec::new();
965    }
966
967    let content = match read_indexed_file_bytes(&file.path) {
968        Some(content) => content,
969        None => return Vec::new(),
970    };
971    files_searched.fetch_add(1, Ordering::Relaxed);
972
973    let shared_path = Arc::new(file.path.clone());
974    let mut matches = Vec::new();
975    let mut line_starts = None;
976    let mut seen_lines = HashSet::new();
977    let mut matched_this_file = false;
978
979    match matcher {
980        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
981            let finder = memchr::memmem::Finder::new(needle);
982            let mut start = 0;
983
984            while let Some(position) = finder.find(&content[start..]) {
985                if should_stop_search(truncated, total_matches, stop_after) {
986                    break;
987                }
988
989                let offset = start + position;
990                start = offset + 1;
991
992                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
993                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
994                if !seen_lines.insert(line) {
995                    continue;
996                }
997
998                matched_this_file = true;
999                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1000                if match_number > max_results {
1001                    truncated.store(true, Ordering::Relaxed);
1002                    break;
1003                }
1004
1005                let end = offset + needle.len();
1006                matches.push(SharedGrepMatch {
1007                    file: shared_path.clone(),
1008                    line,
1009                    column,
1010                    line_text,
1011                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1012                });
1013            }
1014        }
1015        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1016            let search_content = content.to_ascii_lowercase();
1017            let finder = memchr::memmem::Finder::new(needle);
1018            let mut start = 0;
1019
1020            while let Some(position) = finder.find(&search_content[start..]) {
1021                if should_stop_search(truncated, total_matches, stop_after) {
1022                    break;
1023                }
1024
1025                let offset = start + position;
1026                start = offset + 1;
1027
1028                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1029                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1030                if !seen_lines.insert(line) {
1031                    continue;
1032                }
1033
1034                matched_this_file = true;
1035                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1036                if match_number > max_results {
1037                    truncated.store(true, Ordering::Relaxed);
1038                    break;
1039                }
1040
1041                let end = offset + needle.len();
1042                matches.push(SharedGrepMatch {
1043                    file: shared_path.clone(),
1044                    line,
1045                    column,
1046                    line_text,
1047                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1048                });
1049            }
1050        }
1051        SearchMatcher::Regex(regex) => {
1052            for matched in regex.find_iter(&content) {
1053                if should_stop_search(truncated, total_matches, stop_after) {
1054                    break;
1055                }
1056
1057                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1058                let (line, column, line_text) =
1059                    line_details_bytes(&content, line_starts, matched.start());
1060                if !seen_lines.insert(line) {
1061                    continue;
1062                }
1063
1064                matched_this_file = true;
1065                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1066                if match_number > max_results {
1067                    truncated.store(true, Ordering::Relaxed);
1068                    break;
1069                }
1070
1071                matches.push(SharedGrepMatch {
1072                    file: shared_path.clone(),
1073                    line,
1074                    column,
1075                    line_text,
1076                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1077                });
1078            }
1079        }
1080    }
1081
1082    if matched_this_file {
1083        files_with_matches.fetch_add(1, Ordering::Relaxed);
1084    }
1085
1086    matches
1087}
1088
1089fn should_stop_search(
1090    truncated: &AtomicBool,
1091    total_matches: &AtomicUsize,
1092    stop_after: usize,
1093) -> bool {
1094    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1095}
1096
1097fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1098    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1099    let mut left_index = 0;
1100    let mut right_index = 0;
1101
1102    while left_index < left.len() && right_index < right.len() {
1103        match left[left_index].cmp(&right[right_index]) {
1104            std::cmp::Ordering::Less => left_index += 1,
1105            std::cmp::Ordering::Greater => right_index += 1,
1106            std::cmp::Ordering::Equal => {
1107                merged.push(left[left_index]);
1108                left_index += 1;
1109                right_index += 1;
1110            }
1111        }
1112    }
1113
1114    merged
1115}
1116
1117fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1118    let mut merged = Vec::with_capacity(left.len() + right.len());
1119    let mut left_index = 0;
1120    let mut right_index = 0;
1121
1122    while left_index < left.len() && right_index < right.len() {
1123        match left[left_index].cmp(&right[right_index]) {
1124            std::cmp::Ordering::Less => {
1125                merged.push(left[left_index]);
1126                left_index += 1;
1127            }
1128            std::cmp::Ordering::Greater => {
1129                merged.push(right[right_index]);
1130                right_index += 1;
1131            }
1132            std::cmp::Ordering::Equal => {
1133                merged.push(left[left_index]);
1134                left_index += 1;
1135                right_index += 1;
1136            }
1137        }
1138    }
1139
1140    merged.extend_from_slice(&left[left_index..]);
1141    merged.extend_from_slice(&right[right_index..]);
1142    merged
1143}
1144
1145pub fn decompose_regex(pattern: &str) -> RegexQuery {
1146    let hir = match regex_syntax::parse(pattern) {
1147        Ok(hir) => hir,
1148        Err(_) => return RegexQuery::default(),
1149    };
1150
1151    let build = build_query(&hir);
1152    build.into_query()
1153}
1154
1155pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1156    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1157}
1158
1159pub fn normalize_char(c: u8) -> u8 {
1160    c.to_ascii_lowercase()
1161}
1162
1163pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1164    if content.len() < 3 {
1165        return Vec::new();
1166    }
1167
1168    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1169    for start in 0..=content.len() - 3 {
1170        let trigram = pack_trigram(
1171            normalize_char(content[start]),
1172            normalize_char(content[start + 1]),
1173            normalize_char(content[start + 2]),
1174        );
1175        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1176        trigrams.push((trigram, next_char, start));
1177    }
1178    trigrams
1179}
1180
1181pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1182    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's storage
1183    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1184        return PathBuf::from(override_dir)
1185            .join("index")
1186            .join(project_cache_key(project_root));
1187    }
1188    // Use configured storage dir (from plugin, XDG-compliant)
1189    if let Some(dir) = storage_dir {
1190        return dir.join("index").join(project_cache_key(project_root));
1191    }
1192    // Fallback to ~/.cache/aft/ (legacy, for standalone binary usage)
1193    let home = std::env::var_os("HOME")
1194        .map(PathBuf::from)
1195        .unwrap_or_else(|| PathBuf::from("."));
1196    home.join(".cache")
1197        .join("aft")
1198        .join("index")
1199        .join(project_cache_key(project_root))
1200}
1201
1202pub(crate) fn build_path_filters(
1203    include: &[String],
1204    exclude: &[String],
1205) -> Result<PathFilters, String> {
1206    Ok(PathFilters {
1207        includes: build_globset(include)?,
1208        excludes: build_globset(exclude)?,
1209    })
1210}
1211
1212pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1213    walk_project_files_from(root, root, filters)
1214}
1215
1216pub(crate) fn walk_project_files_from(
1217    filter_root: &Path,
1218    search_root: &Path,
1219    filters: &PathFilters,
1220) -> Vec<PathBuf> {
1221    let mut builder = WalkBuilder::new(search_root);
1222    builder
1223        .hidden(false)
1224        .git_ignore(true)
1225        .git_global(true)
1226        .git_exclude(true)
1227        .filter_entry(|entry| {
1228            let name = entry.file_name().to_string_lossy();
1229            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1230                return !matches!(
1231                    name.as_ref(),
1232                    "node_modules"
1233                        | "target"
1234                        | "venv"
1235                        | ".venv"
1236                        | ".git"
1237                        | "__pycache__"
1238                        | ".tox"
1239                        | "dist"
1240                        | "build"
1241                );
1242            }
1243            true
1244        });
1245
1246    let mut files = Vec::new();
1247    for entry in builder.build().filter_map(|entry| entry.ok()) {
1248        if !entry
1249            .file_type()
1250            .map_or(false, |file_type| file_type.is_file())
1251        {
1252            continue;
1253        }
1254        let path = entry.into_path();
1255        if filters.matches(filter_root, &path) {
1256            files.push(path);
1257        }
1258    }
1259
1260    sort_paths_by_mtime_desc(&mut files);
1261    files
1262}
1263
1264pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1265    let bytes = fs::read(path).ok()?;
1266    if is_binary_bytes(&bytes) {
1267        return None;
1268    }
1269    String::from_utf8(bytes).ok()
1270}
1271
1272fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1273    fs::read(path).ok()
1274}
1275
1276pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1277    path.strip_prefix(root)
1278        .map(PathBuf::from)
1279        .unwrap_or_else(|_| path.to_path_buf())
1280}
1281
1282pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1283    paths.sort_by(|left, right| {
1284        path_modified_time(right)
1285            .cmp(&path_modified_time(left))
1286            .then_with(|| left.cmp(right))
1287    });
1288}
1289
1290pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1291    matches.sort_by(|left, right| {
1292        let left_path = resolve_match_path(project_root, &left.file);
1293        let right_path = resolve_match_path(project_root, &right.file);
1294
1295        path_modified_time(&right_path)
1296            .cmp(&path_modified_time(&left_path))
1297            .then_with(|| left.file.cmp(&right.file))
1298            .then_with(|| left.line.cmp(&right.line))
1299            .then_with(|| left.column.cmp(&right.column))
1300    });
1301}
1302
1303fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1304    matches: &mut [SharedGrepMatch],
1305    modified_for_path: F,
1306) where
1307    F: Fn(&Path) -> Option<SystemTime>,
1308{
1309    matches.sort_by(|left, right| {
1310        modified_for_path(right.file.as_path())
1311            .cmp(&modified_for_path(left.file.as_path()))
1312            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1313            .then_with(|| left.line.cmp(&right.line))
1314            .then_with(|| left.column.cmp(&right.column))
1315    });
1316}
1317
1318pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1319    let resolved_project_root = canonicalize_or_normalize(project_root);
1320    let root = match path {
1321        Some(path) => {
1322            let path = PathBuf::from(path);
1323            if path.is_absolute() {
1324                canonicalize_or_normalize(&path)
1325            } else {
1326                normalize_path(&resolved_project_root.join(path))
1327            }
1328        }
1329        None => resolved_project_root.clone(),
1330    };
1331
1332    let use_index = is_within_search_root(&resolved_project_root, &root);
1333    SearchScope { root, use_index }
1334}
1335
1336pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1337    content_inspector::inspect(content).is_binary()
1338}
1339
1340pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1341    run_git(root, &["rev-parse", "HEAD"])
1342}
1343
1344pub(crate) fn project_cache_key(project_root: &Path) -> String {
1345    use sha2::{Digest, Sha256};
1346
1347    let mut hasher = Sha256::new();
1348
1349    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1350        // Git repo: root commit is the unique identity.
1351        // Same repo cloned anywhere produces the same key.
1352        hasher.update(root_commit.as_bytes());
1353    } else {
1354        // Non-git project: use the canonical filesystem path as identity.
1355        let canonical_root = canonicalize_or_normalize(project_root);
1356        hasher.update(canonical_root.to_string_lossy().as_bytes());
1357    }
1358
1359    let digest = format!("{:x}", hasher.finalize());
1360    digest[..16].to_string()
1361}
1362
1363impl PathFilters {
1364    fn matches(&self, root: &Path, path: &Path) -> bool {
1365        let relative = to_glob_path(&relative_to_root(root, path));
1366        if self
1367            .includes
1368            .as_ref()
1369            .is_some_and(|includes| !includes.is_match(&relative))
1370        {
1371            return false;
1372        }
1373        if self
1374            .excludes
1375            .as_ref()
1376            .is_some_and(|excludes| excludes.is_match(&relative))
1377        {
1378            return false;
1379        }
1380        true
1381    }
1382}
1383
1384fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1385    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1386}
1387
1388fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1389    if path.is_absolute() {
1390        path.to_path_buf()
1391    } else {
1392        project_root.join(path)
1393    }
1394}
1395
1396fn path_modified_time(path: &Path) -> Option<SystemTime> {
1397    fs::metadata(path)
1398        .and_then(|metadata| metadata.modified())
1399        .ok()
1400}
1401
1402fn normalize_path(path: &Path) -> PathBuf {
1403    let mut result = PathBuf::new();
1404    for component in path.components() {
1405        match component {
1406            Component::ParentDir => {
1407                if !result.pop() {
1408                    result.push(component);
1409                }
1410            }
1411            Component::CurDir => {}
1412            _ => result.push(component),
1413        }
1414    }
1415    result
1416}
1417
1418/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1419/// since the index was last written. Also detect new files and deleted files.
1420fn verify_file_mtimes(index: &mut SearchIndex) {
1421    // Collect stale files (mtime mismatch or deleted)
1422    let mut stale_paths = Vec::new();
1423    for entry in &index.files {
1424        if entry.path.as_os_str().is_empty() {
1425            continue; // tombstoned entry
1426        }
1427        match fs::metadata(&entry.path) {
1428            Ok(meta) => {
1429                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1430                if current_mtime != entry.modified || meta.len() != entry.size {
1431                    stale_paths.push(entry.path.clone());
1432                }
1433            }
1434            Err(_) => {
1435                // File deleted
1436                stale_paths.push(entry.path.clone());
1437            }
1438        }
1439    }
1440
1441    // Re-index stale files
1442    for path in &stale_paths {
1443        index.update_file(path);
1444    }
1445
1446    // Detect new files not in the index
1447    let filters = PathFilters::default();
1448    for path in walk_project_files(&index.project_root, &filters) {
1449        if !index.path_to_id.contains_key(&path) {
1450            index.update_file(&path);
1451        }
1452    }
1453
1454    if !stale_paths.is_empty() {
1455        log::info!(
1456            "[aft] search index: refreshed {} stale file(s) from disk cache",
1457            stale_paths.len()
1458        );
1459    }
1460}
1461
1462fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1463    path.starts_with(search_root)
1464}
1465
1466impl QueryBuild {
1467    fn into_query(self) -> RegexQuery {
1468        let mut query = RegexQuery::default();
1469
1470        for run in self.and_runs {
1471            add_run_to_and_query(&mut query, &run);
1472        }
1473
1474        for group in self.or_groups {
1475            let mut trigrams = BTreeSet::new();
1476            let mut filters = HashMap::new();
1477            for run in group {
1478                for (trigram, filter) in trigram_filters(&run) {
1479                    trigrams.insert(trigram);
1480                    merge_filter(filters.entry(trigram).or_default(), filter);
1481                }
1482            }
1483            if !trigrams.is_empty() {
1484                query.or_groups.push(trigrams.into_iter().collect());
1485                query.or_filters.push(filters);
1486            }
1487        }
1488
1489        query
1490    }
1491}
1492
1493fn build_query(hir: &Hir) -> QueryBuild {
1494    match hir.kind() {
1495        HirKind::Literal(literal) => {
1496            if literal.0.len() >= 3 {
1497                QueryBuild {
1498                    and_runs: vec![literal.0.to_vec()],
1499                    or_groups: Vec::new(),
1500                }
1501            } else {
1502                QueryBuild::default()
1503            }
1504        }
1505        HirKind::Capture(capture) => build_query(&capture.sub),
1506        HirKind::Concat(parts) => {
1507            let mut build = QueryBuild::default();
1508            for part in parts {
1509                let part_build = build_query(part);
1510                build.and_runs.extend(part_build.and_runs);
1511                build.or_groups.extend(part_build.or_groups);
1512            }
1513            build
1514        }
1515        HirKind::Alternation(parts) => {
1516            let mut group = Vec::new();
1517            for part in parts {
1518                let Some(mut choices) = guaranteed_run_choices(part) else {
1519                    return QueryBuild::default();
1520                };
1521                group.append(&mut choices);
1522            }
1523            if group.is_empty() {
1524                QueryBuild::default()
1525            } else {
1526                QueryBuild {
1527                    and_runs: Vec::new(),
1528                    or_groups: vec![group],
1529                }
1530            }
1531        }
1532        HirKind::Repetition(repetition) => {
1533            if repetition.min == 0 {
1534                QueryBuild::default()
1535            } else {
1536                build_query(&repetition.sub)
1537            }
1538        }
1539        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1540    }
1541}
1542
1543fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1544    match hir.kind() {
1545        HirKind::Literal(literal) => {
1546            if literal.0.len() >= 3 {
1547                Some(vec![literal.0.to_vec()])
1548            } else {
1549                None
1550            }
1551        }
1552        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1553        HirKind::Concat(parts) => {
1554            let mut runs = Vec::new();
1555            for part in parts {
1556                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1557                    runs.append(&mut part_runs);
1558                }
1559            }
1560            if runs.is_empty() {
1561                None
1562            } else {
1563                Some(runs)
1564            }
1565        }
1566        HirKind::Alternation(parts) => {
1567            let mut runs = Vec::new();
1568            for part in parts {
1569                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1570                    return None;
1571                };
1572                runs.append(&mut part_runs);
1573            }
1574            if runs.is_empty() {
1575                None
1576            } else {
1577                Some(runs)
1578            }
1579        }
1580        HirKind::Repetition(repetition) => {
1581            if repetition.min == 0 {
1582                None
1583            } else {
1584                guaranteed_run_choices(&repetition.sub)
1585            }
1586        }
1587        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1588    }
1589}
1590
1591fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1592    for (trigram, filter) in trigram_filters(run) {
1593        if !query.and_trigrams.contains(&trigram) {
1594            query.and_trigrams.push(trigram);
1595        }
1596        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1597    }
1598}
1599
1600fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1601    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1602    for (trigram, next_char, position) in extract_trigrams(run) {
1603        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1604        if next_char != EOF_SENTINEL {
1605            entry.next_mask |= mask_for_next_char(next_char);
1606        }
1607        entry.loc_mask |= mask_for_position(position);
1608    }
1609    filters.into_iter().collect()
1610}
1611
1612fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1613    target.next_mask |= filter.next_mask;
1614    target.loc_mask |= filter.loc_mask;
1615}
1616
1617fn mask_for_next_char(next_char: u8) -> u8 {
1618    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1619    1u8 << bit
1620}
1621
1622fn mask_for_position(position: usize) -> u8 {
1623    1u8 << (position % 8)
1624}
1625
1626fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1627    if patterns.is_empty() {
1628        return Ok(None);
1629    }
1630
1631    let mut builder = GlobSetBuilder::new();
1632    for pattern in patterns {
1633        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1634        builder.add(glob);
1635    }
1636    builder.build().map(Some).map_err(|error| error.to_string())
1637}
1638
1639fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1640    let mut buffer = [0u8; 4];
1641    reader.read_exact(&mut buffer)?;
1642    Ok(u32::from_le_bytes(buffer))
1643}
1644
1645fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1646    let mut buffer = [0u8; 8];
1647    reader.read_exact(&mut buffer)?;
1648    Ok(u64::from_le_bytes(buffer))
1649}
1650
1651fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1652    writer.write_all(&value.to_le_bytes())
1653}
1654
1655fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1656    writer.write_all(&value.to_le_bytes())
1657}
1658
1659fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1660    let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1661    total_len.checked_sub(pos)
1662}
1663
1664fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1665    let output = Command::new("git")
1666        .arg("-C")
1667        .arg(root)
1668        .args(args)
1669        .output()
1670        .ok()?;
1671    if !output.status.success() {
1672        return None;
1673    }
1674    let value = String::from_utf8(output.stdout).ok()?;
1675    let value = value.trim().to_string();
1676    if value.is_empty() {
1677        None
1678    } else {
1679        Some(value)
1680    }
1681}
1682
1683fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1684    let diff_range = format!("{}..{}", from, to);
1685    let output = match Command::new("git")
1686        .arg("-C")
1687        .arg(root)
1688        .args(["diff", "--name-only", &diff_range])
1689        .output()
1690    {
1691        Ok(output) => output,
1692        Err(_) => return false,
1693    };
1694
1695    if !output.status.success() {
1696        return false;
1697    }
1698
1699    let Ok(paths) = String::from_utf8(output.stdout) else {
1700        return false;
1701    };
1702
1703    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1704        let path = root.join(relative_path);
1705        if path.exists() {
1706            index.update_file(&path);
1707        } else {
1708            index.remove_file(&path);
1709        }
1710    }
1711
1712    true
1713}
1714
1715fn is_binary_path(path: &Path, size: u64) -> bool {
1716    if size == 0 {
1717        return false;
1718    }
1719
1720    let mut file = match File::open(path) {
1721        Ok(file) => file,
1722        Err(_) => return true,
1723    };
1724
1725    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1726    match file.read(&mut preview) {
1727        Ok(read) => is_binary_bytes(&preview[..read]),
1728        Err(_) => true,
1729    }
1730}
1731
1732fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1733    let mut starts = vec![0usize];
1734    for (index, byte) in content.iter().copied().enumerate() {
1735        if byte == b'\n' {
1736            starts.push(index + 1);
1737        }
1738    }
1739    starts
1740}
1741
1742fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1743    let line_index = match line_starts.binary_search(&offset) {
1744        Ok(index) => index,
1745        Err(index) => index.saturating_sub(1),
1746    };
1747    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1748    let line_end = content[line_start..]
1749        .iter()
1750        .position(|byte| *byte == b'\n')
1751        .map(|length| line_start + length)
1752        .unwrap_or(content.len());
1753    let mut line_slice = &content[line_start..line_end];
1754    if line_slice.ends_with(b"\r") {
1755        line_slice = &line_slice[..line_slice.len() - 1];
1756    }
1757    let line_text = String::from_utf8_lossy(line_slice).into_owned();
1758    let column = String::from_utf8_lossy(&content[line_start..offset])
1759        .chars()
1760        .count() as u32
1761        + 1;
1762    (line_index as u32 + 1, column, line_text)
1763}
1764
1765fn to_glob_path(path: &Path) -> String {
1766    path.to_string_lossy().replace('\\', "/")
1767}
1768
1769#[cfg(test)]
1770mod tests {
1771    use std::process::Command;
1772
1773    use super::*;
1774
1775    #[test]
1776    fn extract_trigrams_tracks_next_char_and_position() {
1777        let trigrams = extract_trigrams(b"Rust");
1778        assert_eq!(trigrams.len(), 2);
1779        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1780        assert_eq!(
1781            trigrams[1],
1782            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1783        );
1784    }
1785
1786    #[test]
1787    fn decompose_regex_extracts_literals_and_alternations() {
1788        let query = decompose_regex("abc(def|ghi)xyz");
1789        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1790        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1791        assert_eq!(query.or_groups.len(), 1);
1792        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1793        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1794    }
1795
1796    #[test]
1797    fn candidates_intersect_posting_lists() {
1798        let mut index = SearchIndex::new();
1799        let dir = tempfile::tempdir().expect("create temp dir");
1800        let alpha = dir.path().join("alpha.txt");
1801        let beta = dir.path().join("beta.txt");
1802        fs::write(&alpha, "abcdef").expect("write alpha");
1803        fs::write(&beta, "abcxyz").expect("write beta");
1804        index.project_root = dir.path().to_path_buf();
1805        index.index_file(&alpha, b"abcdef");
1806        index.index_file(&beta, b"abcxyz");
1807
1808        let query = RegexQuery {
1809            and_trigrams: vec![
1810                pack_trigram(b'a', b'b', b'c'),
1811                pack_trigram(b'd', b'e', b'f'),
1812            ],
1813            ..RegexQuery::default()
1814        };
1815
1816        let candidates = index.candidates(&query);
1817        assert_eq!(candidates.len(), 1);
1818        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1819    }
1820
1821    #[test]
1822    fn candidates_apply_bloom_filters() {
1823        let mut index = SearchIndex::new();
1824        let dir = tempfile::tempdir().expect("create temp dir");
1825        let file = dir.path().join("sample.txt");
1826        fs::write(&file, "abcd efgh").expect("write sample");
1827        index.project_root = dir.path().to_path_buf();
1828        index.index_file(&file, b"abcd efgh");
1829
1830        let trigram = pack_trigram(b'a', b'b', b'c');
1831        let matching_filter = PostingFilter {
1832            next_mask: mask_for_next_char(b'd'),
1833            loc_mask: mask_for_position(0),
1834        };
1835        let non_matching_filter = PostingFilter {
1836            next_mask: mask_for_next_char(b'z'),
1837            loc_mask: mask_for_position(0),
1838        };
1839
1840        assert_eq!(
1841            index
1842                .postings_for_trigram(trigram, Some(matching_filter))
1843                .len(),
1844            1
1845        );
1846        assert!(index
1847            .postings_for_trigram(trigram, Some(non_matching_filter))
1848            .is_empty());
1849    }
1850
1851    #[test]
1852    fn disk_round_trip_preserves_postings_and_files() {
1853        let dir = tempfile::tempdir().expect("create temp dir");
1854        let project = dir.path().join("project");
1855        fs::create_dir_all(&project).expect("create project dir");
1856        let file = project.join("src.txt");
1857        fs::write(&file, "abcdef").expect("write source");
1858
1859        let mut index = SearchIndex::build(&project);
1860        index.git_head = Some("deadbeef".to_string());
1861        let cache_dir = dir.path().join("cache");
1862        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1863
1864        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1865        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1866        assert_eq!(loaded.files.len(), 1);
1867        assert_eq!(
1868            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1869            PathBuf::from("src.txt")
1870        );
1871        assert_eq!(loaded.postings.len(), index.postings.len());
1872        assert!(loaded
1873            .postings
1874            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1875    }
1876
1877    #[test]
1878    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1879        let dir = tempfile::tempdir().expect("create temp dir");
1880        let project = dir.path().join("project");
1881        fs::create_dir_all(&project).expect("create project dir");
1882        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1883
1884        let index = SearchIndex::build(&project);
1885        let cache_dir = dir.path().join("cache");
1886        index.write_to_disk(&cache_dir, None);
1887
1888        assert!(cache_dir.join("postings.bin").is_file());
1889        assert!(cache_dir.join("lookup.bin").is_file());
1890        assert!(!cache_dir.join("postings.bin.tmp").exists());
1891        assert!(!cache_dir.join("lookup.bin.tmp").exists());
1892    }
1893
1894    #[test]
1895    fn project_cache_key_includes_checkout_path() {
1896        let dir = tempfile::tempdir().expect("create temp dir");
1897        let source = dir.path().join("source");
1898        fs::create_dir_all(&source).expect("create source repo dir");
1899        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1900
1901        assert!(Command::new("git")
1902            .current_dir(&source)
1903            .args(["init"])
1904            .status()
1905            .expect("init git repo")
1906            .success());
1907        assert!(Command::new("git")
1908            .current_dir(&source)
1909            .args(["add", "."])
1910            .status()
1911            .expect("git add")
1912            .success());
1913        assert!(Command::new("git")
1914            .current_dir(&source)
1915            .args([
1916                "-c",
1917                "user.name=AFT Tests",
1918                "-c",
1919                "user.email=aft-tests@example.com",
1920                "commit",
1921                "-m",
1922                "initial",
1923            ])
1924            .status()
1925            .expect("git commit")
1926            .success());
1927
1928        let clone = dir.path().join("clone");
1929        assert!(Command::new("git")
1930            .args(["clone", "--quiet"])
1931            .arg(&source)
1932            .arg(&clone)
1933            .status()
1934            .expect("git clone")
1935            .success());
1936
1937        let source_key = project_cache_key(&source);
1938        let clone_key = project_cache_key(&clone);
1939
1940        assert_eq!(source_key.len(), 16);
1941        assert_eq!(clone_key.len(), 16);
1942        // Same repo (same root commit) → same cache key regardless of clone path
1943        assert_eq!(source_key, clone_key);
1944    }
1945
1946    #[test]
1947    fn resolve_search_scope_disables_index_for_external_path() {
1948        let dir = tempfile::tempdir().expect("create temp dir");
1949        let project = dir.path().join("project");
1950        let outside = dir.path().join("outside");
1951        fs::create_dir_all(&project).expect("create project dir");
1952        fs::create_dir_all(&outside).expect("create outside dir");
1953
1954        let scope = resolve_search_scope(&project, outside.to_str());
1955
1956        assert_eq!(
1957            scope.root,
1958            fs::canonicalize(&outside).expect("canonicalize outside")
1959        );
1960        assert!(!scope.use_index);
1961    }
1962
1963    #[test]
1964    fn grep_filters_matches_to_search_root() {
1965        let dir = tempfile::tempdir().expect("create temp dir");
1966        let project = dir.path().join("project");
1967        let src = project.join("src");
1968        let docs = project.join("docs");
1969        fs::create_dir_all(&src).expect("create src dir");
1970        fs::create_dir_all(&docs).expect("create docs dir");
1971        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1972        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1973
1974        let index = SearchIndex::build(&project);
1975        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1976
1977        assert_eq!(result.files_searched, 1);
1978        assert_eq!(result.files_with_matches, 1);
1979        assert_eq!(result.matches.len(), 1);
1980        // Index stores canonicalized paths; on macOS /var → /private/var
1981        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
1982        assert_eq!(result.matches[0].file, expected);
1983    }
1984
1985    #[test]
1986    fn grep_deduplicates_multiple_matches_on_same_line() {
1987        let dir = tempfile::tempdir().expect("create temp dir");
1988        let project = dir.path().join("project");
1989        let src = project.join("src");
1990        fs::create_dir_all(&src).expect("create src dir");
1991        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1992
1993        let index = SearchIndex::build(&project);
1994        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1995
1996        assert_eq!(result.total_matches, 1);
1997        assert_eq!(result.matches.len(), 1);
1998    }
1999
2000    #[test]
2001    fn grep_reports_total_matches_before_truncation() {
2002        let dir = tempfile::tempdir().expect("create temp dir");
2003        let project = dir.path().join("project");
2004        let src = project.join("src");
2005        fs::create_dir_all(&src).expect("create src dir");
2006        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2007
2008        let index = SearchIndex::build(&project);
2009        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2010
2011        assert_eq!(result.total_matches, 2);
2012        assert_eq!(result.matches.len(), 1);
2013        assert!(result.truncated);
2014    }
2015
2016    #[test]
2017    fn glob_filters_results_to_search_root() {
2018        let dir = tempfile::tempdir().expect("create temp dir");
2019        let project = dir.path().join("project");
2020        let src = project.join("src");
2021        let scripts = project.join("scripts");
2022        fs::create_dir_all(&src).expect("create src dir");
2023        fs::create_dir_all(&scripts).expect("create scripts dir");
2024        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2025        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2026
2027        let index = SearchIndex::build(&project);
2028        let files = index.glob("**/*.rs", &src);
2029
2030        assert_eq!(
2031            files,
2032            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2033        );
2034    }
2035
2036    #[test]
2037    fn glob_includes_hidden_and_binary_files() {
2038        let dir = tempfile::tempdir().expect("create temp dir");
2039        let project = dir.path().join("project");
2040        let hidden_dir = project.join(".hidden");
2041        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2042        let hidden_file = hidden_dir.join("data.bin");
2043        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2044
2045        let index = SearchIndex::build(&project);
2046        let files = index.glob("**/*.bin", &project);
2047
2048        assert_eq!(
2049            files,
2050            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2051        );
2052    }
2053
2054    #[test]
2055    fn read_from_disk_rejects_invalid_nanos() {
2056        let dir = tempfile::tempdir().expect("create temp dir");
2057        let cache_dir = dir.path().join("cache");
2058        fs::create_dir_all(&cache_dir).expect("create cache dir");
2059
2060        let mut postings = Vec::new();
2061        postings.extend_from_slice(INDEX_MAGIC);
2062        postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2063        postings.extend_from_slice(&0u32.to_le_bytes());
2064        postings.extend_from_slice(&1u32.to_le_bytes());
2065        postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2066        postings.extend_from_slice(&1u32.to_le_bytes());
2067        postings.extend_from_slice(b"/");
2068        postings.push(0u8);
2069        postings.extend_from_slice(&1u32.to_le_bytes());
2070        postings.extend_from_slice(&0u64.to_le_bytes());
2071        postings.extend_from_slice(&0u64.to_le_bytes());
2072        postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2073        postings.extend_from_slice(b"a");
2074        postings.extend_from_slice(&0u64.to_le_bytes());
2075
2076        let mut lookup = Vec::new();
2077        lookup.extend_from_slice(LOOKUP_MAGIC);
2078        lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2079        lookup.extend_from_slice(&0u32.to_le_bytes());
2080
2081        fs::write(cache_dir.join("postings.bin"), postings).expect("write postings");
2082        fs::write(cache_dir.join("lookup.bin"), lookup).expect("write lookup");
2083
2084        assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2085    }
2086}
aft/search_index.rs

aft/
search_index.rs