aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7    atomic::{AtomicBool, AtomicUsize, Ordering},
8    Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 1;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24
25#[derive(Clone, Debug)]
26pub struct SearchIndex {
27    pub postings: HashMap<u32, Vec<Posting>>,
28    pub files: Vec<FileEntry>,
29    pub path_to_id: HashMap<PathBuf, u32>,
30    pub ready: bool,
31    project_root: PathBuf,
32    git_head: Option<String>,
33    max_file_size: u64,
34    file_trigrams: HashMap<u32, Vec<u32>>,
35    unindexed_files: HashSet<u32>,
36}
37
38#[derive(Clone, Debug, PartialEq, Eq)]
39pub struct Posting {
40    pub file_id: u32,
41    pub next_mask: u8,
42    pub loc_mask: u8,
43}
44
45#[derive(Clone, Debug)]
46pub struct FileEntry {
47    pub path: PathBuf,
48    pub size: u64,
49    pub modified: SystemTime,
50}
51
52#[derive(Clone, Debug, PartialEq, Eq)]
53pub struct GrepMatch {
54    pub file: PathBuf,
55    pub line: u32,
56    pub column: u32,
57    pub line_text: String,
58    pub match_text: String,
59}
60
61#[derive(Clone, Debug)]
62pub struct GrepResult {
63    pub matches: Vec<GrepMatch>,
64    pub total_matches: usize,
65    pub files_searched: usize,
66    pub files_with_matches: usize,
67    pub index_status: IndexStatus,
68    pub truncated: bool,
69}
70
71#[derive(Clone, Copy, Debug, PartialEq, Eq)]
72pub enum IndexStatus {
73    Ready,
74    Building,
75    Fallback,
76}
77
78impl IndexStatus {
79    pub fn as_str(&self) -> &'static str {
80        match self {
81            IndexStatus::Ready => "Ready",
82            IndexStatus::Building => "Building",
83            IndexStatus::Fallback => "Fallback",
84        }
85    }
86}
87
88#[derive(Clone, Debug, Default)]
89pub struct RegexQuery {
90    pub and_trigrams: Vec<u32>,
91    pub or_groups: Vec<Vec<u32>>,
92    pub(crate) and_filters: HashMap<u32, PostingFilter>,
93    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
94}
95
96#[derive(Clone, Copy, Debug, Default)]
97pub(crate) struct PostingFilter {
98    next_mask: u8,
99    loc_mask: u8,
100}
101
102#[derive(Clone, Debug, Default)]
103struct QueryBuild {
104    and_runs: Vec<Vec<u8>>,
105    or_groups: Vec<Vec<Vec<u8>>>,
106}
107
108#[derive(Clone, Debug, Default)]
109pub(crate) struct PathFilters {
110    includes: Option<GlobSet>,
111    excludes: Option<GlobSet>,
112}
113
114#[derive(Clone, Debug)]
115pub(crate) struct SearchScope {
116    pub root: PathBuf,
117    pub use_index: bool,
118}
119
120#[derive(Clone, Debug)]
121struct SharedGrepMatch {
122    file: Arc<PathBuf>,
123    line: u32,
124    column: u32,
125    line_text: String,
126    match_text: String,
127}
128
129#[derive(Clone, Debug)]
130enum SearchMatcher {
131    Literal(LiteralSearch),
132    Regex(Regex),
133}
134
135#[derive(Clone, Debug)]
136enum LiteralSearch {
137    CaseSensitive(Vec<u8>),
138    AsciiCaseInsensitive(Vec<u8>),
139}
140
141impl SearchIndex {
142    pub fn new() -> Self {
143        SearchIndex {
144            postings: HashMap::new(),
145            files: Vec::new(),
146            path_to_id: HashMap::new(),
147            ready: false,
148            project_root: PathBuf::new(),
149            git_head: None,
150            max_file_size: DEFAULT_MAX_FILE_SIZE,
151            file_trigrams: HashMap::new(),
152            unindexed_files: HashSet::new(),
153        }
154    }
155
156    pub fn build(root: &Path) -> Self {
157        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
158    }
159
160    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
161        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
162        let mut index = SearchIndex {
163            project_root: project_root.clone(),
164            max_file_size,
165            ..SearchIndex::new()
166        };
167
168        let filters = PathFilters::default();
169        for path in walk_project_files(&project_root, &filters) {
170            index.update_file(&path);
171        }
172
173        index.git_head = current_git_head(&project_root);
174        index.ready = true;
175        index
176    }
177
178    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
179        self.remove_file(path);
180
181        let file_id = match self.allocate_file_id(path, content.len() as u64) {
182            Some(file_id) => file_id,
183            None => return,
184        };
185
186        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
187        for (trigram, next_char, position) in extract_trigrams(content) {
188            let entry = trigram_map.entry(trigram).or_default();
189            entry.next_mask |= mask_for_next_char(next_char);
190            entry.loc_mask |= mask_for_position(position);
191        }
192
193        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
194        for (trigram, filter) in trigram_map {
195            let postings = self.postings.entry(trigram).or_default();
196            postings.push(Posting {
197                file_id,
198                next_mask: filter.next_mask,
199                loc_mask: filter.loc_mask,
200            });
201            // Posting lists are kept sorted by file_id for binary search during
202            // intersection. Since file_ids are allocated incrementally, the new
203            // entry is usually already in order. Only sort when needed.
204            if postings.len() > 1
205                && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
206            {
207                postings.sort_unstable_by_key(|p| p.file_id);
208            }
209            file_trigrams.push(trigram);
210        }
211
212        self.file_trigrams.insert(file_id, file_trigrams);
213        self.unindexed_files.remove(&file_id);
214    }
215
216    pub fn remove_file(&mut self, path: &Path) {
217        let Some(file_id) = self.path_to_id.remove(path) else {
218            return;
219        };
220
221        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
222            for trigram in trigrams {
223                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
224                    postings.retain(|posting| posting.file_id != file_id);
225                    postings.is_empty()
226                } else {
227                    false
228                };
229
230                if should_remove {
231                    self.postings.remove(&trigram);
232                }
233            }
234        }
235
236        self.unindexed_files.remove(&file_id);
237        if let Some(file) = self.files.get_mut(file_id as usize) {
238            file.path = PathBuf::new();
239            file.size = 0;
240            file.modified = UNIX_EPOCH;
241        }
242    }
243
244    pub fn update_file(&mut self, path: &Path) {
245        self.remove_file(path);
246
247        let metadata = match fs::metadata(path) {
248            Ok(metadata) if metadata.is_file() => metadata,
249            _ => return,
250        };
251
252        if is_binary_path(path, metadata.len()) {
253            return;
254        }
255
256        if metadata.len() > self.max_file_size {
257            self.track_unindexed_file(path, &metadata);
258            return;
259        }
260
261        let content = match fs::read(path) {
262            Ok(content) => content,
263            Err(_) => return,
264        };
265
266        if is_binary_bytes(&content) {
267            return;
268        }
269
270        self.index_file(path, &content);
271    }
272
273    pub fn grep(
274        &self,
275        pattern: &str,
276        case_sensitive: bool,
277        include: &[String],
278        exclude: &[String],
279        search_root: &Path,
280        max_results: usize,
281    ) -> GrepResult {
282        self.search_grep(
283            pattern,
284            case_sensitive,
285            include,
286            exclude,
287            search_root,
288            max_results,
289        )
290    }
291
292    pub fn search_grep(
293        &self,
294        pattern: &str,
295        case_sensitive: bool,
296        include: &[String],
297        exclude: &[String],
298        search_root: &Path,
299        max_results: usize,
300    ) -> GrepResult {
301        // Detect if pattern is a plain literal (no regex metacharacters).
302        // If so, use memchr::memmem which is 3-10x faster than regex for byte scanning.
303        let is_literal = !pattern.chars().any(|c| {
304            matches!(
305                c,
306                '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
307            )
308        });
309
310        let literal_search = if is_literal {
311            if case_sensitive {
312                Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
313            } else if pattern.is_ascii() {
314                Some(LiteralSearch::AsciiCaseInsensitive(
315                    pattern
316                        .as_bytes()
317                        .iter()
318                        .map(|byte| byte.to_ascii_lowercase())
319                        .collect(),
320                ))
321            } else {
322                None
323            }
324        } else {
325            None
326        };
327
328        // Build the regex for non-literal patterns (or literal Unicode fallback).
329        let regex = if literal_search.is_some() {
330            None
331        } else {
332            let regex_pattern = if is_literal {
333                regex::escape(pattern)
334            } else {
335                pattern.to_string()
336            };
337            let mut builder = RegexBuilder::new(&regex_pattern);
338            builder.case_insensitive(!case_sensitive);
339            match builder.build() {
340                Ok(r) => Some(r),
341                Err(_) => {
342                    return GrepResult {
343                        matches: Vec::new(),
344                        total_matches: 0,
345                        files_searched: 0,
346                        files_with_matches: 0,
347                        index_status: if self.ready {
348                            IndexStatus::Ready
349                        } else {
350                            IndexStatus::Building
351                        },
352                        truncated: false,
353                    };
354                }
355            }
356        };
357
358        let matcher = if let Some(literal_search) = literal_search {
359            SearchMatcher::Literal(literal_search)
360        } else {
361            SearchMatcher::Regex(
362                regex.expect("regex should exist when literal matcher is unavailable"),
363            )
364        };
365
366        let filters = match build_path_filters(include, exclude) {
367            Ok(filters) => filters,
368            Err(_) => PathFilters::default(),
369        };
370        let search_root = canonicalize_or_normalize(search_root);
371
372        let query = decompose_regex(pattern);
373        let candidate_ids = self.candidates(&query);
374
375        let candidate_files: Vec<&FileEntry> = candidate_ids
376            .into_iter()
377            .filter_map(|file_id| self.files.get(file_id as usize))
378            .filter(|file| !file.path.as_os_str().is_empty())
379            .filter(|file| is_within_search_root(&search_root, &file.path))
380            .filter(|file| filters.matches(&self.project_root, &file.path))
381            .collect();
382
383        let total_matches = AtomicUsize::new(0);
384        let files_searched = AtomicUsize::new(0);
385        let files_with_matches = AtomicUsize::new(0);
386        let truncated = AtomicBool::new(false);
387        let stop_after = max_results.saturating_mul(2);
388
389        let mut matches = if candidate_files.len() > 10 {
390            candidate_files
391                .par_iter()
392                .map(|file| {
393                    search_candidate_file(
394                        file,
395                        &matcher,
396                        max_results,
397                        stop_after,
398                        &total_matches,
399                        &files_searched,
400                        &files_with_matches,
401                        &truncated,
402                    )
403                })
404                .reduce(Vec::new, |mut left, mut right| {
405                    left.append(&mut right);
406                    left
407                })
408        } else {
409            let mut matches = Vec::new();
410            for file in candidate_files {
411                matches.extend(search_candidate_file(
412                    file,
413                    &matcher,
414                    max_results,
415                    stop_after,
416                    &total_matches,
417                    &files_searched,
418                    &files_with_matches,
419                    &truncated,
420                ));
421
422                if should_stop_search(&truncated, &total_matches, stop_after) {
423                    break;
424                }
425            }
426            matches
427        };
428
429        sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
430            self.path_to_id
431                .get(path)
432                .and_then(|file_id| self.files.get(*file_id as usize))
433                .map(|file| file.modified)
434        });
435
436        let matches = matches
437            .into_iter()
438            .map(|matched| GrepMatch {
439                file: matched.file.as_ref().clone(),
440                line: matched.line,
441                column: matched.column,
442                line_text: matched.line_text,
443                match_text: matched.match_text,
444            })
445            .collect();
446
447        GrepResult {
448            total_matches: total_matches.load(Ordering::Relaxed),
449            matches,
450            files_searched: files_searched.load(Ordering::Relaxed),
451            files_with_matches: files_with_matches.load(Ordering::Relaxed),
452            index_status: if self.ready {
453                IndexStatus::Ready
454            } else {
455                IndexStatus::Building
456            },
457            truncated: truncated.load(Ordering::Relaxed),
458        }
459    }
460
461    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
462        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
463            Ok(filters) => filters,
464            Err(_) => return Vec::new(),
465        };
466        let search_root = canonicalize_or_normalize(search_root);
467        let filter_root = if search_root.starts_with(&self.project_root) {
468            &self.project_root
469        } else {
470            &search_root
471        };
472
473        let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
474        sort_paths_by_mtime_desc(&mut paths);
475        paths
476    }
477
478    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
479        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
480            return self.active_file_ids();
481        }
482
483        let mut and_trigrams = query.and_trigrams.clone();
484        and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
485
486        let mut current: Option<Vec<u32>> = None;
487
488        for trigram in and_trigrams {
489            let filter = query.and_filters.get(&trigram).copied();
490            let matches = self.postings_for_trigram(trigram, filter);
491            current = Some(match current.take() {
492                Some(existing) => intersect_sorted_ids(&existing, &matches),
493                None => matches,
494            });
495
496            if current.as_ref().is_some_and(|ids| ids.is_empty()) {
497                break;
498            }
499        }
500
501        let mut current = current.unwrap_or_else(|| self.active_file_ids());
502
503        for (index, group) in query.or_groups.iter().enumerate() {
504            let mut group_matches = Vec::new();
505            let filters = query.or_filters.get(index);
506
507            for trigram in group {
508                let filter = filters.and_then(|filters| filters.get(trigram).copied());
509                let matches = self.postings_for_trigram(*trigram, filter);
510                if group_matches.is_empty() {
511                    group_matches = matches;
512                } else {
513                    group_matches = union_sorted_ids(&group_matches, &matches);
514                }
515            }
516
517            current = intersect_sorted_ids(&current, &group_matches);
518            if current.is_empty() {
519                break;
520            }
521        }
522
523        let mut unindexed = self
524            .unindexed_files
525            .iter()
526            .copied()
527            .filter(|file_id| self.is_active_file(*file_id))
528            .collect::<Vec<_>>();
529        if !unindexed.is_empty() {
530            unindexed.sort_unstable();
531            current = union_sorted_ids(&current, &unindexed);
532        }
533
534        current
535    }
536
537    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
538        if fs::create_dir_all(cache_dir).is_err() {
539            return;
540        }
541
542        let postings_path = cache_dir.join("postings.bin");
543        let lookup_path = cache_dir.join("lookup.bin");
544        let tmp_postings = cache_dir.join("postings.bin.tmp");
545        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
546
547        let active_ids = self.active_file_ids();
548        let mut id_map = HashMap::new();
549        for (new_id, old_id) in active_ids.iter().enumerate() {
550            let Ok(new_id_u32) = u32::try_from(new_id) else {
551                return;
552            };
553            id_map.insert(*old_id, new_id_u32);
554        }
555
556        let write_result = (|| -> std::io::Result<()> {
557            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
558
559            postings_writer.write_all(INDEX_MAGIC)?;
560            write_u32(&mut postings_writer, INDEX_VERSION)?;
561
562            let head = git_head.unwrap_or_default();
563            let root = self.project_root.to_string_lossy();
564            let head_len = u32::try_from(head.len())
565                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
566            let root_len = u32::try_from(root.len())
567                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
568            let file_count = u32::try_from(active_ids.len())
569                .map_err(|_| std::io::Error::other("too many files to cache"))?;
570
571            write_u32(&mut postings_writer, head_len)?;
572            write_u32(&mut postings_writer, root_len)?;
573            write_u64(&mut postings_writer, self.max_file_size)?;
574            write_u32(&mut postings_writer, file_count)?;
575            postings_writer.write_all(head.as_bytes())?;
576            postings_writer.write_all(root.as_bytes())?;
577
578            for old_id in &active_ids {
579                let Some(file) = self.files.get(*old_id as usize) else {
580                    return Err(std::io::Error::other("missing file entry for cache write"));
581                };
582                let path = relative_to_root(&self.project_root, &file.path);
583                let path = path.to_string_lossy();
584                let path_len = u32::try_from(path.len())
585                    .map_err(|_| std::io::Error::other("cached path too large"))?;
586                let modified = file
587                    .modified
588                    .duration_since(UNIX_EPOCH)
589                    .unwrap_or(Duration::ZERO);
590                let unindexed = if self.unindexed_files.contains(old_id) {
591                    1u8
592                } else {
593                    0u8
594                };
595
596                postings_writer.write_all(&[unindexed])?;
597                write_u32(&mut postings_writer, path_len)?;
598                write_u64(&mut postings_writer, file.size)?;
599                write_u64(&mut postings_writer, modified.as_secs())?;
600                write_u32(&mut postings_writer, modified.subsec_nanos())?;
601                postings_writer.write_all(path.as_bytes())?;
602            }
603
604            let mut lookup_entries = Vec::new();
605            let mut postings_blob = Vec::new();
606            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
607            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
608
609            for (trigram, postings) in sorted_postings {
610                let offset = u64::try_from(postings_blob.len())
611                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
612                let mut count = 0u32;
613
614                for posting in postings {
615                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
616                        continue;
617                    };
618
619                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
620                    postings_blob.push(posting.next_mask);
621                    postings_blob.push(posting.loc_mask);
622                    count = count.saturating_add(1);
623                }
624
625                if count > 0 {
626                    lookup_entries.push((*trigram, offset, count));
627                }
628            }
629
630            write_u64(
631                &mut postings_writer,
632                u64::try_from(postings_blob.len())
633                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
634            )?;
635            postings_writer.write_all(&postings_blob)?;
636            postings_writer.flush()?;
637            drop(postings_writer);
638
639            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
640            let entry_count = u32::try_from(lookup_entries.len())
641                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
642
643            lookup_writer.write_all(LOOKUP_MAGIC)?;
644            write_u32(&mut lookup_writer, INDEX_VERSION)?;
645            write_u32(&mut lookup_writer, entry_count)?;
646
647            for (trigram, offset, count) in lookup_entries {
648                write_u32(&mut lookup_writer, trigram)?;
649                write_u64(&mut lookup_writer, offset)?;
650                write_u32(&mut lookup_writer, count)?;
651            }
652
653            lookup_writer.flush()?;
654            drop(lookup_writer);
655
656            fs::rename(&tmp_postings, &postings_path)?;
657            fs::rename(&tmp_lookup, &lookup_path)?;
658
659            Ok(())
660        })();
661
662        if write_result.is_err() {
663            let _ = fs::remove_file(&tmp_postings);
664            let _ = fs::remove_file(&tmp_lookup);
665        }
666    }
667
668    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
669        let postings_path = cache_dir.join("postings.bin");
670        let lookup_path = cache_dir.join("lookup.bin");
671
672        let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
673        let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
674
675        let mut magic = [0u8; 8];
676        postings_reader.read_exact(&mut magic).ok()?;
677        if &magic != INDEX_MAGIC {
678            return None;
679        }
680        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
681            return None;
682        }
683
684        let head_len = read_u32(&mut postings_reader).ok()? as usize;
685        let root_len = read_u32(&mut postings_reader).ok()? as usize;
686        let max_file_size = read_u64(&mut postings_reader).ok()?;
687        let file_count = read_u32(&mut postings_reader).ok()? as usize;
688
689        let mut head_bytes = vec![0u8; head_len];
690        postings_reader.read_exact(&mut head_bytes).ok()?;
691        let git_head = String::from_utf8(head_bytes)
692            .ok()
693            .filter(|head| !head.is_empty());
694
695        let mut root_bytes = vec![0u8; root_len];
696        postings_reader.read_exact(&mut root_bytes).ok()?;
697        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
698
699        let mut files = Vec::with_capacity(file_count);
700        let mut path_to_id = HashMap::new();
701        let mut unindexed_files = HashSet::new();
702
703        for file_id in 0..file_count {
704            let mut unindexed = [0u8; 1];
705            postings_reader.read_exact(&mut unindexed).ok()?;
706            let path_len = read_u32(&mut postings_reader).ok()? as usize;
707            let size = read_u64(&mut postings_reader).ok()?;
708            let secs = read_u64(&mut postings_reader).ok()?;
709            let nanos = read_u32(&mut postings_reader).ok()?;
710            let mut path_bytes = vec![0u8; path_len];
711            postings_reader.read_exact(&mut path_bytes).ok()?;
712            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
713            let full_path = project_root.join(relative_path);
714            let file_id_u32 = u32::try_from(file_id).ok()?;
715
716            files.push(FileEntry {
717                path: full_path.clone(),
718                size,
719                modified: UNIX_EPOCH + Duration::new(secs, nanos),
720            });
721            path_to_id.insert(full_path, file_id_u32);
722            if unindexed[0] == 1 {
723                unindexed_files.insert(file_id_u32);
724            }
725        }
726
727        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
728        let mut postings_blob = vec![0u8; postings_len];
729        postings_reader.read_exact(&mut postings_blob).ok()?;
730
731        let mut lookup_magic = [0u8; 8];
732        lookup_reader.read_exact(&mut lookup_magic).ok()?;
733        if &lookup_magic != LOOKUP_MAGIC {
734            return None;
735        }
736        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
737            return None;
738        }
739        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
740
741        let mut postings = HashMap::new();
742        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
743
744        for _ in 0..entry_count {
745            let trigram = read_u32(&mut lookup_reader).ok()?;
746            let offset = read_u64(&mut lookup_reader).ok()? as usize;
747            let count = read_u32(&mut lookup_reader).ok()? as usize;
748            let bytes_len = count.checked_mul(6)?;
749            let end = offset.checked_add(bytes_len)?;
750            if end > postings_blob.len() {
751                return None;
752            }
753
754            let mut trigram_postings = Vec::with_capacity(count);
755            for chunk in postings_blob[offset..end].chunks_exact(6) {
756                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
757                let posting = Posting {
758                    file_id,
759                    next_mask: chunk[4],
760                    loc_mask: chunk[5],
761                };
762                trigram_postings.push(posting.clone());
763                file_trigrams.entry(file_id).or_default().push(trigram);
764            }
765            postings.insert(trigram, trigram_postings);
766        }
767
768        Some(SearchIndex {
769            postings,
770            files,
771            path_to_id,
772            ready: true,
773            project_root,
774            git_head,
775            max_file_size,
776            file_trigrams,
777            unindexed_files,
778        })
779    }
780
781    pub(crate) fn stored_git_head(&self) -> Option<&str> {
782        self.git_head.as_deref()
783    }
784
785    pub(crate) fn set_ready(&mut self, ready: bool) {
786        self.ready = ready;
787    }
788
789    pub(crate) fn rebuild_or_refresh(
790        root: &Path,
791        max_file_size: u64,
792        current_head: Option<String>,
793        baseline: Option<SearchIndex>,
794    ) -> Self {
795        if current_head.is_none() {
796            return SearchIndex::build_with_limit(root, max_file_size);
797        }
798
799        if let Some(mut baseline) = baseline {
800            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
801            baseline.max_file_size = max_file_size;
802
803            if baseline.git_head == current_head {
804                // HEAD matches, but files may have changed on disk since the index was
805                // last written (e.g., uncommitted edits, stash pop, manual file changes
806                // while OpenCode was closed). Verify mtimes and re-index stale files.
807                verify_file_mtimes(&mut baseline);
808                baseline.ready = true;
809                return baseline;
810            }
811
812            if let (Some(previous), Some(current)) =
813                (baseline.git_head.clone(), current_head.clone())
814            {
815                let project_root = baseline.project_root.clone();
816                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
817                    baseline.git_head = Some(current);
818                    baseline.ready = true;
819                    return baseline;
820                }
821            }
822        }
823
824        SearchIndex::build_with_limit(root, max_file_size)
825    }
826
827    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
828        let file_id = u32::try_from(self.files.len()).ok()?;
829        let metadata = fs::metadata(path).ok();
830        let size = metadata
831            .as_ref()
832            .map_or(size_hint, |metadata| metadata.len());
833        let modified = metadata
834            .and_then(|metadata| metadata.modified().ok())
835            .unwrap_or(UNIX_EPOCH);
836
837        self.files.push(FileEntry {
838            path: path.to_path_buf(),
839            size,
840            modified,
841        });
842        self.path_to_id.insert(path.to_path_buf(), file_id);
843        Some(file_id)
844    }
845
846    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
847        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
848            return;
849        };
850        self.unindexed_files.insert(file_id);
851        self.file_trigrams.insert(file_id, Vec::new());
852    }
853
854    fn active_file_ids(&self) -> Vec<u32> {
855        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
856        ids.sort_unstable();
857        ids
858    }
859
860    fn is_active_file(&self, file_id: u32) -> bool {
861        self.files
862            .get(file_id as usize)
863            .map(|file| !file.path.as_os_str().is_empty())
864            .unwrap_or(false)
865    }
866
867    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
868        let Some(postings) = self.postings.get(&trigram) else {
869            return Vec::new();
870        };
871
872        let mut matches = Vec::with_capacity(postings.len());
873
874        for posting in postings {
875            if let Some(filter) = filter {
876                // next_mask: bloom filter check — the character following this trigram in the
877                // query must also appear after this trigram somewhere in the file.
878                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
879                    continue;
880                }
881                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
882                // between consecutive trigram pairs, but is NOT used as a single-trigram
883                // filter because the position in the query string has no relationship to
884                // the position in the file. Using it here causes false negatives.
885            }
886            if self.is_active_file(posting.file_id) {
887                matches.push(posting.file_id);
888            }
889        }
890
891        matches
892    }
893}
894
895fn search_candidate_file(
896    file: &FileEntry,
897    matcher: &SearchMatcher,
898    max_results: usize,
899    stop_after: usize,
900    total_matches: &AtomicUsize,
901    files_searched: &AtomicUsize,
902    files_with_matches: &AtomicUsize,
903    truncated: &AtomicBool,
904) -> Vec<SharedGrepMatch> {
905    if should_stop_search(truncated, total_matches, stop_after) {
906        return Vec::new();
907    }
908
909    let content = match read_indexed_file_bytes(&file.path) {
910        Some(content) => content,
911        None => return Vec::new(),
912    };
913    files_searched.fetch_add(1, Ordering::Relaxed);
914
915    let shared_path = Arc::new(file.path.clone());
916    let mut matches = Vec::new();
917    let mut line_starts = None;
918    let mut seen_lines = HashSet::new();
919    let mut matched_this_file = false;
920
921    match matcher {
922        SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
923            let finder = memchr::memmem::Finder::new(needle);
924            let mut start = 0;
925
926            while let Some(position) = finder.find(&content[start..]) {
927                if should_stop_search(truncated, total_matches, stop_after) {
928                    break;
929                }
930
931                let offset = start + position;
932                start = offset + 1;
933
934                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
935                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
936                if !seen_lines.insert(line) {
937                    continue;
938                }
939
940                matched_this_file = true;
941                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
942                if match_number > max_results {
943                    truncated.store(true, Ordering::Relaxed);
944                    break;
945                }
946
947                let end = offset + needle.len();
948                matches.push(SharedGrepMatch {
949                    file: shared_path.clone(),
950                    line,
951                    column,
952                    line_text,
953                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
954                });
955            }
956        }
957        SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
958            let search_content = content.to_ascii_lowercase();
959            let finder = memchr::memmem::Finder::new(needle);
960            let mut start = 0;
961
962            while let Some(position) = finder.find(&search_content[start..]) {
963                if should_stop_search(truncated, total_matches, stop_after) {
964                    break;
965                }
966
967                let offset = start + position;
968                start = offset + 1;
969
970                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
971                let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
972                if !seen_lines.insert(line) {
973                    continue;
974                }
975
976                matched_this_file = true;
977                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
978                if match_number > max_results {
979                    truncated.store(true, Ordering::Relaxed);
980                    break;
981                }
982
983                let end = offset + needle.len();
984                matches.push(SharedGrepMatch {
985                    file: shared_path.clone(),
986                    line,
987                    column,
988                    line_text,
989                    match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
990                });
991            }
992        }
993        SearchMatcher::Regex(regex) => {
994            for matched in regex.find_iter(&content) {
995                if should_stop_search(truncated, total_matches, stop_after) {
996                    break;
997                }
998
999                let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1000                let (line, column, line_text) =
1001                    line_details_bytes(&content, line_starts, matched.start());
1002                if !seen_lines.insert(line) {
1003                    continue;
1004                }
1005
1006                matched_this_file = true;
1007                let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1008                if match_number > max_results {
1009                    truncated.store(true, Ordering::Relaxed);
1010                    break;
1011                }
1012
1013                matches.push(SharedGrepMatch {
1014                    file: shared_path.clone(),
1015                    line,
1016                    column,
1017                    line_text,
1018                    match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1019                });
1020            }
1021        }
1022    }
1023
1024    if matched_this_file {
1025        files_with_matches.fetch_add(1, Ordering::Relaxed);
1026    }
1027
1028    matches
1029}
1030
1031fn should_stop_search(
1032    truncated: &AtomicBool,
1033    total_matches: &AtomicUsize,
1034    stop_after: usize,
1035) -> bool {
1036    truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1037}
1038
1039fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1040    let mut merged = Vec::with_capacity(left.len().min(right.len()));
1041    let mut left_index = 0;
1042    let mut right_index = 0;
1043
1044    while left_index < left.len() && right_index < right.len() {
1045        match left[left_index].cmp(&right[right_index]) {
1046            std::cmp::Ordering::Less => left_index += 1,
1047            std::cmp::Ordering::Greater => right_index += 1,
1048            std::cmp::Ordering::Equal => {
1049                merged.push(left[left_index]);
1050                left_index += 1;
1051                right_index += 1;
1052            }
1053        }
1054    }
1055
1056    merged
1057}
1058
1059fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1060    let mut merged = Vec::with_capacity(left.len() + right.len());
1061    let mut left_index = 0;
1062    let mut right_index = 0;
1063
1064    while left_index < left.len() && right_index < right.len() {
1065        match left[left_index].cmp(&right[right_index]) {
1066            std::cmp::Ordering::Less => {
1067                merged.push(left[left_index]);
1068                left_index += 1;
1069            }
1070            std::cmp::Ordering::Greater => {
1071                merged.push(right[right_index]);
1072                right_index += 1;
1073            }
1074            std::cmp::Ordering::Equal => {
1075                merged.push(left[left_index]);
1076                left_index += 1;
1077                right_index += 1;
1078            }
1079        }
1080    }
1081
1082    merged.extend_from_slice(&left[left_index..]);
1083    merged.extend_from_slice(&right[right_index..]);
1084    merged
1085}
1086
1087pub fn decompose_regex(pattern: &str) -> RegexQuery {
1088    let hir = match regex_syntax::parse(pattern) {
1089        Ok(hir) => hir,
1090        Err(_) => return RegexQuery::default(),
1091    };
1092
1093    let build = build_query(&hir);
1094    build.into_query()
1095}
1096
1097pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1098    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1099}
1100
1101pub fn normalize_char(c: u8) -> u8 {
1102    c.to_ascii_lowercase()
1103}
1104
1105pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1106    if content.len() < 3 {
1107        return Vec::new();
1108    }
1109
1110    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1111    for start in 0..=content.len() - 3 {
1112        let trigram = pack_trigram(
1113            normalize_char(content[start]),
1114            normalize_char(content[start + 1]),
1115            normalize_char(content[start + 2]),
1116        );
1117        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1118        trigrams.push((trigram, next_char, start));
1119    }
1120    trigrams
1121}
1122
1123pub fn resolve_cache_dir(project_root: &Path) -> PathBuf {
1124    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's cache
1125    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1126        return PathBuf::from(override_dir)
1127            .join("index")
1128            .join(project_cache_key(project_root));
1129    }
1130    let home = std::env::var_os("HOME")
1131        .map(PathBuf::from)
1132        .unwrap_or_else(|| PathBuf::from("."));
1133    home.join(".cache")
1134        .join("aft")
1135        .join("index")
1136        .join(project_cache_key(project_root))
1137}
1138
1139pub(crate) fn build_path_filters(
1140    include: &[String],
1141    exclude: &[String],
1142) -> Result<PathFilters, String> {
1143    Ok(PathFilters {
1144        includes: build_globset(include)?,
1145        excludes: build_globset(exclude)?,
1146    })
1147}
1148
1149pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1150    walk_project_files_from(root, root, filters)
1151}
1152
1153pub(crate) fn walk_project_files_from(
1154    filter_root: &Path,
1155    search_root: &Path,
1156    filters: &PathFilters,
1157) -> Vec<PathBuf> {
1158    let mut builder = WalkBuilder::new(search_root);
1159    builder
1160        .hidden(false)
1161        .git_ignore(true)
1162        .git_global(true)
1163        .git_exclude(true)
1164        .filter_entry(|entry| {
1165            let name = entry.file_name().to_string_lossy();
1166            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1167                return !matches!(
1168                    name.as_ref(),
1169                    "node_modules"
1170                        | "target"
1171                        | "venv"
1172                        | ".venv"
1173                        | ".git"
1174                        | "__pycache__"
1175                        | ".tox"
1176                        | "dist"
1177                        | "build"
1178                );
1179            }
1180            true
1181        });
1182
1183    let mut files = Vec::new();
1184    for entry in builder.build().filter_map(|entry| entry.ok()) {
1185        if !entry
1186            .file_type()
1187            .map_or(false, |file_type| file_type.is_file())
1188        {
1189            continue;
1190        }
1191        let path = entry.into_path();
1192        if filters.matches(filter_root, &path) {
1193            files.push(path);
1194        }
1195    }
1196
1197    sort_paths_by_mtime_desc(&mut files);
1198    files
1199}
1200
1201pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1202    let bytes = fs::read(path).ok()?;
1203    if is_binary_bytes(&bytes) {
1204        return None;
1205    }
1206    String::from_utf8(bytes).ok()
1207}
1208
1209fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1210    fs::read(path).ok()
1211}
1212
1213pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1214    path.strip_prefix(root)
1215        .map(PathBuf::from)
1216        .unwrap_or_else(|_| path.to_path_buf())
1217}
1218
1219pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1220    paths.sort_by(|left, right| {
1221        path_modified_time(right)
1222            .cmp(&path_modified_time(left))
1223            .then_with(|| left.cmp(right))
1224    });
1225}
1226
1227pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1228    matches.sort_by(|left, right| {
1229        let left_path = resolve_match_path(project_root, &left.file);
1230        let right_path = resolve_match_path(project_root, &right.file);
1231
1232        path_modified_time(&right_path)
1233            .cmp(&path_modified_time(&left_path))
1234            .then_with(|| left.file.cmp(&right.file))
1235            .then_with(|| left.line.cmp(&right.line))
1236            .then_with(|| left.column.cmp(&right.column))
1237    });
1238}
1239
1240fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1241    matches: &mut [SharedGrepMatch],
1242    modified_for_path: F,
1243) where
1244    F: Fn(&Path) -> Option<SystemTime>,
1245{
1246    matches.sort_by(|left, right| {
1247        modified_for_path(right.file.as_path())
1248            .cmp(&modified_for_path(left.file.as_path()))
1249            .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1250            .then_with(|| left.line.cmp(&right.line))
1251            .then_with(|| left.column.cmp(&right.column))
1252    });
1253}
1254
1255pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1256    let resolved_project_root = canonicalize_or_normalize(project_root);
1257    let root = match path {
1258        Some(path) => {
1259            let path = PathBuf::from(path);
1260            if path.is_absolute() {
1261                canonicalize_or_normalize(&path)
1262            } else {
1263                normalize_path(&resolved_project_root.join(path))
1264            }
1265        }
1266        None => resolved_project_root.clone(),
1267    };
1268
1269    let use_index = is_within_search_root(&resolved_project_root, &root);
1270    SearchScope { root, use_index }
1271}
1272
1273pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1274    content_inspector::inspect(content).is_binary()
1275}
1276
1277pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1278    run_git(root, &["rev-parse", "HEAD"])
1279}
1280
1281pub(crate) fn project_cache_key(project_root: &Path) -> String {
1282    use sha2::{Digest, Sha256};
1283
1284    let mut hasher = Sha256::new();
1285
1286    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1287        // Git repo: root commit is the unique identity.
1288        // Same repo cloned anywhere produces the same key.
1289        hasher.update(root_commit.as_bytes());
1290    } else {
1291        // Non-git project: use the canonical filesystem path as identity.
1292        let canonical_root = canonicalize_or_normalize(project_root);
1293        hasher.update(canonical_root.to_string_lossy().as_bytes());
1294    }
1295
1296    let digest = format!("{:x}", hasher.finalize());
1297    digest[..16].to_string()
1298}
1299
1300impl PathFilters {
1301    fn matches(&self, root: &Path, path: &Path) -> bool {
1302        let relative = to_glob_path(&relative_to_root(root, path));
1303        if self
1304            .includes
1305            .as_ref()
1306            .is_some_and(|includes| !includes.is_match(&relative))
1307        {
1308            return false;
1309        }
1310        if self
1311            .excludes
1312            .as_ref()
1313            .is_some_and(|excludes| excludes.is_match(&relative))
1314        {
1315            return false;
1316        }
1317        true
1318    }
1319}
1320
1321fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1322    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1323}
1324
1325fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1326    if path.is_absolute() {
1327        path.to_path_buf()
1328    } else {
1329        project_root.join(path)
1330    }
1331}
1332
1333fn path_modified_time(path: &Path) -> Option<SystemTime> {
1334    fs::metadata(path)
1335        .and_then(|metadata| metadata.modified())
1336        .ok()
1337}
1338
1339fn normalize_path(path: &Path) -> PathBuf {
1340    let mut result = PathBuf::new();
1341    for component in path.components() {
1342        match component {
1343            Component::ParentDir => {
1344                if !result.pop() {
1345                    result.push(component);
1346                }
1347            }
1348            Component::CurDir => {}
1349            _ => result.push(component),
1350        }
1351    }
1352    result
1353}
1354
1355/// Verify stored file mtimes against disk. Re-index any files whose mtime changed
1356/// since the index was last written. Also detect new files and deleted files.
1357fn verify_file_mtimes(index: &mut SearchIndex) {
1358    // Collect stale files (mtime mismatch or deleted)
1359    let mut stale_paths = Vec::new();
1360    for entry in &index.files {
1361        if entry.path.as_os_str().is_empty() {
1362            continue; // tombstoned entry
1363        }
1364        match fs::metadata(&entry.path) {
1365            Ok(meta) => {
1366                let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1367                if current_mtime != entry.modified || meta.len() != entry.size {
1368                    stale_paths.push(entry.path.clone());
1369                }
1370            }
1371            Err(_) => {
1372                // File deleted
1373                stale_paths.push(entry.path.clone());
1374            }
1375        }
1376    }
1377
1378    // Re-index stale files
1379    for path in &stale_paths {
1380        index.update_file(path);
1381    }
1382
1383    // Detect new files not in the index
1384    let filters = PathFilters::default();
1385    for path in walk_project_files(&index.project_root, &filters) {
1386        if !index.path_to_id.contains_key(&path) {
1387            index.update_file(&path);
1388        }
1389    }
1390
1391    if !stale_paths.is_empty() {
1392        log::info!(
1393            "[aft] search index: refreshed {} stale file(s) from disk cache",
1394            stale_paths.len()
1395        );
1396    }
1397}
1398
1399fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1400    path.starts_with(search_root)
1401}
1402
1403impl QueryBuild {
1404    fn into_query(self) -> RegexQuery {
1405        let mut query = RegexQuery::default();
1406
1407        for run in self.and_runs {
1408            add_run_to_and_query(&mut query, &run);
1409        }
1410
1411        for group in self.or_groups {
1412            let mut trigrams = BTreeSet::new();
1413            let mut filters = HashMap::new();
1414            for run in group {
1415                for (trigram, filter) in trigram_filters(&run) {
1416                    trigrams.insert(trigram);
1417                    merge_filter(filters.entry(trigram).or_default(), filter);
1418                }
1419            }
1420            if !trigrams.is_empty() {
1421                query.or_groups.push(trigrams.into_iter().collect());
1422                query.or_filters.push(filters);
1423            }
1424        }
1425
1426        query
1427    }
1428}
1429
1430fn build_query(hir: &Hir) -> QueryBuild {
1431    match hir.kind() {
1432        HirKind::Literal(literal) => {
1433            if literal.0.len() >= 3 {
1434                QueryBuild {
1435                    and_runs: vec![literal.0.to_vec()],
1436                    or_groups: Vec::new(),
1437                }
1438            } else {
1439                QueryBuild::default()
1440            }
1441        }
1442        HirKind::Capture(capture) => build_query(&capture.sub),
1443        HirKind::Concat(parts) => {
1444            let mut build = QueryBuild::default();
1445            for part in parts {
1446                let part_build = build_query(part);
1447                build.and_runs.extend(part_build.and_runs);
1448                build.or_groups.extend(part_build.or_groups);
1449            }
1450            build
1451        }
1452        HirKind::Alternation(parts) => {
1453            let mut group = Vec::new();
1454            for part in parts {
1455                let Some(mut choices) = guaranteed_run_choices(part) else {
1456                    return QueryBuild::default();
1457                };
1458                group.append(&mut choices);
1459            }
1460            if group.is_empty() {
1461                QueryBuild::default()
1462            } else {
1463                QueryBuild {
1464                    and_runs: Vec::new(),
1465                    or_groups: vec![group],
1466                }
1467            }
1468        }
1469        HirKind::Repetition(repetition) => {
1470            if repetition.min == 0 {
1471                QueryBuild::default()
1472            } else {
1473                build_query(&repetition.sub)
1474            }
1475        }
1476        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1477    }
1478}
1479
1480fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1481    match hir.kind() {
1482        HirKind::Literal(literal) => {
1483            if literal.0.len() >= 3 {
1484                Some(vec![literal.0.to_vec()])
1485            } else {
1486                None
1487            }
1488        }
1489        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1490        HirKind::Concat(parts) => {
1491            let mut runs = Vec::new();
1492            for part in parts {
1493                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1494                    runs.append(&mut part_runs);
1495                }
1496            }
1497            if runs.is_empty() {
1498                None
1499            } else {
1500                Some(runs)
1501            }
1502        }
1503        HirKind::Alternation(parts) => {
1504            let mut runs = Vec::new();
1505            for part in parts {
1506                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1507                    return None;
1508                };
1509                runs.append(&mut part_runs);
1510            }
1511            if runs.is_empty() {
1512                None
1513            } else {
1514                Some(runs)
1515            }
1516        }
1517        HirKind::Repetition(repetition) => {
1518            if repetition.min == 0 {
1519                None
1520            } else {
1521                guaranteed_run_choices(&repetition.sub)
1522            }
1523        }
1524        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1525    }
1526}
1527
1528fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1529    for (trigram, filter) in trigram_filters(run) {
1530        if !query.and_trigrams.contains(&trigram) {
1531            query.and_trigrams.push(trigram);
1532        }
1533        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1534    }
1535}
1536
1537fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1538    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1539    for (trigram, next_char, position) in extract_trigrams(run) {
1540        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1541        if next_char != EOF_SENTINEL {
1542            entry.next_mask |= mask_for_next_char(next_char);
1543        }
1544        entry.loc_mask |= mask_for_position(position);
1545    }
1546    filters.into_iter().collect()
1547}
1548
1549fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1550    target.next_mask |= filter.next_mask;
1551    target.loc_mask |= filter.loc_mask;
1552}
1553
1554fn mask_for_next_char(next_char: u8) -> u8 {
1555    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1556    1u8 << bit
1557}
1558
1559fn mask_for_position(position: usize) -> u8 {
1560    1u8 << (position % 8)
1561}
1562
1563fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1564    if patterns.is_empty() {
1565        return Ok(None);
1566    }
1567
1568    let mut builder = GlobSetBuilder::new();
1569    for pattern in patterns {
1570        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1571        builder.add(glob);
1572    }
1573    builder.build().map(Some).map_err(|error| error.to_string())
1574}
1575
1576fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1577    let mut buffer = [0u8; 4];
1578    reader.read_exact(&mut buffer)?;
1579    Ok(u32::from_le_bytes(buffer))
1580}
1581
1582fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1583    let mut buffer = [0u8; 8];
1584    reader.read_exact(&mut buffer)?;
1585    Ok(u64::from_le_bytes(buffer))
1586}
1587
1588fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1589    writer.write_all(&value.to_le_bytes())
1590}
1591
1592fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1593    writer.write_all(&value.to_le_bytes())
1594}
1595
1596fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1597    let output = Command::new("git")
1598        .arg("-C")
1599        .arg(root)
1600        .args(args)
1601        .output()
1602        .ok()?;
1603    if !output.status.success() {
1604        return None;
1605    }
1606    let value = String::from_utf8(output.stdout).ok()?;
1607    let value = value.trim().to_string();
1608    if value.is_empty() {
1609        None
1610    } else {
1611        Some(value)
1612    }
1613}
1614
1615fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1616    let diff_range = format!("{}..{}", from, to);
1617    let output = match Command::new("git")
1618        .arg("-C")
1619        .arg(root)
1620        .args(["diff", "--name-only", &diff_range])
1621        .output()
1622    {
1623        Ok(output) => output,
1624        Err(_) => return false,
1625    };
1626
1627    if !output.status.success() {
1628        return false;
1629    }
1630
1631    let Ok(paths) = String::from_utf8(output.stdout) else {
1632        return false;
1633    };
1634
1635    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1636        let path = root.join(relative_path);
1637        if path.exists() {
1638            index.update_file(&path);
1639        } else {
1640            index.remove_file(&path);
1641        }
1642    }
1643
1644    true
1645}
1646
1647fn is_binary_path(path: &Path, size: u64) -> bool {
1648    if size == 0 {
1649        return false;
1650    }
1651
1652    let mut file = match File::open(path) {
1653        Ok(file) => file,
1654        Err(_) => return true,
1655    };
1656
1657    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1658    match file.read(&mut preview) {
1659        Ok(read) => is_binary_bytes(&preview[..read]),
1660        Err(_) => true,
1661    }
1662}
1663
1664fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1665    let mut starts = vec![0usize];
1666    for (index, byte) in content.iter().copied().enumerate() {
1667        if byte == b'\n' {
1668            starts.push(index + 1);
1669        }
1670    }
1671    starts
1672}
1673
1674fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1675    let line_index = match line_starts.binary_search(&offset) {
1676        Ok(index) => index,
1677        Err(index) => index.saturating_sub(1),
1678    };
1679    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1680    let line_end = content[line_start..]
1681        .iter()
1682        .position(|byte| *byte == b'\n')
1683        .map(|length| line_start + length)
1684        .unwrap_or(content.len());
1685    let mut line_slice = &content[line_start..line_end];
1686    if line_slice.ends_with(b"\r") {
1687        line_slice = &line_slice[..line_slice.len() - 1];
1688    }
1689    let line_text = String::from_utf8_lossy(line_slice).into_owned();
1690    let column = String::from_utf8_lossy(&content[line_start..offset])
1691        .chars()
1692        .count() as u32
1693        + 1;
1694    (line_index as u32 + 1, column, line_text)
1695}
1696
1697fn to_glob_path(path: &Path) -> String {
1698    path.to_string_lossy().replace('\\', "/")
1699}
1700
1701#[cfg(test)]
1702mod tests {
1703    use std::process::Command;
1704
1705    use super::*;
1706
1707    #[test]
1708    fn extract_trigrams_tracks_next_char_and_position() {
1709        let trigrams = extract_trigrams(b"Rust");
1710        assert_eq!(trigrams.len(), 2);
1711        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1712        assert_eq!(
1713            trigrams[1],
1714            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1715        );
1716    }
1717
1718    #[test]
1719    fn decompose_regex_extracts_literals_and_alternations() {
1720        let query = decompose_regex("abc(def|ghi)xyz");
1721        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1722        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1723        assert_eq!(query.or_groups.len(), 1);
1724        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1725        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1726    }
1727
1728    #[test]
1729    fn candidates_intersect_posting_lists() {
1730        let mut index = SearchIndex::new();
1731        let dir = tempfile::tempdir().expect("create temp dir");
1732        let alpha = dir.path().join("alpha.txt");
1733        let beta = dir.path().join("beta.txt");
1734        fs::write(&alpha, "abcdef").expect("write alpha");
1735        fs::write(&beta, "abcxyz").expect("write beta");
1736        index.project_root = dir.path().to_path_buf();
1737        index.index_file(&alpha, b"abcdef");
1738        index.index_file(&beta, b"abcxyz");
1739
1740        let query = RegexQuery {
1741            and_trigrams: vec![
1742                pack_trigram(b'a', b'b', b'c'),
1743                pack_trigram(b'd', b'e', b'f'),
1744            ],
1745            ..RegexQuery::default()
1746        };
1747
1748        let candidates = index.candidates(&query);
1749        assert_eq!(candidates.len(), 1);
1750        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1751    }
1752
1753    #[test]
1754    fn candidates_apply_bloom_filters() {
1755        let mut index = SearchIndex::new();
1756        let dir = tempfile::tempdir().expect("create temp dir");
1757        let file = dir.path().join("sample.txt");
1758        fs::write(&file, "abcd efgh").expect("write sample");
1759        index.project_root = dir.path().to_path_buf();
1760        index.index_file(&file, b"abcd efgh");
1761
1762        let trigram = pack_trigram(b'a', b'b', b'c');
1763        let matching_filter = PostingFilter {
1764            next_mask: mask_for_next_char(b'd'),
1765            loc_mask: mask_for_position(0),
1766        };
1767        let non_matching_filter = PostingFilter {
1768            next_mask: mask_for_next_char(b'z'),
1769            loc_mask: mask_for_position(0),
1770        };
1771
1772        assert_eq!(
1773            index
1774                .postings_for_trigram(trigram, Some(matching_filter))
1775                .len(),
1776            1
1777        );
1778        assert!(index
1779            .postings_for_trigram(trigram, Some(non_matching_filter))
1780            .is_empty());
1781    }
1782
1783    #[test]
1784    fn disk_round_trip_preserves_postings_and_files() {
1785        let dir = tempfile::tempdir().expect("create temp dir");
1786        let project = dir.path().join("project");
1787        fs::create_dir_all(&project).expect("create project dir");
1788        let file = project.join("src.txt");
1789        fs::write(&file, "abcdef").expect("write source");
1790
1791        let mut index = SearchIndex::build(&project);
1792        index.git_head = Some("deadbeef".to_string());
1793        let cache_dir = dir.path().join("cache");
1794        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1795
1796        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1797        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1798        assert_eq!(loaded.files.len(), 1);
1799        assert_eq!(
1800            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1801            PathBuf::from("src.txt")
1802        );
1803        assert_eq!(loaded.postings.len(), index.postings.len());
1804        assert!(loaded
1805            .postings
1806            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1807    }
1808
1809    #[test]
1810    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1811        let dir = tempfile::tempdir().expect("create temp dir");
1812        let project = dir.path().join("project");
1813        fs::create_dir_all(&project).expect("create project dir");
1814        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1815
1816        let index = SearchIndex::build(&project);
1817        let cache_dir = dir.path().join("cache");
1818        index.write_to_disk(&cache_dir, None);
1819
1820        assert!(cache_dir.join("postings.bin").is_file());
1821        assert!(cache_dir.join("lookup.bin").is_file());
1822        assert!(!cache_dir.join("postings.bin.tmp").exists());
1823        assert!(!cache_dir.join("lookup.bin.tmp").exists());
1824    }
1825
1826    #[test]
1827    fn project_cache_key_includes_checkout_path() {
1828        let dir = tempfile::tempdir().expect("create temp dir");
1829        let source = dir.path().join("source");
1830        fs::create_dir_all(&source).expect("create source repo dir");
1831        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1832
1833        assert!(Command::new("git")
1834            .current_dir(&source)
1835            .args(["init"])
1836            .status()
1837            .expect("init git repo")
1838            .success());
1839        assert!(Command::new("git")
1840            .current_dir(&source)
1841            .args(["add", "."])
1842            .status()
1843            .expect("git add")
1844            .success());
1845        assert!(Command::new("git")
1846            .current_dir(&source)
1847            .args([
1848                "-c",
1849                "user.name=AFT Tests",
1850                "-c",
1851                "user.email=aft-tests@example.com",
1852                "commit",
1853                "-m",
1854                "initial",
1855            ])
1856            .status()
1857            .expect("git commit")
1858            .success());
1859
1860        let clone = dir.path().join("clone");
1861        assert!(Command::new("git")
1862            .args(["clone", "--quiet"])
1863            .arg(&source)
1864            .arg(&clone)
1865            .status()
1866            .expect("git clone")
1867            .success());
1868
1869        let source_key = project_cache_key(&source);
1870        let clone_key = project_cache_key(&clone);
1871
1872        assert_eq!(source_key.len(), 16);
1873        assert_eq!(clone_key.len(), 16);
1874        // Same repo (same root commit) → same cache key regardless of clone path
1875        assert_eq!(source_key, clone_key);
1876    }
1877
1878    #[test]
1879    fn resolve_search_scope_disables_index_for_external_path() {
1880        let dir = tempfile::tempdir().expect("create temp dir");
1881        let project = dir.path().join("project");
1882        let outside = dir.path().join("outside");
1883        fs::create_dir_all(&project).expect("create project dir");
1884        fs::create_dir_all(&outside).expect("create outside dir");
1885
1886        let scope = resolve_search_scope(&project, outside.to_str());
1887
1888        assert_eq!(
1889            scope.root,
1890            fs::canonicalize(&outside).expect("canonicalize outside")
1891        );
1892        assert!(!scope.use_index);
1893    }
1894
1895    #[test]
1896    fn grep_filters_matches_to_search_root() {
1897        let dir = tempfile::tempdir().expect("create temp dir");
1898        let project = dir.path().join("project");
1899        let src = project.join("src");
1900        let docs = project.join("docs");
1901        fs::create_dir_all(&src).expect("create src dir");
1902        fs::create_dir_all(&docs).expect("create docs dir");
1903        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1904        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1905
1906        let index = SearchIndex::build(&project);
1907        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1908
1909        assert_eq!(result.files_searched, 1);
1910        assert_eq!(result.files_with_matches, 1);
1911        assert_eq!(result.matches.len(), 1);
1912        // Index stores canonicalized paths; on macOS /var → /private/var
1913        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
1914        assert_eq!(result.matches[0].file, expected);
1915    }
1916
1917    #[test]
1918    fn grep_deduplicates_multiple_matches_on_same_line() {
1919        let dir = tempfile::tempdir().expect("create temp dir");
1920        let project = dir.path().join("project");
1921        let src = project.join("src");
1922        fs::create_dir_all(&src).expect("create src dir");
1923        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1924
1925        let index = SearchIndex::build(&project);
1926        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1927
1928        assert_eq!(result.total_matches, 1);
1929        assert_eq!(result.matches.len(), 1);
1930    }
1931
1932    #[test]
1933    fn grep_reports_total_matches_before_truncation() {
1934        let dir = tempfile::tempdir().expect("create temp dir");
1935        let project = dir.path().join("project");
1936        let src = project.join("src");
1937        fs::create_dir_all(&src).expect("create src dir");
1938        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
1939
1940        let index = SearchIndex::build(&project);
1941        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
1942
1943        assert_eq!(result.total_matches, 2);
1944        assert_eq!(result.matches.len(), 1);
1945        assert!(result.truncated);
1946    }
1947
1948    #[test]
1949    fn glob_filters_results_to_search_root() {
1950        let dir = tempfile::tempdir().expect("create temp dir");
1951        let project = dir.path().join("project");
1952        let src = project.join("src");
1953        let scripts = project.join("scripts");
1954        fs::create_dir_all(&src).expect("create src dir");
1955        fs::create_dir_all(&scripts).expect("create scripts dir");
1956        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
1957        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
1958
1959        let index = SearchIndex::build(&project);
1960        let files = index.glob("**/*.rs", &src);
1961
1962        assert_eq!(
1963            files,
1964            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
1965        );
1966    }
1967
1968    #[test]
1969    fn glob_includes_hidden_and_binary_files() {
1970        let dir = tempfile::tempdir().expect("create temp dir");
1971        let project = dir.path().join("project");
1972        let hidden_dir = project.join(".hidden");
1973        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
1974        let hidden_file = hidden_dir.join("data.bin");
1975        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
1976
1977        let index = SearchIndex::build(&project);
1978        let files = index.glob("**/*.bin", &project);
1979
1980        assert_eq!(
1981            files,
1982            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
1983        );
1984    }
1985}
aft/search_index.rs

aft/
search_index.rs