Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7
8use globset::{Glob, GlobSet, GlobSetBuilder};
9use ignore::WalkBuilder;
10use regex::RegexBuilder;
11use regex_syntax::hir::{Hir, HirKind};
12
13const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
14const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
15const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
16const INDEX_VERSION: u32 = 1;
17const PREVIEW_BYTES: usize = 8 * 1024;
18const EOF_SENTINEL: u8 = 0;
19
20#[derive(Clone, Debug)]
21pub struct SearchIndex {
22    pub postings: HashMap<u32, Vec<Posting>>,
23    pub files: Vec<FileEntry>,
24    pub path_to_id: HashMap<PathBuf, u32>,
25    pub ready: bool,
26    project_root: PathBuf,
27    git_head: Option<String>,
28    max_file_size: u64,
29    file_trigrams: HashMap<u32, Vec<u32>>,
30    unindexed_files: HashSet<u32>,
31}
32
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub struct Posting {
35    pub file_id: u32,
36    pub next_mask: u8,
37    pub loc_mask: u8,
38}
39
40#[derive(Clone, Debug)]
41pub struct FileEntry {
42    pub path: PathBuf,
43    pub size: u64,
44    pub modified: SystemTime,
45}
46
47#[derive(Clone, Debug, PartialEq, Eq)]
48pub struct GrepMatch {
49    pub file: PathBuf,
50    pub line: u32,
51    pub column: u32,
52    pub line_text: String,
53    pub match_text: String,
54}
55
56#[derive(Clone, Debug)]
57pub struct GrepResult {
58    pub matches: Vec<GrepMatch>,
59    pub total_matches: usize,
60    pub files_searched: usize,
61    pub files_with_matches: usize,
62    pub index_status: IndexStatus,
63    pub truncated: bool,
64}
65
66#[derive(Clone, Copy, Debug, PartialEq, Eq)]
67pub enum IndexStatus {
68    Ready,
69    Building,
70    Fallback,
71}
72
73impl IndexStatus {
74    pub fn as_str(&self) -> &'static str {
75        match self {
76            IndexStatus::Ready => "Ready",
77            IndexStatus::Building => "Building",
78            IndexStatus::Fallback => "Fallback",
79        }
80    }
81}
82
83#[derive(Clone, Debug, Default)]
84pub struct RegexQuery {
85    pub and_trigrams: Vec<u32>,
86    pub or_groups: Vec<Vec<u32>>,
87    pub(crate) and_filters: HashMap<u32, PostingFilter>,
88    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
89}
90
91#[derive(Clone, Copy, Debug, Default)]
92pub(crate) struct PostingFilter {
93    next_mask: u8,
94    loc_mask: u8,
95}
96
97#[derive(Clone, Debug, Default)]
98struct QueryBuild {
99    and_runs: Vec<Vec<u8>>,
100    or_groups: Vec<Vec<Vec<u8>>>,
101}
102
103#[derive(Clone, Debug, Default)]
104pub(crate) struct PathFilters {
105    includes: Option<GlobSet>,
106    excludes: Option<GlobSet>,
107}
108
109#[derive(Clone, Debug)]
110pub(crate) struct SearchScope {
111    pub root: PathBuf,
112    pub use_index: bool,
113}
114
115impl SearchIndex {
116    pub fn new() -> Self {
117        SearchIndex {
118            postings: HashMap::new(),
119            files: Vec::new(),
120            path_to_id: HashMap::new(),
121            ready: false,
122            project_root: PathBuf::new(),
123            git_head: None,
124            max_file_size: DEFAULT_MAX_FILE_SIZE,
125            file_trigrams: HashMap::new(),
126            unindexed_files: HashSet::new(),
127        }
128    }
129
130    pub fn build(root: &Path) -> Self {
131        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
132    }
133
134    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
135        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
136        let mut index = SearchIndex {
137            project_root: project_root.clone(),
138            max_file_size,
139            ..SearchIndex::new()
140        };
141
142        let filters = PathFilters::default();
143        for path in walk_project_files(&project_root, &filters) {
144            index.update_file(&path);
145        }
146
147        index.git_head = current_git_head(&project_root);
148        index.ready = true;
149        index
150    }
151
152    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
153        self.remove_file(path);
154
155        let file_id = match self.allocate_file_id(path, content.len() as u64) {
156            Some(file_id) => file_id,
157            None => return,
158        };
159
160        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
161        for (trigram, next_char, position) in extract_trigrams(content) {
162            let entry = trigram_map.entry(trigram).or_default();
163            entry.next_mask |= mask_for_next_char(next_char);
164            entry.loc_mask |= mask_for_position(position);
165        }
166
167        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
168        for (trigram, filter) in trigram_map {
169            self.postings.entry(trigram).or_default().push(Posting {
170                file_id,
171                next_mask: filter.next_mask,
172                loc_mask: filter.loc_mask,
173            });
174            file_trigrams.push(trigram);
175        }
176
177        for postings in self.postings.values_mut() {
178            postings.sort_by_key(|posting| posting.file_id);
179        }
180
181        self.file_trigrams.insert(file_id, file_trigrams);
182        self.unindexed_files.remove(&file_id);
183    }
184
185    pub fn remove_file(&mut self, path: &Path) {
186        let Some(file_id) = self.path_to_id.remove(path) else {
187            return;
188        };
189
190        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
191            for trigram in trigrams {
192                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
193                    postings.retain(|posting| posting.file_id != file_id);
194                    postings.is_empty()
195                } else {
196                    false
197                };
198
199                if should_remove {
200                    self.postings.remove(&trigram);
201                }
202            }
203        }
204
205        self.unindexed_files.remove(&file_id);
206        if let Some(file) = self.files.get_mut(file_id as usize) {
207            file.path = PathBuf::new();
208            file.size = 0;
209            file.modified = UNIX_EPOCH;
210        }
211    }
212
213    pub fn update_file(&mut self, path: &Path) {
214        self.remove_file(path);
215
216        let metadata = match fs::metadata(path) {
217            Ok(metadata) if metadata.is_file() => metadata,
218            _ => return,
219        };
220
221        if is_binary_path(path, metadata.len()) {
222            return;
223        }
224
225        if metadata.len() > self.max_file_size {
226            self.track_unindexed_file(path, &metadata);
227            return;
228        }
229
230        let content = match fs::read(path) {
231            Ok(content) => content,
232            Err(_) => return,
233        };
234
235        if is_binary_bytes(&content) {
236            return;
237        }
238
239        self.index_file(path, &content);
240    }
241
242    pub fn grep(
243        &self,
244        pattern: &str,
245        case_sensitive: bool,
246        include: &[String],
247        exclude: &[String],
248        search_root: &Path,
249        max_results: usize,
250    ) -> GrepResult {
251        self.search_grep(
252            pattern,
253            case_sensitive,
254            include,
255            exclude,
256            search_root,
257            max_results,
258        )
259    }
260
261    pub fn search_grep(
262        &self,
263        pattern: &str,
264        case_sensitive: bool,
265        include: &[String],
266        exclude: &[String],
267        search_root: &Path,
268        max_results: usize,
269    ) -> GrepResult {
270        let mut regex_builder = RegexBuilder::new(pattern);
271        regex_builder.case_insensitive(!case_sensitive);
272        let regex = match regex_builder.build() {
273            Ok(regex) => regex,
274            Err(_) => {
275                return GrepResult {
276                    matches: Vec::new(),
277                    total_matches: 0,
278                    files_searched: 0,
279                    files_with_matches: 0,
280                    index_status: if self.ready {
281                        IndexStatus::Ready
282                    } else {
283                        IndexStatus::Building
284                    },
285                    truncated: false,
286                };
287            }
288        };
289
290        let filters = match build_path_filters(include, exclude) {
291            Ok(filters) => filters,
292            Err(_) => PathFilters::default(),
293        };
294        let search_root = canonicalize_or_normalize(search_root);
295
296        let query = decompose_regex(pattern);
297        let candidate_ids = self.candidates(&query);
298
299        let mut matches = Vec::new();
300        let mut total_matches = 0usize;
301        let mut files_searched = 0usize;
302        let mut files_with_matches = 0usize;
303        let mut truncated = false;
304
305        for file_id in candidate_ids {
306            let Some(file) = self.files.get(file_id as usize) else {
307                continue;
308            };
309            if file.path.as_os_str().is_empty() {
310                continue;
311            }
312            if !is_within_search_root(&search_root, &file.path) {
313                continue;
314            }
315            if !filters.matches(&self.project_root, &file.path) {
316                continue;
317            }
318
319            let content = match read_searchable_text(&file.path) {
320                Some(content) => content,
321                None => continue,
322            };
323
324            files_searched += 1;
325            let line_starts = line_starts(&content);
326            let mut seen_lines = HashSet::new();
327            let mut matched_this_file = false;
328
329            for matched in regex.find_iter(&content) {
330                let (line, column, line_text) =
331                    line_details(&content, &line_starts, matched.start());
332                if !seen_lines.insert(line) {
333                    continue;
334                }
335
336                total_matches += 1;
337                if matches.len() < max_results {
338                    matches.push(GrepMatch {
339                        file: file.path.clone(),
340                        line,
341                        column,
342                        line_text,
343                        match_text: matched.as_str().to_string(),
344                    });
345                } else {
346                    truncated = true;
347                }
348                matched_this_file = true;
349            }
350
351            if matched_this_file {
352                files_with_matches += 1;
353            }
354        }
355
356        sort_grep_matches_by_mtime_desc(&mut matches, &self.project_root);
357
358        GrepResult {
359            total_matches,
360            matches,
361            files_searched,
362            files_with_matches,
363            index_status: if self.ready {
364                IndexStatus::Ready
365            } else {
366                IndexStatus::Building
367            },
368            truncated,
369        }
370    }
371
372    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
373        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
374            Ok(filters) => filters,
375            Err(_) => return Vec::new(),
376        };
377        let search_root = canonicalize_or_normalize(search_root);
378        let filter_root = if search_root.starts_with(&self.project_root) {
379            &self.project_root
380        } else {
381            &search_root
382        };
383
384        let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
385        sort_paths_by_mtime_desc(&mut paths);
386        paths
387    }
388
389    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
390        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
391            return self.active_file_ids();
392        }
393
394        let mut current: Option<BTreeSet<u32>> = None;
395
396        for trigram in &query.and_trigrams {
397            let filter = query.and_filters.get(trigram).copied();
398            let matches = self.postings_for_trigram(*trigram, filter);
399            current = Some(match current.take() {
400                Some(existing) => existing.intersection(&matches).copied().collect(),
401                None => matches,
402            });
403
404            if current.as_ref().is_some_and(|set| set.is_empty()) {
405                break;
406            }
407        }
408
409        let mut current = current.unwrap_or_else(|| self.active_file_ids().into_iter().collect());
410
411        for (index, group) in query.or_groups.iter().enumerate() {
412            let mut group_matches = BTreeSet::new();
413            let filters = query.or_filters.get(index);
414
415            for trigram in group {
416                let filter = filters.and_then(|filters| filters.get(trigram).copied());
417                group_matches.extend(self.postings_for_trigram(*trigram, filter));
418            }
419
420            current = current.intersection(&group_matches).copied().collect();
421            if current.is_empty() {
422                break;
423            }
424        }
425
426        for file_id in &self.unindexed_files {
427            if self.is_active_file(*file_id) {
428                current.insert(*file_id);
429            }
430        }
431
432        current.into_iter().collect()
433    }
434
435    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
436        if fs::create_dir_all(cache_dir).is_err() {
437            return;
438        }
439
440        let postings_path = cache_dir.join("postings.bin");
441        let lookup_path = cache_dir.join("lookup.bin");
442        let tmp_postings = cache_dir.join("postings.bin.tmp");
443        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
444
445        let active_ids = self.active_file_ids();
446        let mut id_map = HashMap::new();
447        for (new_id, old_id) in active_ids.iter().enumerate() {
448            let Ok(new_id_u32) = u32::try_from(new_id) else {
449                return;
450            };
451            id_map.insert(*old_id, new_id_u32);
452        }
453
454        let write_result = (|| -> std::io::Result<()> {
455            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
456
457            postings_writer.write_all(INDEX_MAGIC)?;
458            write_u32(&mut postings_writer, INDEX_VERSION)?;
459
460            let head = git_head.unwrap_or_default();
461            let root = self.project_root.to_string_lossy();
462            let head_len = u32::try_from(head.len())
463                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
464            let root_len = u32::try_from(root.len())
465                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
466            let file_count = u32::try_from(active_ids.len())
467                .map_err(|_| std::io::Error::other("too many files to cache"))?;
468
469            write_u32(&mut postings_writer, head_len)?;
470            write_u32(&mut postings_writer, root_len)?;
471            write_u64(&mut postings_writer, self.max_file_size)?;
472            write_u32(&mut postings_writer, file_count)?;
473            postings_writer.write_all(head.as_bytes())?;
474            postings_writer.write_all(root.as_bytes())?;
475
476            for old_id in &active_ids {
477                let Some(file) = self.files.get(*old_id as usize) else {
478                    return Err(std::io::Error::other("missing file entry for cache write"));
479                };
480                let path = relative_to_root(&self.project_root, &file.path);
481                let path = path.to_string_lossy();
482                let path_len = u32::try_from(path.len())
483                    .map_err(|_| std::io::Error::other("cached path too large"))?;
484                let modified = file
485                    .modified
486                    .duration_since(UNIX_EPOCH)
487                    .unwrap_or(Duration::ZERO);
488                let unindexed = if self.unindexed_files.contains(old_id) {
489                    1u8
490                } else {
491                    0u8
492                };
493
494                postings_writer.write_all(&[unindexed])?;
495                write_u32(&mut postings_writer, path_len)?;
496                write_u64(&mut postings_writer, file.size)?;
497                write_u64(&mut postings_writer, modified.as_secs())?;
498                write_u32(&mut postings_writer, modified.subsec_nanos())?;
499                postings_writer.write_all(path.as_bytes())?;
500            }
501
502            let mut lookup_entries = Vec::new();
503            let mut postings_blob = Vec::new();
504            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
505            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
506
507            for (trigram, postings) in sorted_postings {
508                let offset = u64::try_from(postings_blob.len())
509                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
510                let mut count = 0u32;
511
512                for posting in postings {
513                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
514                        continue;
515                    };
516
517                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
518                    postings_blob.push(posting.next_mask);
519                    postings_blob.push(posting.loc_mask);
520                    count = count.saturating_add(1);
521                }
522
523                if count > 0 {
524                    lookup_entries.push((*trigram, offset, count));
525                }
526            }
527
528            write_u64(
529                &mut postings_writer,
530                u64::try_from(postings_blob.len())
531                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
532            )?;
533            postings_writer.write_all(&postings_blob)?;
534            postings_writer.flush()?;
535            drop(postings_writer);
536
537            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
538            let entry_count = u32::try_from(lookup_entries.len())
539                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
540
541            lookup_writer.write_all(LOOKUP_MAGIC)?;
542            write_u32(&mut lookup_writer, INDEX_VERSION)?;
543            write_u32(&mut lookup_writer, entry_count)?;
544
545            for (trigram, offset, count) in lookup_entries {
546                write_u32(&mut lookup_writer, trigram)?;
547                write_u64(&mut lookup_writer, offset)?;
548                write_u32(&mut lookup_writer, count)?;
549            }
550
551            lookup_writer.flush()?;
552            drop(lookup_writer);
553
554            fs::rename(&tmp_postings, &postings_path)?;
555            fs::rename(&tmp_lookup, &lookup_path)?;
556
557            Ok(())
558        })();
559
560        if write_result.is_err() {
561            let _ = fs::remove_file(&tmp_postings);
562            let _ = fs::remove_file(&tmp_lookup);
563        }
564    }
565
566    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
567        let postings_path = cache_dir.join("postings.bin");
568        let lookup_path = cache_dir.join("lookup.bin");
569
570        let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
571        let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
572
573        let mut magic = [0u8; 8];
574        postings_reader.read_exact(&mut magic).ok()?;
575        if &magic != INDEX_MAGIC {
576            return None;
577        }
578        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
579            return None;
580        }
581
582        let head_len = read_u32(&mut postings_reader).ok()? as usize;
583        let root_len = read_u32(&mut postings_reader).ok()? as usize;
584        let max_file_size = read_u64(&mut postings_reader).ok()?;
585        let file_count = read_u32(&mut postings_reader).ok()? as usize;
586
587        let mut head_bytes = vec![0u8; head_len];
588        postings_reader.read_exact(&mut head_bytes).ok()?;
589        let git_head = String::from_utf8(head_bytes)
590            .ok()
591            .filter(|head| !head.is_empty());
592
593        let mut root_bytes = vec![0u8; root_len];
594        postings_reader.read_exact(&mut root_bytes).ok()?;
595        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
596
597        let mut files = Vec::with_capacity(file_count);
598        let mut path_to_id = HashMap::new();
599        let mut unindexed_files = HashSet::new();
600
601        for file_id in 0..file_count {
602            let mut unindexed = [0u8; 1];
603            postings_reader.read_exact(&mut unindexed).ok()?;
604            let path_len = read_u32(&mut postings_reader).ok()? as usize;
605            let size = read_u64(&mut postings_reader).ok()?;
606            let secs = read_u64(&mut postings_reader).ok()?;
607            let nanos = read_u32(&mut postings_reader).ok()?;
608            let mut path_bytes = vec![0u8; path_len];
609            postings_reader.read_exact(&mut path_bytes).ok()?;
610            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
611            let full_path = project_root.join(relative_path);
612            let file_id_u32 = u32::try_from(file_id).ok()?;
613
614            files.push(FileEntry {
615                path: full_path.clone(),
616                size,
617                modified: UNIX_EPOCH + Duration::new(secs, nanos),
618            });
619            path_to_id.insert(full_path, file_id_u32);
620            if unindexed[0] == 1 {
621                unindexed_files.insert(file_id_u32);
622            }
623        }
624
625        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
626        let mut postings_blob = vec![0u8; postings_len];
627        postings_reader.read_exact(&mut postings_blob).ok()?;
628
629        let mut lookup_magic = [0u8; 8];
630        lookup_reader.read_exact(&mut lookup_magic).ok()?;
631        if &lookup_magic != LOOKUP_MAGIC {
632            return None;
633        }
634        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
635            return None;
636        }
637        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
638
639        let mut postings = HashMap::new();
640        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
641
642        for _ in 0..entry_count {
643            let trigram = read_u32(&mut lookup_reader).ok()?;
644            let offset = read_u64(&mut lookup_reader).ok()? as usize;
645            let count = read_u32(&mut lookup_reader).ok()? as usize;
646            let bytes_len = count.checked_mul(6)?;
647            let end = offset.checked_add(bytes_len)?;
648            if end > postings_blob.len() {
649                return None;
650            }
651
652            let mut trigram_postings = Vec::with_capacity(count);
653            for chunk in postings_blob[offset..end].chunks_exact(6) {
654                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
655                let posting = Posting {
656                    file_id,
657                    next_mask: chunk[4],
658                    loc_mask: chunk[5],
659                };
660                trigram_postings.push(posting.clone());
661                file_trigrams.entry(file_id).or_default().push(trigram);
662            }
663            postings.insert(trigram, trigram_postings);
664        }
665
666        Some(SearchIndex {
667            postings,
668            files,
669            path_to_id,
670            ready: true,
671            project_root,
672            git_head,
673            max_file_size,
674            file_trigrams,
675            unindexed_files,
676        })
677    }
678
679    pub(crate) fn stored_git_head(&self) -> Option<&str> {
680        self.git_head.as_deref()
681    }
682
683    pub(crate) fn set_ready(&mut self, ready: bool) {
684        self.ready = ready;
685    }
686
687    pub(crate) fn rebuild_or_refresh(
688        root: &Path,
689        max_file_size: u64,
690        current_head: Option<String>,
691        baseline: Option<SearchIndex>,
692    ) -> Self {
693        if current_head.is_none() {
694            return SearchIndex::build_with_limit(root, max_file_size);
695        }
696
697        if let Some(mut baseline) = baseline {
698            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
699            baseline.max_file_size = max_file_size;
700
701            if baseline.git_head == current_head {
702                baseline.ready = true;
703                return baseline;
704            }
705
706            if let (Some(previous), Some(current)) =
707                (baseline.git_head.clone(), current_head.clone())
708            {
709                let project_root = baseline.project_root.clone();
710                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
711                    baseline.git_head = Some(current);
712                    baseline.ready = true;
713                    return baseline;
714                }
715            }
716        }
717
718        SearchIndex::build_with_limit(root, max_file_size)
719    }
720
721    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
722        let file_id = u32::try_from(self.files.len()).ok()?;
723        let metadata = fs::metadata(path).ok();
724        let size = metadata
725            .as_ref()
726            .map_or(size_hint, |metadata| metadata.len());
727        let modified = metadata
728            .and_then(|metadata| metadata.modified().ok())
729            .unwrap_or(UNIX_EPOCH);
730
731        self.files.push(FileEntry {
732            path: path.to_path_buf(),
733            size,
734            modified,
735        });
736        self.path_to_id.insert(path.to_path_buf(), file_id);
737        Some(file_id)
738    }
739
740    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
741        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
742            return;
743        };
744        self.unindexed_files.insert(file_id);
745        self.file_trigrams.insert(file_id, Vec::new());
746    }
747
748    fn active_file_ids(&self) -> Vec<u32> {
749        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
750        ids.sort_unstable();
751        ids
752    }
753
754    fn is_active_file(&self, file_id: u32) -> bool {
755        self.files
756            .get(file_id as usize)
757            .map(|file| !file.path.as_os_str().is_empty())
758            .unwrap_or(false)
759    }
760
761    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> BTreeSet<u32> {
762        let mut matches = BTreeSet::new();
763        let Some(postings) = self.postings.get(&trigram) else {
764            return matches;
765        };
766
767        for posting in postings {
768            if let Some(filter) = filter {
769                // next_mask: bloom filter check — the character following this trigram in the
770                // query must also appear after this trigram somewhere in the file.
771                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
772                    continue;
773                }
774                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
775                // between consecutive trigram pairs, but is NOT used as a single-trigram
776                // filter because the position in the query string has no relationship to
777                // the position in the file. Using it here causes false negatives.
778            }
779            if self.is_active_file(posting.file_id) {
780                matches.insert(posting.file_id);
781            }
782        }
783
784        matches
785    }
786}
787
788pub fn decompose_regex(pattern: &str) -> RegexQuery {
789    let hir = match regex_syntax::parse(pattern) {
790        Ok(hir) => hir,
791        Err(_) => return RegexQuery::default(),
792    };
793
794    let build = build_query(&hir);
795    build.into_query()
796}
797
798pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
799    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
800}
801
802pub fn normalize_char(c: u8) -> u8 {
803    c.to_ascii_lowercase()
804}
805
806pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
807    if content.len() < 3 {
808        return Vec::new();
809    }
810
811    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
812    for start in 0..=content.len() - 3 {
813        let trigram = pack_trigram(
814            normalize_char(content[start]),
815            normalize_char(content[start + 1]),
816            normalize_char(content[start + 2]),
817        );
818        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
819        trigrams.push((trigram, next_char, start));
820    }
821    trigrams
822}
823
824pub fn resolve_cache_dir(project_root: &Path) -> PathBuf {
825    // Respect AFT_CACHE_DIR for testing — prevents tests from polluting the user's cache
826    if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
827        return PathBuf::from(override_dir)
828            .join("index")
829            .join(project_cache_key(project_root));
830    }
831    let home = std::env::var_os("HOME")
832        .map(PathBuf::from)
833        .unwrap_or_else(|| PathBuf::from("."));
834    home.join(".cache")
835        .join("aft")
836        .join("index")
837        .join(project_cache_key(project_root))
838}
839
840pub(crate) fn build_path_filters(
841    include: &[String],
842    exclude: &[String],
843) -> Result<PathFilters, String> {
844    Ok(PathFilters {
845        includes: build_globset(include)?,
846        excludes: build_globset(exclude)?,
847    })
848}
849
850pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
851    walk_project_files_from(root, root, filters)
852}
853
854pub(crate) fn walk_project_files_from(
855    filter_root: &Path,
856    search_root: &Path,
857    filters: &PathFilters,
858) -> Vec<PathBuf> {
859    let mut builder = WalkBuilder::new(search_root);
860    builder
861        .hidden(false)
862        .git_ignore(true)
863        .git_global(true)
864        .git_exclude(true)
865        .filter_entry(|entry| {
866            let name = entry.file_name().to_string_lossy();
867            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
868                return !matches!(
869                    name.as_ref(),
870                    "node_modules"
871                        | "target"
872                        | "venv"
873                        | ".venv"
874                        | ".git"
875                        | "__pycache__"
876                        | ".tox"
877                        | "dist"
878                        | "build"
879                );
880            }
881            true
882        });
883
884    let mut files = Vec::new();
885    for entry in builder.build().filter_map(|entry| entry.ok()) {
886        if !entry
887            .file_type()
888            .map_or(false, |file_type| file_type.is_file())
889        {
890            continue;
891        }
892        let path = entry.into_path();
893        if filters.matches(filter_root, &path) {
894            files.push(path);
895        }
896    }
897
898    sort_paths_by_mtime_desc(&mut files);
899    files
900}
901
902pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
903    let bytes = fs::read(path).ok()?;
904    if is_binary_bytes(&bytes) {
905        return None;
906    }
907    String::from_utf8(bytes).ok()
908}
909
910pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
911    path.strip_prefix(root)
912        .map(PathBuf::from)
913        .unwrap_or_else(|_| path.to_path_buf())
914}
915
916pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
917    paths.sort_by(|left, right| {
918        path_modified_time(right)
919            .cmp(&path_modified_time(left))
920            .then_with(|| left.cmp(right))
921    });
922}
923
924pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
925    matches.sort_by(|left, right| {
926        let left_path = resolve_match_path(project_root, &left.file);
927        let right_path = resolve_match_path(project_root, &right.file);
928
929        path_modified_time(&right_path)
930            .cmp(&path_modified_time(&left_path))
931            .then_with(|| left.file.cmp(&right.file))
932            .then_with(|| left.line.cmp(&right.line))
933            .then_with(|| left.column.cmp(&right.column))
934    });
935}
936
937pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
938    let resolved_project_root = canonicalize_or_normalize(project_root);
939    let root = match path {
940        Some(path) => {
941            let path = PathBuf::from(path);
942            if path.is_absolute() {
943                canonicalize_or_normalize(&path)
944            } else {
945                normalize_path(&resolved_project_root.join(path))
946            }
947        }
948        None => resolved_project_root.clone(),
949    };
950
951    let use_index = is_within_search_root(&resolved_project_root, &root);
952    SearchScope { root, use_index }
953}
954
955pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
956    content_inspector::inspect(content).is_binary()
957}
958
959pub(crate) fn current_git_head(root: &Path) -> Option<String> {
960    run_git(root, &["rev-parse", "HEAD"])
961}
962
963pub(crate) fn project_cache_key(project_root: &Path) -> String {
964    use sha2::{Digest, Sha256};
965
966    let mut hasher = Sha256::new();
967
968    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
969        // Git repo: root commit is the unique identity.
970        // Same repo cloned anywhere produces the same key.
971        hasher.update(root_commit.as_bytes());
972    } else {
973        // Non-git project: use the canonical filesystem path as identity.
974        let canonical_root = canonicalize_or_normalize(project_root);
975        hasher.update(canonical_root.to_string_lossy().as_bytes());
976    }
977
978    let digest = format!("{:x}", hasher.finalize());
979    digest[..16].to_string()
980}
981
982impl PathFilters {
983    fn matches(&self, root: &Path, path: &Path) -> bool {
984        let relative = to_glob_path(&relative_to_root(root, path));
985        if self
986            .includes
987            .as_ref()
988            .is_some_and(|includes| !includes.is_match(&relative))
989        {
990            return false;
991        }
992        if self
993            .excludes
994            .as_ref()
995            .is_some_and(|excludes| excludes.is_match(&relative))
996        {
997            return false;
998        }
999        true
1000    }
1001}
1002
1003fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1004    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1005}
1006
1007fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1008    if path.is_absolute() {
1009        path.to_path_buf()
1010    } else {
1011        project_root.join(path)
1012    }
1013}
1014
1015fn path_modified_time(path: &Path) -> Option<SystemTime> {
1016    fs::metadata(path)
1017        .and_then(|metadata| metadata.modified())
1018        .ok()
1019}
1020
1021fn normalize_path(path: &Path) -> PathBuf {
1022    let mut result = PathBuf::new();
1023    for component in path.components() {
1024        match component {
1025            Component::ParentDir => {
1026                if !result.pop() {
1027                    result.push(component);
1028                }
1029            }
1030            Component::CurDir => {}
1031            _ => result.push(component),
1032        }
1033    }
1034    result
1035}
1036
1037fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1038    path.starts_with(search_root)
1039}
1040
1041impl QueryBuild {
1042    fn into_query(self) -> RegexQuery {
1043        let mut query = RegexQuery::default();
1044
1045        for run in self.and_runs {
1046            add_run_to_and_query(&mut query, &run);
1047        }
1048
1049        for group in self.or_groups {
1050            let mut trigrams = BTreeSet::new();
1051            let mut filters = HashMap::new();
1052            for run in group {
1053                for (trigram, filter) in trigram_filters(&run) {
1054                    trigrams.insert(trigram);
1055                    merge_filter(filters.entry(trigram).or_default(), filter);
1056                }
1057            }
1058            if !trigrams.is_empty() {
1059                query.or_groups.push(trigrams.into_iter().collect());
1060                query.or_filters.push(filters);
1061            }
1062        }
1063
1064        query
1065    }
1066}
1067
1068fn build_query(hir: &Hir) -> QueryBuild {
1069    match hir.kind() {
1070        HirKind::Literal(literal) => {
1071            if literal.0.len() >= 3 {
1072                QueryBuild {
1073                    and_runs: vec![literal.0.to_vec()],
1074                    or_groups: Vec::new(),
1075                }
1076            } else {
1077                QueryBuild::default()
1078            }
1079        }
1080        HirKind::Capture(capture) => build_query(&capture.sub),
1081        HirKind::Concat(parts) => {
1082            let mut build = QueryBuild::default();
1083            for part in parts {
1084                let part_build = build_query(part);
1085                build.and_runs.extend(part_build.and_runs);
1086                build.or_groups.extend(part_build.or_groups);
1087            }
1088            build
1089        }
1090        HirKind::Alternation(parts) => {
1091            let mut group = Vec::new();
1092            for part in parts {
1093                let Some(mut choices) = guaranteed_run_choices(part) else {
1094                    return QueryBuild::default();
1095                };
1096                group.append(&mut choices);
1097            }
1098            if group.is_empty() {
1099                QueryBuild::default()
1100            } else {
1101                QueryBuild {
1102                    and_runs: Vec::new(),
1103                    or_groups: vec![group],
1104                }
1105            }
1106        }
1107        HirKind::Repetition(repetition) => {
1108            if repetition.min == 0 {
1109                QueryBuild::default()
1110            } else {
1111                build_query(&repetition.sub)
1112            }
1113        }
1114        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1115    }
1116}
1117
1118fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1119    match hir.kind() {
1120        HirKind::Literal(literal) => {
1121            if literal.0.len() >= 3 {
1122                Some(vec![literal.0.to_vec()])
1123            } else {
1124                None
1125            }
1126        }
1127        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1128        HirKind::Concat(parts) => {
1129            let mut runs = Vec::new();
1130            for part in parts {
1131                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1132                    runs.append(&mut part_runs);
1133                }
1134            }
1135            if runs.is_empty() {
1136                None
1137            } else {
1138                Some(runs)
1139            }
1140        }
1141        HirKind::Alternation(parts) => {
1142            let mut runs = Vec::new();
1143            for part in parts {
1144                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1145                    return None;
1146                };
1147                runs.append(&mut part_runs);
1148            }
1149            if runs.is_empty() {
1150                None
1151            } else {
1152                Some(runs)
1153            }
1154        }
1155        HirKind::Repetition(repetition) => {
1156            if repetition.min == 0 {
1157                None
1158            } else {
1159                guaranteed_run_choices(&repetition.sub)
1160            }
1161        }
1162        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1163    }
1164}
1165
1166fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1167    for (trigram, filter) in trigram_filters(run) {
1168        if !query.and_trigrams.contains(&trigram) {
1169            query.and_trigrams.push(trigram);
1170        }
1171        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1172    }
1173}
1174
1175fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1176    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1177    for (trigram, next_char, position) in extract_trigrams(run) {
1178        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1179        if next_char != EOF_SENTINEL {
1180            entry.next_mask |= mask_for_next_char(next_char);
1181        }
1182        entry.loc_mask |= mask_for_position(position);
1183    }
1184    filters.into_iter().collect()
1185}
1186
1187fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1188    target.next_mask |= filter.next_mask;
1189    target.loc_mask |= filter.loc_mask;
1190}
1191
1192fn mask_for_next_char(next_char: u8) -> u8 {
1193    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1194    1u8 << bit
1195}
1196
1197fn mask_for_position(position: usize) -> u8 {
1198    1u8 << (position % 8)
1199}
1200
1201fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1202    if patterns.is_empty() {
1203        return Ok(None);
1204    }
1205
1206    let mut builder = GlobSetBuilder::new();
1207    for pattern in patterns {
1208        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1209        builder.add(glob);
1210    }
1211    builder.build().map(Some).map_err(|error| error.to_string())
1212}
1213
1214fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1215    let mut buffer = [0u8; 4];
1216    reader.read_exact(&mut buffer)?;
1217    Ok(u32::from_le_bytes(buffer))
1218}
1219
1220fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1221    let mut buffer = [0u8; 8];
1222    reader.read_exact(&mut buffer)?;
1223    Ok(u64::from_le_bytes(buffer))
1224}
1225
1226fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1227    writer.write_all(&value.to_le_bytes())
1228}
1229
1230fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1231    writer.write_all(&value.to_le_bytes())
1232}
1233
1234fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1235    let output = Command::new("git")
1236        .arg("-C")
1237        .arg(root)
1238        .args(args)
1239        .output()
1240        .ok()?;
1241    if !output.status.success() {
1242        return None;
1243    }
1244    let value = String::from_utf8(output.stdout).ok()?;
1245    let value = value.trim().to_string();
1246    if value.is_empty() {
1247        None
1248    } else {
1249        Some(value)
1250    }
1251}
1252
1253fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1254    let diff_range = format!("{}..{}", from, to);
1255    let output = match Command::new("git")
1256        .arg("-C")
1257        .arg(root)
1258        .args(["diff", "--name-only", &diff_range])
1259        .output()
1260    {
1261        Ok(output) => output,
1262        Err(_) => return false,
1263    };
1264
1265    if !output.status.success() {
1266        return false;
1267    }
1268
1269    let Ok(paths) = String::from_utf8(output.stdout) else {
1270        return false;
1271    };
1272
1273    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1274        let path = root.join(relative_path);
1275        if path.exists() {
1276            index.update_file(&path);
1277        } else {
1278            index.remove_file(&path);
1279        }
1280    }
1281
1282    true
1283}
1284
1285fn is_binary_path(path: &Path, size: u64) -> bool {
1286    if size == 0 {
1287        return false;
1288    }
1289
1290    let mut file = match File::open(path) {
1291        Ok(file) => file,
1292        Err(_) => return true,
1293    };
1294
1295    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1296    match file.read(&mut preview) {
1297        Ok(read) => is_binary_bytes(&preview[..read]),
1298        Err(_) => true,
1299    }
1300}
1301
1302fn line_starts(content: &str) -> Vec<usize> {
1303    let mut starts = vec![0usize];
1304    for (index, byte) in content.bytes().enumerate() {
1305        if byte == b'\n' {
1306            starts.push(index + 1);
1307        }
1308    }
1309    starts
1310}
1311
1312fn line_details(content: &str, line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1313    let line_index = match line_starts.binary_search(&offset) {
1314        Ok(index) => index,
1315        Err(index) => index.saturating_sub(1),
1316    };
1317    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1318    let line_end = content[line_start..]
1319        .find('\n')
1320        .map(|length| line_start + length)
1321        .unwrap_or(content.len());
1322    let line_text = content[line_start..line_end]
1323        .trim_end_matches('\r')
1324        .to_string();
1325    let column = content[line_start..offset].chars().count() as u32 + 1;
1326    (line_index as u32 + 1, column, line_text)
1327}
1328
1329fn to_glob_path(path: &Path) -> String {
1330    path.to_string_lossy().replace('\\', "/")
1331}
1332
1333#[cfg(test)]
1334mod tests {
1335    use std::process::Command;
1336
1337    use super::*;
1338
1339    #[test]
1340    fn extract_trigrams_tracks_next_char_and_position() {
1341        let trigrams = extract_trigrams(b"Rust");
1342        assert_eq!(trigrams.len(), 2);
1343        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1344        assert_eq!(
1345            trigrams[1],
1346            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1347        );
1348    }
1349
1350    #[test]
1351    fn decompose_regex_extracts_literals_and_alternations() {
1352        let query = decompose_regex("abc(def|ghi)xyz");
1353        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1354        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1355        assert_eq!(query.or_groups.len(), 1);
1356        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1357        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1358    }
1359
1360    #[test]
1361    fn candidates_intersect_posting_lists() {
1362        let mut index = SearchIndex::new();
1363        let dir = tempfile::tempdir().expect("create temp dir");
1364        let alpha = dir.path().join("alpha.txt");
1365        let beta = dir.path().join("beta.txt");
1366        fs::write(&alpha, "abcdef").expect("write alpha");
1367        fs::write(&beta, "abcxyz").expect("write beta");
1368        index.project_root = dir.path().to_path_buf();
1369        index.index_file(&alpha, b"abcdef");
1370        index.index_file(&beta, b"abcxyz");
1371
1372        let query = RegexQuery {
1373            and_trigrams: vec![
1374                pack_trigram(b'a', b'b', b'c'),
1375                pack_trigram(b'd', b'e', b'f'),
1376            ],
1377            ..RegexQuery::default()
1378        };
1379
1380        let candidates = index.candidates(&query);
1381        assert_eq!(candidates.len(), 1);
1382        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1383    }
1384
1385    #[test]
1386    fn candidates_apply_bloom_filters() {
1387        let mut index = SearchIndex::new();
1388        let dir = tempfile::tempdir().expect("create temp dir");
1389        let file = dir.path().join("sample.txt");
1390        fs::write(&file, "abcd efgh").expect("write sample");
1391        index.project_root = dir.path().to_path_buf();
1392        index.index_file(&file, b"abcd efgh");
1393
1394        let trigram = pack_trigram(b'a', b'b', b'c');
1395        let matching_filter = PostingFilter {
1396            next_mask: mask_for_next_char(b'd'),
1397            loc_mask: mask_for_position(0),
1398        };
1399        let non_matching_filter = PostingFilter {
1400            next_mask: mask_for_next_char(b'z'),
1401            loc_mask: mask_for_position(0),
1402        };
1403
1404        assert_eq!(
1405            index
1406                .postings_for_trigram(trigram, Some(matching_filter))
1407                .len(),
1408            1
1409        );
1410        assert!(index
1411            .postings_for_trigram(trigram, Some(non_matching_filter))
1412            .is_empty());
1413    }
1414
1415    #[test]
1416    fn disk_round_trip_preserves_postings_and_files() {
1417        let dir = tempfile::tempdir().expect("create temp dir");
1418        let project = dir.path().join("project");
1419        fs::create_dir_all(&project).expect("create project dir");
1420        let file = project.join("src.txt");
1421        fs::write(&file, "abcdef").expect("write source");
1422
1423        let mut index = SearchIndex::build(&project);
1424        index.git_head = Some("deadbeef".to_string());
1425        let cache_dir = dir.path().join("cache");
1426        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1427
1428        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1429        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1430        assert_eq!(loaded.files.len(), 1);
1431        assert_eq!(
1432            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1433            PathBuf::from("src.txt")
1434        );
1435        assert_eq!(loaded.postings.len(), index.postings.len());
1436        assert!(loaded
1437            .postings
1438            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1439    }
1440
1441    #[test]
1442    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1443        let dir = tempfile::tempdir().expect("create temp dir");
1444        let project = dir.path().join("project");
1445        fs::create_dir_all(&project).expect("create project dir");
1446        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1447
1448        let index = SearchIndex::build(&project);
1449        let cache_dir = dir.path().join("cache");
1450        index.write_to_disk(&cache_dir, None);
1451
1452        assert!(cache_dir.join("postings.bin").is_file());
1453        assert!(cache_dir.join("lookup.bin").is_file());
1454        assert!(!cache_dir.join("postings.bin.tmp").exists());
1455        assert!(!cache_dir.join("lookup.bin.tmp").exists());
1456    }
1457
1458    #[test]
1459    fn project_cache_key_includes_checkout_path() {
1460        let dir = tempfile::tempdir().expect("create temp dir");
1461        let source = dir.path().join("source");
1462        fs::create_dir_all(&source).expect("create source repo dir");
1463        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1464
1465        assert!(Command::new("git")
1466            .current_dir(&source)
1467            .args(["init"])
1468            .status()
1469            .expect("init git repo")
1470            .success());
1471        assert!(Command::new("git")
1472            .current_dir(&source)
1473            .args(["add", "."])
1474            .status()
1475            .expect("git add")
1476            .success());
1477        assert!(Command::new("git")
1478            .current_dir(&source)
1479            .args([
1480                "-c",
1481                "user.name=AFT Tests",
1482                "-c",
1483                "user.email=aft-tests@example.com",
1484                "commit",
1485                "-m",
1486                "initial",
1487            ])
1488            .status()
1489            .expect("git commit")
1490            .success());
1491
1492        let clone = dir.path().join("clone");
1493        assert!(Command::new("git")
1494            .args(["clone", "--quiet"])
1495            .arg(&source)
1496            .arg(&clone)
1497            .status()
1498            .expect("git clone")
1499            .success());
1500
1501        let source_key = project_cache_key(&source);
1502        let clone_key = project_cache_key(&clone);
1503
1504        assert_eq!(source_key.len(), 16);
1505        assert_eq!(clone_key.len(), 16);
1506        // Same repo (same root commit) → same cache key regardless of clone path
1507        assert_eq!(source_key, clone_key);
1508    }
1509
1510    #[test]
1511    fn resolve_search_scope_disables_index_for_external_path() {
1512        let dir = tempfile::tempdir().expect("create temp dir");
1513        let project = dir.path().join("project");
1514        let outside = dir.path().join("outside");
1515        fs::create_dir_all(&project).expect("create project dir");
1516        fs::create_dir_all(&outside).expect("create outside dir");
1517
1518        let scope = resolve_search_scope(&project, outside.to_str());
1519
1520        assert_eq!(
1521            scope.root,
1522            fs::canonicalize(&outside).expect("canonicalize outside")
1523        );
1524        assert!(!scope.use_index);
1525    }
1526
1527    #[test]
1528    fn grep_filters_matches_to_search_root() {
1529        let dir = tempfile::tempdir().expect("create temp dir");
1530        let project = dir.path().join("project");
1531        let src = project.join("src");
1532        let docs = project.join("docs");
1533        fs::create_dir_all(&src).expect("create src dir");
1534        fs::create_dir_all(&docs).expect("create docs dir");
1535        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1536        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1537
1538        let index = SearchIndex::build(&project);
1539        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1540
1541        assert_eq!(result.files_searched, 1);
1542        assert_eq!(result.files_with_matches, 1);
1543        assert_eq!(result.matches.len(), 1);
1544        // Index stores canonicalized paths; on macOS /var → /private/var
1545        let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
1546        assert_eq!(result.matches[0].file, expected);
1547    }
1548
1549    #[test]
1550    fn grep_deduplicates_multiple_matches_on_same_line() {
1551        let dir = tempfile::tempdir().expect("create temp dir");
1552        let project = dir.path().join("project");
1553        let src = project.join("src");
1554        fs::create_dir_all(&src).expect("create src dir");
1555        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1556
1557        let index = SearchIndex::build(&project);
1558        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1559
1560        assert_eq!(result.total_matches, 1);
1561        assert_eq!(result.matches.len(), 1);
1562    }
1563
1564    #[test]
1565    fn grep_reports_total_matches_before_truncation() {
1566        let dir = tempfile::tempdir().expect("create temp dir");
1567        let project = dir.path().join("project");
1568        let src = project.join("src");
1569        fs::create_dir_all(&src).expect("create src dir");
1570        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
1571
1572        let index = SearchIndex::build(&project);
1573        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
1574
1575        assert_eq!(result.total_matches, 2);
1576        assert_eq!(result.matches.len(), 1);
1577        assert!(result.truncated);
1578    }
1579
1580    #[test]
1581    fn glob_filters_results_to_search_root() {
1582        let dir = tempfile::tempdir().expect("create temp dir");
1583        let project = dir.path().join("project");
1584        let src = project.join("src");
1585        let scripts = project.join("scripts");
1586        fs::create_dir_all(&src).expect("create src dir");
1587        fs::create_dir_all(&scripts).expect("create scripts dir");
1588        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
1589        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
1590
1591        let index = SearchIndex::build(&project);
1592        let files = index.glob("**/*.rs", &src);
1593
1594        assert_eq!(
1595            files,
1596            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
1597        );
1598    }
1599
1600    #[test]
1601    fn glob_includes_hidden_and_binary_files() {
1602        let dir = tempfile::tempdir().expect("create temp dir");
1603        let project = dir.path().join("project");
1604        let hidden_dir = project.join(".hidden");
1605        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
1606        let hidden_file = hidden_dir.join("data.bin");
1607        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
1608
1609        let index = SearchIndex::build(&project);
1610        let files = index.glob("**/*.bin", &project);
1611
1612        assert_eq!(
1613            files,
1614            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
1615        );
1616    }
1617}