Skip to main content

aft/
search_index.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7
8use globset::{Glob, GlobSet, GlobSetBuilder};
9use ignore::WalkBuilder;
10use regex::RegexBuilder;
11use regex_syntax::hir::{Hir, HirKind};
12
13const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
14const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
15const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
16const INDEX_VERSION: u32 = 1;
17const PREVIEW_BYTES: usize = 8 * 1024;
18const EOF_SENTINEL: u8 = 0;
19
20#[derive(Clone, Debug)]
21pub struct SearchIndex {
22    pub postings: HashMap<u32, Vec<Posting>>,
23    pub files: Vec<FileEntry>,
24    pub path_to_id: HashMap<PathBuf, u32>,
25    pub ready: bool,
26    project_root: PathBuf,
27    git_head: Option<String>,
28    max_file_size: u64,
29    file_trigrams: HashMap<u32, Vec<u32>>,
30    unindexed_files: HashSet<u32>,
31}
32
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub struct Posting {
35    pub file_id: u32,
36    pub next_mask: u8,
37    pub loc_mask: u8,
38}
39
40#[derive(Clone, Debug)]
41pub struct FileEntry {
42    pub path: PathBuf,
43    pub size: u64,
44    pub modified: SystemTime,
45}
46
47#[derive(Clone, Debug, PartialEq, Eq)]
48pub struct GrepMatch {
49    pub file: PathBuf,
50    pub line: u32,
51    pub column: u32,
52    pub line_text: String,
53    pub match_text: String,
54}
55
56#[derive(Clone, Debug)]
57pub struct GrepResult {
58    pub matches: Vec<GrepMatch>,
59    pub total_matches: usize,
60    pub files_searched: usize,
61    pub files_with_matches: usize,
62    pub index_status: IndexStatus,
63    pub truncated: bool,
64}
65
66#[derive(Clone, Copy, Debug, PartialEq, Eq)]
67pub enum IndexStatus {
68    Ready,
69    Building,
70    Fallback,
71}
72
73impl IndexStatus {
74    pub fn as_str(&self) -> &'static str {
75        match self {
76            IndexStatus::Ready => "Ready",
77            IndexStatus::Building => "Building",
78            IndexStatus::Fallback => "Fallback",
79        }
80    }
81}
82
83#[derive(Clone, Debug, Default)]
84pub struct RegexQuery {
85    pub and_trigrams: Vec<u32>,
86    pub or_groups: Vec<Vec<u32>>,
87    pub(crate) and_filters: HashMap<u32, PostingFilter>,
88    pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
89}
90
91#[derive(Clone, Copy, Debug, Default)]
92pub(crate) struct PostingFilter {
93    next_mask: u8,
94    loc_mask: u8,
95}
96
97#[derive(Clone, Debug, Default)]
98struct QueryBuild {
99    and_runs: Vec<Vec<u8>>,
100    or_groups: Vec<Vec<Vec<u8>>>,
101}
102
103#[derive(Clone, Debug, Default)]
104pub(crate) struct PathFilters {
105    includes: Option<GlobSet>,
106    excludes: Option<GlobSet>,
107}
108
109#[derive(Clone, Debug)]
110pub(crate) struct SearchScope {
111    pub root: PathBuf,
112    pub use_index: bool,
113}
114
115impl SearchIndex {
116    pub fn new() -> Self {
117        SearchIndex {
118            postings: HashMap::new(),
119            files: Vec::new(),
120            path_to_id: HashMap::new(),
121            ready: false,
122            project_root: PathBuf::new(),
123            git_head: None,
124            max_file_size: DEFAULT_MAX_FILE_SIZE,
125            file_trigrams: HashMap::new(),
126            unindexed_files: HashSet::new(),
127        }
128    }
129
130    pub fn build(root: &Path) -> Self {
131        Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
132    }
133
134    pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
135        let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
136        let mut index = SearchIndex {
137            project_root: project_root.clone(),
138            max_file_size,
139            ..SearchIndex::new()
140        };
141
142        let filters = PathFilters::default();
143        for path in walk_project_files(&project_root, &filters) {
144            index.update_file(&path);
145        }
146
147        index.git_head = current_git_head(&project_root);
148        index.ready = true;
149        index
150    }
151
152    pub fn index_file(&mut self, path: &Path, content: &[u8]) {
153        self.remove_file(path);
154
155        let file_id = match self.allocate_file_id(path, content.len() as u64) {
156            Some(file_id) => file_id,
157            None => return,
158        };
159
160        let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
161        for (trigram, next_char, position) in extract_trigrams(content) {
162            let entry = trigram_map.entry(trigram).or_default();
163            entry.next_mask |= mask_for_next_char(next_char);
164            entry.loc_mask |= mask_for_position(position);
165        }
166
167        let mut file_trigrams = Vec::with_capacity(trigram_map.len());
168        for (trigram, filter) in trigram_map {
169            self.postings.entry(trigram).or_default().push(Posting {
170                file_id,
171                next_mask: filter.next_mask,
172                loc_mask: filter.loc_mask,
173            });
174            file_trigrams.push(trigram);
175        }
176
177        for postings in self.postings.values_mut() {
178            postings.sort_by_key(|posting| posting.file_id);
179        }
180
181        self.file_trigrams.insert(file_id, file_trigrams);
182        self.unindexed_files.remove(&file_id);
183    }
184
185    pub fn remove_file(&mut self, path: &Path) {
186        let Some(file_id) = self.path_to_id.remove(path) else {
187            return;
188        };
189
190        if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
191            for trigram in trigrams {
192                let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
193                    postings.retain(|posting| posting.file_id != file_id);
194                    postings.is_empty()
195                } else {
196                    false
197                };
198
199                if should_remove {
200                    self.postings.remove(&trigram);
201                }
202            }
203        }
204
205        self.unindexed_files.remove(&file_id);
206        if let Some(file) = self.files.get_mut(file_id as usize) {
207            file.path = PathBuf::new();
208            file.size = 0;
209            file.modified = UNIX_EPOCH;
210        }
211    }
212
213    pub fn update_file(&mut self, path: &Path) {
214        self.remove_file(path);
215
216        let metadata = match fs::metadata(path) {
217            Ok(metadata) if metadata.is_file() => metadata,
218            _ => return,
219        };
220
221        if is_binary_path(path, metadata.len()) {
222            return;
223        }
224
225        if metadata.len() > self.max_file_size {
226            self.track_unindexed_file(path, &metadata);
227            return;
228        }
229
230        let content = match fs::read(path) {
231            Ok(content) => content,
232            Err(_) => return,
233        };
234
235        if is_binary_bytes(&content) {
236            return;
237        }
238
239        self.index_file(path, &content);
240    }
241
242    pub fn grep(
243        &self,
244        pattern: &str,
245        case_sensitive: bool,
246        include: &[String],
247        exclude: &[String],
248        search_root: &Path,
249        max_results: usize,
250    ) -> GrepResult {
251        self.search_grep(
252            pattern,
253            case_sensitive,
254            include,
255            exclude,
256            search_root,
257            max_results,
258        )
259    }
260
261    pub fn search_grep(
262        &self,
263        pattern: &str,
264        case_sensitive: bool,
265        include: &[String],
266        exclude: &[String],
267        search_root: &Path,
268        max_results: usize,
269    ) -> GrepResult {
270        let mut regex_builder = RegexBuilder::new(pattern);
271        regex_builder.case_insensitive(!case_sensitive);
272        let regex = match regex_builder.build() {
273            Ok(regex) => regex,
274            Err(_) => {
275                return GrepResult {
276                    matches: Vec::new(),
277                    total_matches: 0,
278                    files_searched: 0,
279                    files_with_matches: 0,
280                    index_status: if self.ready {
281                        IndexStatus::Ready
282                    } else {
283                        IndexStatus::Building
284                    },
285                    truncated: false,
286                };
287            }
288        };
289
290        let filters = match build_path_filters(include, exclude) {
291            Ok(filters) => filters,
292            Err(_) => PathFilters::default(),
293        };
294        let search_root = canonicalize_or_normalize(search_root);
295
296        let query = decompose_regex(pattern);
297        let candidate_ids = self.candidates(&query);
298
299        let mut matches = Vec::new();
300        let mut total_matches = 0usize;
301        let mut files_searched = 0usize;
302        let mut files_with_matches = 0usize;
303        let mut truncated = false;
304
305        for file_id in candidate_ids {
306            let Some(file) = self.files.get(file_id as usize) else {
307                continue;
308            };
309            if file.path.as_os_str().is_empty() {
310                continue;
311            }
312            if !is_within_search_root(&search_root, &file.path) {
313                continue;
314            }
315            if !filters.matches(&self.project_root, &file.path) {
316                continue;
317            }
318
319            let content = match read_searchable_text(&file.path) {
320                Some(content) => content,
321                None => continue,
322            };
323
324            files_searched += 1;
325            let line_starts = line_starts(&content);
326            let mut seen_lines = HashSet::new();
327            let mut matched_this_file = false;
328
329            for matched in regex.find_iter(&content) {
330                let (line, column, line_text) =
331                    line_details(&content, &line_starts, matched.start());
332                if !seen_lines.insert(line) {
333                    continue;
334                }
335
336                total_matches += 1;
337                if matches.len() < max_results {
338                    matches.push(GrepMatch {
339                        file: relative_to_root(&self.project_root, &file.path),
340                        line,
341                        column,
342                        line_text,
343                        match_text: matched.as_str().to_string(),
344                    });
345                } else {
346                    truncated = true;
347                }
348                matched_this_file = true;
349            }
350
351            if matched_this_file {
352                files_with_matches += 1;
353            }
354        }
355
356        sort_grep_matches_by_mtime_desc(&mut matches, &self.project_root);
357
358        GrepResult {
359            total_matches,
360            matches,
361            files_searched,
362            files_with_matches,
363            index_status: if self.ready {
364                IndexStatus::Ready
365            } else {
366                IndexStatus::Building
367            },
368            truncated,
369        }
370    }
371
372    pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
373        let filters = match build_path_filters(&[pattern.to_string()], &[]) {
374            Ok(filters) => filters,
375            Err(_) => return Vec::new(),
376        };
377        let search_root = canonicalize_or_normalize(search_root);
378        let filter_root = if search_root.starts_with(&self.project_root) {
379            &self.project_root
380        } else {
381            &search_root
382        };
383
384        let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
385        sort_paths_by_mtime_desc(&mut paths);
386        paths
387    }
388
389    pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
390        if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
391            return self.active_file_ids();
392        }
393
394        let mut current: Option<BTreeSet<u32>> = None;
395
396        for trigram in &query.and_trigrams {
397            let filter = query.and_filters.get(trigram).copied();
398            let matches = self.postings_for_trigram(*trigram, filter);
399            current = Some(match current.take() {
400                Some(existing) => existing.intersection(&matches).copied().collect(),
401                None => matches,
402            });
403
404            if current.as_ref().is_some_and(|set| set.is_empty()) {
405                break;
406            }
407        }
408
409        let mut current = current.unwrap_or_else(|| self.active_file_ids().into_iter().collect());
410
411        for (index, group) in query.or_groups.iter().enumerate() {
412            let mut group_matches = BTreeSet::new();
413            let filters = query.or_filters.get(index);
414
415            for trigram in group {
416                let filter = filters.and_then(|filters| filters.get(trigram).copied());
417                group_matches.extend(self.postings_for_trigram(*trigram, filter));
418            }
419
420            current = current.intersection(&group_matches).copied().collect();
421            if current.is_empty() {
422                break;
423            }
424        }
425
426        for file_id in &self.unindexed_files {
427            if self.is_active_file(*file_id) {
428                current.insert(*file_id);
429            }
430        }
431
432        current.into_iter().collect()
433    }
434
435    pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
436        if fs::create_dir_all(cache_dir).is_err() {
437            return;
438        }
439
440        let postings_path = cache_dir.join("postings.bin");
441        let lookup_path = cache_dir.join("lookup.bin");
442        let tmp_postings = cache_dir.join("postings.bin.tmp");
443        let tmp_lookup = cache_dir.join("lookup.bin.tmp");
444
445        let active_ids = self.active_file_ids();
446        let mut id_map = HashMap::new();
447        for (new_id, old_id) in active_ids.iter().enumerate() {
448            let Ok(new_id_u32) = u32::try_from(new_id) else {
449                return;
450            };
451            id_map.insert(*old_id, new_id_u32);
452        }
453
454        let write_result = (|| -> std::io::Result<()> {
455            let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
456
457            postings_writer.write_all(INDEX_MAGIC)?;
458            write_u32(&mut postings_writer, INDEX_VERSION)?;
459
460            let head = git_head.unwrap_or_default();
461            let root = self.project_root.to_string_lossy();
462            let head_len = u32::try_from(head.len())
463                .map_err(|_| std::io::Error::other("git head too large to cache"))?;
464            let root_len = u32::try_from(root.len())
465                .map_err(|_| std::io::Error::other("project root too large to cache"))?;
466            let file_count = u32::try_from(active_ids.len())
467                .map_err(|_| std::io::Error::other("too many files to cache"))?;
468
469            write_u32(&mut postings_writer, head_len)?;
470            write_u32(&mut postings_writer, root_len)?;
471            write_u64(&mut postings_writer, self.max_file_size)?;
472            write_u32(&mut postings_writer, file_count)?;
473            postings_writer.write_all(head.as_bytes())?;
474            postings_writer.write_all(root.as_bytes())?;
475
476            for old_id in &active_ids {
477                let Some(file) = self.files.get(*old_id as usize) else {
478                    return Err(std::io::Error::other("missing file entry for cache write"));
479                };
480                let path = relative_to_root(&self.project_root, &file.path);
481                let path = path.to_string_lossy();
482                let path_len = u32::try_from(path.len())
483                    .map_err(|_| std::io::Error::other("cached path too large"))?;
484                let modified = file
485                    .modified
486                    .duration_since(UNIX_EPOCH)
487                    .unwrap_or(Duration::ZERO);
488                let unindexed = if self.unindexed_files.contains(old_id) {
489                    1u8
490                } else {
491                    0u8
492                };
493
494                postings_writer.write_all(&[unindexed])?;
495                write_u32(&mut postings_writer, path_len)?;
496                write_u64(&mut postings_writer, file.size)?;
497                write_u64(&mut postings_writer, modified.as_secs())?;
498                write_u32(&mut postings_writer, modified.subsec_nanos())?;
499                postings_writer.write_all(path.as_bytes())?;
500            }
501
502            let mut lookup_entries = Vec::new();
503            let mut postings_blob = Vec::new();
504            let mut sorted_postings: Vec<_> = self.postings.iter().collect();
505            sorted_postings.sort_by_key(|(trigram, _)| **trigram);
506
507            for (trigram, postings) in sorted_postings {
508                let offset = u64::try_from(postings_blob.len())
509                    .map_err(|_| std::io::Error::other("postings blob too large"))?;
510                let mut count = 0u32;
511
512                for posting in postings {
513                    let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
514                        continue;
515                    };
516
517                    postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
518                    postings_blob.push(posting.next_mask);
519                    postings_blob.push(posting.loc_mask);
520                    count = count.saturating_add(1);
521                }
522
523                if count > 0 {
524                    lookup_entries.push((*trigram, offset, count));
525                }
526            }
527
528            write_u64(
529                &mut postings_writer,
530                u64::try_from(postings_blob.len())
531                    .map_err(|_| std::io::Error::other("postings blob too large"))?,
532            )?;
533            postings_writer.write_all(&postings_blob)?;
534            postings_writer.flush()?;
535            drop(postings_writer);
536
537            let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
538            let entry_count = u32::try_from(lookup_entries.len())
539                .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
540
541            lookup_writer.write_all(LOOKUP_MAGIC)?;
542            write_u32(&mut lookup_writer, INDEX_VERSION)?;
543            write_u32(&mut lookup_writer, entry_count)?;
544
545            for (trigram, offset, count) in lookup_entries {
546                write_u32(&mut lookup_writer, trigram)?;
547                write_u64(&mut lookup_writer, offset)?;
548                write_u32(&mut lookup_writer, count)?;
549            }
550
551            lookup_writer.flush()?;
552            drop(lookup_writer);
553
554            fs::rename(&tmp_postings, &postings_path)?;
555            fs::rename(&tmp_lookup, &lookup_path)?;
556
557            Ok(())
558        })();
559
560        if write_result.is_err() {
561            let _ = fs::remove_file(&tmp_postings);
562            let _ = fs::remove_file(&tmp_lookup);
563        }
564    }
565
566    pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
567        let postings_path = cache_dir.join("postings.bin");
568        let lookup_path = cache_dir.join("lookup.bin");
569
570        let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
571        let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
572
573        let mut magic = [0u8; 8];
574        postings_reader.read_exact(&mut magic).ok()?;
575        if &magic != INDEX_MAGIC {
576            return None;
577        }
578        if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
579            return None;
580        }
581
582        let head_len = read_u32(&mut postings_reader).ok()? as usize;
583        let root_len = read_u32(&mut postings_reader).ok()? as usize;
584        let max_file_size = read_u64(&mut postings_reader).ok()?;
585        let file_count = read_u32(&mut postings_reader).ok()? as usize;
586
587        let mut head_bytes = vec![0u8; head_len];
588        postings_reader.read_exact(&mut head_bytes).ok()?;
589        let git_head = String::from_utf8(head_bytes)
590            .ok()
591            .filter(|head| !head.is_empty());
592
593        let mut root_bytes = vec![0u8; root_len];
594        postings_reader.read_exact(&mut root_bytes).ok()?;
595        let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
596
597        let mut files = Vec::with_capacity(file_count);
598        let mut path_to_id = HashMap::new();
599        let mut unindexed_files = HashSet::new();
600
601        for file_id in 0..file_count {
602            let mut unindexed = [0u8; 1];
603            postings_reader.read_exact(&mut unindexed).ok()?;
604            let path_len = read_u32(&mut postings_reader).ok()? as usize;
605            let size = read_u64(&mut postings_reader).ok()?;
606            let secs = read_u64(&mut postings_reader).ok()?;
607            let nanos = read_u32(&mut postings_reader).ok()?;
608            let mut path_bytes = vec![0u8; path_len];
609            postings_reader.read_exact(&mut path_bytes).ok()?;
610            let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
611            let full_path = project_root.join(relative_path);
612            let file_id_u32 = u32::try_from(file_id).ok()?;
613
614            files.push(FileEntry {
615                path: full_path.clone(),
616                size,
617                modified: UNIX_EPOCH + Duration::new(secs, nanos),
618            });
619            path_to_id.insert(full_path, file_id_u32);
620            if unindexed[0] == 1 {
621                unindexed_files.insert(file_id_u32);
622            }
623        }
624
625        let postings_len = read_u64(&mut postings_reader).ok()? as usize;
626        let mut postings_blob = vec![0u8; postings_len];
627        postings_reader.read_exact(&mut postings_blob).ok()?;
628
629        let mut lookup_magic = [0u8; 8];
630        lookup_reader.read_exact(&mut lookup_magic).ok()?;
631        if &lookup_magic != LOOKUP_MAGIC {
632            return None;
633        }
634        if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
635            return None;
636        }
637        let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
638
639        let mut postings = HashMap::new();
640        let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
641
642        for _ in 0..entry_count {
643            let trigram = read_u32(&mut lookup_reader).ok()?;
644            let offset = read_u64(&mut lookup_reader).ok()? as usize;
645            let count = read_u32(&mut lookup_reader).ok()? as usize;
646            let bytes_len = count.checked_mul(6)?;
647            let end = offset.checked_add(bytes_len)?;
648            if end > postings_blob.len() {
649                return None;
650            }
651
652            let mut trigram_postings = Vec::with_capacity(count);
653            for chunk in postings_blob[offset..end].chunks_exact(6) {
654                let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
655                let posting = Posting {
656                    file_id,
657                    next_mask: chunk[4],
658                    loc_mask: chunk[5],
659                };
660                trigram_postings.push(posting.clone());
661                file_trigrams.entry(file_id).or_default().push(trigram);
662            }
663            postings.insert(trigram, trigram_postings);
664        }
665
666        Some(SearchIndex {
667            postings,
668            files,
669            path_to_id,
670            ready: true,
671            project_root,
672            git_head,
673            max_file_size,
674            file_trigrams,
675            unindexed_files,
676        })
677    }
678
679    pub(crate) fn stored_git_head(&self) -> Option<&str> {
680        self.git_head.as_deref()
681    }
682
683    pub(crate) fn set_ready(&mut self, ready: bool) {
684        self.ready = ready;
685    }
686
687    pub(crate) fn rebuild_or_refresh(
688        root: &Path,
689        max_file_size: u64,
690        current_head: Option<String>,
691        baseline: Option<SearchIndex>,
692    ) -> Self {
693        if current_head.is_none() {
694            return SearchIndex::build_with_limit(root, max_file_size);
695        }
696
697        if let Some(mut baseline) = baseline {
698            baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
699            baseline.max_file_size = max_file_size;
700
701            if baseline.git_head == current_head {
702                baseline.ready = true;
703                return baseline;
704            }
705
706            if let (Some(previous), Some(current)) =
707                (baseline.git_head.clone(), current_head.clone())
708            {
709                let project_root = baseline.project_root.clone();
710                if apply_git_diff_updates(&mut baseline, &project_root, &previous, &current) {
711                    baseline.git_head = Some(current);
712                    baseline.ready = true;
713                    return baseline;
714                }
715            }
716        }
717
718        SearchIndex::build_with_limit(root, max_file_size)
719    }
720
721    fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
722        let file_id = u32::try_from(self.files.len()).ok()?;
723        let metadata = fs::metadata(path).ok();
724        let size = metadata
725            .as_ref()
726            .map_or(size_hint, |metadata| metadata.len());
727        let modified = metadata
728            .and_then(|metadata| metadata.modified().ok())
729            .unwrap_or(UNIX_EPOCH);
730
731        self.files.push(FileEntry {
732            path: path.to_path_buf(),
733            size,
734            modified,
735        });
736        self.path_to_id.insert(path.to_path_buf(), file_id);
737        Some(file_id)
738    }
739
740    fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
741        let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
742            return;
743        };
744        self.unindexed_files.insert(file_id);
745        self.file_trigrams.insert(file_id, Vec::new());
746    }
747
748    fn active_file_ids(&self) -> Vec<u32> {
749        let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
750        ids.sort_unstable();
751        ids
752    }
753
754    fn is_active_file(&self, file_id: u32) -> bool {
755        self.files
756            .get(file_id as usize)
757            .map(|file| !file.path.as_os_str().is_empty())
758            .unwrap_or(false)
759    }
760
761    fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> BTreeSet<u32> {
762        let mut matches = BTreeSet::new();
763        let Some(postings) = self.postings.get(&trigram) else {
764            return matches;
765        };
766
767        for posting in postings {
768            if let Some(filter) = filter {
769                // next_mask: bloom filter check — the character following this trigram in the
770                // query must also appear after this trigram somewhere in the file.
771                if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
772                    continue;
773                }
774                // NOTE: loc_mask (position mod 8) is stored for future adjacency checks
775                // between consecutive trigram pairs, but is NOT used as a single-trigram
776                // filter because the position in the query string has no relationship to
777                // the position in the file. Using it here causes false negatives.
778            }
779            if self.is_active_file(posting.file_id) {
780                matches.insert(posting.file_id);
781            }
782        }
783
784        matches
785    }
786}
787
788pub fn decompose_regex(pattern: &str) -> RegexQuery {
789    let hir = match regex_syntax::parse(pattern) {
790        Ok(hir) => hir,
791        Err(_) => return RegexQuery::default(),
792    };
793
794    let build = build_query(&hir);
795    build.into_query()
796}
797
798pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
799    ((a as u32) << 16) | ((b as u32) << 8) | c as u32
800}
801
802pub fn normalize_char(c: u8) -> u8 {
803    c.to_ascii_lowercase()
804}
805
806pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
807    if content.len() < 3 {
808        return Vec::new();
809    }
810
811    let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
812    for start in 0..=content.len() - 3 {
813        let trigram = pack_trigram(
814            normalize_char(content[start]),
815            normalize_char(content[start + 1]),
816            normalize_char(content[start + 2]),
817        );
818        let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
819        trigrams.push((trigram, next_char, start));
820    }
821    trigrams
822}
823
824pub fn resolve_cache_dir(project_root: &Path) -> PathBuf {
825    let home = std::env::var_os("HOME")
826        .map(PathBuf::from)
827        .unwrap_or_else(|| PathBuf::from("."));
828    home.join(".cache")
829        .join("aft")
830        .join("index")
831        .join(project_cache_key(project_root))
832}
833
834pub(crate) fn build_path_filters(
835    include: &[String],
836    exclude: &[String],
837) -> Result<PathFilters, String> {
838    Ok(PathFilters {
839        includes: build_globset(include)?,
840        excludes: build_globset(exclude)?,
841    })
842}
843
844pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
845    walk_project_files_from(root, root, filters)
846}
847
848pub(crate) fn walk_project_files_from(
849    filter_root: &Path,
850    search_root: &Path,
851    filters: &PathFilters,
852) -> Vec<PathBuf> {
853    let mut builder = WalkBuilder::new(search_root);
854    builder
855        .hidden(false)
856        .git_ignore(true)
857        .git_global(true)
858        .git_exclude(true)
859        .filter_entry(|entry| {
860            let name = entry.file_name().to_string_lossy();
861            if entry.file_type().map_or(false, |ft| ft.is_dir()) {
862                return !matches!(
863                    name.as_ref(),
864                    "node_modules"
865                        | "target"
866                        | "venv"
867                        | ".venv"
868                        | ".git"
869                        | "__pycache__"
870                        | ".tox"
871                        | "dist"
872                        | "build"
873                );
874            }
875            true
876        });
877
878    let mut files = Vec::new();
879    for entry in builder.build().filter_map(|entry| entry.ok()) {
880        if !entry
881            .file_type()
882            .map_or(false, |file_type| file_type.is_file())
883        {
884            continue;
885        }
886        let path = entry.into_path();
887        if filters.matches(filter_root, &path) {
888            files.push(path);
889        }
890    }
891
892    sort_paths_by_mtime_desc(&mut files);
893    files
894}
895
896pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
897    let bytes = fs::read(path).ok()?;
898    if is_binary_bytes(&bytes) {
899        return None;
900    }
901    String::from_utf8(bytes).ok()
902}
903
904pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
905    path.strip_prefix(root)
906        .map(PathBuf::from)
907        .unwrap_or_else(|_| path.to_path_buf())
908}
909
910pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
911    paths.sort_by(|left, right| {
912        path_modified_time(right)
913            .cmp(&path_modified_time(left))
914            .then_with(|| left.cmp(right))
915    });
916}
917
918pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
919    matches.sort_by(|left, right| {
920        let left_path = resolve_match_path(project_root, &left.file);
921        let right_path = resolve_match_path(project_root, &right.file);
922
923        path_modified_time(&right_path)
924            .cmp(&path_modified_time(&left_path))
925            .then_with(|| left.file.cmp(&right.file))
926            .then_with(|| left.line.cmp(&right.line))
927            .then_with(|| left.column.cmp(&right.column))
928    });
929}
930
931pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
932    let resolved_project_root = canonicalize_or_normalize(project_root);
933    let root = match path {
934        Some(path) => {
935            let path = PathBuf::from(path);
936            if path.is_absolute() {
937                canonicalize_or_normalize(&path)
938            } else {
939                normalize_path(&resolved_project_root.join(path))
940            }
941        }
942        None => resolved_project_root.clone(),
943    };
944
945    let use_index = is_within_search_root(&resolved_project_root, &root);
946    SearchScope { root, use_index }
947}
948
949pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
950    content_inspector::inspect(content).is_binary()
951}
952
953pub(crate) fn current_git_head(root: &Path) -> Option<String> {
954    run_git(root, &["rev-parse", "HEAD"])
955}
956
957pub(crate) fn project_cache_key(project_root: &Path) -> String {
958    use sha2::{Digest, Sha256};
959
960    let canonical_root = canonicalize_or_normalize(project_root);
961    let mut hasher = Sha256::new();
962    hasher.update(canonical_root.to_string_lossy().as_bytes());
963    if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
964        hasher.update(root_commit.as_bytes());
965    }
966    let digest = format!("{:x}", hasher.finalize());
967    digest[..16].to_string()
968}
969
970impl PathFilters {
971    fn matches(&self, root: &Path, path: &Path) -> bool {
972        let relative = to_glob_path(&relative_to_root(root, path));
973        if self
974            .includes
975            .as_ref()
976            .is_some_and(|includes| !includes.is_match(&relative))
977        {
978            return false;
979        }
980        if self
981            .excludes
982            .as_ref()
983            .is_some_and(|excludes| excludes.is_match(&relative))
984        {
985            return false;
986        }
987        true
988    }
989}
990
991fn canonicalize_or_normalize(path: &Path) -> PathBuf {
992    fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
993}
994
995fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
996    if path.is_absolute() {
997        path.to_path_buf()
998    } else {
999        project_root.join(path)
1000    }
1001}
1002
1003fn path_modified_time(path: &Path) -> Option<SystemTime> {
1004    fs::metadata(path)
1005        .and_then(|metadata| metadata.modified())
1006        .ok()
1007}
1008
1009fn normalize_path(path: &Path) -> PathBuf {
1010    let mut result = PathBuf::new();
1011    for component in path.components() {
1012        match component {
1013            Component::ParentDir => {
1014                if !result.pop() {
1015                    result.push(component);
1016                }
1017            }
1018            Component::CurDir => {}
1019            _ => result.push(component),
1020        }
1021    }
1022    result
1023}
1024
1025fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1026    path.starts_with(search_root)
1027}
1028
1029impl QueryBuild {
1030    fn into_query(self) -> RegexQuery {
1031        let mut query = RegexQuery::default();
1032
1033        for run in self.and_runs {
1034            add_run_to_and_query(&mut query, &run);
1035        }
1036
1037        for group in self.or_groups {
1038            let mut trigrams = BTreeSet::new();
1039            let mut filters = HashMap::new();
1040            for run in group {
1041                for (trigram, filter) in trigram_filters(&run) {
1042                    trigrams.insert(trigram);
1043                    merge_filter(filters.entry(trigram).or_default(), filter);
1044                }
1045            }
1046            if !trigrams.is_empty() {
1047                query.or_groups.push(trigrams.into_iter().collect());
1048                query.or_filters.push(filters);
1049            }
1050        }
1051
1052        query
1053    }
1054}
1055
1056fn build_query(hir: &Hir) -> QueryBuild {
1057    match hir.kind() {
1058        HirKind::Literal(literal) => {
1059            if literal.0.len() >= 3 {
1060                QueryBuild {
1061                    and_runs: vec![literal.0.to_vec()],
1062                    or_groups: Vec::new(),
1063                }
1064            } else {
1065                QueryBuild::default()
1066            }
1067        }
1068        HirKind::Capture(capture) => build_query(&capture.sub),
1069        HirKind::Concat(parts) => {
1070            let mut build = QueryBuild::default();
1071            for part in parts {
1072                let part_build = build_query(part);
1073                build.and_runs.extend(part_build.and_runs);
1074                build.or_groups.extend(part_build.or_groups);
1075            }
1076            build
1077        }
1078        HirKind::Alternation(parts) => {
1079            let mut group = Vec::new();
1080            for part in parts {
1081                let Some(mut choices) = guaranteed_run_choices(part) else {
1082                    return QueryBuild::default();
1083                };
1084                group.append(&mut choices);
1085            }
1086            if group.is_empty() {
1087                QueryBuild::default()
1088            } else {
1089                QueryBuild {
1090                    and_runs: Vec::new(),
1091                    or_groups: vec![group],
1092                }
1093            }
1094        }
1095        HirKind::Repetition(repetition) => {
1096            if repetition.min == 0 {
1097                QueryBuild::default()
1098            } else {
1099                build_query(&repetition.sub)
1100            }
1101        }
1102        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1103    }
1104}
1105
1106fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1107    match hir.kind() {
1108        HirKind::Literal(literal) => {
1109            if literal.0.len() >= 3 {
1110                Some(vec![literal.0.to_vec()])
1111            } else {
1112                None
1113            }
1114        }
1115        HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1116        HirKind::Concat(parts) => {
1117            let mut runs = Vec::new();
1118            for part in parts {
1119                if let Some(mut part_runs) = guaranteed_run_choices(part) {
1120                    runs.append(&mut part_runs);
1121                }
1122            }
1123            if runs.is_empty() {
1124                None
1125            } else {
1126                Some(runs)
1127            }
1128        }
1129        HirKind::Alternation(parts) => {
1130            let mut runs = Vec::new();
1131            for part in parts {
1132                let Some(mut part_runs) = guaranteed_run_choices(part) else {
1133                    return None;
1134                };
1135                runs.append(&mut part_runs);
1136            }
1137            if runs.is_empty() {
1138                None
1139            } else {
1140                Some(runs)
1141            }
1142        }
1143        HirKind::Repetition(repetition) => {
1144            if repetition.min == 0 {
1145                None
1146            } else {
1147                guaranteed_run_choices(&repetition.sub)
1148            }
1149        }
1150        HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1151    }
1152}
1153
1154fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1155    for (trigram, filter) in trigram_filters(run) {
1156        if !query.and_trigrams.contains(&trigram) {
1157            query.and_trigrams.push(trigram);
1158        }
1159        merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1160    }
1161}
1162
1163fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1164    let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1165    for (trigram, next_char, position) in extract_trigrams(run) {
1166        let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1167        if next_char != EOF_SENTINEL {
1168            entry.next_mask |= mask_for_next_char(next_char);
1169        }
1170        entry.loc_mask |= mask_for_position(position);
1171    }
1172    filters.into_iter().collect()
1173}
1174
1175fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1176    target.next_mask |= filter.next_mask;
1177    target.loc_mask |= filter.loc_mask;
1178}
1179
1180fn mask_for_next_char(next_char: u8) -> u8 {
1181    let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1182    1u8 << bit
1183}
1184
1185fn mask_for_position(position: usize) -> u8 {
1186    1u8 << (position % 8)
1187}
1188
1189fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1190    if patterns.is_empty() {
1191        return Ok(None);
1192    }
1193
1194    let mut builder = GlobSetBuilder::new();
1195    for pattern in patterns {
1196        let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1197        builder.add(glob);
1198    }
1199    builder.build().map(Some).map_err(|error| error.to_string())
1200}
1201
1202fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1203    let mut buffer = [0u8; 4];
1204    reader.read_exact(&mut buffer)?;
1205    Ok(u32::from_le_bytes(buffer))
1206}
1207
1208fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1209    let mut buffer = [0u8; 8];
1210    reader.read_exact(&mut buffer)?;
1211    Ok(u64::from_le_bytes(buffer))
1212}
1213
1214fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1215    writer.write_all(&value.to_le_bytes())
1216}
1217
1218fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1219    writer.write_all(&value.to_le_bytes())
1220}
1221
1222fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1223    let output = Command::new("git")
1224        .arg("-C")
1225        .arg(root)
1226        .args(args)
1227        .output()
1228        .ok()?;
1229    if !output.status.success() {
1230        return None;
1231    }
1232    let value = String::from_utf8(output.stdout).ok()?;
1233    let value = value.trim().to_string();
1234    if value.is_empty() {
1235        None
1236    } else {
1237        Some(value)
1238    }
1239}
1240
1241fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1242    let diff_range = format!("{}..{}", from, to);
1243    let output = match Command::new("git")
1244        .arg("-C")
1245        .arg(root)
1246        .args(["diff", "--name-only", &diff_range])
1247        .output()
1248    {
1249        Ok(output) => output,
1250        Err(_) => return false,
1251    };
1252
1253    if !output.status.success() {
1254        return false;
1255    }
1256
1257    let Ok(paths) = String::from_utf8(output.stdout) else {
1258        return false;
1259    };
1260
1261    for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1262        let path = root.join(relative_path);
1263        if path.exists() {
1264            index.update_file(&path);
1265        } else {
1266            index.remove_file(&path);
1267        }
1268    }
1269
1270    true
1271}
1272
1273fn is_binary_path(path: &Path, size: u64) -> bool {
1274    if size == 0 {
1275        return false;
1276    }
1277
1278    let mut file = match File::open(path) {
1279        Ok(file) => file,
1280        Err(_) => return true,
1281    };
1282
1283    let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1284    match file.read(&mut preview) {
1285        Ok(read) => is_binary_bytes(&preview[..read]),
1286        Err(_) => true,
1287    }
1288}
1289
1290fn line_starts(content: &str) -> Vec<usize> {
1291    let mut starts = vec![0usize];
1292    for (index, byte) in content.bytes().enumerate() {
1293        if byte == b'\n' {
1294            starts.push(index + 1);
1295        }
1296    }
1297    starts
1298}
1299
1300fn line_details(content: &str, line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1301    let line_index = match line_starts.binary_search(&offset) {
1302        Ok(index) => index,
1303        Err(index) => index.saturating_sub(1),
1304    };
1305    let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1306    let line_end = content[line_start..]
1307        .find('\n')
1308        .map(|length| line_start + length)
1309        .unwrap_or(content.len());
1310    let line_text = content[line_start..line_end]
1311        .trim_end_matches('\r')
1312        .to_string();
1313    let column = content[line_start..offset].chars().count() as u32 + 1;
1314    (line_index as u32 + 1, column, line_text)
1315}
1316
1317fn to_glob_path(path: &Path) -> String {
1318    path.to_string_lossy().replace('\\', "/")
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323    use std::process::Command;
1324
1325    use super::*;
1326
1327    #[test]
1328    fn extract_trigrams_tracks_next_char_and_position() {
1329        let trigrams = extract_trigrams(b"Rust");
1330        assert_eq!(trigrams.len(), 2);
1331        assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1332        assert_eq!(
1333            trigrams[1],
1334            (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1335        );
1336    }
1337
1338    #[test]
1339    fn decompose_regex_extracts_literals_and_alternations() {
1340        let query = decompose_regex("abc(def|ghi)xyz");
1341        assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1342        assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1343        assert_eq!(query.or_groups.len(), 1);
1344        assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1345        assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1346    }
1347
1348    #[test]
1349    fn candidates_intersect_posting_lists() {
1350        let mut index = SearchIndex::new();
1351        let dir = tempfile::tempdir().expect("create temp dir");
1352        let alpha = dir.path().join("alpha.txt");
1353        let beta = dir.path().join("beta.txt");
1354        fs::write(&alpha, "abcdef").expect("write alpha");
1355        fs::write(&beta, "abcxyz").expect("write beta");
1356        index.project_root = dir.path().to_path_buf();
1357        index.index_file(&alpha, b"abcdef");
1358        index.index_file(&beta, b"abcxyz");
1359
1360        let query = RegexQuery {
1361            and_trigrams: vec![
1362                pack_trigram(b'a', b'b', b'c'),
1363                pack_trigram(b'd', b'e', b'f'),
1364            ],
1365            ..RegexQuery::default()
1366        };
1367
1368        let candidates = index.candidates(&query);
1369        assert_eq!(candidates.len(), 1);
1370        assert_eq!(index.files[candidates[0] as usize].path, alpha);
1371    }
1372
1373    #[test]
1374    fn candidates_apply_bloom_filters() {
1375        let mut index = SearchIndex::new();
1376        let dir = tempfile::tempdir().expect("create temp dir");
1377        let file = dir.path().join("sample.txt");
1378        fs::write(&file, "abcd efgh").expect("write sample");
1379        index.project_root = dir.path().to_path_buf();
1380        index.index_file(&file, b"abcd efgh");
1381
1382        let trigram = pack_trigram(b'a', b'b', b'c');
1383        let matching_filter = PostingFilter {
1384            next_mask: mask_for_next_char(b'd'),
1385            loc_mask: mask_for_position(0),
1386        };
1387        let non_matching_filter = PostingFilter {
1388            next_mask: mask_for_next_char(b'z'),
1389            loc_mask: mask_for_position(0),
1390        };
1391
1392        assert_eq!(
1393            index
1394                .postings_for_trigram(trigram, Some(matching_filter))
1395                .len(),
1396            1
1397        );
1398        assert!(index
1399            .postings_for_trigram(trigram, Some(non_matching_filter))
1400            .is_empty());
1401    }
1402
1403    #[test]
1404    fn disk_round_trip_preserves_postings_and_files() {
1405        let dir = tempfile::tempdir().expect("create temp dir");
1406        let project = dir.path().join("project");
1407        fs::create_dir_all(&project).expect("create project dir");
1408        let file = project.join("src.txt");
1409        fs::write(&file, "abcdef").expect("write source");
1410
1411        let mut index = SearchIndex::build(&project);
1412        index.git_head = Some("deadbeef".to_string());
1413        let cache_dir = dir.path().join("cache");
1414        index.write_to_disk(&cache_dir, index.git_head.as_deref());
1415
1416        let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1417        assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1418        assert_eq!(loaded.files.len(), 1);
1419        assert_eq!(
1420            relative_to_root(&loaded.project_root, &loaded.files[0].path),
1421            PathBuf::from("src.txt")
1422        );
1423        assert_eq!(loaded.postings.len(), index.postings.len());
1424        assert!(loaded
1425            .postings
1426            .contains_key(&pack_trigram(b'a', b'b', b'c')));
1427    }
1428
1429    #[test]
1430    fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1431        let dir = tempfile::tempdir().expect("create temp dir");
1432        let project = dir.path().join("project");
1433        fs::create_dir_all(&project).expect("create project dir");
1434        fs::write(project.join("src.txt"), "abcdef").expect("write source");
1435
1436        let index = SearchIndex::build(&project);
1437        let cache_dir = dir.path().join("cache");
1438        index.write_to_disk(&cache_dir, None);
1439
1440        assert!(cache_dir.join("postings.bin").is_file());
1441        assert!(cache_dir.join("lookup.bin").is_file());
1442        assert!(!cache_dir.join("postings.bin.tmp").exists());
1443        assert!(!cache_dir.join("lookup.bin.tmp").exists());
1444    }
1445
1446    #[test]
1447    fn project_cache_key_includes_checkout_path() {
1448        let dir = tempfile::tempdir().expect("create temp dir");
1449        let source = dir.path().join("source");
1450        fs::create_dir_all(&source).expect("create source repo dir");
1451        fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1452
1453        assert!(Command::new("git")
1454            .current_dir(&source)
1455            .args(["init"])
1456            .status()
1457            .expect("init git repo")
1458            .success());
1459        assert!(Command::new("git")
1460            .current_dir(&source)
1461            .args(["add", "."])
1462            .status()
1463            .expect("git add")
1464            .success());
1465        assert!(Command::new("git")
1466            .current_dir(&source)
1467            .args([
1468                "-c",
1469                "user.name=AFT Tests",
1470                "-c",
1471                "user.email=aft-tests@example.com",
1472                "commit",
1473                "-m",
1474                "initial",
1475            ])
1476            .status()
1477            .expect("git commit")
1478            .success());
1479
1480        let clone = dir.path().join("clone");
1481        assert!(Command::new("git")
1482            .args(["clone", "--quiet"])
1483            .arg(&source)
1484            .arg(&clone)
1485            .status()
1486            .expect("git clone")
1487            .success());
1488
1489        let source_key = project_cache_key(&source);
1490        let clone_key = project_cache_key(&clone);
1491
1492        assert_eq!(source_key.len(), 16);
1493        assert_eq!(clone_key.len(), 16);
1494        assert_ne!(source_key, clone_key);
1495    }
1496
1497    #[test]
1498    fn resolve_search_scope_disables_index_for_external_path() {
1499        let dir = tempfile::tempdir().expect("create temp dir");
1500        let project = dir.path().join("project");
1501        let outside = dir.path().join("outside");
1502        fs::create_dir_all(&project).expect("create project dir");
1503        fs::create_dir_all(&outside).expect("create outside dir");
1504
1505        let scope = resolve_search_scope(&project, outside.to_str());
1506
1507        assert_eq!(
1508            scope.root,
1509            fs::canonicalize(&outside).expect("canonicalize outside")
1510        );
1511        assert!(!scope.use_index);
1512    }
1513
1514    #[test]
1515    fn grep_filters_matches_to_search_root() {
1516        let dir = tempfile::tempdir().expect("create temp dir");
1517        let project = dir.path().join("project");
1518        let src = project.join("src");
1519        let docs = project.join("docs");
1520        fs::create_dir_all(&src).expect("create src dir");
1521        fs::create_dir_all(&docs).expect("create docs dir");
1522        fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1523        fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1524
1525        let index = SearchIndex::build(&project);
1526        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1527
1528        assert_eq!(result.files_searched, 1);
1529        assert_eq!(result.files_with_matches, 1);
1530        assert_eq!(result.matches.len(), 1);
1531        assert_eq!(result.matches[0].file, PathBuf::from("src/main.rs"));
1532    }
1533
1534    #[test]
1535    fn grep_deduplicates_multiple_matches_on_same_line() {
1536        let dir = tempfile::tempdir().expect("create temp dir");
1537        let project = dir.path().join("project");
1538        let src = project.join("src");
1539        fs::create_dir_all(&src).expect("create src dir");
1540        fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1541
1542        let index = SearchIndex::build(&project);
1543        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1544
1545        assert_eq!(result.total_matches, 1);
1546        assert_eq!(result.matches.len(), 1);
1547    }
1548
1549    #[test]
1550    fn grep_reports_total_matches_before_truncation() {
1551        let dir = tempfile::tempdir().expect("create temp dir");
1552        let project = dir.path().join("project");
1553        let src = project.join("src");
1554        fs::create_dir_all(&src).expect("create src dir");
1555        fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
1556
1557        let index = SearchIndex::build(&project);
1558        let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
1559
1560        assert_eq!(result.total_matches, 2);
1561        assert_eq!(result.matches.len(), 1);
1562        assert!(result.truncated);
1563    }
1564
1565    #[test]
1566    fn glob_filters_results_to_search_root() {
1567        let dir = tempfile::tempdir().expect("create temp dir");
1568        let project = dir.path().join("project");
1569        let src = project.join("src");
1570        let scripts = project.join("scripts");
1571        fs::create_dir_all(&src).expect("create src dir");
1572        fs::create_dir_all(&scripts).expect("create scripts dir");
1573        fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
1574        fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
1575
1576        let index = SearchIndex::build(&project);
1577        let files = index.glob("**/*.rs", &src);
1578
1579        assert_eq!(
1580            files,
1581            vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
1582        );
1583    }
1584
1585    #[test]
1586    fn glob_includes_hidden_and_binary_files() {
1587        let dir = tempfile::tempdir().expect("create temp dir");
1588        let project = dir.path().join("project");
1589        let hidden_dir = project.join(".hidden");
1590        fs::create_dir_all(&hidden_dir).expect("create hidden dir");
1591        let hidden_file = hidden_dir.join("data.bin");
1592        fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
1593
1594        let index = SearchIndex::build(&project);
1595        let files = index.glob("**/*.bin", &project);
1596
1597        assert_eq!(
1598            files,
1599            vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
1600        );
1601    }
1602}