1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36 _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41 fs::create_dir_all(cache_dir)?;
42 let path = cache_dir.join("cache.lock");
43 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44 .lock()
45 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46 fs_lock::try_acquire(&path, Duration::from_secs(2))
47 .map(|guard| Self { _guard: guard })
48 .map_err(|error| match error {
49 fs_lock::AcquireError::Timeout => {
50 std::io::Error::other("timed out acquiring search cache lock")
51 }
52 fs_lock::AcquireError::Io(error) => error,
53 })
54 }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59 pub postings: HashMap<u32, Vec<Posting>>,
60 pub files: Vec<FileEntry>,
61 pub path_to_id: HashMap<PathBuf, u32>,
62 pub ready: bool,
63 project_root: PathBuf,
64 git_head: Option<String>,
65 max_file_size: u64,
66 ignore_rules_fingerprint: String,
67 pub file_trigrams: HashMap<u32, Vec<u32>>,
68 unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73 pub files: Vec<(PathBuf, f32)>,
74 pub engine_capped: bool,
75}
76
77impl SearchIndex {
78 pub fn file_count(&self) -> usize {
80 self.files.len()
81 }
82
83 pub fn trigram_count(&self) -> usize {
85 self.postings.len()
86 }
87
88 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90 query_trigrams_from_tokens(tokens)
91 }
92
93 pub fn lexical_rank(
95 &self,
96 query_trigrams: &[u32],
97 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98 max_files: usize,
99 ) -> Vec<(PathBuf, f32)> {
100 self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101 .files
102 }
103
104 pub fn lexical_rank_with_stats(
107 &self,
108 query_trigrams: &[u32],
109 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110 max_files: usize,
111 ) -> LexicalRankResult {
112 if query_trigrams.is_empty() || max_files == 0 {
113 return LexicalRankResult::default();
114 }
115
116 let mut non_zero: Vec<(u32, usize)> = query_trigrams
117 .iter()
118 .filter_map(|trigram| {
119 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120 (posting_count > 0).then_some((*trigram, posting_count))
121 })
122 .collect();
123 if non_zero.is_empty() {
124 return LexicalRankResult::default();
125 }
126
127 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128 let selected_count = non_zero.len().min(3);
129 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131 let mut candidate_ids = BTreeSet::new();
132 for (trigram, _) in non_zero.iter().take(selected_count) {
133 if let Some(postings) = self.postings.get(trigram) {
134 for posting in postings {
135 if self.is_active_file(posting.file_id) {
136 candidate_ids.insert(posting.file_id);
137 }
138 }
139 }
140 }
141 let pre_filter_candidate_count = candidate_ids.len();
142 let engine_capped = pre_filter_candidate_count > candidate_cap;
143 let filtered_candidates = candidate_ids
144 .into_iter()
145 .filter_map(|file_id| {
146 self.files
147 .get(file_id as usize)
148 .map(|entry| (file_id, entry))
149 })
150 .filter(|(_, entry)| {
151 if let Some(filter) = candidate_filter {
152 filter(&entry.path)
153 } else {
154 true
155 }
156 })
157 .collect::<Vec<_>>();
158
159 let mut ranked = Vec::new();
160 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161 let score = lexical_score(self, query_trigrams, file_id);
162 if score > 0.0 {
163 ranked.push((entry.path.clone(), score));
164 }
165 }
166
167 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168 ranked.truncate(max_files);
169 LexicalRankResult {
170 files: ranked,
171 engine_capped,
172 }
173 }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178 pub file_id: u32,
179 pub next_mask: u8,
180 pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185 pub path: PathBuf,
186 pub size: u64,
187 pub modified: SystemTime,
188 pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193 pub file: PathBuf,
194 pub line: u32,
195 pub column: u32,
196 pub line_text: String,
197 pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202 pub matches: Vec<GrepMatch>,
203 pub total_matches: usize,
204 pub files_searched: usize,
205 pub files_with_matches: usize,
206 pub index_status: IndexStatus,
207 pub truncated: bool,
208 pub fully_degraded: bool,
209 pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214 Ready,
215 Building,
216 Fallback,
217 Disabled,
218}
219
220impl IndexStatus {
221 pub fn as_str(&self) -> &'static str {
222 match self {
223 IndexStatus::Ready => "Ready",
224 IndexStatus::Building => "Building",
225 IndexStatus::Fallback => "Fallback",
226 IndexStatus::Disabled => "Disabled",
227 }
228 }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233 pub and_trigrams: Vec<u32>,
234 pub or_groups: Vec<Vec<u32>>,
235 pub(crate) and_filters: HashMap<u32, PostingFilter>,
236 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241 next_mask: u8,
242 loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247 and_runs: Vec<Vec<u8>>,
248 or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253 includes: Option<GlobSet>,
254 excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259 pub root: PathBuf,
260 pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265 file: Arc<PathBuf>,
266 line: u32,
267 column: u32,
268 line_text: String,
269 match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274 Literal(LiteralSearch),
275 Regex(Regex),
276}
277
278impl SearchIndex {
279 pub fn new() -> Self {
280 SearchIndex {
281 postings: HashMap::new(),
282 files: Vec::new(),
283 path_to_id: HashMap::new(),
284 ready: false,
285 project_root: PathBuf::new(),
286 git_head: None,
287 max_file_size: DEFAULT_MAX_FILE_SIZE,
288 ignore_rules_fingerprint: String::new(),
289 file_trigrams: HashMap::new(),
290 unindexed_files: HashSet::new(),
291 }
292 }
293
294 pub fn build(root: &Path) -> Self {
295 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296 }
297
298 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299 let started = std::time::Instant::now();
300 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
301 let mut index = SearchIndex {
302 project_root: project_root.clone(),
303 max_file_size,
304 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
305 ..SearchIndex::new()
306 };
307
308 let filters = PathFilters::default();
309 let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
310 let indexed = index.ingest_paths_parallel(&paths);
311
312 index.git_head = current_git_head(&project_root);
313 index.ready = true;
314 crate::slog_info!(
315 "search index cold build: {} files, {} trigrams, {} ms (pool={})",
316 indexed,
317 index.postings.len(),
318 started.elapsed().as_millis(),
319 search_index_build_pool_size()
320 );
321 index
322 }
323
324 #[cfg(test)]
326 pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
327 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
328 let mut index = SearchIndex {
329 project_root: project_root.clone(),
330 max_file_size,
331 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
332 ..SearchIndex::new()
333 };
334 let filters = PathFilters::default();
335 for path in walk_project_files(&project_root, &filters) {
336 index.update_file(&path);
337 }
338 index.git_head = current_git_head(&project_root);
339 index.ready = true;
340 index
341 }
342
343 fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
344 let max_file_size = self.max_file_size;
345
346 let parallel_read = || -> Vec<(PathBuf, Vec<u8>)> {
347 paths
348 .par_iter()
349 .filter_map(|path| {
350 let metadata = fs::metadata(path).ok()?;
351 if !metadata.is_file() {
352 return None;
353 }
354 if is_binary_path(path, metadata.len()) || metadata.len() > max_file_size {
355 return None;
356 }
357 let content = fs::read(path).ok()?;
358 if is_binary_bytes(&content) {
359 return None;
360 }
361 Some((path.clone(), content))
362 })
363 .collect()
364 };
365
366 let readable: Vec<(PathBuf, Vec<u8>)> = match rayon::ThreadPoolBuilder::new()
367 .num_threads(search_index_build_pool_size())
368 .thread_name(|index| format!("aft-search-build-{index}"))
369 .stack_size(8 * 1024 * 1024)
370 .build()
371 {
372 Ok(pool) => pool.install(parallel_read),
373 Err(error) => {
374 log::warn!(
375 "search index: bounded build pool unavailable ({error}); using global pool"
376 );
377 parallel_read()
378 }
379 };
380
381 let mut readable_by_path: HashMap<PathBuf, Vec<u8>> = readable.into_iter().collect();
382 let mut indexed = 0usize;
383 for path in paths {
384 if let Some(content) = readable_by_path.remove(path) {
385 self.index_file(path, &content);
386 indexed += 1;
387 } else {
388 self.update_file(path);
389 if self.path_to_id.contains_key(path) {
390 indexed += 1;
391 }
392 }
393 }
394 indexed
395 }
396
397 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
398 self.remove_file(path);
399
400 let file_id = match self.allocate_file_id(path, content.len() as u64) {
401 Some(file_id) => file_id,
402 None => return,
403 };
404 if let Some(file) = self.files.get_mut(file_id as usize) {
405 file.content_hash = cache_freshness::hash_bytes(content);
406 }
407
408 let trigram_map = trigram_filter_map(content, true);
409
410 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
411 for (trigram, filter) in trigram_map {
412 let postings = self.postings.entry(trigram).or_default();
413 postings.push(Posting {
414 file_id,
415 next_mask: filter.next_mask,
416 loc_mask: filter.loc_mask,
417 });
418 if postings.len() > 1
422 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
423 {
424 postings.sort_unstable_by_key(|p| p.file_id);
425 }
426 file_trigrams.push(trigram);
427 }
428
429 self.file_trigrams.insert(file_id, file_trigrams);
430 self.unindexed_files.remove(&file_id);
431 }
432
433 pub fn remove_file(&mut self, path: &Path) {
434 let canonical_path = canonicalize_existing_or_deleted_path(path);
435 let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
436 file_id
437 } else if canonical_path.as_path() != path {
438 let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
439 return;
440 };
441 file_id
442 } else {
443 return;
444 };
445
446 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
447 for trigram in trigrams {
448 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
449 postings.retain(|posting| posting.file_id != file_id);
450 postings.is_empty()
451 } else {
452 false
453 };
454
455 if should_remove {
456 self.postings.remove(&trigram);
457 }
458 }
459 }
460
461 self.unindexed_files.remove(&file_id);
462 if let Some(file) = self.files.get_mut(file_id as usize) {
463 file.path = PathBuf::new();
464 file.size = 0;
465 file.modified = UNIX_EPOCH;
466 file.content_hash = cache_freshness::zero_hash();
467 }
468 }
469
470 pub fn update_file(&mut self, path: &Path) {
471 self.remove_file(path);
472
473 let metadata = match fs::metadata(path) {
474 Ok(metadata) if metadata.is_file() => metadata,
475 _ => return,
476 };
477
478 if is_binary_path(path, metadata.len()) {
479 self.track_unindexed_file(path, &metadata);
480 return;
481 }
482
483 if metadata.len() > self.max_file_size {
484 self.track_unindexed_file(path, &metadata);
485 return;
486 }
487
488 let content = match fs::read(path) {
489 Ok(content) => content,
490 Err(_) => return,
491 };
492
493 if is_binary_bytes(&content) {
494 self.track_unindexed_file(path, &metadata);
495 return;
496 }
497
498 self.index_file(path, &content);
499 }
500
501 pub fn grep(
502 &self,
503 pattern: &str,
504 case_sensitive: bool,
505 include: &[String],
506 exclude: &[String],
507 search_root: &Path,
508 max_results: usize,
509 ) -> GrepResult {
510 match pattern_compile::compile(
511 pattern,
512 CompileOpts {
513 case_insensitive: !case_sensitive,
514 ..CompileOpts::default()
515 },
516 ) {
517 CompileResult::Ok(compiled) => {
518 self.search_grep(&compiled, include, exclude, search_root, max_results)
519 }
520 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
521 self.empty_grep_result()
522 }
523 }
524 }
525
526 pub fn search_grep(
527 &self,
528 pattern: &CompiledPattern,
529 include: &[String],
530 exclude: &[String],
531 search_root: &Path,
532 max_results: usize,
533 ) -> GrepResult {
534 let matcher = match pattern {
535 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
536 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
537 };
538
539 let filters = match build_path_filters(include, exclude) {
540 Ok(filters) => filters,
541 Err(_) => PathFilters::default(),
542 };
543 let search_root = canonicalize_or_normalize(search_root);
544
545 let raw_pattern = pattern.raw_pattern_for_trigrams();
546 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
547 RegexQuery::default()
548 } else {
549 decompose_regex(&raw_pattern)
550 };
551 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
552 let candidate_ids = self.candidates(&query);
553
554 let candidate_files: Vec<&FileEntry> = candidate_ids
555 .into_iter()
556 .filter_map(|file_id| self.files.get(file_id as usize))
557 .filter(|file| !file.path.as_os_str().is_empty())
558 .filter(|file| is_within_search_root(&search_root, &file.path))
559 .filter(|file| filters.matches(&self.project_root, &file.path))
560 .collect();
561
562 let total_matches = AtomicUsize::new(0);
563 let files_searched = AtomicUsize::new(0);
564 let files_with_matches = AtomicUsize::new(0);
565 let truncated = AtomicBool::new(false);
566 let engine_capped = AtomicBool::new(false);
567 let stop_after = max_results.saturating_mul(2);
568 let stop_scan = Arc::new(AtomicBool::new(false));
569
570 let mut matches = if candidate_files.len() > 10 {
571 candidate_files
572 .par_iter()
573 .map(|file| {
574 if grep_scan_should_stop(
575 Some(&stop_scan),
576 &truncated,
577 &total_matches,
578 stop_after,
579 ) {
580 engine_capped.store(true, Ordering::Relaxed);
581 return Vec::new();
582 }
583 search_candidate_file(
584 file,
585 &matcher,
586 max_results,
587 stop_after,
588 &total_matches,
589 &files_searched,
590 &files_with_matches,
591 &truncated,
592 &engine_capped,
593 Some(&stop_scan),
594 )
595 })
596 .reduce(Vec::new, |mut left, mut right| {
597 left.append(&mut right);
605 left
606 })
607 } else {
608 let mut matches = Vec::new();
609 for file in candidate_files {
610 matches.extend(search_candidate_file(
611 file,
612 &matcher,
613 max_results,
614 stop_after,
615 &total_matches,
616 &files_searched,
617 &files_with_matches,
618 &truncated,
619 &engine_capped,
620 None,
621 ));
622
623 if should_stop_search(&truncated, &total_matches, stop_after) {
624 engine_capped.store(true, Ordering::Relaxed);
625 break;
626 }
627 }
628 matches
629 };
630
631 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
632 self.path_to_id
633 .get(path)
634 .and_then(|file_id| self.files.get(*file_id as usize))
635 .map(|file| file.modified)
636 });
637
638 let matches = matches
639 .into_iter()
640 .map(|matched| GrepMatch {
641 file: matched.file.as_ref().clone(),
642 line: matched.line,
643 column: matched.column,
644 line_text: matched.line_text,
645 match_text: matched.match_text,
646 })
647 .collect();
648
649 GrepResult {
650 total_matches: total_matches.load(Ordering::Relaxed),
651 matches,
652 files_searched: files_searched.load(Ordering::Relaxed),
653 files_with_matches: files_with_matches.load(Ordering::Relaxed),
654 index_status: if self.ready {
655 IndexStatus::Ready
656 } else {
657 IndexStatus::Building
658 },
659 truncated: truncated.load(Ordering::Relaxed),
660 fully_degraded,
661 engine_capped: engine_capped.load(Ordering::Relaxed),
662 }
663 }
664
665 fn empty_grep_result(&self) -> GrepResult {
666 GrepResult {
667 matches: Vec::new(),
668 total_matches: 0,
669 files_searched: 0,
670 files_with_matches: 0,
671 index_status: if self.ready {
672 IndexStatus::Ready
673 } else {
674 IndexStatus::Building
675 },
676 truncated: false,
677 fully_degraded: false,
678 engine_capped: false,
679 }
680 }
681
682 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
683 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
684 Ok(filters) => filters,
685 Err(_) => return Vec::new(),
686 };
687 let search_root = canonicalize_or_normalize(search_root);
688 let mut entries = self
689 .files
690 .iter()
691 .filter(|file| !file.path.as_os_str().is_empty())
692 .filter(|file| is_within_search_root(&search_root, &file.path))
693 .filter(|file| filters.matches(&self.project_root, &file.path))
694 .map(|file| (file.path.clone(), file.modified))
695 .collect::<Vec<_>>();
696
697 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
698 right_mtime
699 .cmp(left_mtime)
700 .then_with(|| left_path.cmp(right_path))
701 });
702
703 entries.into_iter().map(|(path, _)| path).collect()
704 }
705
706 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
707 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
708 return self.active_file_ids();
709 }
710
711 let mut and_trigrams = query.and_trigrams.clone();
712 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
713
714 let mut current: Option<Vec<u32>> = None;
715
716 for trigram in and_trigrams {
717 let filter = query.and_filters.get(&trigram).copied();
718 let matches = self.postings_for_trigram(trigram, filter);
719 current = Some(match current.take() {
720 Some(existing) => intersect_sorted_ids(&existing, &matches),
721 None => matches,
722 });
723
724 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
725 break;
726 }
727 }
728
729 let mut current = current.unwrap_or_else(|| self.active_file_ids());
730
731 for (index, group) in query.or_groups.iter().enumerate() {
732 let mut group_matches = Vec::new();
733 let filters = query.or_filters.get(index);
734
735 for trigram in group {
736 let filter = filters.and_then(|filters| filters.get(trigram).copied());
737 let matches = self.postings_for_trigram(*trigram, filter);
738 if group_matches.is_empty() {
739 group_matches = matches;
740 } else {
741 group_matches = union_sorted_ids(&group_matches, &matches);
742 }
743 }
744
745 current = intersect_sorted_ids(¤t, &group_matches);
746 if current.is_empty() {
747 break;
748 }
749 }
750
751 let mut unindexed = self
752 .unindexed_files
753 .iter()
754 .copied()
755 .filter(|file_id| self.is_active_file(*file_id))
756 .collect::<Vec<_>>();
757 if !unindexed.is_empty() {
758 unindexed.sort_unstable();
759 current = union_sorted_ids(¤t, &unindexed);
760 }
761
762 current
763 }
764
765 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
766 if fs::create_dir_all(cache_dir).is_err() {
767 return;
768 }
769
770 let cache_path = cache_dir.join("cache.bin");
771 let tmp_cache = cache_dir.join(format!(
772 "cache.bin.tmp.{}.{}",
773 std::process::id(),
774 SystemTime::now()
775 .duration_since(UNIX_EPOCH)
776 .unwrap_or(Duration::ZERO)
777 .as_nanos()
778 ));
779
780 let active_ids = self.active_file_ids();
781 let mut id_map = HashMap::new();
782 for (new_id, old_id) in active_ids.iter().enumerate() {
783 let Ok(new_id_u32) = u32::try_from(new_id) else {
784 return;
785 };
786 id_map.insert(*old_id, new_id_u32);
787 }
788
789 let write_result = (|| -> std::io::Result<()> {
790 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
791
792 postings_writer.write_all(INDEX_MAGIC)?;
793 write_u32(&mut postings_writer, INDEX_VERSION)?;
794
795 let head = git_head.unwrap_or_default();
796 let root = self.project_root.to_string_lossy();
797 let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
798 ignore_rules_fingerprint(&self.project_root)
799 } else {
800 self.ignore_rules_fingerprint.clone()
801 };
802 let head_len = u32::try_from(head.len())
803 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
804 let root_len = u32::try_from(root.len())
805 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
806 let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
807 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
808 let file_count = u32::try_from(active_ids.len())
809 .map_err(|_| std::io::Error::other("too many files to cache"))?;
810
811 write_u32(&mut postings_writer, head_len)?;
812 write_u32(&mut postings_writer, root_len)?;
813 write_u32(&mut postings_writer, ignore_fingerprint_len)?;
814 write_u64(&mut postings_writer, self.max_file_size)?;
815 write_u32(&mut postings_writer, file_count)?;
816 postings_writer.write_all(head.as_bytes())?;
817 postings_writer.write_all(root.as_bytes())?;
818 postings_writer.write_all(ignore_fingerprint.as_bytes())?;
819
820 for old_id in &active_ids {
821 let Some(file) = self.files.get(*old_id as usize) else {
822 return Err(std::io::Error::other("missing file entry for cache write"));
823 };
824 let path =
825 cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
826 std::io::Error::other(format!(
827 "refusing to cache path outside project root: {}",
828 file.path.display()
829 ))
830 })?;
831 let path = path.to_string_lossy();
832 let path_len = u32::try_from(path.len())
833 .map_err(|_| std::io::Error::other("cached path too large"))?;
834 let modified = file
835 .modified
836 .duration_since(UNIX_EPOCH)
837 .unwrap_or(Duration::ZERO);
838 let unindexed = if self.unindexed_files.contains(old_id) {
839 1u8
840 } else {
841 0u8
842 };
843
844 postings_writer.write_all(&[unindexed])?;
845 write_u32(&mut postings_writer, path_len)?;
846 write_u64(&mut postings_writer, file.size)?;
847 write_u64(&mut postings_writer, modified.as_secs())?;
848 write_u32(&mut postings_writer, modified.subsec_nanos())?;
849 postings_writer.write_all(file.content_hash.as_bytes())?;
850 postings_writer.write_all(path.as_bytes())?;
851 }
852
853 let mut lookup_entries = Vec::new();
854 let mut postings_blob = Vec::new();
855 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
856 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
857
858 for (trigram, postings) in sorted_postings {
859 let offset = u64::try_from(postings_blob.len())
860 .map_err(|_| std::io::Error::other("postings blob too large"))?;
861 let mut count = 0u32;
862
863 for posting in postings {
864 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
865 continue;
866 };
867
868 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
869 postings_blob.push(posting.next_mask);
870 postings_blob.push(posting.loc_mask);
871 count = count.saturating_add(1);
872 }
873
874 if count > 0 {
875 lookup_entries.push((*trigram, offset, count));
876 }
877 }
878
879 write_u64(
880 &mut postings_writer,
881 u64::try_from(postings_blob.len())
882 .map_err(|_| std::io::Error::other("postings blob too large"))?,
883 )?;
884 postings_writer.write_all(&postings_blob)?;
885 postings_writer.flush()?;
886 let mut postings_blob_file = postings_writer
887 .into_inner()
888 .map_err(|error| std::io::Error::other(error.to_string()))?
889 .into_inner();
890 let checksum = crc32fast::hash(&postings_blob_file);
891 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
892
893 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
894 let entry_count = u32::try_from(lookup_entries.len())
895 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
896
897 lookup_writer.write_all(LOOKUP_MAGIC)?;
898 write_u32(&mut lookup_writer, INDEX_VERSION)?;
899 write_u32(&mut lookup_writer, entry_count)?;
900
901 for (trigram, offset, count) in lookup_entries {
902 write_u32(&mut lookup_writer, trigram)?;
903 write_u64(&mut lookup_writer, offset)?;
904 write_u32(&mut lookup_writer, count)?;
905 }
906
907 lookup_writer.flush()?;
908 let mut lookup_blob_file = lookup_writer
909 .into_inner()
910 .map_err(|error| std::io::Error::other(error.to_string()))?
911 .into_inner();
912 let checksum = crc32fast::hash(&lookup_blob_file);
913 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
914
915 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
916 write_u32(&mut cache_writer, CACHE_MAGIC)?;
917 write_u32(&mut cache_writer, INDEX_VERSION)?;
918 write_u64(
919 &mut cache_writer,
920 u64::try_from(postings_blob_file.len())
921 .map_err(|_| std::io::Error::other("postings section too large"))?,
922 )?;
923 cache_writer.write_all(&postings_blob_file)?;
924 cache_writer.write_all(&lookup_blob_file)?;
925 cache_writer.flush()?;
926 cache_writer.get_ref().sync_all()?;
927 drop(cache_writer);
928 fs::rename(&tmp_cache, &cache_path)?;
929
930 Ok(())
931 })();
932
933 if write_result.is_err() {
934 let _ = fs::remove_file(&tmp_cache);
935 }
936 }
937
938 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
939 debug_assert!(current_canonical_root.is_absolute());
940 let cache_path = cache_dir.join("cache.bin");
941 let cache_bytes = fs::read(&cache_path).ok()?;
942 if cache_bytes.len() < 16 {
943 return None;
944 }
945 let mut header = Cursor::new(&cache_bytes);
946 if read_u32(&mut header).ok()? != CACHE_MAGIC {
947 return None;
948 }
949 if read_u32(&mut header).ok()? != INDEX_VERSION {
950 return None;
951 }
952 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
953 let start = usize::try_from(header.position()).ok()?;
954 let postings_end = start.checked_add(postings_len_total)?;
955 if postings_end > cache_bytes.len() {
956 return None;
957 }
958 let postings_bytes = &cache_bytes[start..postings_end];
959 let lookup_bytes = &cache_bytes[postings_end..];
960 let lookup_len_total = lookup_bytes.len();
961 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
962 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
963 if postings_len_total < 4 || lookup_len_total < 4 {
964 return None;
965 }
966 verify_crc32_bytes_slice(postings_bytes).ok()?;
967 verify_crc32_bytes_slice(lookup_bytes).ok()?;
968
969 let mut magic = [0u8; 8];
970 postings_reader.read_exact(&mut magic).ok()?;
971 if &magic != INDEX_MAGIC {
972 return None;
973 }
974 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
975 return None;
976 }
977
978 let head_len = read_u32(&mut postings_reader).ok()? as usize;
979 let root_len = read_u32(&mut postings_reader).ok()? as usize;
980 let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
981 let max_file_size = read_u64(&mut postings_reader).ok()?;
982 let file_count = read_u32(&mut postings_reader).ok()? as usize;
983 if file_count > MAX_ENTRIES {
984 return None;
985 }
986 let postings_body_len = postings_len_total.checked_sub(4)?;
987 let lookup_body_len = lookup_len_total.checked_sub(4)?;
988
989 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
990 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
991 if minimum_file_bytes > remaining_postings {
992 return None;
993 }
994
995 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
996 return None;
997 }
998 let mut head_bytes = vec![0u8; head_len];
999 postings_reader.read_exact(&mut head_bytes).ok()?;
1000 let git_head = String::from_utf8(head_bytes)
1001 .ok()
1002 .filter(|head| !head.is_empty());
1003
1004 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1005 return None;
1006 }
1007 let mut root_bytes = vec![0u8; root_len];
1008 postings_reader.read_exact(&mut root_bytes).ok()?;
1009 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
1010 let project_root = current_canonical_root.to_path_buf();
1011
1012 if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1013 return None;
1014 }
1015 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
1016 postings_reader
1017 .read_exact(&mut ignore_fingerprint_bytes)
1018 .ok()?;
1019 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
1020 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
1021 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1022 return None;
1023 }
1024
1025 let mut files = Vec::with_capacity(file_count);
1026 let mut path_to_id = HashMap::new();
1027 let mut unindexed_files = HashSet::new();
1028
1029 for file_id in 0..file_count {
1030 let mut unindexed = [0u8; 1];
1031 postings_reader.read_exact(&mut unindexed).ok()?;
1032 let path_len = read_u32(&mut postings_reader).ok()? as usize;
1033 let size = read_u64(&mut postings_reader).ok()?;
1034 let secs = read_u64(&mut postings_reader).ok()?;
1035 let nanos = read_u32(&mut postings_reader).ok()?;
1036 let mut hash_bytes = [0u8; 32];
1037 postings_reader.read_exact(&mut hash_bytes).ok()?;
1038 let content_hash = blake3::Hash::from_bytes(hash_bytes);
1039 if nanos >= 1_000_000_000 {
1040 return None;
1041 }
1042 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1043 return None;
1044 }
1045 let mut path_bytes = vec![0u8; path_len];
1046 postings_reader.read_exact(&mut path_bytes).ok()?;
1047 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
1048 let full_path = cached_path_under_root(&project_root, &relative_path)?;
1049 let file_id_u32 = u32::try_from(file_id).ok()?;
1050
1051 files.push(FileEntry {
1052 path: full_path.clone(),
1053 size,
1054 modified: UNIX_EPOCH + Duration::new(secs, nanos),
1055 content_hash,
1056 });
1057 path_to_id.insert(full_path, file_id_u32);
1058 if unindexed[0] == 1 {
1059 unindexed_files.insert(file_id_u32);
1060 }
1061 }
1062
1063 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
1064 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
1065 if postings_len > max_postings_bytes {
1066 return None;
1067 }
1068 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1069 return None;
1070 }
1071 let mut postings_blob = vec![0u8; postings_len];
1072 postings_reader.read_exact(&mut postings_blob).ok()?;
1073
1074 let mut lookup_magic = [0u8; 8];
1075 lookup_reader.read_exact(&mut lookup_magic).ok()?;
1076 if &lookup_magic != LOOKUP_MAGIC {
1077 return None;
1078 }
1079 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
1080 return None;
1081 }
1082 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
1083 if entry_count > MAX_ENTRIES {
1084 return None;
1085 }
1086 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1087 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1088 if minimum_lookup_bytes > remaining_lookup {
1089 return None;
1090 }
1091
1092 let mut postings = HashMap::new();
1093 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1094
1095 for _ in 0..entry_count {
1096 let trigram = read_u32(&mut lookup_reader).ok()?;
1097 let offset = read_u64(&mut lookup_reader).ok()? as usize;
1098 let count = read_u32(&mut lookup_reader).ok()? as usize;
1099 if count > MAX_ENTRIES {
1100 return None;
1101 }
1102 let bytes_len = count.checked_mul(POSTING_BYTES)?;
1103 let end = offset.checked_add(bytes_len)?;
1104 if end > postings_blob.len() {
1105 return None;
1106 }
1107
1108 let mut trigram_postings = Vec::with_capacity(count);
1109 for chunk in postings_blob[offset..end].chunks_exact(6) {
1110 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1111 let posting = Posting {
1112 file_id,
1113 next_mask: chunk[4],
1114 loc_mask: chunk[5],
1115 };
1116 trigram_postings.push(posting.clone());
1117 file_trigrams.entry(file_id).or_default().push(trigram);
1118 }
1119 postings.insert(trigram, trigram_postings);
1120 }
1121
1122 Some(SearchIndex {
1123 postings,
1124 files,
1125 path_to_id,
1126 ready: false,
1127 project_root,
1128 git_head,
1129 max_file_size,
1130 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1131 file_trigrams,
1132 unindexed_files,
1133 })
1134 }
1135
1136 pub fn stored_git_head(&self) -> Option<&str> {
1137 self.git_head.as_deref()
1138 }
1139
1140 pub(crate) fn set_ready(&mut self, ready: bool) {
1141 self.ready = ready;
1142 }
1143
1144 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1145 self.git_head = current_head;
1146 verify_file_mtimes(self);
1147 self.ready = true;
1148 }
1149
1150 #[cfg(debug_assertions)]
1151 #[doc(hidden)]
1152 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1153 self.verify_against_disk(current_head);
1154 }
1155
1156 pub(crate) fn rebuild_or_refresh(
1157 root: &Path,
1158 max_file_size: u64,
1159 current_head: Option<String>,
1160 baseline: Option<SearchIndex>,
1161 ) -> Self {
1162 if let Some(mut baseline) = baseline {
1163 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1164 baseline.max_file_size = max_file_size;
1165 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1166 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1167 return SearchIndex::build_with_limit(root, max_file_size);
1168 }
1169 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1170
1171 if baseline.git_head == current_head || current_head.is_none() {
1172 baseline.git_head = current_head;
1179 verify_file_mtimes(&mut baseline);
1180 baseline.ready = true;
1181 return baseline;
1182 }
1183
1184 if let (Some(previous), Some(current)) =
1185 (baseline.git_head.clone(), current_head.clone())
1186 {
1187 let project_root = baseline.project_root.clone();
1188 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1189 baseline.git_head = Some(current);
1190 verify_file_mtimes(&mut baseline);
1191 baseline.ready = true;
1192 return baseline;
1193 }
1194 }
1195 }
1196
1197 SearchIndex::build_with_limit(root, max_file_size)
1198 }
1199
1200 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1201 let file_id = u32::try_from(self.files.len()).ok()?;
1202 let metadata = fs::metadata(path).ok();
1203 let size = metadata
1204 .as_ref()
1205 .map_or(size_hint, |metadata| metadata.len());
1206 let modified = metadata
1207 .and_then(|metadata| metadata.modified().ok())
1208 .unwrap_or(UNIX_EPOCH);
1209
1210 self.files.push(FileEntry {
1211 path: path.to_path_buf(),
1212 size,
1213 modified,
1214 content_hash: cache_freshness::zero_hash(),
1215 });
1216 self.path_to_id.insert(path.to_path_buf(), file_id);
1217 Some(file_id)
1218 }
1219
1220 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1221 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1222 return;
1223 };
1224 self.unindexed_files.insert(file_id);
1225 self.file_trigrams.insert(file_id, Vec::new());
1226 }
1227
1228 fn active_file_ids(&self) -> Vec<u32> {
1229 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1230 ids.sort_unstable();
1231 ids
1232 }
1233
1234 fn is_active_file(&self, file_id: u32) -> bool {
1235 self.files
1236 .get(file_id as usize)
1237 .map(|file| !file.path.as_os_str().is_empty())
1238 .unwrap_or(false)
1239 }
1240
1241 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1242 let Some(postings) = self.postings.get(&trigram) else {
1243 return Vec::new();
1244 };
1245
1246 let mut matches = Vec::with_capacity(postings.len());
1247
1248 for posting in postings {
1249 if let Some(filter) = filter {
1250 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1253 continue;
1254 }
1255 }
1260 if self.is_active_file(posting.file_id) {
1261 matches.push(posting.file_id);
1262 }
1263 }
1264
1265 matches
1266 }
1267}
1268
1269fn search_candidate_file(
1270 file: &FileEntry,
1271 matcher: &SearchMatcher,
1272 max_results: usize,
1273 stop_after: usize,
1274 total_matches: &AtomicUsize,
1275 files_searched: &AtomicUsize,
1276 files_with_matches: &AtomicUsize,
1277 truncated: &AtomicBool,
1278 engine_capped: &AtomicBool,
1279 stop_scan: Option<&Arc<AtomicBool>>,
1280) -> Vec<SharedGrepMatch> {
1281 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1282 engine_capped.store(true, Ordering::Relaxed);
1283 return Vec::new();
1284 }
1285
1286 let content = match read_indexed_file_bytes(&file.path) {
1287 Some(content) => content,
1288 None => return Vec::new(),
1289 };
1290 if is_binary_bytes(&content) {
1297 return Vec::new();
1298 }
1299 files_searched.fetch_add(1, Ordering::Relaxed);
1300
1301 let shared_path = Arc::new(file.path.clone());
1302 let mut matches = Vec::new();
1303 let mut line_starts = None;
1304 let mut seen_lines = HashSet::new();
1305 let mut matched_this_file = false;
1306
1307 match matcher {
1308 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1309 let needle = &literal.needle;
1310 let finder = memchr::memmem::Finder::new(needle);
1311 let mut start = 0;
1312
1313 while let Some(position) = finder.find(&content[start..]) {
1314 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1315 engine_capped.store(true, Ordering::Relaxed);
1316 break;
1317 }
1318
1319 let offset = start + position;
1320 start = offset + 1;
1321
1322 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1323 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1324 if !seen_lines.insert(line) {
1325 continue;
1326 }
1327
1328 matched_this_file = true;
1329 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1330 if match_number > max_results {
1331 truncated.store(true, Ordering::Relaxed);
1332 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1333 break;
1334 }
1335
1336 let end = offset + needle.len();
1337 matches.push(SharedGrepMatch {
1338 file: shared_path.clone(),
1339 line,
1340 column,
1341 line_text,
1342 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1343 });
1344 }
1345 }
1346 SearchMatcher::Literal(literal) => {
1347 let needle = &literal.needle;
1348 let search_content = content.to_ascii_lowercase();
1349 let finder = memchr::memmem::Finder::new(needle);
1350 let mut start = 0;
1351
1352 while let Some(position) = finder.find(&search_content[start..]) {
1353 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1354 engine_capped.store(true, Ordering::Relaxed);
1355 break;
1356 }
1357
1358 let offset = start + position;
1359 start = offset + 1;
1360
1361 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1362 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1363 if !seen_lines.insert(line) {
1364 continue;
1365 }
1366
1367 matched_this_file = true;
1368 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1369 if match_number > max_results {
1370 truncated.store(true, Ordering::Relaxed);
1371 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1372 break;
1373 }
1374
1375 let end = offset + needle.len();
1376 matches.push(SharedGrepMatch {
1377 file: shared_path.clone(),
1378 line,
1379 column,
1380 line_text,
1381 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1382 });
1383 }
1384 }
1385 SearchMatcher::Regex(regex) => {
1386 for matched in regex.find_iter(&content) {
1387 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1388 engine_capped.store(true, Ordering::Relaxed);
1389 break;
1390 }
1391
1392 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1393 let (line, column, line_text) =
1394 line_details_bytes(&content, line_starts, matched.start());
1395 if !seen_lines.insert(line) {
1396 continue;
1397 }
1398
1399 matched_this_file = true;
1400 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1401 if match_number > max_results {
1402 truncated.store(true, Ordering::Relaxed);
1403 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1404 break;
1405 }
1406
1407 matches.push(SharedGrepMatch {
1408 file: shared_path.clone(),
1409 line,
1410 column,
1411 line_text,
1412 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1413 });
1414 }
1415 }
1416 }
1417
1418 if matched_this_file {
1419 files_with_matches.fetch_add(1, Ordering::Relaxed);
1420 }
1421
1422 matches
1423}
1424
1425fn should_stop_search(
1426 truncated: &AtomicBool,
1427 total_matches: &AtomicUsize,
1428 stop_after: usize,
1429) -> bool {
1430 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1431}
1432
1433fn grep_scan_should_stop(
1434 stop_scan: Option<&Arc<AtomicBool>>,
1435 truncated: &AtomicBool,
1436 total_matches: &AtomicUsize,
1437 stop_after: usize,
1438) -> bool {
1439 stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1440 || should_stop_search(truncated, total_matches, stop_after)
1441}
1442
1443fn signal_grep_scan_cap(
1444 stop_scan: Option<&Arc<AtomicBool>>,
1445 total_matches: &AtomicUsize,
1446 stop_after: usize,
1447) {
1448 if let Some(flag) = stop_scan {
1449 if total_matches.load(Ordering::Relaxed) >= stop_after {
1450 flag.store(true, Ordering::Relaxed);
1451 }
1452 }
1453}
1454
1455fn search_index_build_pool_size() -> usize {
1457 std::thread::available_parallelism()
1458 .map(|parallelism| parallelism.get())
1459 .unwrap_or(1)
1460 .div_ceil(2)
1461 .clamp(1, 8)
1462}
1463
1464fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1465 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1466 let mut left_index = 0;
1467 let mut right_index = 0;
1468
1469 while left_index < left.len() && right_index < right.len() {
1470 match left[left_index].cmp(&right[right_index]) {
1471 std::cmp::Ordering::Less => left_index += 1,
1472 std::cmp::Ordering::Greater => right_index += 1,
1473 std::cmp::Ordering::Equal => {
1474 merged.push(left[left_index]);
1475 left_index += 1;
1476 right_index += 1;
1477 }
1478 }
1479 }
1480
1481 merged
1482}
1483
1484fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1485 let mut merged = Vec::with_capacity(left.len() + right.len());
1486 let mut left_index = 0;
1487 let mut right_index = 0;
1488
1489 while left_index < left.len() && right_index < right.len() {
1490 match left[left_index].cmp(&right[right_index]) {
1491 std::cmp::Ordering::Less => {
1492 merged.push(left[left_index]);
1493 left_index += 1;
1494 }
1495 std::cmp::Ordering::Greater => {
1496 merged.push(right[right_index]);
1497 right_index += 1;
1498 }
1499 std::cmp::Ordering::Equal => {
1500 merged.push(left[left_index]);
1501 left_index += 1;
1502 right_index += 1;
1503 }
1504 }
1505 }
1506
1507 merged.extend_from_slice(&left[left_index..]);
1508 merged.extend_from_slice(&right[right_index..]);
1509 merged
1510}
1511
1512pub fn decompose_regex(pattern: &str) -> RegexQuery {
1513 let hir = match regex_syntax::parse(pattern) {
1514 Ok(hir) => hir,
1515 Err(_) => return RegexQuery::default(),
1516 };
1517
1518 let build = build_query(&hir);
1519 build.into_query()
1520}
1521
1522pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1523 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1524}
1525
1526pub fn normalize_char(c: u8) -> u8 {
1527 c.to_ascii_lowercase()
1528}
1529
1530fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
1531 if content.len() < 3 {
1532 return;
1533 }
1534
1535 for start in 0..=content.len() - 3 {
1536 let trigram = pack_trigram(
1537 normalize_char(content[start]),
1538 normalize_char(content[start + 1]),
1539 normalize_char(content[start + 2]),
1540 );
1541 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1542 visit(trigram, next_char, start);
1543 }
1544}
1545
1546pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1547 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1548 scan_trigrams(content, |trigram, next_char, position| {
1549 trigrams.push((trigram, next_char, position));
1550 });
1551 trigrams
1552}
1553
1554fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
1555 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1556 scan_trigrams(content, |trigram, next_char, position| {
1557 let entry = filters.entry(trigram).or_default();
1558 if include_eof_next_char || next_char != EOF_SENTINEL {
1559 entry.next_mask |= mask_for_next_char(next_char);
1560 }
1561 entry.loc_mask |= mask_for_position(position);
1562 });
1563 filters
1564}
1565
1566pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1567 let mut seen = HashSet::new();
1568 let mut out = Vec::new();
1569 for token in tokens {
1570 scan_trigrams(token.as_bytes(), |trigram, _, _| {
1571 if seen.insert(trigram) {
1572 out.push(trigram);
1573 }
1574 });
1575 }
1576 out
1577}
1578
1579pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1580 if query_trigrams.is_empty() {
1581 return 0.0;
1582 }
1583
1584 let mut hits = 0u32;
1585 for &trigram in query_trigrams {
1586 if let Some(postings) = index.postings.get(&trigram) {
1587 if postings
1588 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1589 .is_ok()
1590 {
1591 hits += 1;
1592 }
1593 }
1594 }
1595
1596 if hits == 0 {
1597 return 0.0;
1598 }
1599
1600 let file_trigram_count = index
1601 .file_trigrams
1602 .get(&file_id)
1603 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1604 (hits as f32) / (1.0 + file_trigram_count.ln())
1605}
1606
1607pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1608 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1610 return PathBuf::from(override_dir)
1611 .join("index")
1612 .join(project_cache_key(project_root));
1613 }
1614 if let Some(dir) = storage_dir {
1616 return dir.join("index").join(project_cache_key(project_root));
1617 }
1618 let home = std::env::var_os("HOME")
1623 .or_else(|| std::env::var_os("USERPROFILE"))
1624 .map(PathBuf::from)
1625 .unwrap_or_else(std::env::temp_dir);
1626 home.join(".cache")
1627 .join("aft")
1628 .join("index")
1629 .join(project_cache_key(project_root))
1630}
1631
1632pub(crate) fn build_path_filters(
1633 include: &[String],
1634 exclude: &[String],
1635) -> Result<PathFilters, String> {
1636 Ok(PathFilters {
1637 includes: build_globset(include)?,
1638 excludes: build_globset(exclude)?,
1639 })
1640}
1641
1642pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1643 walk_project_files_from(root, root, filters)
1644}
1645
1646pub fn walk_project_files_bounded_default(
1647 root: &Path,
1648 max_files: usize,
1649) -> Result<Vec<PathBuf>, usize> {
1650 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
1651}
1652
1653pub(crate) fn walk_project_files_bounded_matching<F>(
1654 root: &Path,
1655 filters: &PathFilters,
1656 max_files: usize,
1657 matches_file: F,
1658) -> Result<Vec<PathBuf>, usize>
1659where
1660 F: Fn(&Path) -> bool,
1661{
1662 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
1663}
1664
1665pub fn walk_project_files_bounded_default_matching<F>(
1666 root: &Path,
1667 max_files: usize,
1668 matches_file: F,
1669) -> Result<Vec<PathBuf>, usize>
1670where
1671 F: Fn(&Path) -> bool,
1672{
1673 walk_project_files_from_inner_matching(
1674 root,
1675 root,
1676 &PathFilters::default(),
1677 Some(max_files),
1678 matches_file,
1679 true,
1680 )
1681}
1682
1683pub(crate) fn walk_project_files_from(
1684 filter_root: &Path,
1685 search_root: &Path,
1686 filters: &PathFilters,
1687) -> Vec<PathBuf> {
1688 walk_project_files_from_inner(filter_root, search_root, filters, None, true)
1689 .expect("unbounded project walk cannot exceed a file limit")
1690}
1691
1692pub(crate) fn for_each_walk_project_file_from<F>(
1693 filter_root: &Path,
1694 search_root: &Path,
1695 filters: &PathFilters,
1696 mut on_file: F,
1697) where
1698 F: FnMut(&PathBuf),
1699{
1700 let builder = project_walk_builder(search_root);
1701 for entry in builder.build().filter_map(|entry| entry.ok()) {
1702 if !entry
1703 .file_type()
1704 .map_or(false, |file_type| file_type.is_file())
1705 {
1706 continue;
1707 }
1708 let path = entry.into_path();
1709 if filters.matches(filter_root, &path) {
1710 on_file(&path);
1711 }
1712 }
1713}
1714
1715pub(crate) fn has_any_project_file_from(
1716 filter_root: &Path,
1717 search_root: &Path,
1718 filters: &PathFilters,
1719) -> bool {
1720 walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
1721}
1722
1723fn walk_project_files_from_inner(
1724 filter_root: &Path,
1725 search_root: &Path,
1726 filters: &PathFilters,
1727 max_files: Option<usize>,
1728 sort_by_mtime: bool,
1729) -> Result<Vec<PathBuf>, usize> {
1730 walk_project_files_from_inner_matching(
1731 filter_root,
1732 search_root,
1733 filters,
1734 max_files,
1735 |_| true,
1736 sort_by_mtime,
1737 )
1738}
1739
1740fn project_walk_builder(search_root: &Path) -> WalkBuilder {
1741 let mut builder = WalkBuilder::new(search_root);
1742 builder
1743 .hidden(false)
1744 .git_ignore(true)
1745 .git_global(true)
1746 .git_exclude(true)
1747 .add_custom_ignore_filename(".aftignore")
1748 .filter_entry(|entry| {
1749 let name = entry.file_name().to_string_lossy();
1750 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1751 return !matches!(
1752 name.as_ref(),
1753 "node_modules"
1754 | "target"
1755 | "venv"
1756 | ".venv"
1757 | ".git"
1758 | "__pycache__"
1759 | ".tox"
1760 | "dist"
1761 | "build"
1762 );
1763 }
1764 true
1765 });
1766 builder
1767}
1768
1769fn walk_project_files_from_inner_matching<F>(
1770 filter_root: &Path,
1771 search_root: &Path,
1772 filters: &PathFilters,
1773 max_files: Option<usize>,
1774 matches_file: F,
1775 sort_by_mtime: bool,
1776) -> Result<Vec<PathBuf>, usize>
1777where
1778 F: Fn(&Path) -> bool,
1779{
1780 let builder = project_walk_builder(search_root);
1781
1782 let mut files = Vec::new();
1783 for entry in builder.build().filter_map(|entry| entry.ok()) {
1784 if !entry
1785 .file_type()
1786 .map_or(false, |file_type| file_type.is_file())
1787 {
1788 continue;
1789 }
1790 let path = entry.into_path();
1791 if filters.matches(filter_root, &path) && matches_file(&path) {
1792 files.push(path);
1793 if max_files.is_some_and(|limit| files.len() > limit) {
1794 return Err(files.len());
1795 }
1796 }
1797 }
1798
1799 if sort_by_mtime {
1800 sort_paths_by_mtime_desc(&mut files);
1801 }
1802 Ok(files)
1803}
1804
1805pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1806 let bytes = fs::read(path).ok()?;
1807 if is_binary_bytes(&bytes) {
1808 return None;
1809 }
1810 String::from_utf8(bytes).ok()
1811}
1812
1813fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1814 fs::read(path).ok()
1815}
1816
1817pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1818 path.strip_prefix(root)
1819 .map(PathBuf::from)
1820 .unwrap_or_else(|_| path.to_path_buf())
1821}
1822
1823pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1824 let normalized_root = normalize_path(root);
1825 let normalized_path = normalize_path(path);
1826 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1827 validate_cached_relative_path(relative)
1828}
1829
1830pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1831 let relative = validate_cached_relative_path(relative_path)?;
1832 let normalized_root = normalize_path(root);
1833 let full_path = normalize_path(&normalized_root.join(relative));
1834
1835 match fs::canonicalize(&full_path) {
1836 Ok(canonical_path) => {
1837 if canonical_path.starts_with(&normalized_root) {
1838 return Some(full_path);
1839 }
1840
1841 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1842 canonical_path
1843 .starts_with(&canonical_root)
1844 .then_some(full_path)
1845 }
1846 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1847 }
1848}
1849
1850pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1851 if path.is_absolute() {
1852 return None;
1853 }
1854
1855 let mut normalized = PathBuf::new();
1856 for component in path.components() {
1857 match component {
1858 Component::Normal(part) => normalized.push(part),
1859 Component::CurDir => {}
1860 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1861 }
1862 }
1863 (!normalized.as_os_str().is_empty()).then_some(normalized)
1864}
1865
1866pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1879 use std::collections::HashMap;
1880 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1881 for path in paths.iter() {
1882 mtimes
1883 .entry(path.clone())
1884 .or_insert_with(|| path_modified_time(path));
1885 }
1886 paths.sort_by(|left, right| {
1887 let left_mtime = mtimes.get(left).and_then(|v| *v);
1888 let right_mtime = mtimes.get(right).and_then(|v| *v);
1889 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1890 });
1891}
1892
1893pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1896 use std::collections::HashMap;
1897 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1898 for m in matches.iter() {
1899 mtimes.entry(m.file.clone()).or_insert_with(|| {
1900 let resolved = resolve_match_path(project_root, &m.file);
1901 path_modified_time(&resolved)
1902 });
1903 }
1904 matches.sort_by(|left, right| {
1905 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1906 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1907 right_mtime
1908 .cmp(&left_mtime)
1909 .then_with(|| left.file.cmp(&right.file))
1910 .then_with(|| left.line.cmp(&right.line))
1911 .then_with(|| left.column.cmp(&right.column))
1912 });
1913}
1914
1915fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1920 matches: &mut [SharedGrepMatch],
1921 modified_for_path: F,
1922) where
1923 F: Fn(&Path) -> Option<SystemTime>,
1924{
1925 use std::collections::HashMap;
1926 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1927 for m in matches.iter() {
1928 let path = m.file.as_path().to_path_buf();
1929 mtimes
1930 .entry(path.clone())
1931 .or_insert_with(|| modified_for_path(&path));
1932 }
1933 matches.sort_by(|left, right| {
1934 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1935 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1936 right_mtime
1937 .cmp(&left_mtime)
1938 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1939 .then_with(|| left.line.cmp(&right.line))
1940 .then_with(|| left.column.cmp(&right.column))
1941 });
1942}
1943
1944pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1945 let resolved_project_root = canonicalize_or_normalize(project_root);
1946 let root = match path {
1947 Some(path) => {
1948 let path = PathBuf::from(path);
1949 if path.is_absolute() {
1950 canonicalize_or_normalize(&path)
1951 } else {
1952 normalize_path(&resolved_project_root.join(path))
1953 }
1954 }
1955 None => resolved_project_root.clone(),
1956 };
1957
1958 let use_index = is_within_search_root(&resolved_project_root, &root);
1959 SearchScope { root, use_index }
1960}
1961
1962pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1963 content_inspector::inspect(content).is_binary()
1964}
1965
1966pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1967 run_git(root, &["rev-parse", "HEAD"])
1968}
1969
1970pub fn project_cache_key(project_root: &Path) -> String {
1971 use sha2::{Digest, Sha256};
1972
1973 let mut hasher = Sha256::new();
1974
1975 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1976 hasher.update(root_commit.as_bytes());
1979 } else {
1980 let canonical_root = canonicalize_or_normalize(project_root);
1982 hasher.update(canonical_root.to_string_lossy().as_bytes());
1983 }
1984
1985 let digest = format!("{:x}", hasher.finalize());
1986 digest[..16].to_string()
1987}
1988
1989pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1997 use sha2::{Digest, Sha256};
1998
1999 let root = canonicalize_or_normalize(project_root);
2000 let mut files = Vec::new();
2001 collect_ignore_rule_files(&root, &mut files);
2002 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
2003 if global_ignore.is_file() {
2004 files.push(global_ignore);
2005 }
2006 }
2007 let info_exclude = git_info_exclude_path(&root);
2008 if info_exclude.is_file() {
2009 files.push(info_exclude);
2010 }
2011 files.sort();
2012 files.dedup();
2013
2014 let mut hasher = Sha256::new();
2015 hasher.update(b"aft-ignore-rules-v1\0");
2016 for path in files {
2017 if let Some(relative) = cache_relative_path(&root, &path) {
2018 hasher.update(relative.to_string_lossy().as_bytes());
2019 } else {
2020 hasher.update(path.to_string_lossy().as_bytes());
2021 }
2022 hasher.update(b"\0");
2023 match fs::read(&path) {
2024 Ok(bytes) => hasher.update(&bytes),
2025 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
2026 }
2027 hasher.update(b"\0");
2028 }
2029
2030 format!("{:x}", hasher.finalize())
2031}
2032
2033fn git_info_exclude_path(root: &Path) -> PathBuf {
2034 run_git(
2035 root,
2036 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
2037 )
2038 .map(PathBuf::from)
2039 .unwrap_or_else(|| root.join(".git"))
2040 .join("info")
2041 .join("exclude")
2042}
2043
2044fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
2045 let mut builder = WalkBuilder::new(root);
2046 builder
2047 .hidden(false)
2048 .git_ignore(true)
2049 .git_global(true)
2050 .git_exclude(true)
2051 .add_custom_ignore_filename(".aftignore")
2052 .filter_entry(|entry| {
2053 let name = entry.file_name().to_string_lossy();
2054 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2055 return !matches!(
2056 name.as_ref(),
2057 ".git"
2058 | "node_modules"
2059 | "target"
2060 | "venv"
2061 | ".venv"
2062 | "__pycache__"
2063 | ".tox"
2064 | "dist"
2065 | "build"
2066 );
2067 }
2068 true
2069 });
2070
2071 for entry in builder.build().filter_map(|entry| entry.ok()) {
2072 if !entry
2073 .file_type()
2074 .map_or(false, |file_type| file_type.is_file())
2075 {
2076 continue;
2077 }
2078 let file_name = entry.file_name();
2079 if file_name == ".gitignore" || file_name == ".aftignore" {
2080 files.push(entry.into_path());
2081 }
2082 }
2083}
2084
2085#[cfg(test)]
2087pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
2088 let mut dirs = 0usize;
2089 let mut builder = WalkBuilder::new(root);
2090 builder
2091 .hidden(false)
2092 .git_ignore(true)
2093 .git_global(true)
2094 .git_exclude(true)
2095 .add_custom_ignore_filename(".aftignore");
2096 for entry in builder.build().filter_map(|entry| entry.ok()) {
2097 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2098 dirs += 1;
2099 }
2100 }
2101 dirs
2102}
2103
2104#[cfg(test)]
2106pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
2107 let mut stack = vec![root.to_path_buf()];
2108 let mut dirs = 0usize;
2109 while let Some(dir) = stack.pop() {
2110 dirs += 1;
2111 let Ok(entries) = fs::read_dir(&dir) else {
2112 continue;
2113 };
2114 for entry in entries.flatten() {
2115 let path = entry.path();
2116 let file_name = entry.file_name();
2117 if file_name == ".gitignore" || file_name == ".aftignore" {
2118 continue;
2119 }
2120 let Ok(file_type) = entry.file_type() else {
2121 continue;
2122 };
2123 if !file_type.is_dir() || file_type.is_symlink() {
2124 continue;
2125 }
2126 if matches!(
2127 file_name.to_str().unwrap_or(""),
2128 ".git"
2129 | "node_modules"
2130 | "target"
2131 | "venv"
2132 | ".venv"
2133 | "__pycache__"
2134 | ".tox"
2135 | "dist"
2136 | "build"
2137 ) {
2138 continue;
2139 }
2140 stack.push(path);
2141 }
2142 }
2143 dirs
2144}
2145
2146impl PathFilters {
2147 fn matches(&self, root: &Path, path: &Path) -> bool {
2148 let relative = to_glob_path(&relative_to_root(root, path));
2149 if self
2150 .includes
2151 .as_ref()
2152 .is_some_and(|includes| !includes.is_match(&relative))
2153 {
2154 return false;
2155 }
2156 if self
2157 .excludes
2158 .as_ref()
2159 .is_some_and(|excludes| excludes.is_match(&relative))
2160 {
2161 return false;
2162 }
2163 true
2164 }
2165}
2166
2167fn canonicalize_or_normalize(path: &Path) -> PathBuf {
2168 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
2169}
2170
2171fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
2172 if path.is_absolute() {
2173 path.to_path_buf()
2174 } else {
2175 project_root.join(path)
2176 }
2177}
2178
2179fn path_modified_time(path: &Path) -> Option<SystemTime> {
2180 fs::metadata(path)
2181 .and_then(|metadata| metadata.modified())
2182 .ok()
2183}
2184
2185fn normalize_path(path: &Path) -> PathBuf {
2186 let mut result = PathBuf::new();
2187 for component in path.components() {
2188 match component {
2189 Component::ParentDir => {
2190 if !result.pop() {
2191 result.push(component);
2192 }
2193 }
2194 Component::CurDir => {}
2195 _ => result.push(component),
2196 }
2197 }
2198 result
2199}
2200
2201fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2202 if let Ok(canonical) = fs::canonicalize(path) {
2203 return canonical;
2204 }
2205
2206 let Some(parent) = path.parent() else {
2207 return path.to_path_buf();
2208 };
2209 let Some(file_name) = path.file_name() else {
2210 return path.to_path_buf();
2211 };
2212
2213 fs::canonicalize(parent)
2214 .map(|canonical_parent| canonical_parent.join(file_name))
2215 .unwrap_or_else(|_| path.to_path_buf())
2216}
2217
2218fn verify_file_mtimes(index: &mut SearchIndex) {
2221 let filters = PathFilters::default();
2222 let current_files = walk_project_files(&index.project_root, &filters);
2223 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
2224 let mut stale_paths = Vec::new();
2225 let mut removed_paths = Vec::new();
2226
2227 for entry in &mut index.files {
2228 if entry.path.as_os_str().is_empty() {
2229 continue; }
2231 if !current_file_set.contains(&entry.path) {
2232 removed_paths.push(entry.path.clone());
2233 continue;
2234 }
2235 let cached = FileFreshness {
2236 mtime: entry.modified,
2237 size: entry.size,
2238 content_hash: entry.content_hash,
2239 };
2240 match cache_freshness::verify_file_strict(&entry.path, &cached) {
2241 FreshnessVerdict::HotFresh => {}
2242 FreshnessVerdict::ContentFresh {
2243 new_mtime,
2244 new_size,
2245 } => {
2246 entry.modified = new_mtime;
2247 entry.size = new_size;
2248 }
2249 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2250 stale_paths.push(entry.path.clone())
2251 }
2252 }
2253 }
2254
2255 for path in &removed_paths {
2256 index.remove_file(path);
2257 }
2258
2259 for path in &stale_paths {
2263 if current_file_set.contains(path) {
2264 index.update_file(path);
2265 } else {
2266 index.remove_file(path);
2267 }
2268 }
2269
2270 for path in current_files {
2272 if !index.path_to_id.contains_key(&path) {
2273 index.update_file(&path);
2274 }
2275 }
2276
2277 if !stale_paths.is_empty() {
2278 crate::slog_info!(
2279 "search index: refreshed {} stale file(s) from disk cache",
2280 stale_paths.len()
2281 );
2282 }
2283}
2284
2285fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2286 normalize_path(path).starts_with(normalize_path(search_root))
2287}
2288
2289impl QueryBuild {
2290 fn into_query(self) -> RegexQuery {
2291 let mut query = RegexQuery::default();
2292
2293 for run in self.and_runs {
2294 add_run_to_and_query(&mut query, &run);
2295 }
2296
2297 for group in self.or_groups {
2298 let mut trigrams = BTreeSet::new();
2299 let mut filters = HashMap::new();
2300 for run in group {
2301 for (trigram, filter) in trigram_filters(&run) {
2302 trigrams.insert(trigram);
2303 merge_filter(filters.entry(trigram).or_default(), filter);
2304 }
2305 }
2306 if !trigrams.is_empty() {
2307 query.or_groups.push(trigrams.into_iter().collect());
2308 query.or_filters.push(filters);
2309 }
2310 }
2311
2312 query
2313 }
2314}
2315
2316fn build_query(hir: &Hir) -> QueryBuild {
2317 match hir.kind() {
2318 HirKind::Literal(literal) => {
2319 if literal.0.len() >= 3 {
2320 QueryBuild {
2321 and_runs: vec![literal.0.to_vec()],
2322 or_groups: Vec::new(),
2323 }
2324 } else {
2325 QueryBuild::default()
2326 }
2327 }
2328 HirKind::Capture(capture) => build_query(&capture.sub),
2329 HirKind::Concat(parts) => {
2330 let mut build = QueryBuild::default();
2331 for part in parts {
2332 let part_build = build_query(part);
2333 build.and_runs.extend(part_build.and_runs);
2334 build.or_groups.extend(part_build.or_groups);
2335 }
2336 build
2337 }
2338 HirKind::Alternation(parts) => {
2339 let mut group = Vec::new();
2340 for part in parts {
2341 let Some(mut choices) = guaranteed_run_choices(part) else {
2342 return QueryBuild::default();
2343 };
2344 group.append(&mut choices);
2345 }
2346 if group.is_empty() {
2347 QueryBuild::default()
2348 } else {
2349 QueryBuild {
2350 and_runs: Vec::new(),
2351 or_groups: vec![group],
2352 }
2353 }
2354 }
2355 HirKind::Repetition(repetition) => {
2356 if repetition.min == 0 {
2357 QueryBuild::default()
2358 } else {
2359 build_query(&repetition.sub)
2360 }
2361 }
2362 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2363 }
2364}
2365
2366fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2367 match hir.kind() {
2368 HirKind::Literal(literal) => {
2369 if literal.0.len() >= 3 {
2370 Some(vec![literal.0.to_vec()])
2371 } else {
2372 None
2373 }
2374 }
2375 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2376 HirKind::Concat(parts) => {
2377 let mut runs = Vec::new();
2378 for part in parts {
2379 if let Some(mut part_runs) = guaranteed_run_choices(part) {
2380 runs.append(&mut part_runs);
2381 }
2382 }
2383 if runs.is_empty() {
2384 None
2385 } else {
2386 Some(runs)
2387 }
2388 }
2389 HirKind::Alternation(parts) => {
2390 let mut runs = Vec::new();
2391 for part in parts {
2392 let Some(mut part_runs) = guaranteed_run_choices(part) else {
2393 return None;
2394 };
2395 runs.append(&mut part_runs);
2396 }
2397 if runs.is_empty() {
2398 None
2399 } else {
2400 Some(runs)
2401 }
2402 }
2403 HirKind::Repetition(repetition) => {
2404 if repetition.min == 0 {
2405 None
2406 } else {
2407 guaranteed_run_choices(&repetition.sub)
2408 }
2409 }
2410 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2411 }
2412}
2413
2414fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2415 for (trigram, filter) in trigram_filters(run) {
2416 if !query.and_trigrams.contains(&trigram) {
2417 query.and_trigrams.push(trigram);
2418 }
2419 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2420 }
2421}
2422
2423fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2424 trigram_filter_map(run, false).into_iter().collect()
2425}
2426
2427fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2428 target.next_mask |= filter.next_mask;
2429 target.loc_mask |= filter.loc_mask;
2430}
2431
2432fn mask_for_next_char(next_char: u8) -> u8 {
2433 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2434 1u8 << bit
2435}
2436
2437fn mask_for_position(position: usize) -> u8 {
2438 1u8 << (position % 8)
2439}
2440
2441fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2442 if patterns.is_empty() {
2443 return Ok(None);
2444 }
2445
2446 let mut builder = GlobSetBuilder::new();
2447 for pattern in patterns {
2448 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2449 builder.add(glob);
2450 }
2451 builder.build().map(Some).map_err(|error| error.to_string())
2452}
2453
2454fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2455 let mut buffer = [0u8; 4];
2456 reader.read_exact(&mut buffer)?;
2457 Ok(u32::from_le_bytes(buffer))
2458}
2459
2460fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2461 let mut buffer = [0u8; 8];
2462 reader.read_exact(&mut buffer)?;
2463 Ok(u64::from_le_bytes(buffer))
2464}
2465
2466fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2467 writer.write_all(&value.to_le_bytes())
2468}
2469
2470fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2471 writer.write_all(&value.to_le_bytes())
2472}
2473
2474fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2475 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2476 return Err(std::io::Error::other("search index checksum missing"));
2477 };
2478 let expected = u32::from_le_bytes(*stored);
2479 let actual = crc32fast::hash(body);
2480 if actual != expected {
2481 return Err(std::io::Error::other("search index checksum mismatch"));
2482 }
2483 Ok(())
2484}
2485
2486fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2487 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2488 total_len.checked_sub(pos)
2489}
2490
2491fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2492 let output = Command::new("git")
2493 .arg("-C")
2494 .arg(root)
2495 .args(args)
2496 .output()
2497 .ok()?;
2498 if !output.status.success() {
2499 return None;
2500 }
2501 let value = String::from_utf8(output.stdout).ok()?;
2502 let value = value.trim().to_string();
2503 if value.is_empty() {
2504 None
2505 } else {
2506 Some(value)
2507 }
2508}
2509
2510fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2511 let diff_range = format!("{}..{}", from, to);
2512 let output = match Command::new("git")
2513 .arg("-C")
2514 .arg(root)
2515 .args(["diff", "--name-status", "-M", &diff_range])
2516 .output()
2517 {
2518 Ok(output) => output,
2519 Err(_) => return false,
2520 };
2521
2522 if !output.status.success() {
2523 return false;
2524 }
2525
2526 let Ok(diff) = String::from_utf8(output.stdout) else {
2527 return false;
2528 };
2529
2530 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2531 let mut fields = line.split('\t');
2532 let Some(status) = fields.next() else {
2533 continue;
2534 };
2535
2536 if status.starts_with('R') {
2537 let Some(old_path) = fields
2538 .next()
2539 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2540 else {
2541 continue;
2542 };
2543 let Some(new_path) = fields
2544 .next()
2545 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2546 else {
2547 continue;
2548 };
2549 index.remove_file(&old_path);
2550 index.update_file(&new_path);
2551 continue;
2552 }
2553
2554 let Some(path) = fields
2555 .next()
2556 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2557 else {
2558 continue;
2559 };
2560 if status.starts_with('D') || !path.exists() {
2561 index.remove_file(&path);
2562 } else {
2563 index.update_file(&path);
2564 }
2565 }
2566
2567 true
2568}
2569
2570fn is_binary_path(path: &Path, size: u64) -> bool {
2571 if size == 0 {
2572 return false;
2573 }
2574
2575 let mut file = match File::open(path) {
2576 Ok(file) => file,
2577 Err(_) => return true,
2578 };
2579
2580 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2581 match file.read(&mut preview) {
2582 Ok(read) => is_binary_bytes(&preview[..read]),
2583 Err(_) => true,
2584 }
2585}
2586
2587fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2588 let mut starts = vec![0usize];
2589 for (index, byte) in content.iter().copied().enumerate() {
2590 if byte == b'\n' {
2591 starts.push(index + 1);
2592 }
2593 }
2594 starts
2595}
2596
2597fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2598 let line_index = match line_starts.binary_search(&offset) {
2599 Ok(index) => index,
2600 Err(index) => index.saturating_sub(1),
2601 };
2602 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2603 let line_end = content[line_start..]
2604 .iter()
2605 .position(|byte| *byte == b'\n')
2606 .map(|length| line_start + length)
2607 .unwrap_or(content.len());
2608 let mut line_slice = &content[line_start..line_end];
2609 if line_slice.ends_with(b"\r") {
2610 line_slice = &line_slice[..line_slice.len() - 1];
2611 }
2612 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2613 let column = String::from_utf8_lossy(&content[line_start..offset])
2614 .chars()
2615 .count() as u32
2616 + 1;
2617 (line_index as u32 + 1, column, line_text)
2618}
2619
2620fn to_glob_path(path: &Path) -> String {
2621 path.to_string_lossy().replace('\\', "/")
2622}
2623
2624#[cfg(test)]
2625mod tests {
2626 use std::process::Command;
2627
2628 use super::*;
2629
2630 #[test]
2631 fn cached_path_under_root_allows_missing_lexical_child() {
2632 let dir = tempfile::tempdir().expect("create temp dir");
2633 let project = dir.path().join("project");
2634 fs::create_dir_all(&project).expect("create project dir");
2635 let root = fs::canonicalize(&project).expect("canonicalize project");
2636
2637 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2638 .expect("missing child should fall back to lexical validation");
2639
2640 assert_eq!(path, root.join("future/file.rs"));
2641 }
2642
2643 #[cfg(unix)]
2644 #[test]
2645 fn cached_path_under_root_rejects_symlink_escape() {
2646 let dir = tempfile::tempdir().expect("create temp dir");
2647 let project = dir.path().join("project");
2648 let outside = dir.path().join("outside");
2649 fs::create_dir_all(&project).expect("create project dir");
2650 fs::create_dir_all(&outside).expect("create outside dir");
2651 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2652 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2653 let root = fs::canonicalize(&project).expect("canonicalize project");
2654
2655 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2656 }
2657
2658 #[test]
2659 fn extract_trigrams_tracks_next_char_and_position() {
2660 let trigrams = extract_trigrams(b"Rust");
2661 assert_eq!(trigrams.len(), 2);
2662 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2663 assert_eq!(
2664 trigrams[1],
2665 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2666 );
2667 }
2668
2669 #[test]
2670 fn index_file_trigram_filters_match_legacy_extraction() {
2671 let dir = tempfile::tempdir().expect("create temp dir");
2672 let path = dir.path().join("sample.txt");
2673 let content = b"Rust rust RUST\nxy";
2674 fs::write(&path, content).expect("write sample");
2675
2676 let mut expected = BTreeMap::new();
2677 for (trigram, next_char, position) in extract_trigrams(content) {
2678 let entry: &mut PostingFilter = expected.entry(trigram).or_default();
2679 entry.next_mask |= mask_for_next_char(next_char);
2680 entry.loc_mask |= mask_for_position(position);
2681 }
2682
2683 let mut index = SearchIndex::new();
2684 index.project_root = dir.path().to_path_buf();
2685 index.index_file(&path, content);
2686
2687 let file_id = *index.path_to_id.get(&path).expect("file indexed");
2688 let file_trigrams = index.file_trigrams.get(&file_id).expect("file trigrams");
2689 assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
2690 for (trigram, filter) in expected {
2691 let postings = index.postings.get(&trigram).expect("posting list");
2692 assert_eq!(postings.len(), 1);
2693 assert_eq!(postings[0].file_id, file_id);
2694 assert_eq!(postings[0].next_mask, filter.next_mask);
2695 assert_eq!(postings[0].loc_mask, filter.loc_mask);
2696 }
2697 }
2698
2699 #[test]
2700 fn decompose_regex_extracts_literals_and_alternations() {
2701 let query = decompose_regex("abc(def|ghi)xyz");
2702 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2703 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2704 assert_eq!(query.or_groups.len(), 1);
2705 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2706 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2707 }
2708
2709 #[test]
2710 fn candidates_intersect_posting_lists() {
2711 let mut index = SearchIndex::new();
2712 let dir = tempfile::tempdir().expect("create temp dir");
2713 let alpha = dir.path().join("alpha.txt");
2714 let beta = dir.path().join("beta.txt");
2715 fs::write(&alpha, "abcdef").expect("write alpha");
2716 fs::write(&beta, "abcxyz").expect("write beta");
2717 index.project_root = dir.path().to_path_buf();
2718 index.index_file(&alpha, b"abcdef");
2719 index.index_file(&beta, b"abcxyz");
2720
2721 let query = RegexQuery {
2722 and_trigrams: vec![
2723 pack_trigram(b'a', b'b', b'c'),
2724 pack_trigram(b'd', b'e', b'f'),
2725 ],
2726 ..RegexQuery::default()
2727 };
2728
2729 let candidates = index.candidates(&query);
2730 assert_eq!(candidates.len(), 1);
2731 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2732 }
2733
2734 #[test]
2735 fn candidates_apply_bloom_filters() {
2736 let mut index = SearchIndex::new();
2737 let dir = tempfile::tempdir().expect("create temp dir");
2738 let file = dir.path().join("sample.txt");
2739 fs::write(&file, "abcd efgh").expect("write sample");
2740 index.project_root = dir.path().to_path_buf();
2741 index.index_file(&file, b"abcd efgh");
2742
2743 let trigram = pack_trigram(b'a', b'b', b'c');
2744 let matching_filter = PostingFilter {
2745 next_mask: mask_for_next_char(b'd'),
2746 loc_mask: mask_for_position(0),
2747 };
2748 let non_matching_filter = PostingFilter {
2749 next_mask: mask_for_next_char(b'z'),
2750 loc_mask: mask_for_position(0),
2751 };
2752
2753 assert_eq!(
2754 index
2755 .postings_for_trigram(trigram, Some(matching_filter))
2756 .len(),
2757 1
2758 );
2759 assert!(index
2760 .postings_for_trigram(trigram, Some(non_matching_filter))
2761 .is_empty());
2762 }
2763
2764 #[test]
2765 fn disk_round_trip_preserves_postings_and_files() {
2766 let dir = tempfile::tempdir().expect("create temp dir");
2767 let project = dir.path().join("project");
2768 fs::create_dir_all(&project).expect("create project dir");
2769 let file = project.join("src.txt");
2770 fs::write(&file, "abcdef").expect("write source");
2771
2772 let mut index = SearchIndex::build(&project);
2773 index.git_head = Some("deadbeef".to_string());
2774 let cache_dir = dir.path().join("cache");
2775 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2776
2777 let loaded =
2778 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2779 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2780 assert_eq!(loaded.files.len(), 1);
2781 assert_eq!(
2782 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2783 PathBuf::from("src.txt")
2784 );
2785 assert_eq!(loaded.postings.len(), index.postings.len());
2786 assert!(loaded
2787 .postings
2788 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2789 }
2790
2791 #[test]
2792 fn cache_path_helpers_reject_absolute_and_parent_paths() {
2793 let root = PathBuf::from("/tmp/aft-project");
2794
2795 assert_eq!(
2796 cache_relative_path(&root, &root.join("src/lib.rs")),
2797 Some(PathBuf::from("src/lib.rs"))
2798 );
2799 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2800 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2801 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2802 assert_eq!(
2803 cached_path_under_root(&root, Path::new("src/./lib.rs")),
2804 Some(root.join("src/lib.rs"))
2805 );
2806 }
2807
2808 #[test]
2809 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2810 let dir = tempfile::tempdir().expect("create temp dir");
2811 let project = dir.path().join("project");
2812 fs::create_dir_all(&project).expect("create project dir");
2813 let canonical_project = fs::canonicalize(&project).expect("canonical project");
2814 fs::write(project.join("old.txt"), "old token\n").expect("write old");
2815 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2816
2817 Command::new("git")
2818 .arg("init")
2819 .arg(&project)
2820 .status()
2821 .expect("git init");
2822 for args in [
2823 ["config", "user.email", "aft@example.invalid"],
2824 ["config", "user.name", "AFT Test"],
2825 ] {
2826 Command::new("git")
2827 .arg("-C")
2828 .arg(&project)
2829 .args(args)
2830 .status()
2831 .expect("git config");
2832 }
2833 Command::new("git")
2834 .arg("-C")
2835 .arg(&project)
2836 .args(["add", "."])
2837 .status()
2838 .expect("git add initial");
2839 Command::new("git")
2840 .arg("-C")
2841 .arg(&project)
2842 .args(["commit", "-m", "initial"])
2843 .status()
2844 .expect("git commit initial");
2845 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2846 let mut baseline = SearchIndex::build(&project);
2847 baseline.git_head = Some(previous.clone());
2848
2849 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2850 Command::new("git")
2851 .arg("-C")
2852 .arg(&project)
2853 .args(["add", "-A"])
2854 .status()
2855 .expect("git add rename");
2856 Command::new("git")
2857 .arg("-C")
2858 .arg(&project)
2859 .args(["commit", "-m", "rename"])
2860 .status()
2861 .expect("git commit rename");
2862 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2863
2864 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2865 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2866
2867 let refreshed = SearchIndex::rebuild_or_refresh(
2868 &project,
2869 DEFAULT_MAX_FILE_SIZE,
2870 Some(current),
2871 Some(baseline),
2872 );
2873
2874 assert!(!refreshed
2875 .path_to_id
2876 .contains_key(&canonical_project.join("old.txt")));
2877 assert!(refreshed
2878 .path_to_id
2879 .contains_key(&canonical_project.join("new.txt")));
2880 assert!(refreshed
2881 .path_to_id
2882 .contains_key(&canonical_project.join("untracked.txt")));
2883 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2884 assert_eq!(matches.matches.len(), 1);
2885 }
2886
2887 #[test]
2888 fn read_from_disk_rejects_corrupt_postings_checksum() {
2889 let dir = tempfile::tempdir().expect("create temp dir");
2890 let project = dir.path().join("project");
2891 fs::create_dir_all(&project).expect("create project dir");
2892 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2893
2894 let index = SearchIndex::build(&project);
2895 let cache_dir = dir.path().join("cache");
2896 index.write_to_disk(&cache_dir, None);
2897
2898 let cache_path = cache_dir.join("cache.bin");
2899 let mut bytes = fs::read(&cache_path).expect("read cache");
2900 let middle = bytes.len() / 2;
2901 bytes[middle] ^= 0xff;
2902 fs::write(&cache_path, bytes).expect("write corrupted cache");
2903
2904 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2905 }
2906
2907 #[test]
2908 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2909 let dir = tempfile::tempdir().expect("create temp dir");
2910 let project = dir.path().join("project");
2911 fs::create_dir_all(&project).expect("create project dir");
2912 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2913
2914 let index = SearchIndex::build(&project);
2915 let cache_dir = dir.path().join("cache");
2916 index.write_to_disk(&cache_dir, None);
2917
2918 assert!(cache_dir.join("cache.bin").is_file());
2919 assert!(fs::read_dir(&cache_dir)
2920 .expect("read cache dir")
2921 .all(|entry| !entry
2922 .expect("cache entry")
2923 .file_name()
2924 .to_string_lossy()
2925 .contains(".tmp.")));
2926 }
2927
2928 #[test]
2929 fn concurrent_search_index_writes_do_not_corrupt() {
2930 let dir = tempfile::tempdir().expect("create temp dir");
2931 let project = dir.path().join("project");
2932 fs::create_dir_all(&project).expect("create project dir");
2933 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2934 let cache_dir = dir.path().join("cache");
2935
2936 let a_project = project.clone();
2937 let a_cache = cache_dir.clone();
2938 let a = std::thread::spawn(move || {
2939 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2940 let index = SearchIndex::build(&a_project);
2941 index.write_to_disk(&a_cache, None);
2942 });
2943 let b_project = project.clone();
2944 let b_cache = cache_dir.clone();
2945 let b = std::thread::spawn(move || {
2946 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2947 let index = SearchIndex::build(&b_project);
2948 index.write_to_disk(&b_cache, None);
2949 });
2950 a.join().expect("writer a");
2951 b.join().expect("writer b");
2952
2953 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2954 }
2955
2956 #[test]
2957 fn search_index_atomic_rename_survives_partial_write() {
2958 let dir = tempfile::tempdir().expect("create temp dir");
2959 let cache_dir = dir.path().join("cache");
2960 fs::create_dir_all(&cache_dir).expect("create cache dir");
2961 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2962
2963 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2964 }
2965
2966 #[test]
2967 fn project_cache_key_includes_checkout_path() {
2968 let dir = tempfile::tempdir().expect("create temp dir");
2969 let source = dir.path().join("source");
2970 fs::create_dir_all(&source).expect("create source repo dir");
2971 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2972
2973 assert!(Command::new("git")
2974 .current_dir(&source)
2975 .args(["init"])
2976 .status()
2977 .expect("init git repo")
2978 .success());
2979 assert!(Command::new("git")
2980 .current_dir(&source)
2981 .args(["add", "."])
2982 .status()
2983 .expect("git add")
2984 .success());
2985 assert!(Command::new("git")
2986 .current_dir(&source)
2987 .args([
2988 "-c",
2989 "user.name=AFT Tests",
2990 "-c",
2991 "user.email=aft-tests@example.com",
2992 "commit",
2993 "-m",
2994 "initial",
2995 ])
2996 .status()
2997 .expect("git commit")
2998 .success());
2999
3000 let clone = dir.path().join("clone");
3001 assert!(Command::new("git")
3002 .args(["clone", "--quiet"])
3003 .arg(&source)
3004 .arg(&clone)
3005 .status()
3006 .expect("git clone")
3007 .success());
3008
3009 let source_key = project_cache_key(&source);
3010 let clone_key = project_cache_key(&clone);
3011
3012 assert_eq!(source_key.len(), 16);
3013 assert_eq!(clone_key.len(), 16);
3014 assert_eq!(source_key, clone_key);
3016 }
3017
3018 #[test]
3019 fn git_head_unchanged_picks_up_local_edits() {
3020 let dir = tempfile::tempdir().expect("create temp dir");
3021 let project = dir.path().join("repo");
3022 fs::create_dir_all(&project).expect("create repo dir");
3023 let file = project.join("tracked.txt");
3024 fs::write(&file, "oldtoken\n").expect("write file");
3025 assert!(Command::new("git")
3026 .current_dir(&project)
3027 .arg("init")
3028 .status()
3029 .unwrap()
3030 .success());
3031 assert!(Command::new("git")
3032 .current_dir(&project)
3033 .args(["add", "."])
3034 .status()
3035 .unwrap()
3036 .success());
3037 assert!(Command::new("git")
3038 .current_dir(&project)
3039 .args([
3040 "-c",
3041 "user.name=AFT Tests",
3042 "-c",
3043 "user.email=aft-tests@example.com",
3044 "commit",
3045 "-m",
3046 "initial"
3047 ])
3048 .status()
3049 .unwrap()
3050 .success());
3051 let head = current_git_head(&project);
3052 let mut baseline = SearchIndex::build(&project);
3053 baseline.git_head = head.clone();
3054 fs::write(&file, "newtoken\n").expect("edit tracked file");
3055
3056 let refreshed =
3057 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
3058 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
3059
3060 assert_eq!(result.total_matches, 1);
3061 }
3062
3063 #[test]
3064 fn non_git_project_reuses_cache_when_files_unchanged() {
3065 let dir = tempfile::tempdir().expect("create temp dir");
3066 let project = dir.path().join("project");
3067 fs::create_dir_all(&project).expect("create project dir");
3068 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
3069 let baseline = SearchIndex::build(&project);
3070 let baseline_file_count = baseline.file_count();
3071
3072 let refreshed =
3073 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3074
3075 assert_eq!(refreshed.file_count(), baseline_file_count);
3076 assert_eq!(
3077 refreshed
3078 .grep("unchangedtoken", true, &[], &[], &project, 10)
3079 .total_matches,
3080 1
3081 );
3082 }
3083
3084 #[test]
3085 fn resolve_search_scope_disables_index_for_external_path() {
3086 let dir = tempfile::tempdir().expect("create temp dir");
3087 let project = dir.path().join("project");
3088 let outside = dir.path().join("outside");
3089 fs::create_dir_all(&project).expect("create project dir");
3090 fs::create_dir_all(&outside).expect("create outside dir");
3091
3092 let scope = resolve_search_scope(&project, outside.to_str());
3093
3094 assert_eq!(
3095 scope.root,
3096 fs::canonicalize(&outside).expect("canonicalize outside")
3097 );
3098 assert!(!scope.use_index);
3099 }
3100
3101 #[test]
3102 fn grep_filters_matches_to_search_root() {
3103 let dir = tempfile::tempdir().expect("create temp dir");
3104 let project = dir.path().join("project");
3105 let src = project.join("src");
3106 let docs = project.join("docs");
3107 fs::create_dir_all(&src).expect("create src dir");
3108 fs::create_dir_all(&docs).expect("create docs dir");
3109 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
3110 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
3111
3112 let index = SearchIndex::build(&project);
3113 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3114
3115 assert_eq!(result.files_searched, 1);
3116 assert_eq!(result.files_with_matches, 1);
3117 assert_eq!(result.matches.len(), 1);
3118 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
3120 assert_eq!(result.matches[0].file, expected);
3121 }
3122
3123 #[test]
3124 fn grep_deduplicates_multiple_matches_on_same_line() {
3125 let dir = tempfile::tempdir().expect("create temp dir");
3126 let project = dir.path().join("project");
3127 let src = project.join("src");
3128 fs::create_dir_all(&src).expect("create src dir");
3129 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
3130
3131 let index = SearchIndex::build(&project);
3132 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3133
3134 assert_eq!(result.total_matches, 1);
3135 assert_eq!(result.matches.len(), 1);
3136 }
3137
3138 #[test]
3139 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
3140 let dir = tempfile::tempdir().expect("create temp dir");
3141 let project = dir.path().join("project");
3142 fs::create_dir_all(&project).expect("create project dir");
3143 let file = project.join("unicode.txt");
3144 fs::write(&file, "äbc\n").expect("write unicode file");
3145
3146 let index = SearchIndex::build(&project);
3147 let result = index.grep("Äbc", false, &[], &[], &project, 10);
3148
3149 assert_eq!(result.total_matches, 1);
3150 assert_eq!(result.matches.len(), 1);
3151 assert_eq!(
3152 result.matches[0].file,
3153 fs::canonicalize(file).expect("canonicalize unicode file")
3154 );
3155 }
3156
3157 #[test]
3158 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
3159 let dir = tempfile::tempdir().expect("create temp dir");
3160 let project = dir.path().join("project");
3161 fs::create_dir_all(&project).expect("create project dir");
3162 let file = project.join("tokens.txt");
3163 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
3164 fs::write(&file, "alpha").expect("write original file");
3165 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
3166
3167 let baseline = SearchIndex::build(&project);
3168 fs::write(&file, "bravo").expect("write same-size edit");
3169 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
3170
3171 let refreshed =
3172 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3173 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
3174 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
3175 let refreshed_id = *refreshed
3176 .path_to_id
3177 .get(&canonical_file)
3178 .expect("file remains indexed");
3179
3180 assert_eq!(result.total_matches, 1);
3181 assert!(refreshed
3182 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
3183 .contains(&refreshed_id));
3184 assert!(!refreshed
3185 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
3186 .contains(&refreshed_id));
3187 }
3188
3189 #[test]
3190 fn grep_reports_total_matches_before_truncation() {
3191 let dir = tempfile::tempdir().expect("create temp dir");
3192 let project = dir.path().join("project");
3193 let src = project.join("src");
3194 fs::create_dir_all(&src).expect("create src dir");
3195 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
3196
3197 let index = SearchIndex::build(&project);
3198 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
3199
3200 assert_eq!(result.total_matches, 2);
3201 assert_eq!(result.matches.len(), 1);
3202 assert!(result.truncated);
3203 }
3204
3205 #[test]
3206 fn glob_filters_results_to_search_root() {
3207 let dir = tempfile::tempdir().expect("create temp dir");
3208 let project = dir.path().join("project");
3209 let src = project.join("src");
3210 let scripts = project.join("scripts");
3211 fs::create_dir_all(&src).expect("create src dir");
3212 fs::create_dir_all(&scripts).expect("create scripts dir");
3213 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
3214 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
3215
3216 let index = SearchIndex::build(&project);
3217 let files = index.glob("**/*.rs", &src);
3218
3219 assert_eq!(
3220 files,
3221 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
3222 );
3223 }
3224
3225 #[test]
3226 fn glob_includes_hidden_and_binary_files() {
3227 let dir = tempfile::tempdir().expect("create temp dir");
3228 let project = dir.path().join("project");
3229 let hidden_dir = project.join(".hidden");
3230 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
3231 let hidden_file = hidden_dir.join("data.bin");
3232 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
3233
3234 let index = SearchIndex::build(&project);
3235 let files = index.glob("**/*.bin", &project);
3236
3237 assert_eq!(
3238 files,
3239 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
3240 );
3241 }
3242
3243 #[test]
3244 fn read_from_disk_rejects_invalid_nanos() {
3245 let dir = tempfile::tempdir().expect("create temp dir");
3246 let cache_dir = dir.path().join("cache");
3247 fs::create_dir_all(&cache_dir).expect("create cache dir");
3248
3249 let mut postings = Vec::new();
3250 postings.extend_from_slice(INDEX_MAGIC);
3251 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3252 postings.extend_from_slice(&0u32.to_le_bytes());
3253 postings.extend_from_slice(&1u32.to_le_bytes());
3254 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3255 postings.extend_from_slice(&1u32.to_le_bytes());
3256 postings.extend_from_slice(b"/");
3257 postings.push(0u8);
3258 postings.extend_from_slice(&1u32.to_le_bytes());
3259 postings.extend_from_slice(&0u64.to_le_bytes());
3260 postings.extend_from_slice(&0u64.to_le_bytes());
3261 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3262 postings.extend_from_slice(b"a");
3263 postings.extend_from_slice(&0u64.to_le_bytes());
3264
3265 let mut lookup = Vec::new();
3266 lookup.extend_from_slice(LOOKUP_MAGIC);
3267 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3268 lookup.extend_from_slice(&0u32.to_le_bytes());
3269
3270 let postings_checksum = crc32fast::hash(&postings);
3271 postings.extend_from_slice(&postings_checksum.to_le_bytes());
3272 let lookup_checksum = crc32fast::hash(&lookup);
3273 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3274 let mut cache = Vec::new();
3275 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3276 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3277 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3278 cache.extend_from_slice(&postings);
3279 cache.extend_from_slice(&lookup);
3280 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3281
3282 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3283 }
3284
3285 #[test]
3286 fn parallel_cold_build_matches_serial_index() {
3287 let dir = tempfile::tempdir().expect("create temp dir");
3288 let project = dir.path().join("project");
3289 for index in 0..80 {
3290 let sub = project.join(format!("pkg_{index:03}"));
3291 fs::create_dir_all(&sub).expect("create subdir");
3292 fs::write(
3293 sub.join("lib.rs"),
3294 format!(
3295 "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
3296 ),
3297 )
3298 .expect("write lib");
3299 }
3300
3301 let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
3302 let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
3303
3304 assert_eq!(serial.file_count(), parallel.file_count());
3305 assert_eq!(serial.trigram_count(), parallel.trigram_count());
3306 assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
3307 for (path, id) in &serial.path_to_id {
3308 assert_eq!(parallel.path_to_id.get(path), Some(id));
3309 }
3310 }
3311
3312 #[test]
3313 fn ignore_rule_discovery_respects_gitignore() {
3314 let dir = tempfile::tempdir().expect("create temp dir");
3315 let project = dir.path().join("project");
3316 fs::create_dir_all(project.join("src")).expect("mkdir src");
3317 fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
3318 let data = project.join("src/data");
3319 fs::create_dir_all(&data).expect("mkdir data");
3320 for index in 0..200 {
3321 fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
3322 fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
3323 .expect("write ignored file");
3324 }
3325
3326 Command::new("git")
3327 .arg("init")
3328 .arg(&project)
3329 .status()
3330 .expect("git init");
3331 for args in [
3332 ["config", "user.email", "aft@example.invalid"],
3333 ["config", "user.name", "AFT Test"],
3334 ] {
3335 Command::new("git")
3336 .arg("-C")
3337 .arg(&project)
3338 .args(args)
3339 .status()
3340 .expect("git config");
3341 }
3342 Command::new("git")
3343 .arg("-C")
3344 .arg(&project)
3345 .args(["add", "."])
3346 .status()
3347 .expect("git add");
3348 Command::new("git")
3349 .arg("-C")
3350 .arg(&project)
3351 .args(["commit", "-m", "initial"])
3352 .status()
3353 .expect("git commit");
3354
3355 let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
3356 let walker_dirs = count_ignore_rule_discovery_dirs(&project);
3357 assert!(
3358 legacy_dirs > walker_dirs,
3359 "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
3360 );
3361 assert!(
3362 walker_dirs < 50,
3363 "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
3364 );
3365 }
3366
3367 #[test]
3382 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3383 let dir = tempfile::tempdir().expect("create tempdir");
3387 let mut paths: Vec<PathBuf> = Vec::new();
3388 for i in 0..30 {
3389 let path = if i % 2 == 0 {
3391 let p = dir.path().join(format!("real-{i}.rs"));
3392 fs::write(&p, format!("// {i}\n")).expect("write");
3393 p
3394 } else {
3395 dir.path().join(format!("missing-{i}.rs"))
3396 };
3397 paths.push(path);
3398 }
3399
3400 for _ in 0..50 {
3403 let mut copy = paths.clone();
3404 sort_paths_by_mtime_desc(&mut copy);
3405 assert_eq!(copy.len(), paths.len());
3406 }
3407 }
3408
3409 #[test]
3415 fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
3416 let dir = tempfile::tempdir().expect("create tempdir");
3417 for i in 0..40 {
3420 fs::write(
3421 dir.path().join(format!("file-{i}.rs")),
3422 format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
3423 )
3424 .expect("write");
3425 }
3426 let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
3427 let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
3428 assert!(
3429 result.matches.len() >= 40,
3430 "expected a match per file, got {}",
3431 result.matches.len()
3432 );
3433 assert!(
3434 !result.engine_capped,
3435 "an uncapped grep over >10 files must not report engine_capped"
3436 );
3437 assert!(!result.truncated, "uncapped grep must not be truncated");
3438 }
3439
3440 #[test]
3444 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3445 let dir = tempfile::tempdir().expect("create tempdir");
3446 let mut matches: Vec<GrepMatch> = Vec::new();
3447 for i in 0..30 {
3448 let file = if i % 2 == 0 {
3449 let p = dir.path().join(format!("real-{i}.rs"));
3450 fs::write(&p, format!("// {i}\n")).expect("write");
3451 p
3452 } else {
3453 dir.path().join(format!("missing-{i}.rs"))
3454 };
3455 matches.push(GrepMatch {
3456 file,
3457 line: u32::try_from(i).unwrap_or(0),
3458 column: 0,
3459 line_text: format!("match {i}"),
3460 match_text: format!("match {i}"),
3461 });
3462 }
3463
3464 for _ in 0..50 {
3465 let mut copy = matches.clone();
3466 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3467 assert_eq!(copy.len(), matches.len());
3468 }
3469 }
3470}