1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const PARALLEL_INGEST_CHUNK_SIZE: usize = 256;
29const EOF_SENTINEL: u8 = 0;
30const MAX_ENTRIES: usize = 10_000_000;
31const MIN_FILE_ENTRY_BYTES: usize = 57;
32const LOOKUP_ENTRY_BYTES: usize = 16;
33const POSTING_BYTES: usize = 6;
34static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
35
36pub struct CacheLock {
37 _guard: fs_lock::LockGuard,
38}
39
40impl CacheLock {
41 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
42 fs::create_dir_all(cache_dir)?;
43 let path = cache_dir.join("cache.lock");
44 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
45 .lock()
46 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
47 fs_lock::try_acquire(&path, Duration::from_secs(2))
48 .map(|guard| Self { _guard: guard })
49 .map_err(|error| match error {
50 fs_lock::AcquireError::Timeout => {
51 std::io::Error::other("timed out acquiring search cache lock")
52 }
53 fs_lock::AcquireError::Io(error) => error,
54 })
55 }
56}
57
58#[derive(Clone, Debug)]
59pub struct SearchIndex {
60 pub postings: HashMap<u32, Vec<Posting>>,
61 pub files: Vec<FileEntry>,
62 pub path_to_id: HashMap<PathBuf, u32>,
63 pub ready: bool,
64 project_root: PathBuf,
65 git_head: Option<String>,
66 max_file_size: u64,
67 ignore_rules_fingerprint: String,
68 pub file_trigrams: HashMap<u32, Vec<u32>>,
69 unindexed_files: HashSet<u32>,
70}
71
72#[derive(Clone, Debug, Default)]
73pub struct LexicalRankResult {
74 pub files: Vec<(PathBuf, f32)>,
75 pub engine_capped: bool,
76}
77
78impl SearchIndex {
79 pub fn file_count(&self) -> usize {
81 self.files.len()
82 }
83
84 pub fn trigram_count(&self) -> usize {
86 self.postings.len()
87 }
88
89 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
91 query_trigrams_from_tokens(tokens)
92 }
93
94 pub fn lexical_rank(
96 &self,
97 query_trigrams: &[u32],
98 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
99 max_files: usize,
100 ) -> Vec<(PathBuf, f32)> {
101 self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
102 .files
103 }
104
105 pub fn lexical_rank_with_stats(
108 &self,
109 query_trigrams: &[u32],
110 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
111 max_files: usize,
112 ) -> LexicalRankResult {
113 if query_trigrams.is_empty() || max_files == 0 {
114 return LexicalRankResult::default();
115 }
116
117 let mut non_zero: Vec<(u32, usize)> = query_trigrams
118 .iter()
119 .filter_map(|trigram| {
120 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
121 (posting_count > 0).then_some((*trigram, posting_count))
122 })
123 .collect();
124 if non_zero.is_empty() {
125 return LexicalRankResult::default();
126 }
127
128 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
129 let selected_count = non_zero.len().min(3);
130 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
131
132 let mut candidate_ids = BTreeSet::new();
133 for (trigram, _) in non_zero.iter().take(selected_count) {
134 if let Some(postings) = self.postings.get(trigram) {
135 for posting in postings {
136 if self.is_active_file(posting.file_id) {
137 candidate_ids.insert(posting.file_id);
138 }
139 }
140 }
141 }
142 let pre_filter_candidate_count = candidate_ids.len();
143 let engine_capped = pre_filter_candidate_count > candidate_cap;
144 let filtered_candidates = candidate_ids
145 .into_iter()
146 .filter_map(|file_id| {
147 self.files
148 .get(file_id as usize)
149 .map(|entry| (file_id, entry))
150 })
151 .filter(|(_, entry)| {
152 if let Some(filter) = candidate_filter {
153 filter(&entry.path)
154 } else {
155 true
156 }
157 })
158 .collect::<Vec<_>>();
159
160 let mut ranked = Vec::new();
161 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
162 let score = lexical_score(self, query_trigrams, file_id);
163 if score > 0.0 {
164 ranked.push((entry.path.clone(), score));
165 }
166 }
167
168 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
169 ranked.truncate(max_files);
170 LexicalRankResult {
171 files: ranked,
172 engine_capped,
173 }
174 }
175}
176
177#[derive(Clone, Debug, PartialEq, Eq)]
178pub struct Posting {
179 pub file_id: u32,
180 pub next_mask: u8,
181 pub loc_mask: u8,
182}
183
184#[derive(Clone, Debug)]
185pub struct FileEntry {
186 pub path: PathBuf,
187 pub size: u64,
188 pub modified: SystemTime,
189 pub content_hash: blake3::Hash,
190}
191
192#[derive(Clone, Debug, PartialEq, Eq)]
193pub struct GrepMatch {
194 pub file: PathBuf,
195 pub line: u32,
196 pub column: u32,
197 pub line_text: String,
198 pub match_text: String,
199}
200
201#[derive(Clone, Debug)]
202pub struct GrepResult {
203 pub matches: Vec<GrepMatch>,
204 pub total_matches: usize,
205 pub files_searched: usize,
206 pub files_with_matches: usize,
207 pub index_status: IndexStatus,
208 pub truncated: bool,
209 pub fully_degraded: bool,
210 pub engine_capped: bool,
211 pub walk_truncated: bool,
213}
214
215#[derive(Clone, Copy, Debug, PartialEq, Eq)]
216pub enum IndexStatus {
217 Ready,
218 Building,
219 Fallback,
220 Disabled,
221}
222
223impl IndexStatus {
224 pub fn as_str(&self) -> &'static str {
225 match self {
226 IndexStatus::Ready => "Ready",
227 IndexStatus::Building => "Building",
228 IndexStatus::Fallback => "Fallback",
229 IndexStatus::Disabled => "Disabled",
230 }
231 }
232}
233
234#[derive(Clone, Debug, Default)]
235pub struct RegexQuery {
236 pub and_trigrams: Vec<u32>,
237 pub or_groups: Vec<Vec<u32>>,
238 pub(crate) and_filters: HashMap<u32, PostingFilter>,
239 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
240}
241
242#[derive(Clone, Copy, Debug, Default)]
243pub(crate) struct PostingFilter {
244 next_mask: u8,
245 loc_mask: u8,
246}
247
248#[derive(Clone, Copy)]
249struct SearchFileMetadata {
250 size: u64,
251 modified: SystemTime,
252}
253
254struct PreparedIndexedFile {
255 metadata: SearchFileMetadata,
256 content_hash: blake3::Hash,
257 trigram_map: BTreeMap<u32, PostingFilter>,
258}
259
260enum PreparedSearchPath {
261 Indexed(PreparedIndexedFile),
262 Unindexed(SearchFileMetadata),
263 Skipped,
264}
265
266#[derive(Clone, Debug, Default)]
267struct QueryBuild {
268 and_runs: Vec<Vec<u8>>,
269 or_groups: Vec<Vec<Vec<u8>>>,
270}
271
272#[derive(Clone, Debug, Default)]
273pub(crate) struct PathFilters {
274 includes: Option<GlobSet>,
275 excludes: Option<GlobSet>,
276}
277
278#[derive(Clone, Debug)]
279pub(crate) struct SearchScope {
280 pub root: PathBuf,
281 pub use_index: bool,
282}
283
284#[derive(Clone, Debug)]
285struct SharedGrepMatch {
286 file: Arc<PathBuf>,
287 line: u32,
288 column: u32,
289 line_text: String,
290 match_text: String,
291}
292
293#[derive(Clone, Debug)]
294enum SearchMatcher {
295 Literal(LiteralSearch),
296 Regex(Regex),
297}
298
299impl SearchIndex {
300 pub fn new() -> Self {
301 SearchIndex {
302 postings: HashMap::new(),
303 files: Vec::new(),
304 path_to_id: HashMap::new(),
305 ready: false,
306 project_root: PathBuf::new(),
307 git_head: None,
308 max_file_size: DEFAULT_MAX_FILE_SIZE,
309 ignore_rules_fingerprint: String::new(),
310 file_trigrams: HashMap::new(),
311 unindexed_files: HashSet::new(),
312 }
313 }
314
315 pub fn build(root: &Path) -> Self {
316 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
317 }
318
319 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
320 let started = std::time::Instant::now();
321 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
322 let mut index = SearchIndex {
323 project_root: project_root.clone(),
324 max_file_size,
325 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
326 ..SearchIndex::new()
327 };
328
329 let filters = PathFilters::default();
330 let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
331 let indexed = index.ingest_paths_parallel(&paths);
332
333 index.git_head = current_git_head(&project_root);
334 index.ready = true;
335 crate::slog_info!(
336 "search index cold build: {} files, {} trigrams, {} ms (pool={})",
337 indexed,
338 index.postings.len(),
339 started.elapsed().as_millis(),
340 search_index_build_pool_size()
341 );
342 index
343 }
344
345 #[cfg(test)]
347 pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
348 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
349 let mut index = SearchIndex {
350 project_root: project_root.clone(),
351 max_file_size,
352 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
353 ..SearchIndex::new()
354 };
355 let filters = PathFilters::default();
356 for path in walk_project_files(&project_root, &filters) {
357 index.update_file(&path);
358 }
359 index.git_head = current_git_head(&project_root);
360 index.ready = true;
361 index
362 }
363
364 fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
365 let max_file_size = self.max_file_size;
366 let pool = match rayon::ThreadPoolBuilder::new()
367 .num_threads(search_index_build_pool_size())
368 .thread_name(|index| format!("aft-search-build-{index}"))
369 .stack_size(8 * 1024 * 1024)
370 .build()
371 {
372 Ok(pool) => Some(pool),
373 Err(error) => {
374 log::warn!(
375 "search index: bounded build pool unavailable ({error}); using global pool"
376 );
377 None
378 }
379 };
380
381 let mut indexed = 0usize;
382 for chunk in paths.chunks(PARALLEL_INGEST_CHUNK_SIZE) {
383 let prepare_chunk = || -> Vec<PreparedSearchPath> {
384 chunk
385 .par_iter()
386 .map(|path| prepare_search_path(path, max_file_size))
387 .collect()
388 };
389 let prepared = match &pool {
390 Some(pool) => pool.install(prepare_chunk),
391 None => prepare_chunk(),
392 };
393
394 for (path, prepared) in chunk.iter().zip(prepared) {
395 let inserted = match prepared {
396 PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
397 PreparedSearchPath::Unindexed(metadata) => {
398 self.track_unindexed_file_with_metadata(path, metadata)
399 }
400 PreparedSearchPath::Skipped => false,
401 };
402 if inserted {
403 indexed += 1;
404 }
405 }
406 }
407
408 indexed
409 }
410
411 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
412 self.remove_file(path);
413 let metadata = metadata_for_indexed_content(path, content.len() as u64);
414 self.index_file_with_metadata(path, content, metadata);
415 }
416
417 fn index_file_with_metadata(
418 &mut self,
419 path: &Path,
420 content: &[u8],
421 metadata: SearchFileMetadata,
422 ) -> bool {
423 self.index_prepared_new_file(
424 path,
425 PreparedIndexedFile {
426 metadata,
427 content_hash: cache_freshness::hash_bytes(content),
428 trigram_map: trigram_filter_map(content, true),
429 },
430 )
431 }
432
433 fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
434 let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
435 Some(file_id) => file_id,
436 None => return false,
437 };
438 if let Some(entry) = self.files.get_mut(file_id as usize) {
439 entry.content_hash = file.content_hash;
440 }
441
442 let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
443 for (trigram, filter) in file.trigram_map {
444 let postings = self.postings.entry(trigram).or_default();
445 postings.push(Posting {
446 file_id,
447 next_mask: filter.next_mask,
448 loc_mask: filter.loc_mask,
449 });
450 if postings.len() > 1
454 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
455 {
456 postings.sort_unstable_by_key(|p| p.file_id);
457 }
458 file_trigrams.push(trigram);
459 }
460
461 self.file_trigrams.insert(file_id, file_trigrams);
462 self.unindexed_files.remove(&file_id);
463 true
464 }
465
466 pub fn remove_file(&mut self, path: &Path) {
467 let canonical_path = canonicalize_existing_or_deleted_path(path);
468 let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
469 file_id
470 } else if canonical_path.as_path() != path {
471 let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
472 return;
473 };
474 file_id
475 } else {
476 return;
477 };
478
479 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
480 for trigram in trigrams {
481 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
482 postings.retain(|posting| posting.file_id != file_id);
483 postings.is_empty()
484 } else {
485 false
486 };
487
488 if should_remove {
489 self.postings.remove(&trigram);
490 }
491 }
492 }
493
494 self.unindexed_files.remove(&file_id);
495 if let Some(file) = self.files.get_mut(file_id as usize) {
496 file.path = PathBuf::new();
497 file.size = 0;
498 file.modified = UNIX_EPOCH;
499 file.content_hash = cache_freshness::zero_hash();
500 }
501 }
502
503 pub fn update_file(&mut self, path: &Path) {
504 self.remove_file(path);
505
506 let metadata = match fs::metadata(path) {
507 Ok(metadata) if metadata.is_file() => metadata,
508 _ => return,
509 };
510
511 let metadata = search_file_metadata(&metadata);
512
513 if is_binary_path(path, metadata.size) {
514 self.track_unindexed_file_with_metadata(path, metadata);
515 return;
516 }
517
518 if metadata.size > self.max_file_size {
519 self.track_unindexed_file_with_metadata(path, metadata);
520 return;
521 }
522
523 let content = match fs::read(path) {
524 Ok(content) => content,
525 Err(_) => return,
526 };
527
528 if is_binary_bytes(&content) {
529 self.track_unindexed_file_with_metadata(path, metadata);
530 return;
531 }
532
533 self.index_file_with_metadata(path, &content, metadata);
534 }
535
536 pub fn grep(
537 &self,
538 pattern: &str,
539 case_sensitive: bool,
540 include: &[String],
541 exclude: &[String],
542 search_root: &Path,
543 max_results: usize,
544 ) -> GrepResult {
545 match pattern_compile::compile(
546 pattern,
547 CompileOpts {
548 case_insensitive: !case_sensitive,
549 ..CompileOpts::default()
550 },
551 ) {
552 CompileResult::Ok(compiled) => {
553 self.search_grep(&compiled, include, exclude, search_root, max_results)
554 }
555 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
556 self.empty_grep_result()
557 }
558 }
559 }
560
561 pub fn search_grep(
562 &self,
563 pattern: &CompiledPattern,
564 include: &[String],
565 exclude: &[String],
566 search_root: &Path,
567 max_results: usize,
568 ) -> GrepResult {
569 let matcher = match pattern {
570 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
571 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
572 };
573
574 let filters = match build_path_filters(include, exclude) {
575 Ok(filters) => filters,
576 Err(_) => PathFilters::default(),
577 };
578 let search_root = canonicalize_or_normalize(search_root);
579
580 let raw_pattern = pattern.raw_pattern_for_trigrams();
581 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
582 RegexQuery::default()
583 } else {
584 decompose_regex(&raw_pattern)
585 };
586 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
587 let candidate_ids = self.candidates(&query);
588
589 let candidate_files: Vec<&FileEntry> = candidate_ids
590 .into_iter()
591 .filter_map(|file_id| self.files.get(file_id as usize))
592 .filter(|file| !file.path.as_os_str().is_empty())
593 .filter(|file| is_within_search_root(&search_root, &file.path))
594 .filter(|file| filters.matches(&self.project_root, &file.path))
595 .collect();
596
597 let total_matches = AtomicUsize::new(0);
598 let files_searched = AtomicUsize::new(0);
599 let files_with_matches = AtomicUsize::new(0);
600 let truncated = AtomicBool::new(false);
601 let engine_capped = AtomicBool::new(false);
602 let stop_after = max_results.saturating_mul(2);
603 let stop_scan = Arc::new(AtomicBool::new(false));
604
605 let mut matches = if candidate_files.len() > 10 {
606 candidate_files
607 .par_iter()
608 .map(|file| {
609 if grep_scan_should_stop(
610 Some(&stop_scan),
611 &truncated,
612 &total_matches,
613 stop_after,
614 ) {
615 engine_capped.store(true, Ordering::Relaxed);
616 return Vec::new();
617 }
618 search_candidate_file(
619 file,
620 &matcher,
621 max_results,
622 stop_after,
623 &total_matches,
624 &files_searched,
625 &files_with_matches,
626 &truncated,
627 &engine_capped,
628 Some(&stop_scan),
629 )
630 })
631 .reduce(Vec::new, |mut left, mut right| {
632 left.append(&mut right);
640 left
641 })
642 } else {
643 let mut matches = Vec::new();
644 for file in candidate_files {
645 matches.extend(search_candidate_file(
646 file,
647 &matcher,
648 max_results,
649 stop_after,
650 &total_matches,
651 &files_searched,
652 &files_with_matches,
653 &truncated,
654 &engine_capped,
655 None,
656 ));
657
658 if should_stop_search(&truncated, &total_matches, stop_after) {
659 engine_capped.store(true, Ordering::Relaxed);
660 break;
661 }
662 }
663 matches
664 };
665
666 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
667 self.path_to_id
668 .get(path)
669 .and_then(|file_id| self.files.get(*file_id as usize))
670 .map(|file| file.modified)
671 });
672
673 let matches = matches
674 .into_iter()
675 .map(|matched| GrepMatch {
676 file: matched.file.as_ref().clone(),
677 line: matched.line,
678 column: matched.column,
679 line_text: matched.line_text,
680 match_text: matched.match_text,
681 })
682 .collect();
683
684 GrepResult {
685 total_matches: total_matches.load(Ordering::Relaxed),
686 matches,
687 files_searched: files_searched.load(Ordering::Relaxed),
688 files_with_matches: files_with_matches.load(Ordering::Relaxed),
689 index_status: if self.ready {
690 IndexStatus::Ready
691 } else {
692 IndexStatus::Building
693 },
694 truncated: truncated.load(Ordering::Relaxed),
695 fully_degraded,
696 engine_capped: engine_capped.load(Ordering::Relaxed),
697 walk_truncated: false,
698 }
699 }
700
701 fn empty_grep_result(&self) -> GrepResult {
702 GrepResult {
703 matches: Vec::new(),
704 total_matches: 0,
705 files_searched: 0,
706 files_with_matches: 0,
707 index_status: if self.ready {
708 IndexStatus::Ready
709 } else {
710 IndexStatus::Building
711 },
712 truncated: false,
713 fully_degraded: false,
714 engine_capped: false,
715 walk_truncated: false,
716 }
717 }
718
719 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
720 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
721 Ok(filters) => filters,
722 Err(_) => return Vec::new(),
723 };
724 let search_root = canonicalize_or_normalize(search_root);
725 let mut entries = self
726 .files
727 .iter()
728 .filter(|file| !file.path.as_os_str().is_empty())
729 .filter(|file| is_within_search_root(&search_root, &file.path))
730 .filter(|file| filters.matches(&self.project_root, &file.path))
731 .map(|file| (file.path.clone(), file.modified))
732 .collect::<Vec<_>>();
733
734 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
735 right_mtime
736 .cmp(left_mtime)
737 .then_with(|| left_path.cmp(right_path))
738 });
739
740 entries.into_iter().map(|(path, _)| path).collect()
741 }
742
743 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
744 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
745 return self.active_file_ids();
746 }
747
748 let mut and_trigrams = query.and_trigrams.clone();
749 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
750
751 let mut current: Option<Vec<u32>> = None;
752
753 for trigram in and_trigrams {
754 let filter = query.and_filters.get(&trigram).copied();
755 let matches = self.postings_for_trigram(trigram, filter);
756 current = Some(match current.take() {
757 Some(existing) => intersect_sorted_ids(&existing, &matches),
758 None => matches,
759 });
760
761 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
762 break;
763 }
764 }
765
766 let mut current = current.unwrap_or_else(|| self.active_file_ids());
767
768 for (index, group) in query.or_groups.iter().enumerate() {
769 let mut group_matches = Vec::new();
770 let filters = query.or_filters.get(index);
771
772 for trigram in group {
773 let filter = filters.and_then(|filters| filters.get(trigram).copied());
774 let matches = self.postings_for_trigram(*trigram, filter);
775 if group_matches.is_empty() {
776 group_matches = matches;
777 } else {
778 group_matches = union_sorted_ids(&group_matches, &matches);
779 }
780 }
781
782 current = intersect_sorted_ids(¤t, &group_matches);
783 if current.is_empty() {
784 break;
785 }
786 }
787
788 let mut unindexed = self
789 .unindexed_files
790 .iter()
791 .copied()
792 .filter(|file_id| self.is_active_file(*file_id))
793 .collect::<Vec<_>>();
794 if !unindexed.is_empty() {
795 unindexed.sort_unstable();
796 current = union_sorted_ids(¤t, &unindexed);
797 }
798
799 current
800 }
801
802 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
803 if fs::create_dir_all(cache_dir).is_err() {
804 return;
805 }
806
807 let cache_path = cache_dir.join("cache.bin");
808 let tmp_cache = cache_dir.join(format!(
809 "cache.bin.tmp.{}.{}",
810 std::process::id(),
811 SystemTime::now()
812 .duration_since(UNIX_EPOCH)
813 .unwrap_or(Duration::ZERO)
814 .as_nanos()
815 ));
816
817 let active_ids = self.active_file_ids();
818 let mut id_map = HashMap::new();
819 for (new_id, old_id) in active_ids.iter().enumerate() {
820 let Ok(new_id_u32) = u32::try_from(new_id) else {
821 return;
822 };
823 id_map.insert(*old_id, new_id_u32);
824 }
825
826 let write_result = (|| -> std::io::Result<()> {
827 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
828
829 postings_writer.write_all(INDEX_MAGIC)?;
830 write_u32(&mut postings_writer, INDEX_VERSION)?;
831
832 let head = git_head.unwrap_or_default();
833 let root = self.project_root.to_string_lossy();
834 let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
835 ignore_rules_fingerprint(&self.project_root)
836 } else {
837 self.ignore_rules_fingerprint.clone()
838 };
839 let head_len = u32::try_from(head.len())
840 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
841 let root_len = u32::try_from(root.len())
842 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
843 let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
844 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
845 let file_count = u32::try_from(active_ids.len())
846 .map_err(|_| std::io::Error::other("too many files to cache"))?;
847
848 write_u32(&mut postings_writer, head_len)?;
849 write_u32(&mut postings_writer, root_len)?;
850 write_u32(&mut postings_writer, ignore_fingerprint_len)?;
851 write_u64(&mut postings_writer, self.max_file_size)?;
852 write_u32(&mut postings_writer, file_count)?;
853 postings_writer.write_all(head.as_bytes())?;
854 postings_writer.write_all(root.as_bytes())?;
855 postings_writer.write_all(ignore_fingerprint.as_bytes())?;
856
857 for old_id in &active_ids {
858 let Some(file) = self.files.get(*old_id as usize) else {
859 return Err(std::io::Error::other("missing file entry for cache write"));
860 };
861 let path =
862 cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
863 std::io::Error::other(format!(
864 "refusing to cache path outside project root: {}",
865 file.path.display()
866 ))
867 })?;
868 let path = path.to_string_lossy();
869 let path_len = u32::try_from(path.len())
870 .map_err(|_| std::io::Error::other("cached path too large"))?;
871 let modified = file
872 .modified
873 .duration_since(UNIX_EPOCH)
874 .unwrap_or(Duration::ZERO);
875 let unindexed = if self.unindexed_files.contains(old_id) {
876 1u8
877 } else {
878 0u8
879 };
880
881 postings_writer.write_all(&[unindexed])?;
882 write_u32(&mut postings_writer, path_len)?;
883 write_u64(&mut postings_writer, file.size)?;
884 write_u64(&mut postings_writer, modified.as_secs())?;
885 write_u32(&mut postings_writer, modified.subsec_nanos())?;
886 postings_writer.write_all(file.content_hash.as_bytes())?;
887 postings_writer.write_all(path.as_bytes())?;
888 }
889
890 let mut lookup_entries = Vec::new();
891 let mut postings_blob = Vec::new();
892 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
893 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
894
895 for (trigram, postings) in sorted_postings {
896 let offset = u64::try_from(postings_blob.len())
897 .map_err(|_| std::io::Error::other("postings blob too large"))?;
898 let mut count = 0u32;
899
900 for posting in postings {
901 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
902 continue;
903 };
904
905 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
906 postings_blob.push(posting.next_mask);
907 postings_blob.push(posting.loc_mask);
908 count = count.saturating_add(1);
909 }
910
911 if count > 0 {
912 lookup_entries.push((*trigram, offset, count));
913 }
914 }
915
916 write_u64(
917 &mut postings_writer,
918 u64::try_from(postings_blob.len())
919 .map_err(|_| std::io::Error::other("postings blob too large"))?,
920 )?;
921 postings_writer.write_all(&postings_blob)?;
922 postings_writer.flush()?;
923 let mut postings_blob_file = postings_writer
924 .into_inner()
925 .map_err(|error| std::io::Error::other(error.to_string()))?
926 .into_inner();
927 let checksum = crc32fast::hash(&postings_blob_file);
928 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
929
930 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
931 let entry_count = u32::try_from(lookup_entries.len())
932 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
933
934 lookup_writer.write_all(LOOKUP_MAGIC)?;
935 write_u32(&mut lookup_writer, INDEX_VERSION)?;
936 write_u32(&mut lookup_writer, entry_count)?;
937
938 for (trigram, offset, count) in lookup_entries {
939 write_u32(&mut lookup_writer, trigram)?;
940 write_u64(&mut lookup_writer, offset)?;
941 write_u32(&mut lookup_writer, count)?;
942 }
943
944 lookup_writer.flush()?;
945 let mut lookup_blob_file = lookup_writer
946 .into_inner()
947 .map_err(|error| std::io::Error::other(error.to_string()))?
948 .into_inner();
949 let checksum = crc32fast::hash(&lookup_blob_file);
950 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
951
952 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
953 write_u32(&mut cache_writer, CACHE_MAGIC)?;
954 write_u32(&mut cache_writer, INDEX_VERSION)?;
955 write_u64(
956 &mut cache_writer,
957 u64::try_from(postings_blob_file.len())
958 .map_err(|_| std::io::Error::other("postings section too large"))?,
959 )?;
960 cache_writer.write_all(&postings_blob_file)?;
961 cache_writer.write_all(&lookup_blob_file)?;
962 cache_writer.flush()?;
963 cache_writer.get_ref().sync_all()?;
964 drop(cache_writer);
965 fs::rename(&tmp_cache, &cache_path)?;
966
967 Ok(())
968 })();
969
970 if write_result.is_err() {
971 let _ = fs::remove_file(&tmp_cache);
972 }
973 }
974
975 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
976 debug_assert!(current_canonical_root.is_absolute());
977 let cache_path = cache_dir.join("cache.bin");
978 let cache_bytes = fs::read(&cache_path).ok()?;
979 if cache_bytes.len() < 16 {
980 return None;
981 }
982 let mut header = Cursor::new(&cache_bytes);
983 if read_u32(&mut header).ok()? != CACHE_MAGIC {
984 return None;
985 }
986 if read_u32(&mut header).ok()? != INDEX_VERSION {
987 return None;
988 }
989 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
990 let start = usize::try_from(header.position()).ok()?;
991 let postings_end = start.checked_add(postings_len_total)?;
992 if postings_end > cache_bytes.len() {
993 return None;
994 }
995 let postings_bytes = &cache_bytes[start..postings_end];
996 let lookup_bytes = &cache_bytes[postings_end..];
997 let lookup_len_total = lookup_bytes.len();
998 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
999 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
1000 if postings_len_total < 4 || lookup_len_total < 4 {
1001 return None;
1002 }
1003 verify_crc32_bytes_slice(postings_bytes).ok()?;
1004 verify_crc32_bytes_slice(lookup_bytes).ok()?;
1005
1006 let mut magic = [0u8; 8];
1007 postings_reader.read_exact(&mut magic).ok()?;
1008 if &magic != INDEX_MAGIC {
1009 return None;
1010 }
1011 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
1012 return None;
1013 }
1014
1015 let head_len = read_u32(&mut postings_reader).ok()? as usize;
1016 let root_len = read_u32(&mut postings_reader).ok()? as usize;
1017 let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
1018 let max_file_size = read_u64(&mut postings_reader).ok()?;
1019 let file_count = read_u32(&mut postings_reader).ok()? as usize;
1020 if file_count > MAX_ENTRIES {
1021 return None;
1022 }
1023 let postings_body_len = postings_len_total.checked_sub(4)?;
1024 let lookup_body_len = lookup_len_total.checked_sub(4)?;
1025
1026 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
1027 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
1028 if minimum_file_bytes > remaining_postings {
1029 return None;
1030 }
1031
1032 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1033 return None;
1034 }
1035 let mut head_bytes = vec![0u8; head_len];
1036 postings_reader.read_exact(&mut head_bytes).ok()?;
1037 let git_head = String::from_utf8(head_bytes)
1038 .ok()
1039 .filter(|head| !head.is_empty());
1040
1041 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1042 return None;
1043 }
1044 let mut root_bytes = vec![0u8; root_len];
1045 postings_reader.read_exact(&mut root_bytes).ok()?;
1046 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
1047 let project_root = current_canonical_root.to_path_buf();
1048
1049 if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1050 return None;
1051 }
1052 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
1053 postings_reader
1054 .read_exact(&mut ignore_fingerprint_bytes)
1055 .ok()?;
1056 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
1057 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
1058 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1059 return None;
1060 }
1061
1062 let mut files = Vec::with_capacity(file_count);
1063 let mut path_to_id = HashMap::new();
1064 let mut unindexed_files = HashSet::new();
1065
1066 for file_id in 0..file_count {
1067 let mut unindexed = [0u8; 1];
1068 postings_reader.read_exact(&mut unindexed).ok()?;
1069 let path_len = read_u32(&mut postings_reader).ok()? as usize;
1070 let size = read_u64(&mut postings_reader).ok()?;
1071 let secs = read_u64(&mut postings_reader).ok()?;
1072 let nanos = read_u32(&mut postings_reader).ok()?;
1073 let mut hash_bytes = [0u8; 32];
1074 postings_reader.read_exact(&mut hash_bytes).ok()?;
1075 let content_hash = blake3::Hash::from_bytes(hash_bytes);
1076 if nanos >= 1_000_000_000 {
1077 return None;
1078 }
1079 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1080 return None;
1081 }
1082 let mut path_bytes = vec![0u8; path_len];
1083 postings_reader.read_exact(&mut path_bytes).ok()?;
1084 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
1085 let full_path = cached_path_under_root(&project_root, &relative_path)?;
1086 let file_id_u32 = u32::try_from(file_id).ok()?;
1087
1088 files.push(FileEntry {
1089 path: full_path.clone(),
1090 size,
1091 modified: UNIX_EPOCH + Duration::new(secs, nanos),
1092 content_hash,
1093 });
1094 path_to_id.insert(full_path, file_id_u32);
1095 if unindexed[0] == 1 {
1096 unindexed_files.insert(file_id_u32);
1097 }
1098 }
1099
1100 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
1101 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
1102 if postings_len > max_postings_bytes {
1103 return None;
1104 }
1105 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
1106 return None;
1107 }
1108 let mut postings_blob = vec![0u8; postings_len];
1109 postings_reader.read_exact(&mut postings_blob).ok()?;
1110
1111 let mut lookup_magic = [0u8; 8];
1112 lookup_reader.read_exact(&mut lookup_magic).ok()?;
1113 if &lookup_magic != LOOKUP_MAGIC {
1114 return None;
1115 }
1116 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
1117 return None;
1118 }
1119 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
1120 if entry_count > MAX_ENTRIES {
1121 return None;
1122 }
1123 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1124 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1125 if minimum_lookup_bytes > remaining_lookup {
1126 return None;
1127 }
1128
1129 let mut postings = HashMap::new();
1130 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1131
1132 for _ in 0..entry_count {
1133 let trigram = read_u32(&mut lookup_reader).ok()?;
1134 let offset = read_u64(&mut lookup_reader).ok()? as usize;
1135 let count = read_u32(&mut lookup_reader).ok()? as usize;
1136 if count > MAX_ENTRIES {
1137 return None;
1138 }
1139 let bytes_len = count.checked_mul(POSTING_BYTES)?;
1140 let end = offset.checked_add(bytes_len)?;
1141 if end > postings_blob.len() {
1142 return None;
1143 }
1144
1145 let mut trigram_postings = Vec::with_capacity(count);
1146 for chunk in postings_blob[offset..end].chunks_exact(6) {
1147 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1148 let posting = Posting {
1149 file_id,
1150 next_mask: chunk[4],
1151 loc_mask: chunk[5],
1152 };
1153 trigram_postings.push(posting.clone());
1154 file_trigrams.entry(file_id).or_default().push(trigram);
1155 }
1156 postings.insert(trigram, trigram_postings);
1157 }
1158
1159 Some(SearchIndex {
1160 postings,
1161 files,
1162 path_to_id,
1163 ready: false,
1164 project_root,
1165 git_head,
1166 max_file_size,
1167 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1168 file_trigrams,
1169 unindexed_files,
1170 })
1171 }
1172
1173 pub fn stored_git_head(&self) -> Option<&str> {
1174 self.git_head.as_deref()
1175 }
1176
1177 pub(crate) fn set_ready(&mut self, ready: bool) {
1178 self.ready = ready;
1179 }
1180
1181 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1182 self.git_head = current_head;
1183 verify_file_mtimes(self);
1184 self.ready = true;
1185 }
1186
1187 #[cfg(debug_assertions)]
1188 #[doc(hidden)]
1189 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1190 self.verify_against_disk(current_head);
1191 }
1192
1193 pub(crate) fn rebuild_or_refresh(
1194 root: &Path,
1195 max_file_size: u64,
1196 current_head: Option<String>,
1197 baseline: Option<SearchIndex>,
1198 ) -> Self {
1199 if let Some(mut baseline) = baseline {
1200 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1201 baseline.max_file_size = max_file_size;
1202 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1203 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1204 return SearchIndex::build_with_limit(root, max_file_size);
1205 }
1206 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1207
1208 if baseline.git_head == current_head || current_head.is_none() {
1209 baseline.git_head = current_head;
1216 verify_file_mtimes(&mut baseline);
1217 baseline.ready = true;
1218 return baseline;
1219 }
1220
1221 if let (Some(previous), Some(current)) =
1222 (baseline.git_head.clone(), current_head.clone())
1223 {
1224 let project_root = baseline.project_root.clone();
1225 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1226 baseline.git_head = Some(current);
1227 verify_file_mtimes(&mut baseline);
1228 baseline.ready = true;
1229 return baseline;
1230 }
1231 }
1232 }
1233
1234 SearchIndex::build_with_limit(root, max_file_size)
1235 }
1236
1237 fn allocate_file_id_with_metadata(
1238 &mut self,
1239 path: &Path,
1240 metadata: SearchFileMetadata,
1241 ) -> Option<u32> {
1242 let file_id = u32::try_from(self.files.len()).ok()?;
1243 self.files.push(FileEntry {
1244 path: path.to_path_buf(),
1245 size: metadata.size,
1246 modified: metadata.modified,
1247 content_hash: cache_freshness::zero_hash(),
1248 });
1249 self.path_to_id.insert(path.to_path_buf(), file_id);
1250 Some(file_id)
1251 }
1252
1253 fn track_unindexed_file_with_metadata(
1254 &mut self,
1255 path: &Path,
1256 metadata: SearchFileMetadata,
1257 ) -> bool {
1258 let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1259 return false;
1260 };
1261 self.unindexed_files.insert(file_id);
1262 self.file_trigrams.insert(file_id, Vec::new());
1263 true
1264 }
1265
1266 fn active_file_ids(&self) -> Vec<u32> {
1267 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1268 ids.sort_unstable();
1269 ids
1270 }
1271
1272 fn is_active_file(&self, file_id: u32) -> bool {
1273 self.files
1274 .get(file_id as usize)
1275 .map(|file| !file.path.as_os_str().is_empty())
1276 .unwrap_or(false)
1277 }
1278
1279 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1280 let Some(postings) = self.postings.get(&trigram) else {
1281 return Vec::new();
1282 };
1283
1284 let mut matches = Vec::with_capacity(postings.len());
1285
1286 for posting in postings {
1287 if let Some(filter) = filter {
1288 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1291 continue;
1292 }
1293 }
1298 if self.is_active_file(posting.file_id) {
1299 matches.push(posting.file_id);
1300 }
1301 }
1302
1303 matches
1304 }
1305}
1306
1307fn search_candidate_file(
1308 file: &FileEntry,
1309 matcher: &SearchMatcher,
1310 max_results: usize,
1311 stop_after: usize,
1312 total_matches: &AtomicUsize,
1313 files_searched: &AtomicUsize,
1314 files_with_matches: &AtomicUsize,
1315 truncated: &AtomicBool,
1316 engine_capped: &AtomicBool,
1317 stop_scan: Option<&Arc<AtomicBool>>,
1318) -> Vec<SharedGrepMatch> {
1319 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1320 engine_capped.store(true, Ordering::Relaxed);
1321 return Vec::new();
1322 }
1323
1324 let content = match read_indexed_file_bytes(&file.path) {
1325 Some(content) => content,
1326 None => return Vec::new(),
1327 };
1328 if is_binary_bytes(&content) {
1335 return Vec::new();
1336 }
1337 files_searched.fetch_add(1, Ordering::Relaxed);
1338
1339 let shared_path = Arc::new(file.path.clone());
1340 let mut matches = Vec::new();
1341 let mut line_starts = None;
1342 let mut seen_lines = HashSet::new();
1343 let mut matched_this_file = false;
1344
1345 match matcher {
1346 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1347 let needle = &literal.needle;
1348 let finder = memchr::memmem::Finder::new(needle);
1349 let mut start = 0;
1350
1351 while let Some(position) = finder.find(&content[start..]) {
1352 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1353 engine_capped.store(true, Ordering::Relaxed);
1354 break;
1355 }
1356
1357 let offset = start + position;
1358 start = offset + 1;
1359
1360 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1361 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1362 if !seen_lines.insert(line) {
1363 continue;
1364 }
1365
1366 matched_this_file = true;
1367 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1368 if match_number > max_results {
1369 truncated.store(true, Ordering::Relaxed);
1370 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1371 break;
1372 }
1373
1374 let end = offset + needle.len();
1375 matches.push(SharedGrepMatch {
1376 file: shared_path.clone(),
1377 line,
1378 column,
1379 line_text,
1380 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1381 });
1382 }
1383 }
1384 SearchMatcher::Literal(literal) => {
1385 let needle = &literal.needle;
1386 let search_content = content.to_ascii_lowercase();
1387 let finder = memchr::memmem::Finder::new(needle);
1388 let mut start = 0;
1389
1390 while let Some(position) = finder.find(&search_content[start..]) {
1391 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1392 engine_capped.store(true, Ordering::Relaxed);
1393 break;
1394 }
1395
1396 let offset = start + position;
1397 start = offset + 1;
1398
1399 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1400 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1401 if !seen_lines.insert(line) {
1402 continue;
1403 }
1404
1405 matched_this_file = true;
1406 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1407 if match_number > max_results {
1408 truncated.store(true, Ordering::Relaxed);
1409 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1410 break;
1411 }
1412
1413 let end = offset + needle.len();
1414 matches.push(SharedGrepMatch {
1415 file: shared_path.clone(),
1416 line,
1417 column,
1418 line_text,
1419 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1420 });
1421 }
1422 }
1423 SearchMatcher::Regex(regex) => {
1424 for matched in regex.find_iter(&content) {
1425 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1426 engine_capped.store(true, Ordering::Relaxed);
1427 break;
1428 }
1429
1430 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1431 let (line, column, line_text) =
1432 line_details_bytes(&content, line_starts, matched.start());
1433 if !seen_lines.insert(line) {
1434 continue;
1435 }
1436
1437 matched_this_file = true;
1438 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1439 if match_number > max_results {
1440 truncated.store(true, Ordering::Relaxed);
1441 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1442 break;
1443 }
1444
1445 matches.push(SharedGrepMatch {
1446 file: shared_path.clone(),
1447 line,
1448 column,
1449 line_text,
1450 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1451 });
1452 }
1453 }
1454 }
1455
1456 if matched_this_file {
1457 files_with_matches.fetch_add(1, Ordering::Relaxed);
1458 }
1459
1460 matches
1461}
1462
1463fn should_stop_search(
1464 truncated: &AtomicBool,
1465 total_matches: &AtomicUsize,
1466 stop_after: usize,
1467) -> bool {
1468 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1469}
1470
1471fn grep_scan_should_stop(
1472 stop_scan: Option<&Arc<AtomicBool>>,
1473 truncated: &AtomicBool,
1474 total_matches: &AtomicUsize,
1475 stop_after: usize,
1476) -> bool {
1477 stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1478 || should_stop_search(truncated, total_matches, stop_after)
1479}
1480
1481fn signal_grep_scan_cap(
1482 stop_scan: Option<&Arc<AtomicBool>>,
1483 total_matches: &AtomicUsize,
1484 stop_after: usize,
1485) {
1486 if let Some(flag) = stop_scan {
1487 if total_matches.load(Ordering::Relaxed) >= stop_after {
1488 flag.store(true, Ordering::Relaxed);
1489 }
1490 }
1491}
1492
1493fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1494 SearchFileMetadata {
1495 size: metadata.len(),
1496 modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1497 }
1498}
1499
1500fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1501 fs::metadata(path)
1502 .ok()
1503 .map(|metadata| search_file_metadata(&metadata))
1504 .unwrap_or(SearchFileMetadata {
1505 size: size_hint,
1506 modified: UNIX_EPOCH,
1507 })
1508}
1509
1510fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1511 let metadata = match fs::metadata(path) {
1512 Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1513 _ => return PreparedSearchPath::Skipped,
1514 };
1515
1516 if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1517 return PreparedSearchPath::Unindexed(metadata);
1518 }
1519
1520 let content = match fs::read(path) {
1521 Ok(content) => content,
1522 Err(_) => return PreparedSearchPath::Skipped,
1523 };
1524
1525 if is_binary_bytes(&content) {
1526 return PreparedSearchPath::Unindexed(metadata);
1527 }
1528
1529 PreparedSearchPath::Indexed(PreparedIndexedFile {
1530 metadata,
1531 content_hash: cache_freshness::hash_bytes(&content),
1532 trigram_map: trigram_filter_map(&content, true),
1533 })
1534}
1535
1536fn search_index_build_pool_size() -> usize {
1538 std::thread::available_parallelism()
1539 .map(|parallelism| parallelism.get())
1540 .unwrap_or(1)
1541 .div_ceil(2)
1542 .clamp(1, 8)
1543}
1544
1545fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1546 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1547 let mut left_index = 0;
1548 let mut right_index = 0;
1549
1550 while left_index < left.len() && right_index < right.len() {
1551 match left[left_index].cmp(&right[right_index]) {
1552 std::cmp::Ordering::Less => left_index += 1,
1553 std::cmp::Ordering::Greater => right_index += 1,
1554 std::cmp::Ordering::Equal => {
1555 merged.push(left[left_index]);
1556 left_index += 1;
1557 right_index += 1;
1558 }
1559 }
1560 }
1561
1562 merged
1563}
1564
1565fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1566 let mut merged = Vec::with_capacity(left.len() + right.len());
1567 let mut left_index = 0;
1568 let mut right_index = 0;
1569
1570 while left_index < left.len() && right_index < right.len() {
1571 match left[left_index].cmp(&right[right_index]) {
1572 std::cmp::Ordering::Less => {
1573 merged.push(left[left_index]);
1574 left_index += 1;
1575 }
1576 std::cmp::Ordering::Greater => {
1577 merged.push(right[right_index]);
1578 right_index += 1;
1579 }
1580 std::cmp::Ordering::Equal => {
1581 merged.push(left[left_index]);
1582 left_index += 1;
1583 right_index += 1;
1584 }
1585 }
1586 }
1587
1588 merged.extend_from_slice(&left[left_index..]);
1589 merged.extend_from_slice(&right[right_index..]);
1590 merged
1591}
1592
1593pub fn decompose_regex(pattern: &str) -> RegexQuery {
1594 let hir = match regex_syntax::parse(pattern) {
1595 Ok(hir) => hir,
1596 Err(_) => return RegexQuery::default(),
1597 };
1598
1599 let build = build_query(&hir);
1600 build.into_query()
1601}
1602
1603pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1604 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1605}
1606
1607pub fn normalize_char(c: u8) -> u8 {
1608 c.to_ascii_lowercase()
1609}
1610
1611fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
1612 if content.len() < 3 {
1613 return;
1614 }
1615
1616 for start in 0..=content.len() - 3 {
1617 let trigram = pack_trigram(
1618 normalize_char(content[start]),
1619 normalize_char(content[start + 1]),
1620 normalize_char(content[start + 2]),
1621 );
1622 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1623 visit(trigram, next_char, start);
1624 }
1625}
1626
1627pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1628 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1629 scan_trigrams(content, |trigram, next_char, position| {
1630 trigrams.push((trigram, next_char, position));
1631 });
1632 trigrams
1633}
1634
1635fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
1636 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1637 scan_trigrams(content, |trigram, next_char, position| {
1638 let entry = filters.entry(trigram).or_default();
1639 if include_eof_next_char || next_char != EOF_SENTINEL {
1640 entry.next_mask |= mask_for_next_char(next_char);
1641 }
1642 entry.loc_mask |= mask_for_position(position);
1643 });
1644 filters
1645}
1646
1647pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1648 let mut seen = HashSet::new();
1649 let mut out = Vec::new();
1650 for token in tokens {
1651 scan_trigrams(token.as_bytes(), |trigram, _, _| {
1652 if seen.insert(trigram) {
1653 out.push(trigram);
1654 }
1655 });
1656 }
1657 out
1658}
1659
1660pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1661 if query_trigrams.is_empty() {
1662 return 0.0;
1663 }
1664
1665 let mut hits = 0u32;
1666 for &trigram in query_trigrams {
1667 if let Some(postings) = index.postings.get(&trigram) {
1668 if postings
1669 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1670 .is_ok()
1671 {
1672 hits += 1;
1673 }
1674 }
1675 }
1676
1677 if hits == 0 {
1678 return 0.0;
1679 }
1680
1681 let file_trigram_count = index
1682 .file_trigrams
1683 .get(&file_id)
1684 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1685 (hits as f32) / (1.0 + file_trigram_count.ln())
1686}
1687
1688pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1689 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1691 return PathBuf::from(override_dir)
1692 .join("index")
1693 .join(artifact_cache_key(project_root));
1694 }
1695 if let Some(dir) = storage_dir {
1697 return dir.join("index").join(artifact_cache_key(project_root));
1698 }
1699 let home = std::env::var_os("HOME")
1704 .or_else(|| std::env::var_os("USERPROFILE"))
1705 .map(PathBuf::from)
1706 .unwrap_or_else(std::env::temp_dir);
1707 home.join(".cache")
1708 .join("aft")
1709 .join("index")
1710 .join(artifact_cache_key(project_root))
1711}
1712
1713pub(crate) fn build_path_filters(
1714 include: &[String],
1715 exclude: &[String],
1716) -> Result<PathFilters, String> {
1717 Ok(PathFilters {
1718 includes: build_globset(include)?,
1719 excludes: build_globset(exclude)?,
1720 })
1721}
1722
1723pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1724 walk_project_files_from(root, root, filters)
1725}
1726
1727pub fn walk_project_files_bounded_default(
1728 root: &Path,
1729 max_files: usize,
1730) -> Result<Vec<PathBuf>, usize> {
1731 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
1732}
1733
1734pub(crate) fn walk_project_files_bounded_matching<F>(
1735 root: &Path,
1736 filters: &PathFilters,
1737 max_files: usize,
1738 matches_file: F,
1739) -> Result<Vec<PathBuf>, usize>
1740where
1741 F: Fn(&Path) -> bool,
1742{
1743 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
1744}
1745
1746pub fn walk_project_files_bounded_default_matching<F>(
1747 root: &Path,
1748 max_files: usize,
1749 matches_file: F,
1750) -> Result<Vec<PathBuf>, usize>
1751where
1752 F: Fn(&Path) -> bool,
1753{
1754 walk_project_files_from_inner_matching(
1755 root,
1756 root,
1757 &PathFilters::default(),
1758 Some(max_files),
1759 matches_file,
1760 true,
1761 )
1762}
1763
1764pub(crate) fn walk_project_files_from(
1765 filter_root: &Path,
1766 search_root: &Path,
1767 filters: &PathFilters,
1768) -> Vec<PathBuf> {
1769 walk_project_files_from_inner(filter_root, search_root, filters, None, true)
1770 .expect("unbounded project walk cannot exceed a file limit")
1771}
1772
1773pub(crate) fn has_any_project_file_from(
1774 filter_root: &Path,
1775 search_root: &Path,
1776 filters: &PathFilters,
1777) -> bool {
1778 walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
1779}
1780
1781fn walk_project_files_from_inner(
1782 filter_root: &Path,
1783 search_root: &Path,
1784 filters: &PathFilters,
1785 max_files: Option<usize>,
1786 sort_by_mtime: bool,
1787) -> Result<Vec<PathBuf>, usize> {
1788 walk_project_files_from_inner_matching(
1789 filter_root,
1790 search_root,
1791 filters,
1792 max_files,
1793 |_| true,
1794 sort_by_mtime,
1795 )
1796}
1797
1798fn project_walk_builder(search_root: &Path) -> WalkBuilder {
1799 let mut builder = WalkBuilder::new(search_root);
1800 builder
1801 .hidden(false)
1802 .git_ignore(true)
1803 .git_global(true)
1804 .git_exclude(true)
1805 .add_custom_ignore_filename(".aftignore")
1806 .filter_entry(|entry| {
1807 let name = entry.file_name().to_string_lossy();
1808 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1809 return !matches!(
1810 name.as_ref(),
1811 "node_modules"
1812 | "target"
1813 | "venv"
1814 | ".venv"
1815 | ".git"
1816 | "__pycache__"
1817 | ".tox"
1818 | "dist"
1819 | "build"
1820 );
1821 }
1822 true
1823 });
1824 builder
1825}
1826
1827fn walk_project_files_from_inner_matching<F>(
1828 filter_root: &Path,
1829 search_root: &Path,
1830 filters: &PathFilters,
1831 max_files: Option<usize>,
1832 matches_file: F,
1833 sort_by_mtime: bool,
1834) -> Result<Vec<PathBuf>, usize>
1835where
1836 F: Fn(&Path) -> bool,
1837{
1838 let builder = project_walk_builder(search_root);
1839
1840 let mut files = Vec::new();
1841 for entry in builder.build().filter_map(|entry| entry.ok()) {
1842 if !entry
1843 .file_type()
1844 .map_or(false, |file_type| file_type.is_file())
1845 {
1846 continue;
1847 }
1848 let path = entry.into_path();
1849 if filters.matches(filter_root, &path) && matches_file(&path) {
1850 files.push(path);
1851 if max_files.is_some_and(|limit| files.len() > limit) {
1852 return Err(files.len());
1853 }
1854 }
1855 }
1856
1857 if sort_by_mtime {
1858 sort_paths_by_mtime_desc(&mut files);
1859 }
1860 Ok(files)
1861}
1862
1863pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1864 let bytes = fs::read(path).ok()?;
1865 if is_binary_bytes(&bytes) {
1866 return None;
1867 }
1868 String::from_utf8(bytes).ok()
1869}
1870
1871fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1872 fs::read(path).ok()
1873}
1874
1875pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1876 path.strip_prefix(root)
1877 .map(PathBuf::from)
1878 .unwrap_or_else(|_| path.to_path_buf())
1879}
1880
1881pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1882 let normalized_root = normalize_path(root);
1883 let normalized_path = normalize_path(path);
1884 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1885 validate_cached_relative_path(relative)
1886}
1887
1888pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1889 let relative = validate_cached_relative_path(relative_path)?;
1890 let normalized_root = normalize_path(root);
1891 let full_path = normalize_path(&normalized_root.join(relative));
1892
1893 match fs::canonicalize(&full_path) {
1894 Ok(canonical_path) => {
1895 if canonical_path.starts_with(&normalized_root) {
1896 return Some(full_path);
1897 }
1898
1899 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1900 canonical_path
1901 .starts_with(&canonical_root)
1902 .then_some(full_path)
1903 }
1904 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1905 }
1906}
1907
1908pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1909 if path.is_absolute() {
1910 return None;
1911 }
1912
1913 let mut normalized = PathBuf::new();
1914 for component in path.components() {
1915 match component {
1916 Component::Normal(part) => normalized.push(part),
1917 Component::CurDir => {}
1918 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1919 }
1920 }
1921 (!normalized.as_os_str().is_empty()).then_some(normalized)
1922}
1923
1924pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1937 use std::collections::HashMap;
1938 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1939 for path in paths.iter() {
1940 mtimes
1941 .entry(path.clone())
1942 .or_insert_with(|| path_modified_time(path));
1943 }
1944 paths.sort_by(|left, right| {
1945 let left_mtime = mtimes.get(left).and_then(|v| *v);
1946 let right_mtime = mtimes.get(right).and_then(|v| *v);
1947 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1948 });
1949}
1950
1951pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1954 use std::collections::HashMap;
1955 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1956 for m in matches.iter() {
1957 mtimes.entry(m.file.clone()).or_insert_with(|| {
1958 let resolved = resolve_match_path(project_root, &m.file);
1959 path_modified_time(&resolved)
1960 });
1961 }
1962 matches.sort_by(|left, right| {
1963 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1964 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1965 right_mtime
1966 .cmp(&left_mtime)
1967 .then_with(|| left.file.cmp(&right.file))
1968 .then_with(|| left.line.cmp(&right.line))
1969 .then_with(|| left.column.cmp(&right.column))
1970 });
1971}
1972
1973fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1978 matches: &mut [SharedGrepMatch],
1979 modified_for_path: F,
1980) where
1981 F: Fn(&Path) -> Option<SystemTime>,
1982{
1983 use std::collections::HashMap;
1984 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1985 for m in matches.iter() {
1986 let path = m.file.as_path().to_path_buf();
1987 mtimes
1988 .entry(path.clone())
1989 .or_insert_with(|| modified_for_path(&path));
1990 }
1991 matches.sort_by(|left, right| {
1992 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1993 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1994 right_mtime
1995 .cmp(&left_mtime)
1996 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1997 .then_with(|| left.line.cmp(&right.line))
1998 .then_with(|| left.column.cmp(&right.column))
1999 });
2000}
2001
2002pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
2003 let resolved_project_root = canonicalize_or_normalize(project_root);
2004 let root = match path {
2005 Some(path) => {
2006 let path = PathBuf::from(path);
2007 if path.is_absolute() {
2008 canonicalize_or_normalize(&path)
2009 } else {
2010 normalize_path(&resolved_project_root.join(path))
2011 }
2012 }
2013 None => resolved_project_root.clone(),
2014 };
2015
2016 let use_index = is_within_search_root(&resolved_project_root, &root);
2017 SearchScope { root, use_index }
2018}
2019
2020pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
2021 content_inspector::inspect(content).is_binary()
2022}
2023
2024pub(crate) fn current_git_head(root: &Path) -> Option<String> {
2025 run_git(root, &["rev-parse", "HEAD"])
2026}
2027
2028pub fn artifact_cache_key(project_root: &Path) -> String {
2040 use sha2::{Digest, Sha256};
2041
2042 let mut hasher = Sha256::new();
2043
2044 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
2045 hasher.update(root_commit.as_bytes());
2048 } else {
2049 let canonical_root = canonicalize_or_normalize(project_root);
2051 hasher.update(canonical_root.to_string_lossy().as_bytes());
2052 }
2053
2054 let digest = format!("{:x}", hasher.finalize());
2055 digest[..16].to_string()
2056}
2057
2058pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
2066 use sha2::{Digest, Sha256};
2067
2068 let root = canonicalize_or_normalize(project_root);
2069 let mut files = Vec::new();
2070 collect_ignore_rule_files(&root, &mut files);
2071 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
2072 if global_ignore.is_file() {
2073 files.push(global_ignore);
2074 }
2075 }
2076 let info_exclude = git_info_exclude_path(&root);
2077 if info_exclude.is_file() {
2078 files.push(info_exclude);
2079 }
2080 files.sort();
2081 files.dedup();
2082
2083 let mut hasher = Sha256::new();
2084 hasher.update(b"aft-ignore-rules-v1\0");
2085 for path in files {
2086 if let Some(relative) = cache_relative_path(&root, &path) {
2087 hasher.update(relative.to_string_lossy().as_bytes());
2088 } else {
2089 hasher.update(path.to_string_lossy().as_bytes());
2090 }
2091 hasher.update(b"\0");
2092 match fs::read(&path) {
2093 Ok(bytes) => hasher.update(&bytes),
2094 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
2095 }
2096 hasher.update(b"\0");
2097 }
2098
2099 format!("{:x}", hasher.finalize())
2100}
2101
2102fn git_info_exclude_path(root: &Path) -> PathBuf {
2103 run_git(
2104 root,
2105 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
2106 )
2107 .map(PathBuf::from)
2108 .unwrap_or_else(|| root.join(".git"))
2109 .join("info")
2110 .join("exclude")
2111}
2112
2113fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
2114 let mut builder = WalkBuilder::new(root);
2115 builder
2116 .hidden(false)
2117 .git_ignore(true)
2118 .git_global(true)
2119 .git_exclude(true)
2120 .add_custom_ignore_filename(".aftignore")
2121 .filter_entry(|entry| {
2122 let name = entry.file_name().to_string_lossy();
2123 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2124 return !matches!(
2125 name.as_ref(),
2126 ".git"
2127 | "node_modules"
2128 | "target"
2129 | "venv"
2130 | ".venv"
2131 | "__pycache__"
2132 | ".tox"
2133 | "dist"
2134 | "build"
2135 );
2136 }
2137 true
2138 });
2139
2140 for entry in builder.build().filter_map(|entry| entry.ok()) {
2141 if !entry
2142 .file_type()
2143 .map_or(false, |file_type| file_type.is_file())
2144 {
2145 continue;
2146 }
2147 let file_name = entry.file_name();
2148 if file_name == ".gitignore" || file_name == ".aftignore" {
2149 files.push(entry.into_path());
2150 }
2151 }
2152}
2153
2154#[cfg(test)]
2156pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
2157 let mut dirs = 0usize;
2158 let mut builder = WalkBuilder::new(root);
2159 builder
2160 .hidden(false)
2161 .git_ignore(true)
2162 .git_global(true)
2163 .git_exclude(true)
2164 .add_custom_ignore_filename(".aftignore");
2165 for entry in builder.build().filter_map(|entry| entry.ok()) {
2166 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2167 dirs += 1;
2168 }
2169 }
2170 dirs
2171}
2172
2173#[cfg(test)]
2175pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
2176 let mut stack = vec![root.to_path_buf()];
2177 let mut dirs = 0usize;
2178 while let Some(dir) = stack.pop() {
2179 dirs += 1;
2180 let Ok(entries) = fs::read_dir(&dir) else {
2181 continue;
2182 };
2183 for entry in entries.flatten() {
2184 let path = entry.path();
2185 let file_name = entry.file_name();
2186 if file_name == ".gitignore" || file_name == ".aftignore" {
2187 continue;
2188 }
2189 let Ok(file_type) = entry.file_type() else {
2190 continue;
2191 };
2192 if !file_type.is_dir() || file_type.is_symlink() {
2193 continue;
2194 }
2195 if matches!(
2196 file_name.to_str().unwrap_or(""),
2197 ".git"
2198 | "node_modules"
2199 | "target"
2200 | "venv"
2201 | ".venv"
2202 | "__pycache__"
2203 | ".tox"
2204 | "dist"
2205 | "build"
2206 ) {
2207 continue;
2208 }
2209 stack.push(path);
2210 }
2211 }
2212 dirs
2213}
2214
2215impl PathFilters {
2216 pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
2217 let relative = to_glob_path(&relative_to_root(root, path));
2218 if self
2219 .includes
2220 .as_ref()
2221 .is_some_and(|includes| !includes.is_match(&relative))
2222 {
2223 return false;
2224 }
2225 if self
2226 .excludes
2227 .as_ref()
2228 .is_some_and(|excludes| excludes.is_match(&relative))
2229 {
2230 return false;
2231 }
2232 true
2233 }
2234}
2235
2236fn canonicalize_or_normalize(path: &Path) -> PathBuf {
2237 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
2238}
2239
2240fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
2241 if path.is_absolute() {
2242 path.to_path_buf()
2243 } else {
2244 project_root.join(path)
2245 }
2246}
2247
2248fn path_modified_time(path: &Path) -> Option<SystemTime> {
2249 fs::metadata(path)
2250 .and_then(|metadata| metadata.modified())
2251 .ok()
2252}
2253
2254fn normalize_path(path: &Path) -> PathBuf {
2255 let mut result = PathBuf::new();
2256 for component in path.components() {
2257 match component {
2258 Component::ParentDir => {
2259 if !result.pop() {
2260 result.push(component);
2261 }
2262 }
2263 Component::CurDir => {}
2264 _ => result.push(component),
2265 }
2266 }
2267 result
2268}
2269
2270fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2271 if let Ok(canonical) = fs::canonicalize(path) {
2272 return canonical;
2273 }
2274
2275 let Some(parent) = path.parent() else {
2276 return path.to_path_buf();
2277 };
2278 let Some(file_name) = path.file_name() else {
2279 return path.to_path_buf();
2280 };
2281
2282 fs::canonicalize(parent)
2283 .map(|canonical_parent| canonical_parent.join(file_name))
2284 .unwrap_or_else(|_| path.to_path_buf())
2285}
2286
2287fn verify_file_mtimes(index: &mut SearchIndex) {
2290 let filters = PathFilters::default();
2291 let current_files = walk_project_files(&index.project_root, &filters);
2292 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
2293 let mut stale_paths = Vec::new();
2294 let mut removed_paths = Vec::new();
2295
2296 for entry in &mut index.files {
2297 if entry.path.as_os_str().is_empty() {
2298 continue; }
2300 if !current_file_set.contains(&entry.path) {
2301 removed_paths.push(entry.path.clone());
2302 continue;
2303 }
2304 let cached = FileFreshness {
2305 mtime: entry.modified,
2306 size: entry.size,
2307 content_hash: entry.content_hash,
2308 };
2309 match cache_freshness::verify_file_strict(&entry.path, &cached) {
2310 FreshnessVerdict::HotFresh => {}
2311 FreshnessVerdict::ContentFresh {
2312 new_mtime,
2313 new_size,
2314 } => {
2315 entry.modified = new_mtime;
2316 entry.size = new_size;
2317 }
2318 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2319 stale_paths.push(entry.path.clone())
2320 }
2321 }
2322 }
2323
2324 for path in &removed_paths {
2325 index.remove_file(path);
2326 }
2327
2328 for path in &stale_paths {
2332 if current_file_set.contains(path) {
2333 index.update_file(path);
2334 } else {
2335 index.remove_file(path);
2336 }
2337 }
2338
2339 for path in current_files {
2341 if !index.path_to_id.contains_key(&path) {
2342 index.update_file(&path);
2343 }
2344 }
2345
2346 if !stale_paths.is_empty() {
2347 crate::slog_info!(
2348 "search index: refreshed {} stale file(s) from disk cache",
2349 stale_paths.len()
2350 );
2351 }
2352}
2353
2354fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2355 normalize_path(path).starts_with(normalize_path(search_root))
2356}
2357
2358impl QueryBuild {
2359 fn into_query(self) -> RegexQuery {
2360 let mut query = RegexQuery::default();
2361
2362 for run in self.and_runs {
2363 add_run_to_and_query(&mut query, &run);
2364 }
2365
2366 for group in self.or_groups {
2367 let mut trigrams = BTreeSet::new();
2368 let mut filters = HashMap::new();
2369 for run in group {
2370 for (trigram, filter) in trigram_filters(&run) {
2371 trigrams.insert(trigram);
2372 merge_filter(filters.entry(trigram).or_default(), filter);
2373 }
2374 }
2375 if !trigrams.is_empty() {
2376 query.or_groups.push(trigrams.into_iter().collect());
2377 query.or_filters.push(filters);
2378 }
2379 }
2380
2381 query
2382 }
2383}
2384
2385fn build_query(hir: &Hir) -> QueryBuild {
2386 match hir.kind() {
2387 HirKind::Literal(literal) => {
2388 if literal.0.len() >= 3 {
2389 QueryBuild {
2390 and_runs: vec![literal.0.to_vec()],
2391 or_groups: Vec::new(),
2392 }
2393 } else {
2394 QueryBuild::default()
2395 }
2396 }
2397 HirKind::Capture(capture) => build_query(&capture.sub),
2398 HirKind::Concat(parts) => {
2399 let mut build = QueryBuild::default();
2400 for part in parts {
2401 let part_build = build_query(part);
2402 build.and_runs.extend(part_build.and_runs);
2403 build.or_groups.extend(part_build.or_groups);
2404 }
2405 build
2406 }
2407 HirKind::Alternation(parts) => {
2408 let mut group = Vec::new();
2409 for part in parts {
2410 let Some(mut choices) = guaranteed_run_choices(part) else {
2411 return QueryBuild::default();
2412 };
2413 group.append(&mut choices);
2414 }
2415 if group.is_empty() {
2416 QueryBuild::default()
2417 } else {
2418 QueryBuild {
2419 and_runs: Vec::new(),
2420 or_groups: vec![group],
2421 }
2422 }
2423 }
2424 HirKind::Repetition(repetition) => {
2425 if repetition.min == 0 {
2426 QueryBuild::default()
2427 } else {
2428 build_query(&repetition.sub)
2429 }
2430 }
2431 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2432 }
2433}
2434
2435fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2436 match hir.kind() {
2437 HirKind::Literal(literal) => {
2438 if literal.0.len() >= 3 {
2439 Some(vec![literal.0.to_vec()])
2440 } else {
2441 None
2442 }
2443 }
2444 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2445 HirKind::Concat(parts) => {
2446 let mut runs = Vec::new();
2447 for part in parts {
2448 if let Some(mut part_runs) = guaranteed_run_choices(part) {
2449 runs.append(&mut part_runs);
2450 }
2451 }
2452 if runs.is_empty() {
2453 None
2454 } else {
2455 Some(runs)
2456 }
2457 }
2458 HirKind::Alternation(parts) => {
2459 let mut runs = Vec::new();
2460 for part in parts {
2461 let Some(mut part_runs) = guaranteed_run_choices(part) else {
2462 return None;
2463 };
2464 runs.append(&mut part_runs);
2465 }
2466 if runs.is_empty() {
2467 None
2468 } else {
2469 Some(runs)
2470 }
2471 }
2472 HirKind::Repetition(repetition) => {
2473 if repetition.min == 0 {
2474 None
2475 } else {
2476 guaranteed_run_choices(&repetition.sub)
2477 }
2478 }
2479 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2480 }
2481}
2482
2483fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2484 for (trigram, filter) in trigram_filters(run) {
2485 if !query.and_trigrams.contains(&trigram) {
2486 query.and_trigrams.push(trigram);
2487 }
2488 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2489 }
2490}
2491
2492fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2493 trigram_filter_map(run, false).into_iter().collect()
2494}
2495
2496fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2497 target.next_mask |= filter.next_mask;
2498 target.loc_mask |= filter.loc_mask;
2499}
2500
2501fn mask_for_next_char(next_char: u8) -> u8 {
2502 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2503 1u8 << bit
2504}
2505
2506fn mask_for_position(position: usize) -> u8 {
2507 1u8 << (position % 8)
2508}
2509
2510fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2511 if patterns.is_empty() {
2512 return Ok(None);
2513 }
2514
2515 let mut builder = GlobSetBuilder::new();
2516 for pattern in patterns {
2517 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2518 builder.add(glob);
2519 }
2520 builder.build().map(Some).map_err(|error| error.to_string())
2521}
2522
2523fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2524 let mut buffer = [0u8; 4];
2525 reader.read_exact(&mut buffer)?;
2526 Ok(u32::from_le_bytes(buffer))
2527}
2528
2529fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2530 let mut buffer = [0u8; 8];
2531 reader.read_exact(&mut buffer)?;
2532 Ok(u64::from_le_bytes(buffer))
2533}
2534
2535fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2536 writer.write_all(&value.to_le_bytes())
2537}
2538
2539fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2540 writer.write_all(&value.to_le_bytes())
2541}
2542
2543fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2544 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2545 return Err(std::io::Error::other("search index checksum missing"));
2546 };
2547 let expected = u32::from_le_bytes(*stored);
2548 let actual = crc32fast::hash(body);
2549 if actual != expected {
2550 return Err(std::io::Error::other("search index checksum mismatch"));
2551 }
2552 Ok(())
2553}
2554
2555fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2556 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2557 total_len.checked_sub(pos)
2558}
2559
2560fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2561 let output = Command::new("git")
2562 .arg("-C")
2563 .arg(root)
2564 .args(args)
2565 .output()
2566 .ok()?;
2567 if !output.status.success() {
2568 return None;
2569 }
2570 let value = String::from_utf8(output.stdout).ok()?;
2571 let value = value.trim().to_string();
2572 if value.is_empty() {
2573 None
2574 } else {
2575 Some(value)
2576 }
2577}
2578
2579fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2580 let diff_range = format!("{}..{}", from, to);
2581 let output = match Command::new("git")
2582 .arg("-C")
2583 .arg(root)
2584 .args(["diff", "--name-status", "-M", &diff_range])
2585 .output()
2586 {
2587 Ok(output) => output,
2588 Err(_) => return false,
2589 };
2590
2591 if !output.status.success() {
2592 return false;
2593 }
2594
2595 let Ok(diff) = String::from_utf8(output.stdout) else {
2596 return false;
2597 };
2598
2599 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2600 let mut fields = line.split('\t');
2601 let Some(status) = fields.next() else {
2602 continue;
2603 };
2604
2605 if status.starts_with('R') {
2606 let Some(old_path) = fields
2607 .next()
2608 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2609 else {
2610 continue;
2611 };
2612 let Some(new_path) = fields
2613 .next()
2614 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2615 else {
2616 continue;
2617 };
2618 index.remove_file(&old_path);
2619 index.update_file(&new_path);
2620 continue;
2621 }
2622
2623 let Some(path) = fields
2624 .next()
2625 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2626 else {
2627 continue;
2628 };
2629 if status.starts_with('D') || !path.exists() {
2630 index.remove_file(&path);
2631 } else {
2632 index.update_file(&path);
2633 }
2634 }
2635
2636 true
2637}
2638
2639fn is_binary_path(path: &Path, size: u64) -> bool {
2640 if size == 0 {
2641 return false;
2642 }
2643
2644 let mut file = match File::open(path) {
2645 Ok(file) => file,
2646 Err(_) => return true,
2647 };
2648
2649 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2650 match file.read(&mut preview) {
2651 Ok(read) => is_binary_bytes(&preview[..read]),
2652 Err(_) => true,
2653 }
2654}
2655
2656fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2657 let mut starts = vec![0usize];
2658 for (index, byte) in content.iter().copied().enumerate() {
2659 if byte == b'\n' {
2660 starts.push(index + 1);
2661 }
2662 }
2663 starts
2664}
2665
2666fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2667 let line_index = match line_starts.binary_search(&offset) {
2668 Ok(index) => index,
2669 Err(index) => index.saturating_sub(1),
2670 };
2671 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2672 let line_end = content[line_start..]
2673 .iter()
2674 .position(|byte| *byte == b'\n')
2675 .map(|length| line_start + length)
2676 .unwrap_or(content.len());
2677 let mut line_slice = &content[line_start..line_end];
2678 if line_slice.ends_with(b"\r") {
2679 line_slice = &line_slice[..line_slice.len() - 1];
2680 }
2681 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2682 let column = String::from_utf8_lossy(&content[line_start..offset])
2683 .chars()
2684 .count() as u32
2685 + 1;
2686 (line_index as u32 + 1, column, line_text)
2687}
2688
2689fn to_glob_path(path: &Path) -> String {
2690 path.to_string_lossy().replace('\\', "/")
2691}
2692
2693#[cfg(test)]
2694mod tests {
2695 use std::process::Command;
2696
2697 use super::*;
2698
2699 #[test]
2700 fn cached_path_under_root_allows_missing_lexical_child() {
2701 let dir = tempfile::tempdir().expect("create temp dir");
2702 let project = dir.path().join("project");
2703 fs::create_dir_all(&project).expect("create project dir");
2704 let root = fs::canonicalize(&project).expect("canonicalize project");
2705
2706 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2707 .expect("missing child should fall back to lexical validation");
2708
2709 assert_eq!(path, root.join("future/file.rs"));
2710 }
2711
2712 #[cfg(unix)]
2713 #[test]
2714 fn cached_path_under_root_rejects_symlink_escape() {
2715 let dir = tempfile::tempdir().expect("create temp dir");
2716 let project = dir.path().join("project");
2717 let outside = dir.path().join("outside");
2718 fs::create_dir_all(&project).expect("create project dir");
2719 fs::create_dir_all(&outside).expect("create outside dir");
2720 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2721 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2722 let root = fs::canonicalize(&project).expect("canonicalize project");
2723
2724 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2725 }
2726
2727 #[test]
2728 fn extract_trigrams_tracks_next_char_and_position() {
2729 let trigrams = extract_trigrams(b"Rust");
2730 assert_eq!(trigrams.len(), 2);
2731 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2732 assert_eq!(
2733 trigrams[1],
2734 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2735 );
2736 }
2737
2738 #[test]
2739 fn index_file_trigram_filters_match_legacy_extraction() {
2740 let dir = tempfile::tempdir().expect("create temp dir");
2741 let path = dir.path().join("sample.txt");
2742 let content = b"Rust rust RUST\nxy";
2743 fs::write(&path, content).expect("write sample");
2744
2745 let mut expected = BTreeMap::new();
2746 for (trigram, next_char, position) in extract_trigrams(content) {
2747 let entry: &mut PostingFilter = expected.entry(trigram).or_default();
2748 entry.next_mask |= mask_for_next_char(next_char);
2749 entry.loc_mask |= mask_for_position(position);
2750 }
2751
2752 let mut index = SearchIndex::new();
2753 index.project_root = dir.path().to_path_buf();
2754 index.index_file(&path, content);
2755
2756 let file_id = *index.path_to_id.get(&path).expect("file indexed");
2757 let file_trigrams = index.file_trigrams.get(&file_id).expect("file trigrams");
2758 assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
2759 for (trigram, filter) in expected {
2760 let postings = index.postings.get(&trigram).expect("posting list");
2761 assert_eq!(postings.len(), 1);
2762 assert_eq!(postings[0].file_id, file_id);
2763 assert_eq!(postings[0].next_mask, filter.next_mask);
2764 assert_eq!(postings[0].loc_mask, filter.loc_mask);
2765 }
2766 }
2767
2768 #[test]
2769 fn decompose_regex_extracts_literals_and_alternations() {
2770 let query = decompose_regex("abc(def|ghi)xyz");
2771 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2772 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2773 assert_eq!(query.or_groups.len(), 1);
2774 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2775 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2776 }
2777
2778 #[test]
2779 fn candidates_intersect_posting_lists() {
2780 let mut index = SearchIndex::new();
2781 let dir = tempfile::tempdir().expect("create temp dir");
2782 let alpha = dir.path().join("alpha.txt");
2783 let beta = dir.path().join("beta.txt");
2784 fs::write(&alpha, "abcdef").expect("write alpha");
2785 fs::write(&beta, "abcxyz").expect("write beta");
2786 index.project_root = dir.path().to_path_buf();
2787 index.index_file(&alpha, b"abcdef");
2788 index.index_file(&beta, b"abcxyz");
2789
2790 let query = RegexQuery {
2791 and_trigrams: vec![
2792 pack_trigram(b'a', b'b', b'c'),
2793 pack_trigram(b'd', b'e', b'f'),
2794 ],
2795 ..RegexQuery::default()
2796 };
2797
2798 let candidates = index.candidates(&query);
2799 assert_eq!(candidates.len(), 1);
2800 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2801 }
2802
2803 #[test]
2804 fn candidates_apply_bloom_filters() {
2805 let mut index = SearchIndex::new();
2806 let dir = tempfile::tempdir().expect("create temp dir");
2807 let file = dir.path().join("sample.txt");
2808 fs::write(&file, "abcd efgh").expect("write sample");
2809 index.project_root = dir.path().to_path_buf();
2810 index.index_file(&file, b"abcd efgh");
2811
2812 let trigram = pack_trigram(b'a', b'b', b'c');
2813 let matching_filter = PostingFilter {
2814 next_mask: mask_for_next_char(b'd'),
2815 loc_mask: mask_for_position(0),
2816 };
2817 let non_matching_filter = PostingFilter {
2818 next_mask: mask_for_next_char(b'z'),
2819 loc_mask: mask_for_position(0),
2820 };
2821
2822 assert_eq!(
2823 index
2824 .postings_for_trigram(trigram, Some(matching_filter))
2825 .len(),
2826 1
2827 );
2828 assert!(index
2829 .postings_for_trigram(trigram, Some(non_matching_filter))
2830 .is_empty());
2831 }
2832
2833 #[test]
2834 fn disk_round_trip_preserves_postings_and_files() {
2835 let dir = tempfile::tempdir().expect("create temp dir");
2836 let project = dir.path().join("project");
2837 fs::create_dir_all(&project).expect("create project dir");
2838 let file = project.join("src.txt");
2839 fs::write(&file, "abcdef").expect("write source");
2840
2841 let mut index = SearchIndex::build(&project);
2842 index.git_head = Some("deadbeef".to_string());
2843 let cache_dir = dir.path().join("cache");
2844 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2845
2846 let loaded =
2847 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2848 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2849 assert_eq!(loaded.files.len(), 1);
2850 assert_eq!(
2851 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2852 PathBuf::from("src.txt")
2853 );
2854 assert_eq!(loaded.postings.len(), index.postings.len());
2855 assert!(loaded
2856 .postings
2857 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2858 }
2859
2860 #[test]
2861 fn cache_path_helpers_reject_absolute_and_parent_paths() {
2862 let root = PathBuf::from("/tmp/aft-project");
2863
2864 assert_eq!(
2865 cache_relative_path(&root, &root.join("src/lib.rs")),
2866 Some(PathBuf::from("src/lib.rs"))
2867 );
2868 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2869 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2870 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2871 assert_eq!(
2872 cached_path_under_root(&root, Path::new("src/./lib.rs")),
2873 Some(root.join("src/lib.rs"))
2874 );
2875 }
2876
2877 #[test]
2878 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2879 let dir = tempfile::tempdir().expect("create temp dir");
2880 let project = dir.path().join("project");
2881 fs::create_dir_all(&project).expect("create project dir");
2882 let canonical_project = fs::canonicalize(&project).expect("canonical project");
2883 fs::write(project.join("old.txt"), "old token\n").expect("write old");
2884 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2885
2886 Command::new("git")
2887 .arg("init")
2888 .arg(&project)
2889 .status()
2890 .expect("git init");
2891 for args in [
2892 ["config", "user.email", "aft@example.invalid"],
2893 ["config", "user.name", "AFT Test"],
2894 ] {
2895 Command::new("git")
2896 .arg("-C")
2897 .arg(&project)
2898 .args(args)
2899 .status()
2900 .expect("git config");
2901 }
2902 Command::new("git")
2903 .arg("-C")
2904 .arg(&project)
2905 .args(["add", "."])
2906 .status()
2907 .expect("git add initial");
2908 Command::new("git")
2909 .arg("-C")
2910 .arg(&project)
2911 .args(["commit", "-m", "initial"])
2912 .status()
2913 .expect("git commit initial");
2914 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2915 let mut baseline = SearchIndex::build(&project);
2916 baseline.git_head = Some(previous.clone());
2917
2918 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2919 Command::new("git")
2920 .arg("-C")
2921 .arg(&project)
2922 .args(["add", "-A"])
2923 .status()
2924 .expect("git add rename");
2925 Command::new("git")
2926 .arg("-C")
2927 .arg(&project)
2928 .args(["commit", "-m", "rename"])
2929 .status()
2930 .expect("git commit rename");
2931 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2932
2933 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2934 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2935
2936 let refreshed = SearchIndex::rebuild_or_refresh(
2937 &project,
2938 DEFAULT_MAX_FILE_SIZE,
2939 Some(current),
2940 Some(baseline),
2941 );
2942
2943 assert!(!refreshed
2944 .path_to_id
2945 .contains_key(&canonical_project.join("old.txt")));
2946 assert!(refreshed
2947 .path_to_id
2948 .contains_key(&canonical_project.join("new.txt")));
2949 assert!(refreshed
2950 .path_to_id
2951 .contains_key(&canonical_project.join("untracked.txt")));
2952 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2953 assert_eq!(matches.matches.len(), 1);
2954 }
2955
2956 #[test]
2957 fn read_from_disk_rejects_corrupt_postings_checksum() {
2958 let dir = tempfile::tempdir().expect("create temp dir");
2959 let project = dir.path().join("project");
2960 fs::create_dir_all(&project).expect("create project dir");
2961 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2962
2963 let index = SearchIndex::build(&project);
2964 let cache_dir = dir.path().join("cache");
2965 index.write_to_disk(&cache_dir, None);
2966
2967 let cache_path = cache_dir.join("cache.bin");
2968 let mut bytes = fs::read(&cache_path).expect("read cache");
2969 let middle = bytes.len() / 2;
2970 bytes[middle] ^= 0xff;
2971 fs::write(&cache_path, bytes).expect("write corrupted cache");
2972
2973 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2974 }
2975
2976 #[test]
2977 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2978 let dir = tempfile::tempdir().expect("create temp dir");
2979 let project = dir.path().join("project");
2980 fs::create_dir_all(&project).expect("create project dir");
2981 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2982
2983 let index = SearchIndex::build(&project);
2984 let cache_dir = dir.path().join("cache");
2985 index.write_to_disk(&cache_dir, None);
2986
2987 assert!(cache_dir.join("cache.bin").is_file());
2988 assert!(fs::read_dir(&cache_dir)
2989 .expect("read cache dir")
2990 .all(|entry| !entry
2991 .expect("cache entry")
2992 .file_name()
2993 .to_string_lossy()
2994 .contains(".tmp.")));
2995 }
2996
2997 #[test]
2998 fn concurrent_search_index_writes_do_not_corrupt() {
2999 let dir = tempfile::tempdir().expect("create temp dir");
3000 let project = dir.path().join("project");
3001 fs::create_dir_all(&project).expect("create project dir");
3002 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
3003 let cache_dir = dir.path().join("cache");
3004
3005 let a_project = project.clone();
3006 let a_cache = cache_dir.clone();
3007 let a = std::thread::spawn(move || {
3008 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
3009 let index = SearchIndex::build(&a_project);
3010 index.write_to_disk(&a_cache, None);
3011 });
3012 let b_project = project.clone();
3013 let b_cache = cache_dir.clone();
3014 let b = std::thread::spawn(move || {
3015 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
3016 let index = SearchIndex::build(&b_project);
3017 index.write_to_disk(&b_cache, None);
3018 });
3019 a.join().expect("writer a");
3020 b.join().expect("writer b");
3021
3022 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
3023 }
3024
3025 #[test]
3026 fn search_index_atomic_rename_survives_partial_write() {
3027 let dir = tempfile::tempdir().expect("create temp dir");
3028 let cache_dir = dir.path().join("cache");
3029 fs::create_dir_all(&cache_dir).expect("create cache dir");
3030 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
3031
3032 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3033 }
3034
3035 #[test]
3036 fn artifact_cache_key_shared_across_clones_of_same_repo() {
3037 let dir = tempfile::tempdir().expect("create temp dir");
3038 let source = dir.path().join("source");
3039 fs::create_dir_all(&source).expect("create source repo dir");
3040 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
3041
3042 assert!(Command::new("git")
3043 .current_dir(&source)
3044 .args(["init"])
3045 .status()
3046 .expect("init git repo")
3047 .success());
3048 assert!(Command::new("git")
3049 .current_dir(&source)
3050 .args(["add", "."])
3051 .status()
3052 .expect("git add")
3053 .success());
3054 assert!(Command::new("git")
3055 .current_dir(&source)
3056 .args([
3057 "-c",
3058 "user.name=AFT Tests",
3059 "-c",
3060 "user.email=aft-tests@example.com",
3061 "commit",
3062 "-m",
3063 "initial",
3064 ])
3065 .status()
3066 .expect("git commit")
3067 .success());
3068
3069 let clone = dir.path().join("clone");
3070 assert!(Command::new("git")
3071 .args(["clone", "--quiet"])
3072 .arg(&source)
3073 .arg(&clone)
3074 .status()
3075 .expect("git clone")
3076 .success());
3077
3078 let source_key = artifact_cache_key(&source);
3079 let clone_key = artifact_cache_key(&clone);
3080
3081 assert_eq!(source_key.len(), 16);
3082 assert_eq!(clone_key.len(), 16);
3083 assert_eq!(source_key, clone_key);
3085 }
3086
3087 #[test]
3088 fn git_head_unchanged_picks_up_local_edits() {
3089 let dir = tempfile::tempdir().expect("create temp dir");
3090 let project = dir.path().join("repo");
3091 fs::create_dir_all(&project).expect("create repo dir");
3092 let file = project.join("tracked.txt");
3093 fs::write(&file, "oldtoken\n").expect("write file");
3094 assert!(Command::new("git")
3095 .current_dir(&project)
3096 .arg("init")
3097 .status()
3098 .unwrap()
3099 .success());
3100 assert!(Command::new("git")
3101 .current_dir(&project)
3102 .args(["add", "."])
3103 .status()
3104 .unwrap()
3105 .success());
3106 assert!(Command::new("git")
3107 .current_dir(&project)
3108 .args([
3109 "-c",
3110 "user.name=AFT Tests",
3111 "-c",
3112 "user.email=aft-tests@example.com",
3113 "commit",
3114 "-m",
3115 "initial"
3116 ])
3117 .status()
3118 .unwrap()
3119 .success());
3120 let head = current_git_head(&project);
3121 let mut baseline = SearchIndex::build(&project);
3122 baseline.git_head = head.clone();
3123 fs::write(&file, "newtoken\n").expect("edit tracked file");
3124
3125 let refreshed =
3126 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
3127 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
3128
3129 assert_eq!(result.total_matches, 1);
3130 }
3131
3132 #[test]
3133 fn non_git_project_reuses_cache_when_files_unchanged() {
3134 let dir = tempfile::tempdir().expect("create temp dir");
3135 let project = dir.path().join("project");
3136 fs::create_dir_all(&project).expect("create project dir");
3137 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
3138 let baseline = SearchIndex::build(&project);
3139 let baseline_file_count = baseline.file_count();
3140
3141 let refreshed =
3142 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3143
3144 assert_eq!(refreshed.file_count(), baseline_file_count);
3145 assert_eq!(
3146 refreshed
3147 .grep("unchangedtoken", true, &[], &[], &project, 10)
3148 .total_matches,
3149 1
3150 );
3151 }
3152
3153 #[test]
3154 fn resolve_search_scope_disables_index_for_external_path() {
3155 let dir = tempfile::tempdir().expect("create temp dir");
3156 let project = dir.path().join("project");
3157 let outside = dir.path().join("outside");
3158 fs::create_dir_all(&project).expect("create project dir");
3159 fs::create_dir_all(&outside).expect("create outside dir");
3160
3161 let scope = resolve_search_scope(&project, outside.to_str());
3162
3163 assert_eq!(
3164 scope.root,
3165 fs::canonicalize(&outside).expect("canonicalize outside")
3166 );
3167 assert!(!scope.use_index);
3168 }
3169
3170 #[test]
3171 fn grep_filters_matches_to_search_root() {
3172 let dir = tempfile::tempdir().expect("create temp dir");
3173 let project = dir.path().join("project");
3174 let src = project.join("src");
3175 let docs = project.join("docs");
3176 fs::create_dir_all(&src).expect("create src dir");
3177 fs::create_dir_all(&docs).expect("create docs dir");
3178 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
3179 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
3180
3181 let index = SearchIndex::build(&project);
3182 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3183
3184 assert_eq!(result.files_searched, 1);
3185 assert_eq!(result.files_with_matches, 1);
3186 assert_eq!(result.matches.len(), 1);
3187 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
3189 assert_eq!(result.matches[0].file, expected);
3190 }
3191
3192 #[test]
3193 fn grep_deduplicates_multiple_matches_on_same_line() {
3194 let dir = tempfile::tempdir().expect("create temp dir");
3195 let project = dir.path().join("project");
3196 let src = project.join("src");
3197 fs::create_dir_all(&src).expect("create src dir");
3198 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
3199
3200 let index = SearchIndex::build(&project);
3201 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
3202
3203 assert_eq!(result.total_matches, 1);
3204 assert_eq!(result.matches.len(), 1);
3205 }
3206
3207 #[test]
3208 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
3209 let dir = tempfile::tempdir().expect("create temp dir");
3210 let project = dir.path().join("project");
3211 fs::create_dir_all(&project).expect("create project dir");
3212 let file = project.join("unicode.txt");
3213 fs::write(&file, "äbc\n").expect("write unicode file");
3214
3215 let index = SearchIndex::build(&project);
3216 let result = index.grep("Äbc", false, &[], &[], &project, 10);
3217
3218 assert_eq!(result.total_matches, 1);
3219 assert_eq!(result.matches.len(), 1);
3220 assert_eq!(
3221 result.matches[0].file,
3222 fs::canonicalize(file).expect("canonicalize unicode file")
3223 );
3224 }
3225
3226 #[test]
3227 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
3228 let dir = tempfile::tempdir().expect("create temp dir");
3229 let project = dir.path().join("project");
3230 fs::create_dir_all(&project).expect("create project dir");
3231 let file = project.join("tokens.txt");
3232 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
3233 fs::write(&file, "alpha").expect("write original file");
3234 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
3235
3236 let baseline = SearchIndex::build(&project);
3237 fs::write(&file, "bravo").expect("write same-size edit");
3238 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
3239
3240 let refreshed =
3241 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
3242 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
3243 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
3244 let refreshed_id = *refreshed
3245 .path_to_id
3246 .get(&canonical_file)
3247 .expect("file remains indexed");
3248
3249 assert_eq!(result.total_matches, 1);
3250 assert!(refreshed
3251 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
3252 .contains(&refreshed_id));
3253 assert!(!refreshed
3254 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
3255 .contains(&refreshed_id));
3256 }
3257
3258 #[test]
3259 fn grep_reports_total_matches_before_truncation() {
3260 let dir = tempfile::tempdir().expect("create temp dir");
3261 let project = dir.path().join("project");
3262 let src = project.join("src");
3263 fs::create_dir_all(&src).expect("create src dir");
3264 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
3265
3266 let index = SearchIndex::build(&project);
3267 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
3268
3269 assert_eq!(result.total_matches, 2);
3270 assert_eq!(result.matches.len(), 1);
3271 assert!(result.truncated);
3272 }
3273
3274 #[test]
3275 fn glob_filters_results_to_search_root() {
3276 let dir = tempfile::tempdir().expect("create temp dir");
3277 let project = dir.path().join("project");
3278 let src = project.join("src");
3279 let scripts = project.join("scripts");
3280 fs::create_dir_all(&src).expect("create src dir");
3281 fs::create_dir_all(&scripts).expect("create scripts dir");
3282 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
3283 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
3284
3285 let index = SearchIndex::build(&project);
3286 let files = index.glob("**/*.rs", &src);
3287
3288 assert_eq!(
3289 files,
3290 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
3291 );
3292 }
3293
3294 #[test]
3295 fn glob_includes_hidden_and_binary_files() {
3296 let dir = tempfile::tempdir().expect("create temp dir");
3297 let project = dir.path().join("project");
3298 let hidden_dir = project.join(".hidden");
3299 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
3300 let hidden_file = hidden_dir.join("data.bin");
3301 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
3302
3303 let index = SearchIndex::build(&project);
3304 let files = index.glob("**/*.bin", &project);
3305
3306 assert_eq!(
3307 files,
3308 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
3309 );
3310 }
3311
3312 #[test]
3313 fn read_from_disk_rejects_invalid_nanos() {
3314 let dir = tempfile::tempdir().expect("create temp dir");
3315 let cache_dir = dir.path().join("cache");
3316 fs::create_dir_all(&cache_dir).expect("create cache dir");
3317
3318 let mut postings = Vec::new();
3319 postings.extend_from_slice(INDEX_MAGIC);
3320 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3321 postings.extend_from_slice(&0u32.to_le_bytes());
3322 postings.extend_from_slice(&1u32.to_le_bytes());
3323 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3324 postings.extend_from_slice(&1u32.to_le_bytes());
3325 postings.extend_from_slice(b"/");
3326 postings.push(0u8);
3327 postings.extend_from_slice(&1u32.to_le_bytes());
3328 postings.extend_from_slice(&0u64.to_le_bytes());
3329 postings.extend_from_slice(&0u64.to_le_bytes());
3330 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3331 postings.extend_from_slice(b"a");
3332 postings.extend_from_slice(&0u64.to_le_bytes());
3333
3334 let mut lookup = Vec::new();
3335 lookup.extend_from_slice(LOOKUP_MAGIC);
3336 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3337 lookup.extend_from_slice(&0u32.to_le_bytes());
3338
3339 let postings_checksum = crc32fast::hash(&postings);
3340 postings.extend_from_slice(&postings_checksum.to_le_bytes());
3341 let lookup_checksum = crc32fast::hash(&lookup);
3342 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3343 let mut cache = Vec::new();
3344 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3345 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3346 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3347 cache.extend_from_slice(&postings);
3348 cache.extend_from_slice(&lookup);
3349 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3350
3351 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3352 }
3353
3354 #[test]
3355 fn parallel_cold_build_matches_serial_index() {
3356 let dir = tempfile::tempdir().expect("create temp dir");
3357 let project = dir.path().join("project");
3358 for index in 0..80 {
3359 let sub = project.join(format!("pkg_{index:03}"));
3360 fs::create_dir_all(&sub).expect("create subdir");
3361 fs::write(
3362 sub.join("lib.rs"),
3363 format!(
3364 "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
3365 ),
3366 )
3367 .expect("write lib");
3368 }
3369
3370 let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
3371 let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
3372
3373 assert_eq!(serial.file_count(), parallel.file_count());
3374 assert_eq!(serial.trigram_count(), parallel.trigram_count());
3375 assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
3376 assert_eq!(serial.postings, parallel.postings);
3377 assert_eq!(serial.file_trigrams, parallel.file_trigrams);
3378 for (path, id) in &serial.path_to_id {
3379 assert_eq!(parallel.path_to_id.get(path), Some(id));
3380 }
3381 for (serial_file, parallel_file) in serial.files.iter().zip(¶llel.files) {
3382 assert_eq!(serial_file.path, parallel_file.path);
3383 assert_eq!(serial_file.size, parallel_file.size);
3384 assert_eq!(serial_file.modified, parallel_file.modified);
3385 assert_eq!(serial_file.content_hash, parallel_file.content_hash);
3386 }
3387
3388 let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
3389 let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
3390 assert_eq!(serial_grep.matches, parallel_grep.matches);
3391 assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
3392 assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
3393 assert_eq!(
3394 serial_grep.files_with_matches,
3395 parallel_grep.files_with_matches
3396 );
3397 }
3398
3399 #[test]
3400 fn ignore_rule_discovery_respects_gitignore() {
3401 let dir = tempfile::tempdir().expect("create temp dir");
3402 let project = dir.path().join("project");
3403 fs::create_dir_all(project.join("src")).expect("mkdir src");
3404 fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
3405 let data = project.join("src/data");
3406 fs::create_dir_all(&data).expect("mkdir data");
3407 for index in 0..200 {
3408 fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
3409 fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
3410 .expect("write ignored file");
3411 }
3412
3413 Command::new("git")
3414 .arg("init")
3415 .arg(&project)
3416 .status()
3417 .expect("git init");
3418 for args in [
3419 ["config", "user.email", "aft@example.invalid"],
3420 ["config", "user.name", "AFT Test"],
3421 ] {
3422 Command::new("git")
3423 .arg("-C")
3424 .arg(&project)
3425 .args(args)
3426 .status()
3427 .expect("git config");
3428 }
3429 Command::new("git")
3430 .arg("-C")
3431 .arg(&project)
3432 .args(["add", "."])
3433 .status()
3434 .expect("git add");
3435 Command::new("git")
3436 .arg("-C")
3437 .arg(&project)
3438 .args(["commit", "-m", "initial"])
3439 .status()
3440 .expect("git commit");
3441
3442 let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
3443 let walker_dirs = count_ignore_rule_discovery_dirs(&project);
3444 assert!(
3445 legacy_dirs > walker_dirs,
3446 "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
3447 );
3448 assert!(
3449 walker_dirs < 50,
3450 "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
3451 );
3452 }
3453
3454 #[test]
3469 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3470 let dir = tempfile::tempdir().expect("create tempdir");
3474 let mut paths: Vec<PathBuf> = Vec::new();
3475 for i in 0..30 {
3476 let path = if i % 2 == 0 {
3478 let p = dir.path().join(format!("real-{i}.rs"));
3479 fs::write(&p, format!("// {i}\n")).expect("write");
3480 p
3481 } else {
3482 dir.path().join(format!("missing-{i}.rs"))
3483 };
3484 paths.push(path);
3485 }
3486
3487 for _ in 0..50 {
3490 let mut copy = paths.clone();
3491 sort_paths_by_mtime_desc(&mut copy);
3492 assert_eq!(copy.len(), paths.len());
3493 }
3494 }
3495
3496 #[test]
3502 fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
3503 let dir = tempfile::tempdir().expect("create tempdir");
3504 for i in 0..40 {
3507 fs::write(
3508 dir.path().join(format!("file-{i}.rs")),
3509 format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
3510 )
3511 .expect("write");
3512 }
3513 let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
3514 let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
3515 assert!(
3516 result.matches.len() >= 40,
3517 "expected a match per file, got {}",
3518 result.matches.len()
3519 );
3520 assert!(
3521 !result.engine_capped,
3522 "an uncapped grep over >10 files must not report engine_capped"
3523 );
3524 assert!(!result.truncated, "uncapped grep must not be truncated");
3525 }
3526
3527 #[test]
3531 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3532 let dir = tempfile::tempdir().expect("create tempdir");
3533 let mut matches: Vec<GrepMatch> = Vec::new();
3534 for i in 0..30 {
3535 let file = if i % 2 == 0 {
3536 let p = dir.path().join(format!("real-{i}.rs"));
3537 fs::write(&p, format!("// {i}\n")).expect("write");
3538 p
3539 } else {
3540 dir.path().join(format!("missing-{i}.rs"))
3541 };
3542 matches.push(GrepMatch {
3543 file,
3544 line: u32::try_from(i).unwrap_or(0),
3545 column: 0,
3546 line_text: format!("match {i}"),
3547 match_text: format!("match {i}"),
3548 });
3549 }
3550
3551 for _ in 0..50 {
3552 let mut copy = matches.clone();
3553 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3554 assert_eq!(copy.len(), matches.len());
3555 }
3556 }
3557}