1use std::collections::{BTreeMap, BTreeSet, BinaryHeap, HashMap, HashSet};
2use std::fs::{self, File, OpenOptions};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const SPILL_MAGIC: &[u8; 8] = b"AFTSPI01";
27const FILE_TRIGRAM_COUNT_MAGIC: &[u8; 8] = b"AFTFTC01";
28const INDEX_VERSION: u32 = 4;
29const PREVIEW_BYTES: usize = 8 * 1024;
30const SPIMI_SOFT_LIMIT_BYTES: usize = 128 * 1024 * 1024;
31const SPIMI_HARD_LIMIT_BYTES: usize = 256 * 1024 * 1024;
32const SPILL_RECORD_ESTIMATED_BYTES: usize = 16;
33const DELTA_COMPACT_SOFT_FILES: usize = 1_000;
34const DELTA_COMPACT_HARD_FILES: usize = 5_000;
35const DELTA_COMPACT_SOFT_BYTES: usize = 32 * 1024 * 1024;
36const DELTA_COMPACT_HARD_BYTES: usize = 128 * 1024 * 1024;
37const EOF_SENTINEL: u8 = 0;
38const MAX_ENTRIES: usize = 10_000_000;
39const MIN_FILE_ENTRY_BYTES: usize = 57;
40const LOOKUP_ENTRY_BYTES: usize = 16;
41const POSTING_BYTES: usize = 6;
42static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
43
44pub struct CacheLock {
45 _guard: fs_lock::LockGuard,
46}
47
48impl CacheLock {
49 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
50 fs::create_dir_all(cache_dir)?;
51 let path = cache_dir.join("cache.lock");
52 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
53 .lock()
54 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
55 fs_lock::try_acquire(&path, Duration::from_secs(2))
56 .map(|guard| Self { _guard: guard })
57 .map_err(|error| match error {
58 fs_lock::AcquireError::Timeout => {
59 std::io::Error::other("timed out acquiring search cache lock")
60 }
61 fs_lock::AcquireError::Io(error) => error,
62 })
63 }
64}
65
66#[derive(Clone, Debug)]
67pub struct SearchIndex {
68 base: Option<Arc<BasePostings>>,
69 delta_postings: HashMap<u32, Vec<Posting>>,
70 delta_file_trigrams: HashMap<u32, Vec<u32>>,
71 pub files: Arc<Vec<FileEntry>>,
72 pub path_to_id: Arc<HashMap<PathBuf, u32>>,
73 pub ready: bool,
74 project_root: PathBuf,
75 git_head: Option<String>,
76 max_file_size: u64,
77 ignore_rules_fingerprint: String,
78 pub file_trigram_count: Arc<Vec<u32>>,
79 unindexed_files: Arc<HashSet<u32>>,
80 superseded: HashSet<u32>,
81 base_file_count: u32,
82 delta_packed_bytes: usize,
83 compaction_state: Arc<Mutex<CompactionState>>,
84}
85
86#[derive(Clone, Debug)]
87struct BasePostings {
88 file: Arc<File>,
89 postings_blob_start: u64,
90 postings_blob_len: u64,
91 lookup: Arc<Vec<LookupEntry>>,
92}
93
94#[derive(Clone, Copy, Debug, PartialEq, Eq)]
95struct LookupEntry {
96 trigram: u32,
97 offset: u64,
98 count: u32,
99}
100
101#[derive(Clone, Debug, Default)]
102struct CompactionState {
103 running: bool,
104 requested_again: bool,
105 buffered_paths: Vec<PathBuf>,
106}
107
108#[derive(Clone, Debug)]
109pub struct SearchIndexSnapshot {
110 base: Option<Arc<BasePostings>>,
111 delta_postings: Arc<HashMap<u32, Vec<Posting>>>,
112 files: Arc<Vec<FileEntry>>,
113 path_to_id: Arc<HashMap<PathBuf, u32>>,
114 ready: bool,
115 project_root: PathBuf,
116 file_trigram_count: Arc<Vec<u32>>,
117 unindexed_files: Arc<HashSet<u32>>,
118 superseded: Arc<HashSet<u32>>,
119}
120
121#[derive(Clone, Debug, Default)]
122pub struct LexicalRankResult {
123 pub files: Vec<(PathBuf, f32)>,
124 pub engine_capped: bool,
125}
126
127impl SearchIndex {
128 pub fn file_count(&self) -> usize {
130 self.files.len()
131 }
132
133 pub fn trigram_count(&self) -> usize {
135 self.snapshot().trigram_count()
136 }
137
138 pub fn snapshot(&self) -> SearchIndexSnapshot {
143 SearchIndexSnapshot {
144 base: self.base.clone(),
145 delta_postings: Arc::new(self.delta_postings.clone()),
146 files: Arc::clone(&self.files),
147 path_to_id: Arc::clone(&self.path_to_id),
148 ready: self.ready,
149 project_root: self.project_root.clone(),
150 file_trigram_count: Arc::clone(&self.file_trigram_count),
151 unindexed_files: Arc::clone(&self.unindexed_files),
152 superseded: Arc::new(self.superseded.clone()),
153 }
154 }
155
156 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
158 query_trigrams_from_tokens(tokens)
159 }
160
161 pub fn lexical_rank(
163 &self,
164 query_trigrams: &[u32],
165 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
166 max_files: usize,
167 ) -> Vec<(PathBuf, f32)> {
168 self.snapshot()
169 .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
170 .files
171 }
172
173 pub fn lexical_rank_with_stats(
176 &self,
177 query_trigrams: &[u32],
178 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
179 max_files: usize,
180 ) -> LexicalRankResult {
181 self.snapshot()
182 .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
183 }
184}
185
186impl SearchIndexSnapshot {
187 pub fn trigram_count(&self) -> usize {
189 let base_count = self.base.as_ref().map_or(0, |base| base.lookup.len());
190 let Some(base) = &self.base else {
191 return self.delta_postings.len();
192 };
193 base_count
194 + self
195 .delta_postings
196 .keys()
197 .filter(|trigram| base.lookup_entry(**trigram).is_none())
198 .count()
199 }
200
201 pub fn lexical_rank_with_stats(
204 &self,
205 query_trigrams: &[u32],
206 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
207 max_files: usize,
208 ) -> LexicalRankResult {
209 if query_trigrams.is_empty() || max_files == 0 {
210 return LexicalRankResult::default();
211 }
212
213 let mut non_zero: Vec<(u32, usize)> = query_trigrams
214 .iter()
215 .filter_map(|trigram| {
216 let posting_count = self.posting_count(*trigram);
217 (posting_count > 0).then_some((*trigram, posting_count))
218 })
219 .collect();
220 if non_zero.is_empty() {
221 return LexicalRankResult::default();
222 }
223
224 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
225 let selected_count = non_zero.len().min(3);
226 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
227
228 let mut candidate_ids = BTreeSet::new();
229 for (trigram, _) in non_zero.iter().take(selected_count) {
230 for file_id in self.postings_for_trigram(*trigram, None) {
231 candidate_ids.insert(file_id);
232 }
233 }
234 let pre_filter_candidate_count = candidate_ids.len();
235 let engine_capped = pre_filter_candidate_count > candidate_cap;
236 let filtered_candidates = candidate_ids
237 .into_iter()
238 .filter_map(|file_id| {
239 self.files
240 .get(file_id as usize)
241 .map(|entry| (file_id, entry))
242 })
243 .filter(|(_, entry)| {
244 if let Some(filter) = candidate_filter {
245 filter(&entry.path)
246 } else {
247 true
248 }
249 })
250 .collect::<Vec<_>>();
251
252 let mut ranked = Vec::new();
253 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
254 let score = lexical_score_snapshot(self, query_trigrams, file_id);
255 if score > 0.0 {
256 ranked.push((entry.path.clone(), score));
257 }
258 }
259
260 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
261 ranked.truncate(max_files);
262 LexicalRankResult {
263 files: ranked,
264 engine_capped,
265 }
266 }
267}
268
269#[derive(Clone, Debug, PartialEq, Eq)]
270pub struct Posting {
271 pub file_id: u32,
272 pub next_mask: u8,
273 pub loc_mask: u8,
274}
275
276#[derive(Clone, Debug)]
277pub struct FileEntry {
278 pub path: PathBuf,
279 pub size: u64,
280 pub modified: SystemTime,
281 pub content_hash: blake3::Hash,
282}
283
284#[derive(Clone, Debug, PartialEq, Eq)]
285pub struct GrepMatch {
286 pub file: PathBuf,
287 pub line: u32,
288 pub column: u32,
289 pub line_text: String,
290 pub match_text: String,
291}
292
293#[derive(Clone, Debug)]
294pub struct GrepResult {
295 pub matches: Vec<GrepMatch>,
296 pub total_matches: usize,
297 pub files_searched: usize,
298 pub files_with_matches: usize,
299 pub index_status: IndexStatus,
300 pub truncated: bool,
301 pub fully_degraded: bool,
302 pub engine_capped: bool,
303 pub walk_truncated: bool,
305}
306
307#[derive(Clone, Copy, Debug, PartialEq, Eq)]
308pub enum IndexStatus {
309 Ready,
310 Building,
311 Fallback,
312 Disabled,
313}
314
315impl IndexStatus {
316 pub fn as_str(&self) -> &'static str {
317 match self {
318 IndexStatus::Ready => "Ready",
319 IndexStatus::Building => "Building",
320 IndexStatus::Fallback => "Fallback",
321 IndexStatus::Disabled => "Disabled",
322 }
323 }
324}
325
326#[derive(Clone, Debug, Default)]
327pub struct RegexQuery {
328 pub and_trigrams: Vec<u32>,
329 pub or_groups: Vec<Vec<u32>>,
330 pub(crate) and_filters: HashMap<u32, PostingFilter>,
331 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
332}
333
334#[derive(Clone, Copy, Debug, Default)]
335pub(crate) struct PostingFilter {
336 next_mask: u8,
337 loc_mask: u8,
338}
339
340#[derive(Clone, Copy)]
341struct SearchFileMetadata {
342 size: u64,
343 modified: SystemTime,
344}
345
346struct PreparedIndexedFile {
347 metadata: SearchFileMetadata,
348 content_hash: blake3::Hash,
349 trigram_map: BTreeMap<u32, PostingFilter>,
350}
351
352enum PreparedSearchPath {
353 Indexed(PreparedIndexedFile),
354 Unindexed(SearchFileMetadata),
355 Skipped,
356}
357
358#[derive(Clone, Debug, Default)]
359struct QueryBuild {
360 and_runs: Vec<Vec<u8>>,
361 or_groups: Vec<Vec<Vec<u8>>>,
362}
363
364#[derive(Clone, Debug, Default)]
365pub(crate) struct PathFilters {
366 includes: Option<GlobSet>,
367 excludes: Option<GlobSet>,
368}
369
370#[derive(Clone, Debug)]
371pub(crate) struct SearchScope {
372 pub root: PathBuf,
373 pub use_index: bool,
374}
375
376#[derive(Clone, Debug)]
377struct SharedGrepMatch {
378 file: Arc<PathBuf>,
379 line: u32,
380 column: u32,
381 line_text: String,
382 match_text: String,
383}
384
385#[derive(Clone, Debug)]
386enum SearchMatcher {
387 Literal(LiteralSearch),
388 Regex(Regex),
389}
390
391impl SearchIndex {
392 pub fn new() -> Self {
393 SearchIndex {
394 base: None,
395 delta_postings: HashMap::new(),
396 delta_file_trigrams: HashMap::new(),
397 files: Arc::new(Vec::new()),
398 path_to_id: Arc::new(HashMap::new()),
399 ready: false,
400 project_root: PathBuf::new(),
401 git_head: None,
402 max_file_size: DEFAULT_MAX_FILE_SIZE,
403 ignore_rules_fingerprint: String::new(),
404 file_trigram_count: Arc::new(Vec::new()),
405 unindexed_files: Arc::new(HashSet::new()),
406 superseded: HashSet::new(),
407 base_file_count: 0,
408 delta_packed_bytes: 0,
409 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
410 }
411 }
412
413 pub fn build(root: &Path) -> Self {
414 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
415 }
416
417 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
418 let cache_dir = transient_search_cache_dir(root);
419 Self::build_with_limit_to_cache_dir(root, max_file_size, &cache_dir)
420 }
421
422 pub fn build_with_limit_to_cache_dir(
423 root: &Path,
424 max_file_size: u64,
425 cache_dir: &Path,
426 ) -> Self {
427 let started = std::time::Instant::now();
428 match build_streaming_index(root, max_file_size, cache_dir) {
429 Ok((mut index, indexed)) => {
430 index.ready = true;
431 crate::slog_info!(
432 "search index cold streaming build: {} files, {} trigrams, {} ms (pool={})",
433 indexed,
434 index.trigram_count(),
435 started.elapsed().as_millis(),
436 search_index_build_pool_size()
437 );
438 index
439 }
440 Err(error) => {
441 log::warn!(
442 "search index: streaming build failed ({}); falling back to bounded in-memory delta",
443 error
444 );
445 let mut index = SearchIndex {
446 project_root: fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
447 max_file_size,
448 ignore_rules_fingerprint: ignore_rules_fingerprint(
449 &fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
450 ),
451 ..SearchIndex::new()
452 };
453 let filters = PathFilters::default();
454 let paths: Vec<PathBuf> = walk_project_files(&index.project_root, &filters);
455 let indexed = index.ingest_paths_parallel(&paths);
456 index.git_head = current_git_head(&index.project_root);
457 index.ready = true;
458 crate::slog_info!(
459 "search index fallback build: {} files, {} trigrams, {} ms (pool={})",
460 indexed,
461 index.trigram_count(),
462 started.elapsed().as_millis(),
463 search_index_build_pool_size()
464 );
465 index
466 }
467 }
468 }
469
470 #[cfg(test)]
472 pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
473 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
474 let mut index = SearchIndex {
475 project_root: project_root.clone(),
476 max_file_size,
477 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
478 ..SearchIndex::new()
479 };
480 let filters = PathFilters::default();
481 for path in walk_project_files(&project_root, &filters) {
482 index.update_file(&path);
483 }
484 index.git_head = current_git_head(&project_root);
485 index.ready = true;
486 index
487 }
488
489 fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
490 let max_file_size = self.max_file_size;
491 let pool_size = search_index_build_pool_size();
492 let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
493 let pool = match rayon::ThreadPoolBuilder::new()
494 .num_threads(pool_size)
495 .thread_name(|index| format!("aft-search-build-{index}"))
496 .stack_size(8 * 1024 * 1024)
497 .build()
498 {
499 Ok(pool) => Some(pool),
500 Err(error) => {
501 log::warn!(
502 "search index: bounded build pool unavailable ({error}); using global pool"
503 );
504 None
505 }
506 };
507
508 let mut indexed = 0usize;
509 for chunk in paths.chunks(chunk_size) {
510 let prepare_chunk = || -> Vec<PreparedSearchPath> {
511 chunk
512 .par_iter()
513 .map(|path| prepare_search_path(path, max_file_size))
514 .collect()
515 };
516 let prepared = match &pool {
517 Some(pool) => pool.install(prepare_chunk),
518 None => prepare_chunk(),
519 };
520
521 for (path, prepared) in chunk.iter().zip(prepared) {
522 let inserted = match prepared {
523 PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
524 PreparedSearchPath::Unindexed(metadata) => {
525 self.track_unindexed_file_with_metadata(path, metadata)
526 }
527 PreparedSearchPath::Skipped => false,
528 };
529 if inserted {
530 indexed += 1;
531 }
532 }
533 }
534
535 indexed
536 }
537
538 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
539 self.remove_file(path);
540 let metadata = metadata_for_indexed_content(path, content.len() as u64);
541 self.index_file_with_metadata(path, content, metadata);
542 }
543
544 fn index_file_with_metadata(
545 &mut self,
546 path: &Path,
547 content: &[u8],
548 metadata: SearchFileMetadata,
549 ) -> bool {
550 self.index_prepared_new_file(
551 path,
552 PreparedIndexedFile {
553 metadata,
554 content_hash: cache_freshness::hash_bytes(content),
555 trigram_map: trigram_filter_map(content, true),
556 },
557 )
558 }
559
560 fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
561 let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
562 Some(file_id) => file_id,
563 None => return false,
564 };
565 if let Some(entry) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
566 entry.content_hash = file.content_hash;
567 }
568
569 let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
570 for (trigram, filter) in file.trigram_map {
571 let postings = self.delta_postings.entry(trigram).or_default();
572 postings.push(Posting {
573 file_id,
574 next_mask: filter.next_mask,
575 loc_mask: filter.loc_mask,
576 });
577 if postings.len() > 1
578 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
579 {
580 postings.sort_unstable_by_key(|p| p.file_id);
581 }
582 file_trigrams.push(trigram);
583 }
584
585 let trigram_count = file_trigrams.len() as u32;
586 self.delta_packed_bytes = self
587 .delta_packed_bytes
588 .saturating_add(file_trigrams.len().saturating_mul(POSTING_BYTES));
589 self.delta_file_trigrams.insert(file_id, file_trigrams);
590 ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
591 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
592 *count = trigram_count;
593 }
594 Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
595 self.update_compaction_flags(Some(path));
596 true
597 }
598
599 pub fn remove_file(&mut self, path: &Path) {
600 let canonical_path = canonicalize_existing_or_deleted_path(path);
601 let file_id = {
602 let path_to_id = Arc::make_mut(&mut self.path_to_id);
603 if let Some(file_id) = path_to_id.remove(path) {
604 file_id
605 } else if canonical_path.as_path() != path {
606 let Some(file_id) = path_to_id.remove(&canonical_path) else {
607 return;
608 };
609 file_id
610 } else {
611 return;
612 }
613 };
614
615 if file_id < self.base_file_count {
616 self.superseded.insert(file_id);
617 }
618
619 if let Some(trigrams) = self.delta_file_trigrams.remove(&file_id) {
620 self.delta_packed_bytes = self
621 .delta_packed_bytes
622 .saturating_sub(trigrams.len().saturating_mul(POSTING_BYTES));
623 for trigram in trigrams {
624 let should_remove = if let Some(postings) = self.delta_postings.get_mut(&trigram) {
625 postings.retain(|posting| posting.file_id != file_id);
626 postings.is_empty()
627 } else {
628 false
629 };
630
631 if should_remove {
632 self.delta_postings.remove(&trigram);
633 }
634 }
635 }
636
637 Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
638 if let Some(file) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
639 file.path = PathBuf::new();
640 file.size = 0;
641 file.modified = UNIX_EPOCH;
642 file.content_hash = cache_freshness::zero_hash();
643 }
644 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
645 *count = 0;
646 }
647 self.update_compaction_flags(Some(path));
648 }
649
650 pub fn update_file(&mut self, path: &Path) {
651 self.remove_file(path);
652
653 let metadata = match fs::metadata(path) {
654 Ok(metadata) if metadata.is_file() => metadata,
655 _ => return,
656 };
657
658 let metadata = search_file_metadata(&metadata);
659
660 if is_binary_path(path, metadata.size) {
661 self.track_unindexed_file_with_metadata(path, metadata);
662 return;
663 }
664
665 if metadata.size > self.max_file_size {
666 self.track_unindexed_file_with_metadata(path, metadata);
667 return;
668 }
669
670 let content = match fs::read(path) {
671 Ok(content) => content,
672 Err(_) => return,
673 };
674
675 if is_binary_bytes(&content) {
676 self.track_unindexed_file_with_metadata(path, metadata);
677 return;
678 }
679
680 self.index_file_with_metadata(path, &content, metadata);
681 }
682
683 pub fn grep(
684 &self,
685 pattern: &str,
686 case_sensitive: bool,
687 include: &[String],
688 exclude: &[String],
689 search_root: &Path,
690 max_results: usize,
691 ) -> GrepResult {
692 self.snapshot().grep(
693 pattern,
694 case_sensitive,
695 include,
696 exclude,
697 search_root,
698 max_results,
699 )
700 }
701
702 pub fn search_grep(
703 &self,
704 pattern: &CompiledPattern,
705 include: &[String],
706 exclude: &[String],
707 search_root: &Path,
708 max_results: usize,
709 ) -> GrepResult {
710 self.snapshot()
711 .search_grep(pattern, include, exclude, search_root, max_results)
712 }
713
714 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
715 self.snapshot().glob(pattern, search_root)
716 }
717
718 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
719 self.snapshot().candidates(query)
720 }
721
722 pub fn write_to_disk(&mut self, cache_dir: &Path, git_head: Option<&str>) {
723 let Some(plan) = CacheWritePlan::from_index(self, git_head) else {
724 return;
725 };
726
727 let write_result = {
728 let mut sources = self.compaction_record_sources(Arc::clone(&plan.id_map));
729 write_cache_file_from_sources(cache_dir, &plan, &mut sources)
730 };
731
732 match write_result {
733 Ok(base) => {
734 self.base = Some(Arc::new(base));
735 self.delta_postings.clear();
736 self.delta_file_trigrams.clear();
737 self.superseded.clear();
738 self.delta_packed_bytes = 0;
739 self.base_file_count = u32::try_from(plan.files.len()).unwrap_or(u32::MAX);
740 self.files = Arc::new(plan.files);
741 self.path_to_id = Arc::new(plan.path_to_id);
742 self.unindexed_files = Arc::new(plan.unindexed_files);
743 self.file_trigram_count = Arc::new(plan.file_trigram_count);
744 self.git_head = plan.git_head.filter(|head| !head.is_empty());
745 self.ignore_rules_fingerprint = plan.ignore_fingerprint;
746 }
747 Err(error) => {
748 log::warn!("search index: failed to write disk cache: {}", error);
749 }
750 }
751 }
752
753 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
754 debug_assert!(current_canonical_root.is_absolute());
755 let cache_path = cache_dir.join("cache.bin");
756 let cache_file = open_cache_file_read(&cache_path).ok()?;
757 let file_len = cache_file.metadata().ok()?.len();
758 if file_len < 16 {
759 return None;
760 }
761
762 let mut reader = BufReader::new(cache_file.try_clone().ok()?);
763 if read_u32(&mut reader).ok()? != CACHE_MAGIC {
764 return None;
765 }
766 if read_u32(&mut reader).ok()? != INDEX_VERSION {
767 return None;
768 }
769 let postings_len_total = read_u64(&mut reader).ok()?;
770 let postings_section_start = reader.stream_position().ok()?;
771 let postings_section_end = postings_section_start.checked_add(postings_len_total)?;
772 if postings_len_total < 4 || postings_section_end > file_len {
773 return None;
774 }
775 let postings_body_end = postings_section_end.checked_sub(4)?;
776
777 let mut magic = [0u8; 8];
778 reader.read_exact(&mut magic).ok()?;
779 if &magic != INDEX_MAGIC {
780 return None;
781 }
782 if read_u32(&mut reader).ok()? != INDEX_VERSION {
783 return None;
784 }
785
786 let head_len = read_u32(&mut reader).ok()? as usize;
787 let root_len = read_u32(&mut reader).ok()? as usize;
788 let ignore_fingerprint_len = read_u32(&mut reader).ok()? as usize;
789 let max_file_size = read_u64(&mut reader).ok()?;
790 let file_count = read_u32(&mut reader).ok()? as usize;
791 if file_count > MAX_ENTRIES {
792 return None;
793 }
794
795 if !reader_has_remaining(&mut reader, postings_body_end, head_len).ok()? {
796 return None;
797 }
798 let mut head_bytes = vec![0u8; head_len];
799 reader.read_exact(&mut head_bytes).ok()?;
800 let git_head = String::from_utf8(head_bytes)
801 .ok()
802 .filter(|head| !head.is_empty());
803
804 if !reader_has_remaining(&mut reader, postings_body_end, root_len).ok()? {
805 return None;
806 }
807 let mut root_bytes = vec![0u8; root_len];
808 reader.read_exact(&mut root_bytes).ok()?;
809 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
810 let project_root = current_canonical_root.to_path_buf();
811
812 if !reader_has_remaining(&mut reader, postings_body_end, ignore_fingerprint_len).ok()? {
813 return None;
814 }
815 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
816 reader.read_exact(&mut ignore_fingerprint_bytes).ok()?;
817 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
818 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
819 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
820 return None;
821 }
822
823 let mut files = Vec::with_capacity(file_count);
824 let mut path_to_id = HashMap::new();
825 let mut unindexed_files = HashSet::new();
826
827 for file_id in 0..file_count {
828 if !reader_has_remaining(&mut reader, postings_body_end, MIN_FILE_ENTRY_BYTES).ok()? {
829 return None;
830 }
831 let mut unindexed = [0u8; 1];
832 reader.read_exact(&mut unindexed).ok()?;
833 let path_len = read_u32(&mut reader).ok()? as usize;
834 let size = read_u64(&mut reader).ok()?;
835 let secs = read_u64(&mut reader).ok()?;
836 let nanos = read_u32(&mut reader).ok()?;
837 let mut hash_bytes = [0u8; 32];
838 reader.read_exact(&mut hash_bytes).ok()?;
839 let content_hash = blake3::Hash::from_bytes(hash_bytes);
840 if nanos >= 1_000_000_000 {
841 return None;
842 }
843 if !reader_has_remaining(&mut reader, postings_body_end, path_len).ok()? {
844 return None;
845 }
846 let mut path_bytes = vec![0u8; path_len];
847 reader.read_exact(&mut path_bytes).ok()?;
848 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
849 let full_path = cached_path_under_root(&project_root, &relative_path)?;
850 let file_id_u32 = u32::try_from(file_id).ok()?;
851
852 files.push(FileEntry {
853 path: full_path.clone(),
854 size,
855 modified: UNIX_EPOCH + Duration::new(secs, nanos),
856 content_hash,
857 });
858 path_to_id.insert(full_path, file_id_u32);
859 if unindexed[0] == 1 {
860 unindexed_files.insert(file_id_u32);
861 }
862 }
863
864 if !reader_has_remaining(&mut reader, postings_body_end, 8).ok()? {
865 return None;
866 }
867 let postings_blob_len = read_u64(&mut reader).ok()?;
868 let postings_blob_start = reader.stream_position().ok()?;
869 let postings_blob_end = postings_blob_start.checked_add(postings_blob_len)?;
870 if postings_blob_end > postings_body_end || postings_blob_len % POSTING_BYTES as u64 != 0 {
871 return None;
872 }
873
874 let lookup_section_start = postings_section_end;
875 if lookup_section_start >= file_len {
876 return None;
877 }
878 let mut lookup_file = cache_file.try_clone().ok()?;
879 lookup_file
880 .seek(SeekFrom::Start(lookup_section_start))
881 .ok()?;
882 let mut lookup_bytes = Vec::new();
883 lookup_file.read_to_end(&mut lookup_bytes).ok()?;
884 if lookup_bytes.len() < 4 {
885 return None;
886 }
887 verify_crc32_bytes_slice(&lookup_bytes).ok()?;
888 let lookup_body_len = lookup_bytes.len().checked_sub(4)?;
889 let mut lookup_reader = BufReader::new(Cursor::new(&lookup_bytes));
890 let mut lookup_magic = [0u8; 8];
891 lookup_reader.read_exact(&mut lookup_magic).ok()?;
892 if &lookup_magic != LOOKUP_MAGIC {
893 return None;
894 }
895 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
896 return None;
897 }
898 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
899 if entry_count > MAX_ENTRIES {
900 return None;
901 }
902 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
903 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
904 if minimum_lookup_bytes > remaining_lookup {
905 return None;
906 }
907
908 let mut lookup = Vec::with_capacity(entry_count);
909 let mut previous_trigram = None;
910 for _ in 0..entry_count {
911 let trigram = read_u32(&mut lookup_reader).ok()?;
912 let offset = read_u64(&mut lookup_reader).ok()?;
913 let count = read_u32(&mut lookup_reader).ok()?;
914 if count as usize > MAX_ENTRIES {
915 return None;
916 }
917 if previous_trigram.is_some_and(|previous| previous >= trigram) {
918 return None;
919 }
920 previous_trigram = Some(trigram);
921 let bytes_len = (count as u64).checked_mul(POSTING_BYTES as u64)?;
922 let end = offset.checked_add(bytes_len)?;
923 if end > postings_blob_len {
924 return None;
925 }
926 lookup.push(LookupEntry {
927 trigram,
928 offset,
929 count,
930 });
931 }
932
933 let base = BasePostings {
934 file: Arc::new(cache_file),
935 postings_blob_start,
936 postings_blob_len,
937 lookup: Arc::new(lookup),
938 };
939
940 let (file_trigram_count, migrated_counts) = match read_file_trigram_count_extension(
941 &base,
942 postings_blob_end,
943 postings_body_end,
944 file_count,
945 ) {
946 Ok(Some(counts)) => (counts, false),
947 Ok(None) => (
948 compute_file_trigram_counts_from_base(&base, file_count).ok()?,
949 true,
950 ),
951 Err(_) => return None,
952 };
953
954 let mut index = SearchIndex {
955 base: Some(Arc::new(base)),
956 delta_postings: HashMap::new(),
957 delta_file_trigrams: HashMap::new(),
958 files: Arc::new(files),
959 path_to_id: Arc::new(path_to_id),
960 ready: false,
961 project_root,
962 git_head,
963 max_file_size,
964 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
965 file_trigram_count: Arc::new(file_trigram_count),
966 unindexed_files: Arc::new(unindexed_files),
967 superseded: HashSet::new(),
968 base_file_count: u32::try_from(file_count).ok()?,
969 delta_packed_bytes: 0,
970 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
971 };
972
973 if migrated_counts {
974 if let Ok(_lock) = CacheLock::acquire(cache_dir) {
975 let head = index.git_head.clone();
976 index.write_to_disk(cache_dir, head.as_deref());
977 }
978 }
979
980 Some(index)
981 }
982
983 pub fn stored_git_head(&self) -> Option<&str> {
984 self.git_head.as_deref()
985 }
986
987 pub(crate) fn set_ready(&mut self, ready: bool) {
988 self.ready = ready;
989 }
990
991 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
992 self.git_head = current_head;
993 verify_file_mtimes(self);
994 self.ready = true;
995 }
996
997 #[cfg(debug_assertions)]
998 #[doc(hidden)]
999 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1000 self.verify_against_disk(current_head);
1001 }
1002
1003 pub(crate) fn rebuild_or_refresh(
1004 root: &Path,
1005 max_file_size: u64,
1006 current_head: Option<String>,
1007 baseline: Option<SearchIndex>,
1008 cache_dir: Option<&Path>,
1009 ) -> Self {
1010 if let Some(mut baseline) = baseline {
1011 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1012 baseline.max_file_size = max_file_size;
1013 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1014 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1015 return match cache_dir {
1016 Some(cache_dir) => {
1017 SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1018 }
1019 None => SearchIndex::build_with_limit(root, max_file_size),
1020 };
1021 }
1022 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1023
1024 if baseline.git_head == current_head || current_head.is_none() {
1025 baseline.git_head = current_head;
1032 verify_file_mtimes(&mut baseline);
1033 baseline.ready = true;
1034 return baseline;
1035 }
1036
1037 if let (Some(previous), Some(current)) =
1038 (baseline.git_head.clone(), current_head.clone())
1039 {
1040 let project_root = baseline.project_root.clone();
1041 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1042 baseline.git_head = Some(current);
1043 verify_file_mtimes(&mut baseline);
1044 baseline.ready = true;
1045 return baseline;
1046 }
1047 }
1048 }
1049
1050 match cache_dir {
1051 Some(cache_dir) => {
1052 SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1053 }
1054 None => SearchIndex::build_with_limit(root, max_file_size),
1055 }
1056 }
1057
1058 fn allocate_file_id_with_metadata(
1059 &mut self,
1060 path: &Path,
1061 metadata: SearchFileMetadata,
1062 ) -> Option<u32> {
1063 let file_id = u32::try_from(self.files.len()).ok()?;
1064 Arc::make_mut(&mut self.files).push(FileEntry {
1065 path: path.to_path_buf(),
1066 size: metadata.size,
1067 modified: metadata.modified,
1068 content_hash: cache_freshness::zero_hash(),
1069 });
1070 Arc::make_mut(&mut self.path_to_id).insert(path.to_path_buf(), file_id);
1071 ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
1072 Some(file_id)
1073 }
1074
1075 fn track_unindexed_file_with_metadata(
1076 &mut self,
1077 path: &Path,
1078 metadata: SearchFileMetadata,
1079 ) -> bool {
1080 let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1081 return false;
1082 };
1083 Arc::make_mut(&mut self.unindexed_files).insert(file_id);
1084 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
1085 *count = 0;
1086 }
1087 true
1088 }
1089
1090 fn active_file_ids(&self) -> Vec<u32> {
1091 self.snapshot().active_file_ids()
1092 }
1093
1094 #[cfg(test)]
1095 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1096 self.snapshot().postings_for_trigram(trigram, filter)
1097 }
1098
1099 fn update_compaction_flags(&mut self, changed_path: Option<&Path>) {
1100 let delta_files = self.delta_file_trigrams.len();
1101 let hard = delta_files >= DELTA_COMPACT_HARD_FILES
1102 || self.delta_packed_bytes >= DELTA_COMPACT_HARD_BYTES;
1103 let soft = delta_files >= DELTA_COMPACT_SOFT_FILES
1104 || self.delta_packed_bytes >= DELTA_COMPACT_SOFT_BYTES;
1105 if let Ok(mut state) = self.compaction_state.lock() {
1106 if state.running {
1107 if let Some(path) = changed_path {
1108 state.buffered_paths.push(path.to_path_buf());
1109 }
1110 if soft || hard {
1111 state.requested_again = true;
1112 }
1113 } else if hard || (soft && !state.requested_again) {
1114 state.requested_again = true;
1115 }
1116 }
1117 }
1118
1119 fn compaction_record_sources(
1120 &self,
1121 id_map: Arc<HashMap<u32, u32>>,
1122 ) -> Vec<Box<dyn PostingRecordSource>> {
1123 let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
1124 if let Some(base) = self.base.clone() {
1125 sources.push(Box::new(BaseRecordSource::new(
1126 base,
1127 Arc::clone(&id_map),
1128 Arc::new(self.superseded.clone()),
1129 )));
1130 }
1131
1132 let mut delta_records = Vec::new();
1133 for (&trigram, postings) in &self.delta_postings {
1134 for posting in postings {
1135 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
1136 continue;
1137 };
1138 delta_records.push(SpillRecord {
1139 trigram,
1140 file_id: mapped_file_id,
1141 next_mask: posting.next_mask,
1142 loc_mask: posting.loc_mask,
1143 });
1144 }
1145 }
1146 if !delta_records.is_empty() {
1147 delta_records.sort_unstable_by_key(|record| (record.trigram, record.file_id));
1148 sources.push(Box::new(VecRecordSource::new(delta_records)));
1149 }
1150 sources
1151 }
1152}
1153
1154impl BasePostings {
1155 fn lookup_entry(&self, trigram: u32) -> Option<LookupEntry> {
1156 self.lookup
1157 .binary_search_by_key(&trigram, |entry| entry.trigram)
1158 .ok()
1159 .and_then(|index| self.lookup.get(index).copied())
1160 }
1161
1162 fn read_postings(&self, entry: LookupEntry) -> std::io::Result<Vec<Posting>> {
1163 let bytes_len = (entry.count as usize)
1164 .checked_mul(POSTING_BYTES)
1165 .ok_or_else(|| std::io::Error::other("posting list too large"))?;
1166 let offset = self
1167 .postings_blob_start
1168 .checked_add(entry.offset)
1169 .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1170 let end = entry
1171 .offset
1172 .checked_add(bytes_len as u64)
1173 .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1174 if end > self.postings_blob_len {
1175 return Err(std::io::Error::other("posting list exceeds blob"));
1176 }
1177 let mut bytes = vec![0u8; bytes_len];
1178 pread_exact(&self.file, offset, &mut bytes)?;
1179 let mut postings = Vec::with_capacity(entry.count as usize);
1180 for chunk in bytes.chunks_exact(POSTING_BYTES) {
1181 postings.push(Posting {
1182 file_id: u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
1183 next_mask: chunk[4],
1184 loc_mask: chunk[5],
1185 });
1186 }
1187 Ok(postings)
1188 }
1189}
1190
1191impl SearchIndexSnapshot {
1192 pub fn grep(
1193 &self,
1194 pattern: &str,
1195 case_sensitive: bool,
1196 include: &[String],
1197 exclude: &[String],
1198 search_root: &Path,
1199 max_results: usize,
1200 ) -> GrepResult {
1201 match pattern_compile::compile(
1202 pattern,
1203 CompileOpts {
1204 case_insensitive: !case_sensitive,
1205 ..CompileOpts::default()
1206 },
1207 ) {
1208 CompileResult::Ok(compiled) => {
1209 self.search_grep(&compiled, include, exclude, search_root, max_results)
1210 }
1211 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
1212 self.empty_grep_result()
1213 }
1214 }
1215 }
1216
1217 pub fn search_grep(
1218 &self,
1219 pattern: &CompiledPattern,
1220 include: &[String],
1221 exclude: &[String],
1222 search_root: &Path,
1223 max_results: usize,
1224 ) -> GrepResult {
1225 let matcher = match pattern {
1226 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
1227 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
1228 };
1229
1230 let filters = match build_path_filters(include, exclude) {
1231 Ok(filters) => filters,
1232 Err(_) => PathFilters::default(),
1233 };
1234 let search_root = canonicalize_or_normalize(search_root);
1235
1236 let raw_pattern = pattern.raw_pattern_for_trigrams();
1237 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
1238 RegexQuery::default()
1239 } else {
1240 decompose_regex(&raw_pattern)
1241 };
1242 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
1243 let candidate_ids = self.candidates(&query);
1244
1245 let candidate_files: Vec<&FileEntry> = candidate_ids
1246 .into_iter()
1247 .filter_map(|file_id| self.files.get(file_id as usize))
1248 .filter(|file| !file.path.as_os_str().is_empty())
1249 .filter(|file| is_within_search_root(&search_root, &file.path))
1250 .filter(|file| filters.matches(&self.project_root, &file.path))
1251 .collect();
1252
1253 let total_matches = AtomicUsize::new(0);
1254 let files_searched = AtomicUsize::new(0);
1255 let files_with_matches = AtomicUsize::new(0);
1256 let truncated = AtomicBool::new(false);
1257 let engine_capped = AtomicBool::new(false);
1258 let stop_after = max_results.saturating_mul(2);
1259 let stop_scan = Arc::new(AtomicBool::new(false));
1260
1261 let mut matches = if candidate_files.len() > 10 {
1262 candidate_files
1263 .par_iter()
1264 .map(|file| {
1265 if grep_scan_should_stop(
1266 Some(&stop_scan),
1267 &truncated,
1268 &total_matches,
1269 stop_after,
1270 ) {
1271 engine_capped.store(true, Ordering::Relaxed);
1272 return Vec::new();
1273 }
1274 search_candidate_file(
1275 file,
1276 &matcher,
1277 max_results,
1278 stop_after,
1279 &total_matches,
1280 &files_searched,
1281 &files_with_matches,
1282 &truncated,
1283 &engine_capped,
1284 Some(&stop_scan),
1285 )
1286 })
1287 .reduce(Vec::new, |mut left, mut right| {
1288 left.append(&mut right);
1292 left
1293 })
1294 } else {
1295 let mut matches = Vec::new();
1296 for file in candidate_files {
1297 matches.extend(search_candidate_file(
1298 file,
1299 &matcher,
1300 max_results,
1301 stop_after,
1302 &total_matches,
1303 &files_searched,
1304 &files_with_matches,
1305 &truncated,
1306 &engine_capped,
1307 None,
1308 ));
1309
1310 if should_stop_search(&truncated, &total_matches, stop_after) {
1311 engine_capped.store(true, Ordering::Relaxed);
1312 break;
1313 }
1314 }
1315 matches
1316 };
1317
1318 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, &self.project_root, |path| {
1319 self.path_to_id
1320 .get(path)
1321 .and_then(|file_id| self.files.get(*file_id as usize))
1322 .map(|file| file.modified)
1323 });
1324
1325 let matches = matches
1326 .into_iter()
1327 .map(|matched| GrepMatch {
1328 file: matched.file.as_ref().clone(),
1329 line: matched.line,
1330 column: matched.column,
1331 line_text: matched.line_text,
1332 match_text: matched.match_text,
1333 })
1334 .collect();
1335
1336 GrepResult {
1337 total_matches: total_matches.load(Ordering::Relaxed),
1338 matches,
1339 files_searched: files_searched.load(Ordering::Relaxed),
1340 files_with_matches: files_with_matches.load(Ordering::Relaxed),
1341 index_status: if self.ready {
1342 IndexStatus::Ready
1343 } else {
1344 IndexStatus::Building
1345 },
1346 truncated: truncated.load(Ordering::Relaxed),
1347 fully_degraded,
1348 engine_capped: engine_capped.load(Ordering::Relaxed),
1349 walk_truncated: false,
1350 }
1351 }
1352
1353 fn empty_grep_result(&self) -> GrepResult {
1354 GrepResult {
1355 matches: Vec::new(),
1356 total_matches: 0,
1357 files_searched: 0,
1358 files_with_matches: 0,
1359 index_status: if self.ready {
1360 IndexStatus::Ready
1361 } else {
1362 IndexStatus::Building
1363 },
1364 truncated: false,
1365 fully_degraded: false,
1366 engine_capped: false,
1367 walk_truncated: false,
1368 }
1369 }
1370
1371 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
1372 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
1373 Ok(filters) => filters,
1374 Err(_) => return Vec::new(),
1375 };
1376 let search_root = canonicalize_or_normalize(search_root);
1377 let mut entries = self
1378 .files
1379 .iter()
1380 .filter(|file| !file.path.as_os_str().is_empty())
1381 .filter(|file| is_within_search_root(&search_root, &file.path))
1382 .filter(|file| filters.matches(&self.project_root, &file.path))
1383 .map(|file| (file.path.clone(), file.modified))
1384 .collect::<Vec<_>>();
1385
1386 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
1387 right_mtime
1388 .cmp(left_mtime)
1389 .then_with(|| left_path.cmp(right_path))
1390 });
1391
1392 entries.into_iter().map(|(path, _)| path).collect()
1393 }
1394
1395 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
1396 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
1397 return self.active_file_ids();
1398 }
1399
1400 let mut and_trigrams = query.and_trigrams.clone();
1401 and_trigrams.sort_unstable_by_key(|trigram| self.posting_count(*trigram));
1402
1403 let mut current: Option<Vec<u32>> = None;
1404
1405 for trigram in and_trigrams {
1406 let filter = query.and_filters.get(&trigram).copied();
1407 let matches = self.postings_for_trigram(trigram, filter);
1408 current = Some(match current.take() {
1409 Some(existing) => intersect_sorted_ids(&existing, &matches),
1410 None => matches,
1411 });
1412
1413 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
1414 break;
1415 }
1416 }
1417
1418 let mut current = current.unwrap_or_else(|| self.active_file_ids());
1419
1420 for (index, group) in query.or_groups.iter().enumerate() {
1421 let mut group_matches = Vec::new();
1422 let filters = query.or_filters.get(index);
1423
1424 for trigram in group {
1425 let filter = filters.and_then(|filters| filters.get(trigram).copied());
1426 let matches = self.postings_for_trigram(*trigram, filter);
1427 if group_matches.is_empty() {
1428 group_matches = matches;
1429 } else {
1430 group_matches = union_sorted_ids(&group_matches, &matches);
1431 }
1432 }
1433
1434 current = intersect_sorted_ids(¤t, &group_matches);
1435 if current.is_empty() {
1436 break;
1437 }
1438 }
1439
1440 let mut unindexed = self
1441 .unindexed_files
1442 .iter()
1443 .copied()
1444 .filter(|file_id| self.is_active_file(*file_id))
1445 .collect::<Vec<_>>();
1446 if !unindexed.is_empty() {
1447 unindexed.sort_unstable();
1448 current = union_sorted_ids(¤t, &unindexed);
1449 }
1450
1451 current
1452 }
1453
1454 fn posting_count(&self, trigram: u32) -> usize {
1455 let base_count = self
1456 .base
1457 .as_ref()
1458 .and_then(|base| base.lookup_entry(trigram))
1459 .map_or(0usize, |entry| entry.count as usize);
1460 base_count.saturating_add(self.delta_postings.get(&trigram).map_or(0usize, Vec::len))
1461 }
1462
1463 fn active_file_ids(&self) -> Vec<u32> {
1464 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1465 ids.retain(|file_id| self.is_active_file(*file_id));
1466 ids.sort_unstable();
1467 ids
1468 }
1469
1470 fn is_active_file(&self, file_id: u32) -> bool {
1471 if self.superseded.contains(&file_id) {
1472 return false;
1473 }
1474 self.files
1475 .get(file_id as usize)
1476 .map(|file| !file.path.as_os_str().is_empty())
1477 .unwrap_or(false)
1478 }
1479
1480 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1481 let mut matches = Vec::new();
1482
1483 if let Some(base_entry) = self
1484 .base
1485 .as_ref()
1486 .and_then(|base| base.lookup_entry(trigram))
1487 {
1488 if let Some(base) = &self.base {
1489 if let Ok(postings) = base.read_postings(base_entry) {
1490 matches.reserve(postings.len());
1491 for posting in postings {
1492 if self.superseded.contains(&posting.file_id) {
1493 continue;
1494 }
1495 if !posting_matches_filter(&posting, filter) {
1496 continue;
1497 }
1498 if self.is_active_file(posting.file_id) {
1499 matches.push(posting.file_id);
1500 }
1501 }
1502 }
1503 }
1504 }
1505
1506 if let Some(postings) = self.delta_postings.get(&trigram) {
1507 matches.reserve(postings.len());
1508 for posting in postings {
1509 if !posting_matches_filter(posting, filter) {
1510 continue;
1511 }
1512 if self.is_active_file(posting.file_id) {
1513 matches.push(posting.file_id);
1514 }
1515 }
1516 }
1517
1518 if matches.len() > 1 {
1519 matches.sort_unstable();
1520 matches.dedup();
1521 }
1522 matches
1523 }
1524}
1525
1526fn posting_matches_filter(posting: &Posting, filter: Option<PostingFilter>) -> bool {
1527 if let Some(filter) = filter {
1528 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1531 return false;
1532 }
1533 }
1537 true
1538}
1539
1540fn search_candidate_file(
1541 file: &FileEntry,
1542 matcher: &SearchMatcher,
1543 max_results: usize,
1544 stop_after: usize,
1545 total_matches: &AtomicUsize,
1546 files_searched: &AtomicUsize,
1547 files_with_matches: &AtomicUsize,
1548 truncated: &AtomicBool,
1549 engine_capped: &AtomicBool,
1550 stop_scan: Option<&Arc<AtomicBool>>,
1551) -> Vec<SharedGrepMatch> {
1552 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1553 engine_capped.store(true, Ordering::Relaxed);
1554 return Vec::new();
1555 }
1556
1557 let content = match read_indexed_file_bytes(&file.path) {
1558 Some(content) => content,
1559 None => return Vec::new(),
1560 };
1561 if is_binary_bytes(&content) {
1568 return Vec::new();
1569 }
1570 files_searched.fetch_add(1, Ordering::Relaxed);
1571
1572 let shared_path = Arc::new(file.path.clone());
1573 let mut matches = Vec::new();
1574 let mut line_starts = None;
1575 let mut seen_lines = HashSet::new();
1576 let mut matched_this_file = false;
1577
1578 match matcher {
1579 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1580 let needle = &literal.needle;
1581 let finder = memchr::memmem::Finder::new(needle);
1582 let mut start = 0;
1583
1584 while let Some(position) = finder.find(&content[start..]) {
1585 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1586 engine_capped.store(true, Ordering::Relaxed);
1587 break;
1588 }
1589
1590 let offset = start + position;
1591 start = offset + 1;
1592
1593 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1594 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1595 if !seen_lines.insert(line) {
1596 continue;
1597 }
1598
1599 matched_this_file = true;
1600 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1601 if match_number > max_results {
1602 truncated.store(true, Ordering::Relaxed);
1603 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1604 break;
1605 }
1606
1607 let end = offset + needle.len();
1608 matches.push(SharedGrepMatch {
1609 file: shared_path.clone(),
1610 line,
1611 column,
1612 line_text,
1613 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1614 });
1615 }
1616 }
1617 SearchMatcher::Literal(literal) => {
1618 let needle = &literal.needle;
1619 let search_content = content.to_ascii_lowercase();
1620 let finder = memchr::memmem::Finder::new(needle);
1621 let mut start = 0;
1622
1623 while let Some(position) = finder.find(&search_content[start..]) {
1624 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1625 engine_capped.store(true, Ordering::Relaxed);
1626 break;
1627 }
1628
1629 let offset = start + position;
1630 start = offset + 1;
1631
1632 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1633 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1634 if !seen_lines.insert(line) {
1635 continue;
1636 }
1637
1638 matched_this_file = true;
1639 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1640 if match_number > max_results {
1641 truncated.store(true, Ordering::Relaxed);
1642 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1643 break;
1644 }
1645
1646 let end = offset + needle.len();
1647 matches.push(SharedGrepMatch {
1648 file: shared_path.clone(),
1649 line,
1650 column,
1651 line_text,
1652 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1653 });
1654 }
1655 }
1656 SearchMatcher::Regex(regex) => {
1657 for matched in regex.find_iter(&content) {
1658 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1659 engine_capped.store(true, Ordering::Relaxed);
1660 break;
1661 }
1662
1663 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1664 let (line, column, line_text) =
1665 line_details_bytes(&content, line_starts, matched.start());
1666 if !seen_lines.insert(line) {
1667 continue;
1668 }
1669
1670 matched_this_file = true;
1671 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1672 if match_number > max_results {
1673 truncated.store(true, Ordering::Relaxed);
1674 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1675 break;
1676 }
1677
1678 matches.push(SharedGrepMatch {
1679 file: shared_path.clone(),
1680 line,
1681 column,
1682 line_text,
1683 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1684 });
1685 }
1686 }
1687 }
1688
1689 if matched_this_file {
1690 files_with_matches.fetch_add(1, Ordering::Relaxed);
1691 }
1692
1693 matches
1694}
1695
1696fn should_stop_search(
1697 truncated: &AtomicBool,
1698 total_matches: &AtomicUsize,
1699 stop_after: usize,
1700) -> bool {
1701 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1702}
1703
1704fn grep_scan_should_stop(
1705 stop_scan: Option<&Arc<AtomicBool>>,
1706 truncated: &AtomicBool,
1707 total_matches: &AtomicUsize,
1708 stop_after: usize,
1709) -> bool {
1710 stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1711 || should_stop_search(truncated, total_matches, stop_after)
1712}
1713
1714fn signal_grep_scan_cap(
1715 stop_scan: Option<&Arc<AtomicBool>>,
1716 total_matches: &AtomicUsize,
1717 stop_after: usize,
1718) {
1719 if let Some(flag) = stop_scan {
1720 if total_matches.load(Ordering::Relaxed) >= stop_after {
1721 flag.store(true, Ordering::Relaxed);
1722 }
1723 }
1724}
1725
1726fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1727 SearchFileMetadata {
1728 size: metadata.len(),
1729 modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1730 }
1731}
1732
1733fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1734 fs::metadata(path)
1735 .ok()
1736 .map(|metadata| search_file_metadata(&metadata))
1737 .unwrap_or(SearchFileMetadata {
1738 size: size_hint,
1739 modified: UNIX_EPOCH,
1740 })
1741}
1742
1743fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1744 let metadata = match fs::metadata(path) {
1745 Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1746 _ => return PreparedSearchPath::Skipped,
1747 };
1748
1749 if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1750 return PreparedSearchPath::Unindexed(metadata);
1751 }
1752
1753 let content = match fs::read(path) {
1754 Ok(content) => content,
1755 Err(_) => return PreparedSearchPath::Skipped,
1756 };
1757
1758 if is_binary_bytes(&content) {
1759 return PreparedSearchPath::Unindexed(metadata);
1760 }
1761
1762 PreparedSearchPath::Indexed(PreparedIndexedFile {
1763 metadata,
1764 content_hash: cache_freshness::hash_bytes(&content),
1765 trigram_map: trigram_filter_map(&content, true),
1766 })
1767}
1768
1769fn search_index_build_pool_size() -> usize {
1772 std::thread::available_parallelism()
1773 .map(|parallelism| parallelism.get())
1774 .unwrap_or(1)
1775 .div_ceil(2)
1776 .clamp(1, 8)
1777}
1778
1779#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1780struct SpillRecord {
1781 trigram: u32,
1782 file_id: u32,
1783 next_mask: u8,
1784 loc_mask: u8,
1785}
1786
1787struct CacheWritePlan {
1788 project_root: PathBuf,
1789 git_head: Option<String>,
1790 ignore_fingerprint: String,
1791 max_file_size: u64,
1792 files: Vec<FileEntry>,
1793 path_to_id: HashMap<PathBuf, u32>,
1794 unindexed_files: HashSet<u32>,
1795 file_trigram_count: Vec<u32>,
1796 id_map: Arc<HashMap<u32, u32>>,
1797}
1798
1799impl CacheWritePlan {
1800 fn from_index(index: &SearchIndex, git_head: Option<&str>) -> Option<Self> {
1801 let active_ids = index.active_file_ids();
1802 let mut id_map = HashMap::with_capacity(active_ids.len());
1803 for (new_id, old_id) in active_ids.iter().enumerate() {
1804 let new_id = u32::try_from(new_id).ok()?;
1805 id_map.insert(*old_id, new_id);
1806 }
1807
1808 let mut files = Vec::with_capacity(active_ids.len());
1809 let mut path_to_id = HashMap::with_capacity(active_ids.len());
1810 let mut unindexed_files = HashSet::new();
1811 let mut file_trigram_count = Vec::with_capacity(active_ids.len());
1812 for old_id in active_ids {
1813 let new_id = *id_map.get(&old_id)?;
1814 let file = index.files.get(old_id as usize)?.clone();
1815 if file.path.as_os_str().is_empty() {
1816 continue;
1817 }
1818 path_to_id.insert(file.path.clone(), new_id);
1819 if index.unindexed_files.contains(&old_id) {
1820 unindexed_files.insert(new_id);
1821 }
1822 file_trigram_count.push(
1823 index
1824 .file_trigram_count
1825 .get(old_id as usize)
1826 .copied()
1827 .unwrap_or(0),
1828 );
1829 files.push(file);
1830 }
1831
1832 Some(Self {
1833 project_root: index.project_root.clone(),
1834 git_head: git_head.map(ToOwned::to_owned),
1835 ignore_fingerprint: if index.ignore_rules_fingerprint.is_empty() {
1836 ignore_rules_fingerprint(&index.project_root)
1837 } else {
1838 index.ignore_rules_fingerprint.clone()
1839 },
1840 max_file_size: index.max_file_size,
1841 files,
1842 path_to_id,
1843 unindexed_files,
1844 file_trigram_count,
1845 id_map: Arc::new(id_map),
1846 })
1847 }
1848}
1849
1850trait PostingRecordSource {
1851 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>>;
1852}
1853
1854struct VecRecordSource {
1855 records: Vec<SpillRecord>,
1856 index: usize,
1857}
1858
1859impl VecRecordSource {
1860 fn new(records: Vec<SpillRecord>) -> Self {
1861 Self { records, index: 0 }
1862 }
1863}
1864
1865impl PostingRecordSource for VecRecordSource {
1866 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1867 let record = self.records.get(self.index).copied();
1868 if record.is_some() {
1869 self.index += 1;
1870 }
1871 Ok(record)
1872 }
1873}
1874
1875struct SpillSegmentSource {
1876 reader: BufReader<File>,
1877 remaining_records: u64,
1878 current_trigram: u32,
1879 remaining_in_group: u32,
1880}
1881
1882impl SpillSegmentSource {
1883 fn open(path: &Path) -> std::io::Result<Self> {
1884 let mut reader = BufReader::new(File::open(path)?);
1885 let mut magic = [0u8; 8];
1886 reader.read_exact(&mut magic)?;
1887 if &magic != SPILL_MAGIC {
1888 return Err(std::io::Error::other("invalid search spill magic"));
1889 }
1890 if read_u32(&mut reader)? != INDEX_VERSION {
1891 return Err(std::io::Error::other("invalid search spill version"));
1892 }
1893 let remaining_records = read_u64(&mut reader)?;
1894 Ok(Self {
1895 reader,
1896 remaining_records,
1897 current_trigram: 0,
1898 remaining_in_group: 0,
1899 })
1900 }
1901}
1902
1903impl PostingRecordSource for SpillSegmentSource {
1904 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1905 if self.remaining_records == 0 {
1906 return Ok(None);
1907 }
1908 if self.remaining_in_group == 0 {
1909 self.current_trigram = read_u32(&mut self.reader)?;
1910 self.remaining_in_group = read_u32(&mut self.reader)?;
1911 if self.remaining_in_group == 0 {
1912 return Err(std::io::Error::other("empty search spill group"));
1913 }
1914 }
1915 let mut file_id = [0u8; 4];
1916 self.reader.read_exact(&mut file_id)?;
1917 let mut masks = [0u8; 2];
1918 self.reader.read_exact(&mut masks)?;
1919 self.remaining_in_group -= 1;
1920 self.remaining_records -= 1;
1921 Ok(Some(SpillRecord {
1922 trigram: self.current_trigram,
1923 file_id: u32::from_le_bytes(file_id),
1924 next_mask: masks[0],
1925 loc_mask: masks[1],
1926 }))
1927 }
1928}
1929
1930struct BaseRecordSource {
1931 base: Arc<BasePostings>,
1932 id_map: Arc<HashMap<u32, u32>>,
1933 superseded: Arc<HashSet<u32>>,
1934 lookup_index: usize,
1935 current: Vec<SpillRecord>,
1936 current_index: usize,
1937}
1938
1939impl BaseRecordSource {
1940 fn new(
1941 base: Arc<BasePostings>,
1942 id_map: Arc<HashMap<u32, u32>>,
1943 superseded: Arc<HashSet<u32>>,
1944 ) -> Self {
1945 Self {
1946 base,
1947 id_map,
1948 superseded,
1949 lookup_index: 0,
1950 current: Vec::new(),
1951 current_index: 0,
1952 }
1953 }
1954
1955 fn load_next_group(&mut self) -> std::io::Result<bool> {
1956 while let Some(entry) = self.base.lookup.get(self.lookup_index).copied() {
1957 self.lookup_index += 1;
1958 let postings = self.base.read_postings(entry)?;
1959 self.current.clear();
1960 self.current_index = 0;
1961 for posting in postings {
1962 if self.superseded.contains(&posting.file_id) {
1963 continue;
1964 }
1965 let Some(mapped_file_id) = self.id_map.get(&posting.file_id).copied() else {
1966 continue;
1967 };
1968 self.current.push(SpillRecord {
1969 trigram: entry.trigram,
1970 file_id: mapped_file_id,
1971 next_mask: posting.next_mask,
1972 loc_mask: posting.loc_mask,
1973 });
1974 }
1975 if !self.current.is_empty() {
1976 return Ok(true);
1977 }
1978 }
1979 Ok(false)
1980 }
1981}
1982
1983impl PostingRecordSource for BaseRecordSource {
1984 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1985 if self.current_index >= self.current.len() && !self.load_next_group()? {
1986 return Ok(None);
1987 }
1988 let record = self.current[self.current_index];
1989 self.current_index += 1;
1990 Ok(Some(record))
1991 }
1992}
1993
1994#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1995struct HeapItem {
1996 record: SpillRecord,
1997 source_index: usize,
1998}
1999
2000impl Ord for HeapItem {
2001 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
2002 other
2003 .record
2004 .trigram
2005 .cmp(&self.record.trigram)
2006 .then_with(|| other.record.file_id.cmp(&self.record.file_id))
2007 .then_with(|| other.source_index.cmp(&self.source_index))
2008 }
2009}
2010
2011impl PartialOrd for HeapItem {
2012 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
2013 Some(self.cmp(other))
2014 }
2015}
2016
2017fn build_streaming_index(
2018 root: &Path,
2019 max_file_size: u64,
2020 cache_dir: &Path,
2021) -> std::io::Result<(SearchIndex, usize)> {
2022 fs::create_dir_all(cache_dir)?;
2023 sweep_stale_search_build_dirs(cache_dir);
2024 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
2025 let ignore_fingerprint = ignore_rules_fingerprint(&project_root);
2026 let filters = PathFilters::default();
2027 let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
2028 let pool_size = search_index_build_pool_size();
2029 let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
2030 let pool = rayon::ThreadPoolBuilder::new()
2031 .num_threads(pool_size)
2032 .thread_name(|index| format!("aft-search-build-{index}"))
2033 .stack_size(8 * 1024 * 1024)
2034 .build()
2035 .ok();
2036
2037 let spill_dir = create_spill_dir(cache_dir)?;
2038 let mut spill_paths = Vec::new();
2039 let mut spill_seq = 0usize;
2040 let mut block: Vec<SpillRecord> = Vec::new();
2041 let mut files = Vec::new();
2042 let mut path_to_id = HashMap::new();
2043 let mut unindexed_files = HashSet::new();
2044 let mut file_trigram_count = Vec::new();
2045 let mut indexed = 0usize;
2046
2047 let build_result = (|| -> std::io::Result<BasePostings> {
2048 for chunk in paths.chunks(chunk_size) {
2049 let prepare_chunk = || -> Vec<PreparedSearchPath> {
2050 chunk
2051 .par_iter()
2052 .map(|path| prepare_search_path(path, max_file_size))
2053 .collect()
2054 };
2055 let prepared = match &pool {
2056 Some(pool) => pool.install(prepare_chunk),
2057 None => prepare_chunk(),
2058 };
2059
2060 for (path, prepared) in chunk.iter().zip(prepared) {
2061 match prepared {
2062 PreparedSearchPath::Indexed(file) => {
2063 let file_id = u32::try_from(files.len())
2064 .map_err(|_| std::io::Error::other("too many files to index"))?;
2065 files.push(FileEntry {
2066 path: path.clone(),
2067 size: file.metadata.size,
2068 modified: file.metadata.modified,
2069 content_hash: file.content_hash,
2070 });
2071 path_to_id.insert(path.clone(), file_id);
2072 file_trigram_count.push(file.trigram_map.len() as u32);
2073 for (trigram, filter) in file.trigram_map {
2074 block.push(SpillRecord {
2075 trigram,
2076 file_id,
2077 next_mask: filter.next_mask,
2078 loc_mask: filter.loc_mask,
2079 });
2080 }
2081 indexed += 1;
2082 }
2083 PreparedSearchPath::Unindexed(metadata) => {
2084 let file_id = u32::try_from(files.len())
2085 .map_err(|_| std::io::Error::other("too many files to index"))?;
2086 files.push(FileEntry {
2087 path: path.clone(),
2088 size: metadata.size,
2089 modified: metadata.modified,
2090 content_hash: cache_freshness::zero_hash(),
2091 });
2092 path_to_id.insert(path.clone(), file_id);
2093 unindexed_files.insert(file_id);
2094 file_trigram_count.push(0);
2095 indexed += 1;
2096 }
2097 PreparedSearchPath::Skipped => {}
2098 }
2099
2100 let block_bytes = block.len().saturating_mul(SPILL_RECORD_ESTIMATED_BYTES);
2101 if block_bytes >= SPIMI_SOFT_LIMIT_BYTES || block_bytes >= SPIMI_HARD_LIMIT_BYTES {
2102 let path = flush_spill_segment(&spill_dir, spill_seq, &mut block)?;
2103 spill_paths.push(path);
2104 spill_seq += 1;
2105 }
2106 }
2107 }
2108
2109 block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2110 let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
2111 for path in &spill_paths {
2112 sources.push(Box::new(SpillSegmentSource::open(path)?));
2113 }
2114 if !block.is_empty() {
2115 sources.push(Box::new(VecRecordSource::new(std::mem::take(&mut block))));
2116 }
2117
2118 let plan = CacheWritePlan {
2119 project_root: project_root.clone(),
2120 git_head: current_git_head(&project_root),
2121 ignore_fingerprint: ignore_fingerprint.clone(),
2122 max_file_size,
2123 files: files.clone(),
2124 path_to_id: path_to_id.clone(),
2125 unindexed_files: unindexed_files.clone(),
2126 file_trigram_count: file_trigram_count.clone(),
2127 id_map: Arc::new(
2128 (0..files.len())
2129 .filter_map(|id| {
2130 let id = u32::try_from(id).ok()?;
2131 Some((id, id))
2132 })
2133 .collect(),
2134 ),
2135 };
2136 write_cache_file_from_sources(cache_dir, &plan, &mut sources)
2137 })();
2138
2139 let _ = fs::remove_dir_all(&spill_dir);
2140 let base = build_result?;
2141 let base_file_count =
2142 u32::try_from(files.len()).map_err(|_| std::io::Error::other("too many files to index"))?;
2143 let git_head = current_git_head(&project_root);
2144 let index = SearchIndex {
2145 base: Some(Arc::new(base)),
2146 delta_postings: HashMap::new(),
2147 delta_file_trigrams: HashMap::new(),
2148 files: Arc::new(files),
2149 path_to_id: Arc::new(path_to_id),
2150 ready: false,
2151 project_root,
2152 git_head,
2153 max_file_size,
2154 ignore_rules_fingerprint: ignore_fingerprint,
2155 file_trigram_count: Arc::new(file_trigram_count),
2156 unindexed_files: Arc::new(unindexed_files),
2157 superseded: HashSet::new(),
2158 base_file_count,
2159 delta_packed_bytes: 0,
2160 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
2161 };
2162 Ok((index, indexed))
2163}
2164
2165fn write_cache_file_from_sources(
2166 cache_dir: &Path,
2167 plan: &CacheWritePlan,
2168 sources: &mut [Box<dyn PostingRecordSource>],
2169) -> std::io::Result<BasePostings> {
2170 fs::create_dir_all(cache_dir)?;
2171 sweep_stale_search_build_dirs(cache_dir);
2172 let cache_path = cache_dir.join("cache.bin");
2173 let tmp_cache = cache_dir.join(format!(
2174 "cache.bin.tmp.{}.{}",
2175 std::process::id(),
2176 SystemTime::now()
2177 .duration_since(UNIX_EPOCH)
2178 .unwrap_or(Duration::ZERO)
2179 .as_nanos()
2180 ));
2181
2182 let write_result = (|| -> std::io::Result<BasePostings> {
2183 let raw = OpenOptions::new()
2184 .write(true)
2185 .create_new(true)
2186 .open(&tmp_cache)?;
2187 let mut writer = BufWriter::new(raw);
2188 write_u32(&mut writer, CACHE_MAGIC)?;
2189 write_u32(&mut writer, INDEX_VERSION)?;
2190 let postings_len_patch = writer.stream_position()?;
2191 write_u64(&mut writer, 0)?;
2192
2193 let postings_section_start = writer.stream_position()?;
2194 let postings_header = build_postings_header_bytes(plan)?;
2195 writer.write_all(&postings_header)?;
2196 let postings_blob_len_patch = writer.stream_position()?;
2197 write_u64(&mut writer, 0)?;
2198 let postings_blob_start = writer.stream_position()?;
2199
2200 let (lookup_entries, postings_blob_len) = merge_sources_to_writer(sources, &mut writer)?;
2201 let extension = build_file_trigram_count_extension(&plan.file_trigram_count)?;
2202 writer.write_all(&extension)?;
2203 let postings_crc_end = writer.stream_position()?;
2204
2205 writer.flush()?;
2206 writer.seek(SeekFrom::Start(postings_blob_len_patch))?;
2207 write_u64(&mut writer, postings_blob_len)?;
2208 writer.flush()?;
2209
2210 let checksum = crc32_file_range(
2211 &tmp_cache,
2212 postings_section_start,
2213 postings_crc_end.saturating_sub(postings_section_start),
2214 )?;
2215 writer.seek(SeekFrom::Start(postings_crc_end))?;
2216 writer.write_all(&checksum.to_le_bytes())?;
2217 let postings_section_end = writer.stream_position()?;
2218 let postings_len_total = postings_section_end.saturating_sub(postings_section_start);
2219 writer.seek(SeekFrom::Start(postings_len_patch))?;
2220 write_u64(&mut writer, postings_len_total)?;
2221 writer.seek(SeekFrom::Start(postings_section_end))?;
2222
2223 let lookup_blob = build_lookup_section_bytes(&lookup_entries)?;
2224 writer.write_all(&lookup_blob)?;
2225 writer.flush()?;
2226 writer.get_ref().sync_all()?;
2227 drop(writer);
2228
2229 fs::rename(&tmp_cache, &cache_path)?;
2230 sync_parent_dir(&cache_path);
2231 let file = open_cache_file_read(&cache_path)?;
2232 Ok(BasePostings {
2233 file: Arc::new(file),
2234 postings_blob_start,
2235 postings_blob_len,
2236 lookup: Arc::new(lookup_entries),
2237 })
2238 })();
2239
2240 if write_result.is_err() {
2241 let _ = fs::remove_file(&tmp_cache);
2242 }
2243 write_result
2244}
2245
2246fn merge_sources_to_writer(
2247 sources: &mut [Box<dyn PostingRecordSource>],
2248 writer: &mut BufWriter<File>,
2249) -> std::io::Result<(Vec<LookupEntry>, u64)> {
2250 let mut heap = BinaryHeap::new();
2251 for (source_index, source) in sources.iter_mut().enumerate() {
2252 if let Some(record) = source.next_record()? {
2253 heap.push(HeapItem {
2254 record,
2255 source_index,
2256 });
2257 }
2258 }
2259
2260 let mut lookup_entries = Vec::new();
2261 let mut postings_blob_len = 0u64;
2262 let mut current_trigram: Option<u32> = None;
2263 let mut current_offset = 0u64;
2264 let mut current_count = 0u32;
2265
2266 while let Some(item) = heap.pop() {
2267 let record = item.record;
2268 if current_trigram != Some(record.trigram) {
2269 if let Some(trigram) = current_trigram {
2270 lookup_entries.push(LookupEntry {
2271 trigram,
2272 offset: current_offset,
2273 count: current_count,
2274 });
2275 }
2276 current_trigram = Some(record.trigram);
2277 current_offset = postings_blob_len;
2278 current_count = 0;
2279 }
2280
2281 writer.write_all(&record.file_id.to_le_bytes())?;
2282 writer.write_all(&[record.next_mask, record.loc_mask])?;
2283 postings_blob_len = postings_blob_len
2284 .checked_add(POSTING_BYTES as u64)
2285 .ok_or_else(|| std::io::Error::other("postings blob too large"))?;
2286 current_count = current_count
2287 .checked_add(1)
2288 .ok_or_else(|| std::io::Error::other("posting list too large"))?;
2289
2290 if let Some(next) = sources[item.source_index].next_record()? {
2291 heap.push(HeapItem {
2292 record: next,
2293 source_index: item.source_index,
2294 });
2295 }
2296 }
2297
2298 if let Some(trigram) = current_trigram {
2299 lookup_entries.push(LookupEntry {
2300 trigram,
2301 offset: current_offset,
2302 count: current_count,
2303 });
2304 }
2305
2306 Ok((lookup_entries, postings_blob_len))
2307}
2308
2309fn build_postings_header_bytes(plan: &CacheWritePlan) -> std::io::Result<Vec<u8>> {
2310 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2311 writer.write_all(INDEX_MAGIC)?;
2312 write_u32(&mut writer, INDEX_VERSION)?;
2313
2314 let head = plan.git_head.as_deref().unwrap_or_default();
2315 let root = plan.project_root.to_string_lossy();
2316 let head_len = u32::try_from(head.len())
2317 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
2318 let root_len = u32::try_from(root.len())
2319 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
2320 let ignore_fingerprint_len = u32::try_from(plan.ignore_fingerprint.len())
2321 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
2322 let file_count = u32::try_from(plan.files.len())
2323 .map_err(|_| std::io::Error::other("too many files to cache"))?;
2324
2325 write_u32(&mut writer, head_len)?;
2326 write_u32(&mut writer, root_len)?;
2327 write_u32(&mut writer, ignore_fingerprint_len)?;
2328 write_u64(&mut writer, plan.max_file_size)?;
2329 write_u32(&mut writer, file_count)?;
2330 writer.write_all(head.as_bytes())?;
2331 writer.write_all(root.as_bytes())?;
2332 writer.write_all(plan.ignore_fingerprint.as_bytes())?;
2333
2334 for (file_id, file) in plan.files.iter().enumerate() {
2335 let file_id =
2336 u32::try_from(file_id).map_err(|_| std::io::Error::other("too many files to cache"))?;
2337 let path = cache_relative_path(&plan.project_root, &file.path)
2338 .or_else(|| {
2339 fs::canonicalize(&file.path)
2340 .ok()
2341 .and_then(|canonical| cache_relative_path(&plan.project_root, &canonical))
2342 })
2343 .ok_or_else(|| {
2344 std::io::Error::other(format!(
2345 "refusing to cache path outside project root: {}",
2346 file.path.display()
2347 ))
2348 })?;
2349 let path = path.to_string_lossy();
2350 let path_len = u32::try_from(path.len())
2351 .map_err(|_| std::io::Error::other("cached path too large"))?;
2352 let modified = file
2353 .modified
2354 .duration_since(UNIX_EPOCH)
2355 .unwrap_or(Duration::ZERO);
2356 let unindexed = if plan.unindexed_files.contains(&file_id) {
2357 1u8
2358 } else {
2359 0u8
2360 };
2361
2362 writer.write_all(&[unindexed])?;
2363 write_u32(&mut writer, path_len)?;
2364 write_u64(&mut writer, file.size)?;
2365 write_u64(&mut writer, modified.as_secs())?;
2366 write_u32(&mut writer, modified.subsec_nanos())?;
2367 writer.write_all(file.content_hash.as_bytes())?;
2368 writer.write_all(path.as_bytes())?;
2369 }
2370
2371 writer.flush()?;
2372 Ok(writer
2373 .into_inner()
2374 .map_err(|error| std::io::Error::other(error.to_string()))?
2375 .into_inner())
2376}
2377
2378fn build_lookup_section_bytes(lookup_entries: &[LookupEntry]) -> std::io::Result<Vec<u8>> {
2379 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2380 let entry_count = u32::try_from(lookup_entries.len())
2381 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
2382 writer.write_all(LOOKUP_MAGIC)?;
2383 write_u32(&mut writer, INDEX_VERSION)?;
2384 write_u32(&mut writer, entry_count)?;
2385 for entry in lookup_entries {
2386 write_u32(&mut writer, entry.trigram)?;
2387 write_u64(&mut writer, entry.offset)?;
2388 write_u32(&mut writer, entry.count)?;
2389 }
2390 writer.flush()?;
2391 let mut lookup_blob = writer
2392 .into_inner()
2393 .map_err(|error| std::io::Error::other(error.to_string()))?
2394 .into_inner();
2395 let checksum = crc32fast::hash(&lookup_blob);
2396 lookup_blob.extend_from_slice(&checksum.to_le_bytes());
2397 Ok(lookup_blob)
2398}
2399
2400fn build_file_trigram_count_extension(counts: &[u32]) -> std::io::Result<Vec<u8>> {
2401 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2402 writer.write_all(FILE_TRIGRAM_COUNT_MAGIC)?;
2403 write_u32(&mut writer, INDEX_VERSION)?;
2404 write_u32(
2405 &mut writer,
2406 u32::try_from(counts.len())
2407 .map_err(|_| std::io::Error::other("too many file trigram counts"))?,
2408 )?;
2409 for count in counts {
2410 write_u32(&mut writer, *count)?;
2411 }
2412 writer.flush()?;
2413 Ok(writer
2414 .into_inner()
2415 .map_err(|error| std::io::Error::other(error.to_string()))?
2416 .into_inner())
2417}
2418
2419fn flush_spill_segment(
2420 spill_dir: &Path,
2421 seq: usize,
2422 block: &mut Vec<SpillRecord>,
2423) -> std::io::Result<PathBuf> {
2424 if block.is_empty() {
2425 return Err(std::io::Error::other(
2426 "refusing to write empty search spill",
2427 ));
2428 }
2429 block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2430 let path = spill_dir.join(format!("segment.{seq:06}.bin"));
2431 let mut writer = BufWriter::new(File::create(&path)?);
2432 writer.write_all(SPILL_MAGIC)?;
2433 write_u32(&mut writer, INDEX_VERSION)?;
2434 write_u64(
2435 &mut writer,
2436 u64::try_from(block.len()).map_err(|_| std::io::Error::other("search spill too large"))?,
2437 )?;
2438
2439 let mut index = 0usize;
2440 while index < block.len() {
2441 let trigram = block[index].trigram;
2442 let group_start = index;
2443 while index < block.len() && block[index].trigram == trigram {
2444 index += 1;
2445 }
2446 write_u32(&mut writer, trigram)?;
2447 write_u32(
2448 &mut writer,
2449 u32::try_from(index - group_start)
2450 .map_err(|_| std::io::Error::other("search spill group too large"))?,
2451 )?;
2452 for record in &block[group_start..index] {
2453 writer.write_all(&record.file_id.to_le_bytes())?;
2454 writer.write_all(&[record.next_mask, record.loc_mask])?;
2455 }
2456 }
2457 writer.flush()?;
2458 writer.get_ref().sync_all()?;
2459 block.clear();
2460 Ok(path)
2461}
2462
2463fn create_spill_dir(cache_dir: &Path) -> std::io::Result<PathBuf> {
2464 let dir = cache_dir.join(format!(
2465 "search-build.tmp.{}.{}",
2466 std::process::id(),
2467 SystemTime::now()
2468 .duration_since(UNIX_EPOCH)
2469 .unwrap_or(Duration::ZERO)
2470 .as_nanos()
2471 ));
2472 fs::create_dir_all(&dir)?;
2473 Ok(dir)
2474}
2475
2476fn sweep_stale_search_build_dirs(cache_dir: &Path) {
2477 let Ok(entries) = fs::read_dir(cache_dir) else {
2478 return;
2479 };
2480 for entry in entries.flatten() {
2481 let file_name = entry.file_name();
2482 if file_name.to_string_lossy().starts_with("search-build.tmp.") {
2483 let _ = fs::remove_dir_all(entry.path());
2484 }
2485 }
2486}
2487
2488fn transient_search_cache_dir(root: &Path) -> PathBuf {
2489 std::env::temp_dir().join(format!(
2490 "aft-search-cache.{}.{}.{}",
2491 artifact_cache_key(root),
2492 std::process::id(),
2493 SystemTime::now()
2494 .duration_since(UNIX_EPOCH)
2495 .unwrap_or(Duration::ZERO)
2496 .as_nanos()
2497 ))
2498}
2499
2500fn read_file_trigram_count_extension(
2501 base: &BasePostings,
2502 extension_start: u64,
2503 postings_body_end: u64,
2504 file_count: usize,
2505) -> std::io::Result<Option<Vec<u32>>> {
2506 if extension_start >= postings_body_end {
2507 return Ok(None);
2508 }
2509 let extension_len = postings_body_end - extension_start;
2510 if extension_len < 16 {
2511 return Ok(None);
2512 }
2513 let mut header = [0u8; 16];
2514 pread_exact(&base.file, extension_start, &mut header)?;
2515 if &header[..8] != FILE_TRIGRAM_COUNT_MAGIC {
2516 return Ok(None);
2517 }
2518 let version = u32::from_le_bytes([header[8], header[9], header[10], header[11]]);
2519 if version != INDEX_VERSION {
2520 return Err(std::io::Error::other("invalid file trigram count version"));
2521 }
2522 let count = u32::from_le_bytes([header[12], header[13], header[14], header[15]]) as usize;
2523 if count != file_count {
2524 return Err(std::io::Error::other("file trigram count length mismatch"));
2525 }
2526 let counts_len = count
2527 .checked_mul(4)
2528 .ok_or_else(|| std::io::Error::other("file trigram count extension too large"))?;
2529 if 16u64 + counts_len as u64 > extension_len {
2530 return Err(std::io::Error::other(
2531 "truncated file trigram count extension",
2532 ));
2533 }
2534 let mut bytes = vec![0u8; counts_len];
2535 pread_exact(&base.file, extension_start + 16, &mut bytes)?;
2536 let mut counts = Vec::with_capacity(count);
2537 for chunk in bytes.chunks_exact(4) {
2538 counts.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
2539 }
2540 Ok(Some(counts))
2541}
2542
2543fn compute_file_trigram_counts_from_base(
2544 base: &BasePostings,
2545 file_count: usize,
2546) -> std::io::Result<Vec<u32>> {
2547 let mut counts = vec![0u32; file_count];
2548 for entry in base.lookup.iter().copied() {
2549 for posting in base.read_postings(entry)? {
2550 let Some(count) = counts.get_mut(posting.file_id as usize) else {
2551 return Err(std::io::Error::other("posting references missing file"));
2552 };
2553 *count = count.saturating_add(1);
2554 }
2555 }
2556 Ok(counts)
2557}
2558
2559fn ensure_count_slot(counts: &mut Vec<u32>, file_id: u32) {
2560 let len = file_id as usize + 1;
2561 if counts.len() < len {
2562 counts.resize(len, 0);
2563 }
2564}
2565
2566fn reader_has_remaining<R: Seek>(
2567 reader: &mut R,
2568 absolute_end: u64,
2569 len: usize,
2570) -> std::io::Result<bool> {
2571 let position = reader.stream_position()?;
2572 Ok(position <= absolute_end && (len as u64) <= absolute_end - position)
2573}
2574
2575fn crc32_file_range(path: &Path, start: u64, len: u64) -> std::io::Result<u32> {
2576 let mut file = File::open(path)?;
2577 file.seek(SeekFrom::Start(start))?;
2578 let mut hasher = crc32fast::Hasher::new();
2579 let mut remaining = len;
2580 let mut buffer = vec![0u8; 1024 * 1024];
2581 while remaining > 0 {
2582 let read_len = buffer.len().min(remaining as usize);
2583 let bytes_read = file.read(&mut buffer[..read_len])?;
2584 if bytes_read == 0 {
2585 return Err(std::io::Error::new(
2586 std::io::ErrorKind::UnexpectedEof,
2587 "truncated cache while checksumming",
2588 ));
2589 }
2590 hasher.update(&buffer[..bytes_read]);
2591 remaining -= bytes_read as u64;
2592 }
2593 Ok(hasher.finalize())
2594}
2595
2596fn sync_parent_dir(path: &Path) {
2597 if let Some(parent) = path.parent() {
2598 if let Ok(dir) = File::open(parent) {
2599 let _ = dir.sync_all();
2600 }
2601 }
2602}
2603
2604fn open_cache_file_read(path: &Path) -> std::io::Result<File> {
2605 let mut options = OpenOptions::new();
2606 options.read(true);
2607 #[cfg(windows)]
2608 {
2609 use std::os::windows::fs::OpenOptionsExt;
2610 const FILE_SHARE_READ: u32 = 0x0000_0001;
2611 const FILE_SHARE_WRITE: u32 = 0x0000_0002;
2612 const FILE_SHARE_DELETE: u32 = 0x0000_0004;
2613 options.share_mode(FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE);
2614 }
2615 options.open(path)
2616}
2617
2618#[cfg(unix)]
2619fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2620 use std::os::unix::fs::FileExt;
2621 while !buffer.is_empty() {
2622 let bytes_read = file.read_at(buffer, offset)?;
2623 if bytes_read == 0 {
2624 return Err(std::io::Error::new(
2625 std::io::ErrorKind::UnexpectedEof,
2626 "short pread from search cache",
2627 ));
2628 }
2629 offset += bytes_read as u64;
2630 let (_, rest) = buffer.split_at_mut(bytes_read);
2631 buffer = rest;
2632 }
2633 Ok(())
2634}
2635
2636#[cfg(windows)]
2637fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2638 use std::os::windows::fs::FileExt;
2639 while !buffer.is_empty() {
2640 let bytes_read = file.seek_read(buffer, offset)?;
2641 if bytes_read == 0 {
2642 return Err(std::io::Error::new(
2643 std::io::ErrorKind::UnexpectedEof,
2644 "short pread from search cache",
2645 ));
2646 }
2647 offset += bytes_read as u64;
2648 let (_, rest) = buffer.split_at_mut(bytes_read);
2649 buffer = rest;
2650 }
2651 Ok(())
2652}
2653
2654fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2655 let mut merged = Vec::with_capacity(left.len().min(right.len()));
2656 let mut left_index = 0;
2657 let mut right_index = 0;
2658
2659 while left_index < left.len() && right_index < right.len() {
2660 match left[left_index].cmp(&right[right_index]) {
2661 std::cmp::Ordering::Less => left_index += 1,
2662 std::cmp::Ordering::Greater => right_index += 1,
2663 std::cmp::Ordering::Equal => {
2664 merged.push(left[left_index]);
2665 left_index += 1;
2666 right_index += 1;
2667 }
2668 }
2669 }
2670
2671 merged
2672}
2673
2674fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2675 let mut merged = Vec::with_capacity(left.len() + right.len());
2676 let mut left_index = 0;
2677 let mut right_index = 0;
2678
2679 while left_index < left.len() && right_index < right.len() {
2680 match left[left_index].cmp(&right[right_index]) {
2681 std::cmp::Ordering::Less => {
2682 merged.push(left[left_index]);
2683 left_index += 1;
2684 }
2685 std::cmp::Ordering::Greater => {
2686 merged.push(right[right_index]);
2687 right_index += 1;
2688 }
2689 std::cmp::Ordering::Equal => {
2690 merged.push(left[left_index]);
2691 left_index += 1;
2692 right_index += 1;
2693 }
2694 }
2695 }
2696
2697 merged.extend_from_slice(&left[left_index..]);
2698 merged.extend_from_slice(&right[right_index..]);
2699 merged
2700}
2701
2702pub fn decompose_regex(pattern: &str) -> RegexQuery {
2703 let hir = match regex_syntax::parse(pattern) {
2704 Ok(hir) => hir,
2705 Err(_) => return RegexQuery::default(),
2706 };
2707
2708 let build = build_query(&hir);
2709 build.into_query()
2710}
2711
2712pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
2713 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
2714}
2715
2716pub fn normalize_char(c: u8) -> u8 {
2717 c.to_ascii_lowercase()
2718}
2719
2720fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
2721 if content.len() < 3 {
2722 return;
2723 }
2724
2725 for start in 0..=content.len() - 3 {
2726 let trigram = pack_trigram(
2727 normalize_char(content[start]),
2728 normalize_char(content[start + 1]),
2729 normalize_char(content[start + 2]),
2730 );
2731 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
2732 visit(trigram, next_char, start);
2733 }
2734}
2735
2736pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
2737 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
2738 scan_trigrams(content, |trigram, next_char, position| {
2739 trigrams.push((trigram, next_char, position));
2740 });
2741 trigrams
2742}
2743
2744fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
2745 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2746 scan_trigrams(content, |trigram, next_char, position| {
2747 let entry = filters.entry(trigram).or_default();
2748 if include_eof_next_char || next_char != EOF_SENTINEL {
2749 entry.next_mask |= mask_for_next_char(next_char);
2750 }
2751 entry.loc_mask |= mask_for_position(position);
2752 });
2753 filters
2754}
2755
2756pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
2757 let mut seen = HashSet::new();
2758 let mut out = Vec::new();
2759 for token in tokens {
2760 scan_trigrams(token.as_bytes(), |trigram, _, _| {
2761 if seen.insert(trigram) {
2762 out.push(trigram);
2763 }
2764 });
2765 }
2766 out
2767}
2768
2769pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
2770 lexical_score_snapshot(&index.snapshot(), query_trigrams, file_id)
2771}
2772
2773fn lexical_score_snapshot(
2774 index: &SearchIndexSnapshot,
2775 query_trigrams: &[u32],
2776 file_id: u32,
2777) -> f32 {
2778 if query_trigrams.is_empty() {
2779 return 0.0;
2780 }
2781
2782 let mut hits = 0u32;
2783 for &trigram in query_trigrams {
2784 let postings = index.postings_for_trigram(trigram, None);
2785 if postings.binary_search(&file_id).is_ok() {
2786 hits += 1;
2787 }
2788 }
2789
2790 if hits == 0 {
2791 return 0.0;
2792 }
2793
2794 let file_trigram_count = index
2795 .file_trigram_count
2796 .get(file_id as usize)
2797 .copied()
2798 .unwrap_or(1)
2799 .max(1) as f32;
2800 (hits as f32) / (1.0 + file_trigram_count.ln())
2801}
2802
2803pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
2804 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
2806 return PathBuf::from(override_dir)
2807 .join("index")
2808 .join(artifact_cache_key(project_root));
2809 }
2810 if let Some(dir) = storage_dir {
2812 return dir.join("index").join(artifact_cache_key(project_root));
2813 }
2814 let home = std::env::var_os("HOME")
2819 .or_else(|| std::env::var_os("USERPROFILE"))
2820 .map(PathBuf::from)
2821 .unwrap_or_else(std::env::temp_dir);
2822 home.join(".cache")
2823 .join("aft")
2824 .join("index")
2825 .join(artifact_cache_key(project_root))
2826}
2827
2828pub(crate) fn build_path_filters(
2829 include: &[String],
2830 exclude: &[String],
2831) -> Result<PathFilters, String> {
2832 Ok(PathFilters {
2833 includes: build_globset(include)?,
2834 excludes: build_globset(exclude)?,
2835 })
2836}
2837
2838pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
2839 walk_project_files_from(root, root, filters)
2840}
2841
2842pub fn walk_project_files_bounded_default(
2843 root: &Path,
2844 max_files: usize,
2845) -> Result<Vec<PathBuf>, usize> {
2846 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
2847}
2848
2849pub(crate) fn walk_project_files_bounded_matching<F>(
2850 root: &Path,
2851 filters: &PathFilters,
2852 max_files: usize,
2853 matches_file: F,
2854) -> Result<Vec<PathBuf>, usize>
2855where
2856 F: Fn(&Path) -> bool,
2857{
2858 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
2859}
2860
2861pub fn walk_project_files_bounded_default_matching<F>(
2862 root: &Path,
2863 max_files: usize,
2864 matches_file: F,
2865) -> Result<Vec<PathBuf>, usize>
2866where
2867 F: Fn(&Path) -> bool,
2868{
2869 walk_project_files_from_inner_matching(
2870 root,
2871 root,
2872 &PathFilters::default(),
2873 Some(max_files),
2874 matches_file,
2875 true,
2876 )
2877}
2878
2879pub(crate) fn walk_project_files_from(
2880 filter_root: &Path,
2881 search_root: &Path,
2882 filters: &PathFilters,
2883) -> Vec<PathBuf> {
2884 walk_project_files_from_inner(filter_root, search_root, filters, None, true)
2885 .expect("unbounded project walk cannot exceed a file limit")
2886}
2887
2888pub(crate) fn has_any_project_file_from(
2889 filter_root: &Path,
2890 search_root: &Path,
2891 filters: &PathFilters,
2892) -> bool {
2893 walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
2894}
2895
2896fn walk_project_files_from_inner(
2897 filter_root: &Path,
2898 search_root: &Path,
2899 filters: &PathFilters,
2900 max_files: Option<usize>,
2901 sort_by_mtime: bool,
2902) -> Result<Vec<PathBuf>, usize> {
2903 walk_project_files_from_inner_matching(
2904 filter_root,
2905 search_root,
2906 filters,
2907 max_files,
2908 |_| true,
2909 sort_by_mtime,
2910 )
2911}
2912
2913fn project_walk_builder(search_root: &Path) -> WalkBuilder {
2914 let mut builder = WalkBuilder::new(search_root);
2915 builder
2916 .hidden(false)
2917 .git_ignore(true)
2918 .git_global(true)
2919 .git_exclude(true)
2920 .add_custom_ignore_filename(".aftignore")
2921 .filter_entry(|entry| {
2922 let name = entry.file_name().to_string_lossy();
2923 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2924 return !matches!(
2925 name.as_ref(),
2926 "node_modules"
2927 | "target"
2928 | "venv"
2929 | ".venv"
2930 | ".git"
2931 | "__pycache__"
2932 | ".tox"
2933 | "dist"
2934 | "build"
2935 );
2936 }
2937 true
2938 });
2939 builder
2940}
2941
2942fn walk_project_files_from_inner_matching<F>(
2943 filter_root: &Path,
2944 search_root: &Path,
2945 filters: &PathFilters,
2946 max_files: Option<usize>,
2947 matches_file: F,
2948 sort_by_mtime: bool,
2949) -> Result<Vec<PathBuf>, usize>
2950where
2951 F: Fn(&Path) -> bool,
2952{
2953 let builder = project_walk_builder(search_root);
2954
2955 let mut files = Vec::new();
2956 for entry in builder.build().filter_map(|entry| entry.ok()) {
2957 if !entry
2958 .file_type()
2959 .map_or(false, |file_type| file_type.is_file())
2960 {
2961 continue;
2962 }
2963 let path = entry.into_path();
2964 if filters.matches(filter_root, &path) && matches_file(&path) {
2965 files.push(path);
2966 if max_files.is_some_and(|limit| files.len() > limit) {
2967 return Err(files.len());
2968 }
2969 }
2970 }
2971
2972 if sort_by_mtime {
2973 sort_paths_by_mtime_desc(&mut files);
2974 }
2975 Ok(files)
2976}
2977
2978pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
2979 let bytes = fs::read(path).ok()?;
2980 if is_binary_bytes(&bytes) {
2981 return None;
2982 }
2983 String::from_utf8(bytes).ok()
2984}
2985
2986fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
2987 fs::read(path).ok()
2988}
2989
2990pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
2991 path.strip_prefix(root)
2992 .map(PathBuf::from)
2993 .unwrap_or_else(|_| path.to_path_buf())
2994}
2995
2996pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
2997 let normalized_root = normalize_path(root);
2998 let normalized_path = normalize_path(path);
2999 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
3000 validate_cached_relative_path(relative)
3001}
3002
3003pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
3004 let relative = validate_cached_relative_path(relative_path)?;
3005 let normalized_root = normalize_path(root);
3006 let full_path = normalize_path(&normalized_root.join(relative));
3007
3008 match fs::canonicalize(&full_path) {
3009 Ok(canonical_path) => {
3010 if canonical_path.starts_with(&normalized_root) {
3011 return Some(full_path);
3012 }
3013
3014 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
3015 canonical_path
3016 .starts_with(&canonical_root)
3017 .then_some(full_path)
3018 }
3019 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
3020 }
3021}
3022
3023pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
3024 if path.is_absolute() {
3025 return None;
3026 }
3027
3028 let mut normalized = PathBuf::new();
3029 for component in path.components() {
3030 match component {
3031 Component::Normal(part) => normalized.push(part),
3032 Component::CurDir => {}
3033 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
3034 }
3035 }
3036 (!normalized.as_os_str().is_empty()).then_some(normalized)
3037}
3038
3039pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
3052 use std::collections::HashMap;
3053 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
3054 let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(paths.len());
3055 for path in paths.iter() {
3056 mtimes
3057 .entry(path.clone())
3058 .or_insert_with(|| path_modified_time(path));
3059 display_paths
3060 .entry(path.clone())
3061 .or_insert_with(|| normalized_display_sort_key(None, path));
3062 }
3063 paths.sort_by(|left, right| {
3064 let left_mtime = mtimes.get(left).and_then(|v| *v);
3065 let right_mtime = mtimes.get(right).and_then(|v| *v);
3066 let left_display = display_paths
3067 .get(left)
3068 .map(String::as_bytes)
3069 .unwrap_or_default();
3070 let right_display = display_paths
3071 .get(right)
3072 .map(String::as_bytes)
3073 .unwrap_or_default();
3074 right_mtime
3075 .cmp(&left_mtime)
3076 .then_with(|| left_display.cmp(right_display))
3077 });
3078}
3079
3080pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
3083 use std::collections::HashMap;
3084 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
3085 let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(matches.len());
3086 for m in matches.iter() {
3087 mtimes.entry(m.file.clone()).or_insert_with(|| {
3088 let resolved = resolve_match_path(project_root, &m.file);
3089 path_modified_time(&resolved)
3090 });
3091 display_paths
3092 .entry(m.file.clone())
3093 .or_insert_with(|| normalized_display_sort_key(Some(project_root), &m.file));
3094 }
3095 matches.sort_by(|left, right| {
3096 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
3097 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
3098 let left_display = display_paths
3099 .get(&left.file)
3100 .map(String::as_bytes)
3101 .unwrap_or_default();
3102 let right_display = display_paths
3103 .get(&right.file)
3104 .map(String::as_bytes)
3105 .unwrap_or_default();
3106 right_mtime
3110 .cmp(&left_mtime)
3111 .then_with(|| left_display.cmp(right_display))
3112 .then_with(|| left.line.cmp(&right.line))
3113 .then_with(|| left.column.cmp(&right.column))
3114 });
3115}
3116
3117fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
3122 matches: &mut [SharedGrepMatch],
3123 project_root: &Path,
3124 modified_for_path: F,
3125) where
3126 F: Fn(&Path) -> Option<SystemTime>,
3127{
3128 use std::collections::HashMap;
3129 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
3130 let mut display_paths: HashMap<PathBuf, String> = HashMap::with_capacity(matches.len());
3131 for m in matches.iter() {
3132 let path = m.file.as_path().to_path_buf();
3133 mtimes
3134 .entry(path.clone())
3135 .or_insert_with(|| modified_for_path(&path));
3136 display_paths
3137 .entry(path.clone())
3138 .or_insert_with(|| normalized_display_sort_key(Some(project_root), &path));
3139 }
3140 matches.sort_by(|left, right| {
3141 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
3142 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
3143 let left_display = display_paths
3144 .get(left.file.as_path())
3145 .map(String::as_bytes)
3146 .unwrap_or_default();
3147 let right_display = display_paths
3148 .get(right.file.as_path())
3149 .map(String::as_bytes)
3150 .unwrap_or_default();
3151 right_mtime
3155 .cmp(&left_mtime)
3156 .then_with(|| left_display.cmp(right_display))
3157 .then_with(|| left.line.cmp(&right.line))
3158 .then_with(|| left.column.cmp(&right.column))
3159 });
3160}
3161
3162pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
3163 let resolved_project_root = canonicalize_or_normalize(project_root);
3164 let root = match path {
3165 Some(path) => {
3166 let path = PathBuf::from(path);
3167 if path.is_absolute() {
3168 canonicalize_or_normalize(&path)
3169 } else {
3170 normalize_path(&resolved_project_root.join(path))
3171 }
3172 }
3173 None => resolved_project_root.clone(),
3174 };
3175
3176 let use_index = is_within_search_root(&resolved_project_root, &root);
3177 SearchScope { root, use_index }
3178}
3179
3180pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
3181 content_inspector::inspect(content).is_binary()
3182}
3183
3184pub(crate) fn current_git_head(root: &Path) -> Option<String> {
3185 run_git(root, &["rev-parse", "HEAD"])
3186}
3187
3188pub fn artifact_cache_key(project_root: &Path) -> String {
3200 use sha2::{Digest, Sha256};
3201
3202 let mut hasher = Sha256::new();
3203
3204 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
3205 hasher.update(root_commit.as_bytes());
3208 } else {
3209 let canonical_root = canonicalize_or_normalize(project_root);
3211 hasher.update(canonical_root.to_string_lossy().as_bytes());
3212 }
3213
3214 let digest = format!("{:x}", hasher.finalize());
3215 digest[..16].to_string()
3216}
3217
3218pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
3226 use sha2::{Digest, Sha256};
3227
3228 let root = canonicalize_or_normalize(project_root);
3229 let mut files = Vec::new();
3230 collect_ignore_rule_files(&root, &mut files);
3231 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
3232 if global_ignore.is_file() {
3233 files.push(global_ignore);
3234 }
3235 }
3236 let info_exclude = git_info_exclude_path(&root);
3237 if info_exclude.is_file() {
3238 files.push(info_exclude);
3239 }
3240 files.sort();
3241 files.dedup();
3242
3243 let mut hasher = Sha256::new();
3244 hasher.update(b"aft-ignore-rules-v1\0");
3245 for path in files {
3246 if let Some(relative) = cache_relative_path(&root, &path) {
3247 hasher.update(relative.to_string_lossy().as_bytes());
3248 } else {
3249 hasher.update(path.to_string_lossy().as_bytes());
3250 }
3251 hasher.update(b"\0");
3252 match fs::read(&path) {
3253 Ok(bytes) => hasher.update(&bytes),
3254 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
3255 }
3256 hasher.update(b"\0");
3257 }
3258
3259 format!("{:x}", hasher.finalize())
3260}
3261
3262fn git_info_exclude_path(root: &Path) -> PathBuf {
3263 run_git(
3264 root,
3265 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
3266 )
3267 .map(PathBuf::from)
3268 .unwrap_or_else(|| root.join(".git"))
3269 .join("info")
3270 .join("exclude")
3271}
3272
3273fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
3274 let mut builder = WalkBuilder::new(root);
3275 builder
3276 .hidden(false)
3277 .git_ignore(true)
3278 .git_global(true)
3279 .git_exclude(true)
3280 .add_custom_ignore_filename(".aftignore")
3281 .filter_entry(|entry| {
3282 let name = entry.file_name().to_string_lossy();
3283 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3284 return !matches!(
3285 name.as_ref(),
3286 ".git"
3287 | "node_modules"
3288 | "target"
3289 | "venv"
3290 | ".venv"
3291 | "__pycache__"
3292 | ".tox"
3293 | "dist"
3294 | "build"
3295 );
3296 }
3297 true
3298 });
3299
3300 for entry in builder.build().filter_map(|entry| entry.ok()) {
3301 if !entry
3302 .file_type()
3303 .map_or(false, |file_type| file_type.is_file())
3304 {
3305 continue;
3306 }
3307 let file_name = entry.file_name();
3308 if file_name == ".gitignore" || file_name == ".aftignore" {
3309 files.push(entry.into_path());
3310 }
3311 }
3312}
3313
3314#[cfg(test)]
3316pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
3317 let mut dirs = 0usize;
3318 let mut builder = WalkBuilder::new(root);
3319 builder
3320 .hidden(false)
3321 .git_ignore(true)
3322 .git_global(true)
3323 .git_exclude(true)
3324 .add_custom_ignore_filename(".aftignore");
3325 for entry in builder.build().filter_map(|entry| entry.ok()) {
3326 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3327 dirs += 1;
3328 }
3329 }
3330 dirs
3331}
3332
3333#[cfg(test)]
3335pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
3336 let mut stack = vec![root.to_path_buf()];
3337 let mut dirs = 0usize;
3338 while let Some(dir) = stack.pop() {
3339 dirs += 1;
3340 let Ok(entries) = fs::read_dir(&dir) else {
3341 continue;
3342 };
3343 for entry in entries.flatten() {
3344 let path = entry.path();
3345 let file_name = entry.file_name();
3346 if file_name == ".gitignore" || file_name == ".aftignore" {
3347 continue;
3348 }
3349 let Ok(file_type) = entry.file_type() else {
3350 continue;
3351 };
3352 if !file_type.is_dir() || file_type.is_symlink() {
3353 continue;
3354 }
3355 if matches!(
3356 file_name.to_str().unwrap_or(""),
3357 ".git"
3358 | "node_modules"
3359 | "target"
3360 | "venv"
3361 | ".venv"
3362 | "__pycache__"
3363 | ".tox"
3364 | "dist"
3365 | "build"
3366 ) {
3367 continue;
3368 }
3369 stack.push(path);
3370 }
3371 }
3372 dirs
3373}
3374
3375impl PathFilters {
3376 pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
3377 let relative = to_glob_path(&relative_to_root(root, path));
3378 if self
3379 .includes
3380 .as_ref()
3381 .is_some_and(|includes| !includes.is_match(&relative))
3382 {
3383 return false;
3384 }
3385 if self
3386 .excludes
3387 .as_ref()
3388 .is_some_and(|excludes| excludes.is_match(&relative))
3389 {
3390 return false;
3391 }
3392 true
3393 }
3394}
3395
3396fn canonicalize_or_normalize(path: &Path) -> PathBuf {
3397 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
3398}
3399
3400fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
3401 if path.is_absolute() {
3402 path.to_path_buf()
3403 } else {
3404 project_root.join(path)
3405 }
3406}
3407
3408fn path_modified_time(path: &Path) -> Option<SystemTime> {
3409 fs::metadata(path)
3410 .and_then(|metadata| metadata.modified())
3411 .ok()
3412}
3413
3414fn normalized_display_sort_key(project_root: Option<&Path>, path: &Path) -> String {
3415 let display_path = project_root
3416 .and_then(|root| path.strip_prefix(root).ok())
3417 .unwrap_or(path);
3418 to_glob_path(display_path)
3419}
3420
3421fn normalize_path(path: &Path) -> PathBuf {
3422 let mut result = PathBuf::new();
3423 for component in path.components() {
3424 match component {
3425 Component::ParentDir => {
3426 if !result.pop() {
3427 result.push(component);
3428 }
3429 }
3430 Component::CurDir => {}
3431 _ => result.push(component),
3432 }
3433 }
3434 result
3435}
3436
3437fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3438 if let Ok(canonical) = fs::canonicalize(path) {
3439 return canonical;
3440 }
3441
3442 let Some(parent) = path.parent() else {
3443 return path.to_path_buf();
3444 };
3445 let Some(file_name) = path.file_name() else {
3446 return path.to_path_buf();
3447 };
3448
3449 fs::canonicalize(parent)
3450 .map(|canonical_parent| canonical_parent.join(file_name))
3451 .unwrap_or_else(|_| path.to_path_buf())
3452}
3453
3454fn verify_file_mtimes(index: &mut SearchIndex) {
3457 let filters = PathFilters::default();
3458 let current_files = walk_project_files(&index.project_root, &filters);
3459 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
3460 let mut stale_paths = Vec::new();
3461 let mut removed_paths = Vec::new();
3462
3463 for entry in Arc::make_mut(&mut index.files).iter_mut() {
3464 if entry.path.as_os_str().is_empty() {
3465 continue; }
3467 if !current_file_set.contains(&entry.path) {
3468 removed_paths.push(entry.path.clone());
3469 continue;
3470 }
3471 let cached = FileFreshness {
3472 mtime: entry.modified,
3473 size: entry.size,
3474 content_hash: entry.content_hash,
3475 };
3476 match cache_freshness::verify_file_strict(&entry.path, &cached) {
3477 FreshnessVerdict::HotFresh => {}
3478 FreshnessVerdict::ContentFresh {
3479 new_mtime,
3480 new_size,
3481 } => {
3482 entry.modified = new_mtime;
3483 entry.size = new_size;
3484 }
3485 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
3486 stale_paths.push(entry.path.clone())
3487 }
3488 }
3489 }
3490
3491 for path in &removed_paths {
3492 index.remove_file(path);
3493 }
3494
3495 for path in &stale_paths {
3499 if current_file_set.contains(path) {
3500 index.update_file(path);
3501 } else {
3502 index.remove_file(path);
3503 }
3504 }
3505
3506 for path in current_files {
3508 if !index.path_to_id.contains_key(&path) {
3509 index.update_file(&path);
3510 }
3511 }
3512
3513 if !stale_paths.is_empty() {
3514 crate::slog_info!(
3515 "search index: refreshed {} stale file(s) from disk cache",
3516 stale_paths.len()
3517 );
3518 }
3519}
3520
3521fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
3522 normalize_path(path).starts_with(normalize_path(search_root))
3523}
3524
3525impl QueryBuild {
3526 fn into_query(self) -> RegexQuery {
3527 let mut query = RegexQuery::default();
3528
3529 for run in self.and_runs {
3530 add_run_to_and_query(&mut query, &run);
3531 }
3532
3533 for group in self.or_groups {
3534 let mut trigrams = BTreeSet::new();
3535 let mut filters = HashMap::new();
3536 for run in group {
3537 for (trigram, filter) in trigram_filters(&run) {
3538 trigrams.insert(trigram);
3539 merge_filter(filters.entry(trigram).or_default(), filter);
3540 }
3541 }
3542 if !trigrams.is_empty() {
3543 query.or_groups.push(trigrams.into_iter().collect());
3544 query.or_filters.push(filters);
3545 }
3546 }
3547
3548 query
3549 }
3550}
3551
3552fn build_query(hir: &Hir) -> QueryBuild {
3553 match hir.kind() {
3554 HirKind::Literal(literal) => {
3555 if literal.0.len() >= 3 {
3556 QueryBuild {
3557 and_runs: vec![literal.0.to_vec()],
3558 or_groups: Vec::new(),
3559 }
3560 } else {
3561 QueryBuild::default()
3562 }
3563 }
3564 HirKind::Capture(capture) => build_query(&capture.sub),
3565 HirKind::Concat(parts) => {
3566 let mut build = QueryBuild::default();
3567 for part in parts {
3568 let part_build = build_query(part);
3569 build.and_runs.extend(part_build.and_runs);
3570 build.or_groups.extend(part_build.or_groups);
3571 }
3572 build
3573 }
3574 HirKind::Alternation(parts) => {
3575 let mut group = Vec::new();
3576 for part in parts {
3577 let Some(mut choices) = guaranteed_run_choices(part) else {
3578 return QueryBuild::default();
3579 };
3580 group.append(&mut choices);
3581 }
3582 if group.is_empty() {
3583 QueryBuild::default()
3584 } else {
3585 QueryBuild {
3586 and_runs: Vec::new(),
3587 or_groups: vec![group],
3588 }
3589 }
3590 }
3591 HirKind::Repetition(repetition) => {
3592 if repetition.min == 0 {
3593 QueryBuild::default()
3594 } else {
3595 build_query(&repetition.sub)
3596 }
3597 }
3598 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
3599 }
3600}
3601
3602fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
3603 match hir.kind() {
3604 HirKind::Literal(literal) => {
3605 if literal.0.len() >= 3 {
3606 Some(vec![literal.0.to_vec()])
3607 } else {
3608 None
3609 }
3610 }
3611 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
3612 HirKind::Concat(parts) => {
3613 let mut runs = Vec::new();
3614 for part in parts {
3615 if let Some(mut part_runs) = guaranteed_run_choices(part) {
3616 runs.append(&mut part_runs);
3617 }
3618 }
3619 if runs.is_empty() {
3620 None
3621 } else {
3622 Some(runs)
3623 }
3624 }
3625 HirKind::Alternation(parts) => {
3626 let mut runs = Vec::new();
3627 for part in parts {
3628 let Some(mut part_runs) = guaranteed_run_choices(part) else {
3629 return None;
3630 };
3631 runs.append(&mut part_runs);
3632 }
3633 if runs.is_empty() {
3634 None
3635 } else {
3636 Some(runs)
3637 }
3638 }
3639 HirKind::Repetition(repetition) => {
3640 if repetition.min == 0 {
3641 None
3642 } else {
3643 guaranteed_run_choices(&repetition.sub)
3644 }
3645 }
3646 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
3647 }
3648}
3649
3650fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
3651 for (trigram, filter) in trigram_filters(run) {
3652 if !query.and_trigrams.contains(&trigram) {
3653 query.and_trigrams.push(trigram);
3654 }
3655 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
3656 }
3657}
3658
3659fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
3660 trigram_filter_map(run, false).into_iter().collect()
3661}
3662
3663fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
3664 target.next_mask |= filter.next_mask;
3665 target.loc_mask |= filter.loc_mask;
3666}
3667
3668fn mask_for_next_char(next_char: u8) -> u8 {
3669 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
3670 1u8 << bit
3671}
3672
3673fn mask_for_position(position: usize) -> u8 {
3674 1u8 << (position % 8)
3675}
3676
3677fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
3678 if patterns.is_empty() {
3679 return Ok(None);
3680 }
3681
3682 let mut builder = GlobSetBuilder::new();
3683 for pattern in patterns {
3684 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
3685 builder.add(glob);
3686 }
3687 builder.build().map(Some).map_err(|error| error.to_string())
3688}
3689
3690fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
3691 let mut buffer = [0u8; 4];
3692 reader.read_exact(&mut buffer)?;
3693 Ok(u32::from_le_bytes(buffer))
3694}
3695
3696fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
3697 let mut buffer = [0u8; 8];
3698 reader.read_exact(&mut buffer)?;
3699 Ok(u64::from_le_bytes(buffer))
3700}
3701
3702fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
3703 writer.write_all(&value.to_le_bytes())
3704}
3705
3706fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
3707 writer.write_all(&value.to_le_bytes())
3708}
3709
3710fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
3711 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
3712 return Err(std::io::Error::other("search index checksum missing"));
3713 };
3714 let expected = u32::from_le_bytes(*stored);
3715 let actual = crc32fast::hash(body);
3716 if actual != expected {
3717 return Err(std::io::Error::other("search index checksum mismatch"));
3718 }
3719 Ok(())
3720}
3721
3722fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
3723 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
3724 total_len.checked_sub(pos)
3725}
3726
3727fn run_git(root: &Path, args: &[&str]) -> Option<String> {
3728 let output = Command::new("git")
3729 .arg("-C")
3730 .arg(root)
3731 .args(args)
3732 .output()
3733 .ok()?;
3734 if !output.status.success() {
3735 return None;
3736 }
3737 let value = String::from_utf8(output.stdout).ok()?;
3738 let value = value.trim().to_string();
3739 if value.is_empty() {
3740 None
3741 } else {
3742 Some(value)
3743 }
3744}
3745
3746fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
3747 let diff_range = format!("{}..{}", from, to);
3748 let output = match Command::new("git")
3749 .arg("-C")
3750 .arg(root)
3751 .args(["diff", "--name-status", "-M", &diff_range])
3752 .output()
3753 {
3754 Ok(output) => output,
3755 Err(_) => return false,
3756 };
3757
3758 if !output.status.success() {
3759 return false;
3760 }
3761
3762 let Ok(diff) = String::from_utf8(output.stdout) else {
3763 return false;
3764 };
3765
3766 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
3767 let mut fields = line.split('\t');
3768 let Some(status) = fields.next() else {
3769 continue;
3770 };
3771
3772 if status.starts_with('R') {
3773 let Some(old_path) = fields
3774 .next()
3775 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3776 else {
3777 continue;
3778 };
3779 let Some(new_path) = fields
3780 .next()
3781 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3782 else {
3783 continue;
3784 };
3785 index.remove_file(&old_path);
3786 index.update_file(&new_path);
3787 continue;
3788 }
3789
3790 let Some(path) = fields
3791 .next()
3792 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3793 else {
3794 continue;
3795 };
3796 if status.starts_with('D') || !path.exists() {
3797 index.remove_file(&path);
3798 } else {
3799 index.update_file(&path);
3800 }
3801 }
3802
3803 true
3804}
3805
3806fn is_binary_path(path: &Path, size: u64) -> bool {
3807 if size == 0 {
3808 return false;
3809 }
3810
3811 let mut file = match File::open(path) {
3812 Ok(file) => file,
3813 Err(_) => return true,
3814 };
3815
3816 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
3817 match file.read(&mut preview) {
3818 Ok(read) => is_binary_bytes(&preview[..read]),
3819 Err(_) => true,
3820 }
3821}
3822
3823fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
3824 let mut starts = vec![0usize];
3825 for (index, byte) in content.iter().copied().enumerate() {
3826 if byte == b'\n' {
3827 starts.push(index + 1);
3828 }
3829 }
3830 starts
3831}
3832
3833fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
3834 let line_index = match line_starts.binary_search(&offset) {
3835 Ok(index) => index,
3836 Err(index) => index.saturating_sub(1),
3837 };
3838 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
3839 let line_end = content[line_start..]
3840 .iter()
3841 .position(|byte| *byte == b'\n')
3842 .map(|length| line_start + length)
3843 .unwrap_or(content.len());
3844 let mut line_slice = &content[line_start..line_end];
3845 if line_slice.ends_with(b"\r") {
3846 line_slice = &line_slice[..line_slice.len() - 1];
3847 }
3848 let line_text = String::from_utf8_lossy(line_slice).into_owned();
3849 let column = String::from_utf8_lossy(&content[line_start..offset])
3850 .chars()
3851 .count() as u32
3852 + 1;
3853 (line_index as u32 + 1, column, line_text)
3854}
3855
3856fn to_glob_path(path: &Path) -> String {
3857 path.to_string_lossy().replace('\\', "/")
3858}
3859
3860#[cfg(test)]
3861mod tests {
3862 use std::process::Command;
3863
3864 use super::*;
3865
3866 #[test]
3867 fn cached_path_under_root_allows_missing_lexical_child() {
3868 let dir = tempfile::tempdir().expect("create temp dir");
3869 let project = dir.path().join("project");
3870 fs::create_dir_all(&project).expect("create project dir");
3871 let root = fs::canonicalize(&project).expect("canonicalize project");
3872
3873 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
3874 .expect("missing child should fall back to lexical validation");
3875
3876 assert_eq!(path, root.join("future/file.rs"));
3877 }
3878
3879 #[cfg(unix)]
3880 #[test]
3881 fn cached_path_under_root_rejects_symlink_escape() {
3882 let dir = tempfile::tempdir().expect("create temp dir");
3883 let project = dir.path().join("project");
3884 let outside = dir.path().join("outside");
3885 fs::create_dir_all(&project).expect("create project dir");
3886 fs::create_dir_all(&outside).expect("create outside dir");
3887 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
3888 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
3889 let root = fs::canonicalize(&project).expect("canonicalize project");
3890
3891 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
3892 }
3893
3894 #[test]
3895 fn extract_trigrams_tracks_next_char_and_position() {
3896 let trigrams = extract_trigrams(b"Rust");
3897 assert_eq!(trigrams.len(), 2);
3898 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
3899 assert_eq!(
3900 trigrams[1],
3901 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
3902 );
3903 }
3904
3905 #[test]
3906 fn index_file_trigram_filters_match_legacy_extraction() {
3907 let dir = tempfile::tempdir().expect("create temp dir");
3908 let path = dir.path().join("sample.txt");
3909 let content = b"Rust rust RUST\nxy";
3910 fs::write(&path, content).expect("write sample");
3911
3912 let mut expected = BTreeMap::new();
3913 for (trigram, next_char, position) in extract_trigrams(content) {
3914 let entry: &mut PostingFilter = expected.entry(trigram).or_default();
3915 entry.next_mask |= mask_for_next_char(next_char);
3916 entry.loc_mask |= mask_for_position(position);
3917 }
3918
3919 let mut index = SearchIndex::new();
3920 index.project_root = dir.path().to_path_buf();
3921 index.index_file(&path, content);
3922
3923 let file_id = *index.path_to_id.get(&path).expect("file indexed");
3924 let file_trigrams = index
3925 .delta_file_trigrams
3926 .get(&file_id)
3927 .expect("delta file trigrams");
3928 assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
3929 for (trigram, filter) in expected {
3930 let postings = index
3931 .delta_postings
3932 .get(&trigram)
3933 .expect("delta posting list");
3934 assert_eq!(postings.len(), 1);
3935 assert_eq!(postings[0].file_id, file_id);
3936 assert_eq!(postings[0].next_mask, filter.next_mask);
3937 assert_eq!(postings[0].loc_mask, filter.loc_mask);
3938 }
3939 }
3940
3941 #[test]
3942 fn decompose_regex_extracts_literals_and_alternations() {
3943 let query = decompose_regex("abc(def|ghi)xyz");
3944 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
3945 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
3946 assert_eq!(query.or_groups.len(), 1);
3947 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
3948 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
3949 }
3950
3951 #[test]
3952 fn candidates_intersect_posting_lists() {
3953 let mut index = SearchIndex::new();
3954 let dir = tempfile::tempdir().expect("create temp dir");
3955 let alpha = dir.path().join("alpha.txt");
3956 let beta = dir.path().join("beta.txt");
3957 fs::write(&alpha, "abcdef").expect("write alpha");
3958 fs::write(&beta, "abcxyz").expect("write beta");
3959 index.project_root = dir.path().to_path_buf();
3960 index.index_file(&alpha, b"abcdef");
3961 index.index_file(&beta, b"abcxyz");
3962
3963 let query = RegexQuery {
3964 and_trigrams: vec![
3965 pack_trigram(b'a', b'b', b'c'),
3966 pack_trigram(b'd', b'e', b'f'),
3967 ],
3968 ..RegexQuery::default()
3969 };
3970
3971 let candidates = index.candidates(&query);
3972 assert_eq!(candidates.len(), 1);
3973 assert_eq!(index.files[candidates[0] as usize].path, alpha);
3974 }
3975
3976 #[test]
3977 fn candidates_apply_bloom_filters() {
3978 let mut index = SearchIndex::new();
3979 let dir = tempfile::tempdir().expect("create temp dir");
3980 let file = dir.path().join("sample.txt");
3981 fs::write(&file, "abcd efgh").expect("write sample");
3982 index.project_root = dir.path().to_path_buf();
3983 index.index_file(&file, b"abcd efgh");
3984
3985 let trigram = pack_trigram(b'a', b'b', b'c');
3986 let matching_filter = PostingFilter {
3987 next_mask: mask_for_next_char(b'd'),
3988 loc_mask: mask_for_position(0),
3989 };
3990 let non_matching_filter = PostingFilter {
3991 next_mask: mask_for_next_char(b'z'),
3992 loc_mask: mask_for_position(0),
3993 };
3994
3995 assert_eq!(
3996 index
3997 .postings_for_trigram(trigram, Some(matching_filter))
3998 .len(),
3999 1
4000 );
4001 assert!(index
4002 .postings_for_trigram(trigram, Some(non_matching_filter))
4003 .is_empty());
4004 }
4005
4006 #[test]
4007 fn base_delta_readd_masks_base_and_keeps_postings_sorted() {
4008 let dir = tempfile::tempdir().expect("create temp dir");
4009 let project = dir.path().join("project");
4010 fs::create_dir_all(&project).expect("create project dir");
4011 let a = project.join("a.txt");
4012 let b = project.join("b.txt");
4013 fs::write(&a, "abc old").expect("write a");
4014 fs::write(&b, "abc base").expect("write b");
4015
4016 let mut built = SearchIndex::build(&project);
4017 let cache_dir = dir.path().join("cache");
4018 built.write_to_disk(&cache_dir, None);
4019 let mut index = SearchIndex::read_from_disk(&cache_dir, &project).expect("load base");
4020 assert_eq!(index.base_file_count, 2);
4021
4022 let old_a_id = *index.path_to_id.get(&a).expect("original a id");
4023 let b_id = *index.path_to_id.get(&b).expect("original b id");
4024 index.remove_file(&a);
4025 index.index_file(&a, b"abc new");
4026 let new_id = *index.path_to_id.get(&a).expect("re-added file id");
4027 assert!(new_id >= index.base_file_count);
4028 let abc = pack_trigram(b'a', b'b', b'c');
4029 let ids = index.postings_for_trigram(abc, None);
4030 assert_eq!(ids, {
4031 let mut expected = vec![b_id, new_id];
4032 expected.sort_unstable();
4033 expected
4034 });
4035 assert!(!ids.contains(&old_a_id));
4036 }
4037
4038 #[test]
4039 fn write_to_disk_compacts_base_and_delta() {
4040 let dir = tempfile::tempdir().expect("create temp dir");
4041 let project = dir.path().join("project");
4042 fs::create_dir_all(&project).expect("create project dir");
4043 let file = project.join("src.txt");
4044 fs::write(&file, "abcdef").expect("write source");
4045 let mut index = SearchIndex::build(&project);
4046 let cache_dir = dir.path().join("cache");
4047 index.write_to_disk(&cache_dir, None);
4048 fs::write(&file, "abcxyz").expect("edit source");
4049 index.update_file(&file);
4050 assert!(!index.delta_postings.is_empty());
4051 index.write_to_disk(&cache_dir, None);
4052 assert!(index.delta_postings.is_empty());
4053 assert!(index.superseded.is_empty());
4054 assert_eq!(
4055 index.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4056 vec![0]
4057 );
4058 assert!(index
4059 .postings_for_trigram(pack_trigram(b'd', b'e', b'f'), None)
4060 .is_empty());
4061 }
4062
4063 #[test]
4064 fn legacy_cache_without_file_trigram_count_migrates_streaming_counts() {
4065 let dir = tempfile::tempdir().expect("create temp dir");
4066 let project = dir.path().join("project");
4067 fs::create_dir_all(&project).expect("create project dir");
4068 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4069 let cache_dir = dir.path().join("cache");
4070 let mut index = SearchIndex::build(&project);
4071 index.write_to_disk(&cache_dir, None);
4072 let cache_path = cache_dir.join("cache.bin");
4073 strip_file_trigram_count_extension(&cache_path);
4074 assert!(!cache_has_file_trigram_count_extension(&cache_path));
4075
4076 let loaded = SearchIndex::read_from_disk(&cache_dir, &project).expect("load legacy cache");
4077 assert_eq!(loaded.file_trigram_count.as_ref(), &[4]);
4078 assert!(loaded.delta_postings.is_empty());
4079 assert!(cache_has_file_trigram_count_extension(&cache_path));
4080 }
4081
4082 #[test]
4083 fn compaction_flags_buffer_paths_while_running() {
4084 let dir = tempfile::tempdir().expect("create temp dir");
4085 let project = dir.path().join("project");
4086 fs::create_dir_all(&project).expect("create project dir");
4087 let file = project.join("src.txt");
4088 fs::write(&file, "abcdef").expect("write source");
4089 let mut index = SearchIndex::new();
4090 index.project_root = project.clone();
4091 {
4092 let mut state = index.compaction_state.lock().expect("compaction state");
4093 state.running = true;
4094 }
4095 index.update_file(&file);
4096 let state = index.compaction_state.lock().expect("compaction state");
4097 assert!(state.requested_again || !index.delta_postings.is_empty());
4098 assert!(state.buffered_paths.contains(&file));
4099 }
4100
4101 fn cache_has_file_trigram_count_extension(cache_path: &Path) -> bool {
4102 file_trigram_count_extension_range(cache_path).is_some()
4103 }
4104
4105 fn strip_file_trigram_count_extension(cache_path: &Path) {
4106 let mut bytes = fs::read(cache_path).expect("read cache");
4107 let (start, end) = file_trigram_count_extension_range_from_bytes(&bytes)
4108 .expect("file trigram count extension");
4109 bytes.drain(start..end);
4110 let postings_len_total = u64::from_le_bytes(bytes[8..16].try_into().unwrap())
4111 - u64::try_from(end - start).unwrap();
4112 bytes[8..16].copy_from_slice(&postings_len_total.to_le_bytes());
4113 let checksum_pos = 16 + usize::try_from(postings_len_total).unwrap() - 4;
4114 let checksum = crc32fast::hash(&bytes[16..checksum_pos]);
4115 bytes[checksum_pos..checksum_pos + 4].copy_from_slice(&checksum.to_le_bytes());
4116 fs::write(cache_path, bytes).expect("write legacy cache");
4117 }
4118
4119 fn file_trigram_count_extension_range(cache_path: &Path) -> Option<(usize, usize)> {
4120 let bytes = fs::read(cache_path).ok()?;
4121 file_trigram_count_extension_range_from_bytes(&bytes)
4122 }
4123
4124 fn file_trigram_count_extension_range_from_bytes(bytes: &[u8]) -> Option<(usize, usize)> {
4125 let postings_len_total = u64::from_le_bytes(bytes.get(8..16)?.try_into().ok()?) as usize;
4126 let postings_start = 16usize;
4127 let postings_end = postings_start.checked_add(postings_len_total)?;
4128 let postings_body_end = postings_end.checked_sub(4)?;
4129 let mut reader = Cursor::new(&bytes[postings_start..postings_body_end]);
4130 let mut magic = [0u8; 8];
4131 reader.read_exact(&mut magic).ok()?;
4132 if &magic != INDEX_MAGIC {
4133 return None;
4134 }
4135 read_u32(&mut reader).ok()?;
4136 let head_len = read_u32(&mut reader).ok()? as u64;
4137 let root_len = read_u32(&mut reader).ok()? as u64;
4138 let ignore_len = read_u32(&mut reader).ok()? as u64;
4139 read_u64(&mut reader).ok()?;
4140 let file_count = read_u32(&mut reader).ok()? as usize;
4141 let skip = head_len.checked_add(root_len)?.checked_add(ignore_len)?;
4142 reader.seek(SeekFrom::Current(skip as i64)).ok()?;
4143 for _ in 0..file_count {
4144 let mut unindexed = [0u8; 1];
4145 reader.read_exact(&mut unindexed).ok()?;
4146 let path_len = read_u32(&mut reader).ok()? as u64;
4147 read_u64(&mut reader).ok()?;
4148 read_u64(&mut reader).ok()?;
4149 read_u32(&mut reader).ok()?;
4150 let mut hash = [0u8; 32];
4151 reader.read_exact(&mut hash).ok()?;
4152 reader.seek(SeekFrom::Current(path_len as i64)).ok()?;
4153 }
4154 let postings_blob_len = read_u64(&mut reader).ok()? as usize;
4155 let extension_start = postings_start
4156 .checked_add(reader.position() as usize)?
4157 .checked_add(postings_blob_len)?;
4158 if extension_start + 16 > postings_body_end {
4159 return None;
4160 }
4161 if bytes.get(extension_start..extension_start + 8)? != FILE_TRIGRAM_COUNT_MAGIC {
4162 return None;
4163 }
4164 let count = u32::from_le_bytes(
4165 bytes[extension_start + 12..extension_start + 16]
4166 .try_into()
4167 .ok()?,
4168 ) as usize;
4169 let extension_end = extension_start
4170 .checked_add(16)?
4171 .checked_add(count.checked_mul(4)?)?;
4172 (extension_end <= postings_body_end).then_some((extension_start, extension_end))
4173 }
4174
4175 #[test]
4176 fn disk_round_trip_preserves_postings_and_files() {
4177 let dir = tempfile::tempdir().expect("create temp dir");
4178 let project = dir.path().join("project");
4179 fs::create_dir_all(&project).expect("create project dir");
4180 let file = project.join("src.txt");
4181 fs::write(&file, "abcdef").expect("write source");
4182
4183 let mut index = SearchIndex::build(&project);
4184 index.git_head = Some("deadbeef".to_string());
4185 let cache_dir = dir.path().join("cache");
4186 let head = index.git_head.clone();
4187 index.write_to_disk(&cache_dir, head.as_deref());
4188
4189 let loaded =
4190 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
4191 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
4192 assert_eq!(loaded.files.len(), 1);
4193 assert_eq!(
4194 relative_to_root(&loaded.project_root, &loaded.files[0].path),
4195 PathBuf::from("src.txt")
4196 );
4197 assert_eq!(loaded.trigram_count(), index.trigram_count());
4198 assert_eq!(
4199 loaded.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4200 vec![0]
4201 );
4202 assert_eq!(
4203 loaded.file_trigram_count.as_ref(),
4204 index.file_trigram_count.as_ref()
4205 );
4206 }
4207
4208 #[test]
4209 fn cache_path_helpers_reject_absolute_and_parent_paths() {
4210 let root = PathBuf::from("/tmp/aft-project");
4211
4212 assert_eq!(
4213 cache_relative_path(&root, &root.join("src/lib.rs")),
4214 Some(PathBuf::from("src/lib.rs"))
4215 );
4216 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
4217 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
4218 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
4219 assert_eq!(
4220 cached_path_under_root(&root, Path::new("src/./lib.rs")),
4221 Some(root.join("src/lib.rs"))
4222 );
4223 }
4224
4225 #[test]
4226 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
4227 let dir = tempfile::tempdir().expect("create temp dir");
4228 let project = dir.path().join("project");
4229 fs::create_dir_all(&project).expect("create project dir");
4230 let canonical_project = fs::canonicalize(&project).expect("canonical project");
4231 fs::write(project.join("old.txt"), "old token\n").expect("write old");
4232 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
4233
4234 Command::new("git")
4235 .arg("init")
4236 .arg(&project)
4237 .status()
4238 .expect("git init");
4239 for args in [
4240 ["config", "user.email", "aft@example.invalid"],
4241 ["config", "user.name", "AFT Test"],
4242 ] {
4243 Command::new("git")
4244 .arg("-C")
4245 .arg(&project)
4246 .args(args)
4247 .status()
4248 .expect("git config");
4249 }
4250 Command::new("git")
4251 .arg("-C")
4252 .arg(&project)
4253 .args(["add", "."])
4254 .status()
4255 .expect("git add initial");
4256 Command::new("git")
4257 .arg("-C")
4258 .arg(&project)
4259 .args(["commit", "-m", "initial"])
4260 .status()
4261 .expect("git commit initial");
4262 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
4263 let mut baseline = SearchIndex::build(&project);
4264 baseline.git_head = Some(previous.clone());
4265
4266 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
4267 Command::new("git")
4268 .arg("-C")
4269 .arg(&project)
4270 .args(["add", "-A"])
4271 .status()
4272 .expect("git add rename");
4273 Command::new("git")
4274 .arg("-C")
4275 .arg(&project)
4276 .args(["commit", "-m", "rename"])
4277 .status()
4278 .expect("git commit rename");
4279 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
4280
4281 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
4282 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
4283
4284 let refreshed = SearchIndex::rebuild_or_refresh(
4285 &project,
4286 DEFAULT_MAX_FILE_SIZE,
4287 Some(current),
4288 Some(baseline),
4289 None,
4290 );
4291
4292 assert!(!refreshed
4293 .path_to_id
4294 .contains_key(&canonical_project.join("old.txt")));
4295 assert!(refreshed
4296 .path_to_id
4297 .contains_key(&canonical_project.join("new.txt")));
4298 assert!(refreshed
4299 .path_to_id
4300 .contains_key(&canonical_project.join("untracked.txt")));
4301 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
4302 assert_eq!(matches.matches.len(), 1);
4303 }
4304
4305 #[test]
4306 fn read_from_disk_rejects_corrupt_lookup_checksum() {
4307 let dir = tempfile::tempdir().expect("create temp dir");
4308 let project = dir.path().join("project");
4309 fs::create_dir_all(&project).expect("create project dir");
4310 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4311
4312 let mut index = SearchIndex::build(&project);
4313 let cache_dir = dir.path().join("cache");
4314 index.write_to_disk(&cache_dir, None);
4315
4316 let cache_path = cache_dir.join("cache.bin");
4317 let mut bytes = fs::read(&cache_path).expect("read cache");
4318 let last = bytes.len() - 1;
4319 bytes[last] ^= 0xff;
4320 fs::write(&cache_path, bytes).expect("write corrupted cache");
4321
4322 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
4323 }
4324
4325 #[test]
4326 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
4327 let dir = tempfile::tempdir().expect("create temp dir");
4328 let project = dir.path().join("project");
4329 fs::create_dir_all(&project).expect("create project dir");
4330 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4331
4332 let mut index = SearchIndex::build(&project);
4333 let cache_dir = dir.path().join("cache");
4334 index.write_to_disk(&cache_dir, None);
4335
4336 assert!(cache_dir.join("cache.bin").is_file());
4337 assert!(fs::read_dir(&cache_dir)
4338 .expect("read cache dir")
4339 .all(|entry| !entry
4340 .expect("cache entry")
4341 .file_name()
4342 .to_string_lossy()
4343 .contains(".tmp.")));
4344 }
4345
4346 #[test]
4347 fn concurrent_search_index_writes_do_not_corrupt() {
4348 let dir = tempfile::tempdir().expect("create temp dir");
4349 let project = dir.path().join("project");
4350 fs::create_dir_all(&project).expect("create project dir");
4351 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
4352 let cache_dir = dir.path().join("cache");
4353
4354 let a_project = project.clone();
4355 let a_cache = cache_dir.clone();
4356 let a = std::thread::spawn(move || {
4357 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
4358 let mut index = SearchIndex::build(&a_project);
4359 index.write_to_disk(&a_cache, None);
4360 });
4361 let b_project = project.clone();
4362 let b_cache = cache_dir.clone();
4363 let b = std::thread::spawn(move || {
4364 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
4365 let mut index = SearchIndex::build(&b_project);
4366 index.write_to_disk(&b_cache, None);
4367 });
4368 a.join().expect("writer a");
4369 b.join().expect("writer b");
4370
4371 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
4372 }
4373
4374 #[test]
4375 fn search_index_atomic_rename_survives_partial_write() {
4376 let dir = tempfile::tempdir().expect("create temp dir");
4377 let cache_dir = dir.path().join("cache");
4378 fs::create_dir_all(&cache_dir).expect("create cache dir");
4379 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
4380
4381 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4382 }
4383
4384 #[test]
4385 fn artifact_cache_key_shared_across_clones_of_same_repo() {
4386 let dir = tempfile::tempdir().expect("create temp dir");
4387 let source = dir.path().join("source");
4388 fs::create_dir_all(&source).expect("create source repo dir");
4389 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
4390
4391 assert!(Command::new("git")
4392 .current_dir(&source)
4393 .args(["init"])
4394 .status()
4395 .expect("init git repo")
4396 .success());
4397 assert!(Command::new("git")
4398 .current_dir(&source)
4399 .args(["add", "."])
4400 .status()
4401 .expect("git add")
4402 .success());
4403 assert!(Command::new("git")
4404 .current_dir(&source)
4405 .args([
4406 "-c",
4407 "user.name=AFT Tests",
4408 "-c",
4409 "user.email=aft-tests@example.com",
4410 "commit",
4411 "-m",
4412 "initial",
4413 ])
4414 .status()
4415 .expect("git commit")
4416 .success());
4417
4418 let clone = dir.path().join("clone");
4419 assert!(Command::new("git")
4420 .args(["clone", "--quiet"])
4421 .arg(&source)
4422 .arg(&clone)
4423 .status()
4424 .expect("git clone")
4425 .success());
4426
4427 let source_key = artifact_cache_key(&source);
4428 let clone_key = artifact_cache_key(&clone);
4429
4430 assert_eq!(source_key.len(), 16);
4431 assert_eq!(clone_key.len(), 16);
4432 assert_eq!(source_key, clone_key);
4434 }
4435
4436 #[test]
4437 fn git_head_unchanged_picks_up_local_edits() {
4438 let dir = tempfile::tempdir().expect("create temp dir");
4439 let project = dir.path().join("repo");
4440 fs::create_dir_all(&project).expect("create repo dir");
4441 let file = project.join("tracked.txt");
4442 fs::write(&file, "oldtoken\n").expect("write file");
4443 assert!(Command::new("git")
4444 .current_dir(&project)
4445 .arg("init")
4446 .status()
4447 .unwrap()
4448 .success());
4449 assert!(Command::new("git")
4450 .current_dir(&project)
4451 .args(["add", "."])
4452 .status()
4453 .unwrap()
4454 .success());
4455 assert!(Command::new("git")
4456 .current_dir(&project)
4457 .args([
4458 "-c",
4459 "user.name=AFT Tests",
4460 "-c",
4461 "user.email=aft-tests@example.com",
4462 "commit",
4463 "-m",
4464 "initial"
4465 ])
4466 .status()
4467 .unwrap()
4468 .success());
4469 let head = current_git_head(&project);
4470 let mut baseline = SearchIndex::build(&project);
4471 baseline.git_head = head.clone();
4472 fs::write(&file, "newtoken\n").expect("edit tracked file");
4473
4474 let refreshed = SearchIndex::rebuild_or_refresh(
4475 &project,
4476 DEFAULT_MAX_FILE_SIZE,
4477 head,
4478 Some(baseline),
4479 None,
4480 );
4481 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
4482
4483 assert_eq!(result.total_matches, 1);
4484 }
4485
4486 #[test]
4487 fn non_git_project_reuses_cache_when_files_unchanged() {
4488 let dir = tempfile::tempdir().expect("create temp dir");
4489 let project = dir.path().join("project");
4490 fs::create_dir_all(&project).expect("create project dir");
4491 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
4492 let baseline = SearchIndex::build(&project);
4493 let baseline_file_count = baseline.file_count();
4494
4495 let refreshed = SearchIndex::rebuild_or_refresh(
4496 &project,
4497 DEFAULT_MAX_FILE_SIZE,
4498 None,
4499 Some(baseline),
4500 None,
4501 );
4502
4503 assert_eq!(refreshed.file_count(), baseline_file_count);
4504 assert_eq!(
4505 refreshed
4506 .grep("unchangedtoken", true, &[], &[], &project, 10)
4507 .total_matches,
4508 1
4509 );
4510 }
4511
4512 #[test]
4513 fn resolve_search_scope_disables_index_for_external_path() {
4514 let dir = tempfile::tempdir().expect("create temp dir");
4515 let project = dir.path().join("project");
4516 let outside = dir.path().join("outside");
4517 fs::create_dir_all(&project).expect("create project dir");
4518 fs::create_dir_all(&outside).expect("create outside dir");
4519
4520 let scope = resolve_search_scope(&project, outside.to_str());
4521
4522 assert_eq!(
4523 scope.root,
4524 fs::canonicalize(&outside).expect("canonicalize outside")
4525 );
4526 assert!(!scope.use_index);
4527 }
4528
4529 #[test]
4530 fn grep_filters_matches_to_search_root() {
4531 let dir = tempfile::tempdir().expect("create temp dir");
4532 let project = dir.path().join("project");
4533 let src = project.join("src");
4534 let docs = project.join("docs");
4535 fs::create_dir_all(&src).expect("create src dir");
4536 fs::create_dir_all(&docs).expect("create docs dir");
4537 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
4538 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
4539
4540 let index = SearchIndex::build(&project);
4541 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4542
4543 assert_eq!(result.files_searched, 1);
4544 assert_eq!(result.files_with_matches, 1);
4545 assert_eq!(result.matches.len(), 1);
4546 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
4548 assert_eq!(result.matches[0].file, expected);
4549 }
4550
4551 #[test]
4552 fn grep_deduplicates_multiple_matches_on_same_line() {
4553 let dir = tempfile::tempdir().expect("create temp dir");
4554 let project = dir.path().join("project");
4555 let src = project.join("src");
4556 fs::create_dir_all(&src).expect("create src dir");
4557 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
4558
4559 let index = SearchIndex::build(&project);
4560 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4561
4562 assert_eq!(result.total_matches, 1);
4563 assert_eq!(result.matches.len(), 1);
4564 }
4565
4566 #[test]
4567 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
4568 let dir = tempfile::tempdir().expect("create temp dir");
4569 let project = dir.path().join("project");
4570 fs::create_dir_all(&project).expect("create project dir");
4571 let file = project.join("unicode.txt");
4572 fs::write(&file, "äbc\n").expect("write unicode file");
4573
4574 let index = SearchIndex::build(&project);
4575 let result = index.grep("Äbc", false, &[], &[], &project, 10);
4576
4577 assert_eq!(result.total_matches, 1);
4578 assert_eq!(result.matches.len(), 1);
4579 assert_eq!(
4580 result.matches[0].file,
4581 fs::canonicalize(file).expect("canonicalize unicode file")
4582 );
4583 }
4584
4585 #[test]
4586 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
4587 let dir = tempfile::tempdir().expect("create temp dir");
4588 let project = dir.path().join("project");
4589 fs::create_dir_all(&project).expect("create project dir");
4590 let file = project.join("tokens.txt");
4591 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
4592 fs::write(&file, "alpha").expect("write original file");
4593 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
4594
4595 let baseline = SearchIndex::build(&project);
4596 fs::write(&file, "bravo").expect("write same-size edit");
4597 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
4598
4599 let refreshed = SearchIndex::rebuild_or_refresh(
4600 &project,
4601 DEFAULT_MAX_FILE_SIZE,
4602 None,
4603 Some(baseline),
4604 None,
4605 );
4606 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
4607 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
4608 let refreshed_id = *refreshed
4609 .path_to_id
4610 .get(&canonical_file)
4611 .expect("file remains indexed");
4612
4613 assert_eq!(result.total_matches, 1);
4614 assert!(refreshed
4615 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
4616 .contains(&refreshed_id));
4617 assert!(!refreshed
4618 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
4619 .contains(&refreshed_id));
4620 }
4621
4622 #[test]
4623 fn grep_reports_total_matches_before_truncation() {
4624 let dir = tempfile::tempdir().expect("create temp dir");
4625 let project = dir.path().join("project");
4626 let src = project.join("src");
4627 fs::create_dir_all(&src).expect("create src dir");
4628 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
4629
4630 let index = SearchIndex::build(&project);
4631 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
4632
4633 assert_eq!(result.total_matches, 2);
4634 assert_eq!(result.matches.len(), 1);
4635 assert!(result.truncated);
4636 }
4637
4638 #[test]
4639 fn glob_filters_results_to_search_root() {
4640 let dir = tempfile::tempdir().expect("create temp dir");
4641 let project = dir.path().join("project");
4642 let src = project.join("src");
4643 let scripts = project.join("scripts");
4644 fs::create_dir_all(&src).expect("create src dir");
4645 fs::create_dir_all(&scripts).expect("create scripts dir");
4646 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
4647 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
4648
4649 let index = SearchIndex::build(&project);
4650 let files = index.glob("**/*.rs", &src);
4651
4652 assert_eq!(
4653 files,
4654 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
4655 );
4656 }
4657
4658 #[test]
4659 fn glob_includes_hidden_and_binary_files() {
4660 let dir = tempfile::tempdir().expect("create temp dir");
4661 let project = dir.path().join("project");
4662 let hidden_dir = project.join(".hidden");
4663 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
4664 let hidden_file = hidden_dir.join("data.bin");
4665 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
4666
4667 let index = SearchIndex::build(&project);
4668 let files = index.glob("**/*.bin", &project);
4669
4670 assert_eq!(
4671 files,
4672 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
4673 );
4674 }
4675
4676 #[test]
4677 fn read_from_disk_rejects_invalid_nanos() {
4678 let dir = tempfile::tempdir().expect("create temp dir");
4679 let cache_dir = dir.path().join("cache");
4680 fs::create_dir_all(&cache_dir).expect("create cache dir");
4681
4682 let mut postings = Vec::new();
4683 postings.extend_from_slice(INDEX_MAGIC);
4684 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4685 postings.extend_from_slice(&0u32.to_le_bytes());
4686 postings.extend_from_slice(&1u32.to_le_bytes());
4687 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
4688 postings.extend_from_slice(&1u32.to_le_bytes());
4689 postings.extend_from_slice(b"/");
4690 postings.push(0u8);
4691 postings.extend_from_slice(&1u32.to_le_bytes());
4692 postings.extend_from_slice(&0u64.to_le_bytes());
4693 postings.extend_from_slice(&0u64.to_le_bytes());
4694 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
4695 postings.extend_from_slice(b"a");
4696 postings.extend_from_slice(&0u64.to_le_bytes());
4697
4698 let mut lookup = Vec::new();
4699 lookup.extend_from_slice(LOOKUP_MAGIC);
4700 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4701 lookup.extend_from_slice(&0u32.to_le_bytes());
4702
4703 let postings_checksum = crc32fast::hash(&postings);
4704 postings.extend_from_slice(&postings_checksum.to_le_bytes());
4705 let lookup_checksum = crc32fast::hash(&lookup);
4706 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
4707 let mut cache = Vec::new();
4708 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
4709 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4710 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
4711 cache.extend_from_slice(&postings);
4712 cache.extend_from_slice(&lookup);
4713 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
4714
4715 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4716 }
4717
4718 #[test]
4719 fn parallel_cold_build_matches_serial_index() {
4720 let dir = tempfile::tempdir().expect("create temp dir");
4721 let project = dir.path().join("project");
4722 for index in 0..80 {
4723 let sub = project.join(format!("pkg_{index:03}"));
4724 fs::create_dir_all(&sub).expect("create subdir");
4725 fs::write(
4726 sub.join("lib.rs"),
4727 format!(
4728 "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
4729 ),
4730 )
4731 .expect("write lib");
4732 }
4733
4734 let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
4735 let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
4736
4737 assert_eq!(serial.file_count(), parallel.file_count());
4738 assert_eq!(serial.trigram_count(), parallel.trigram_count());
4739 assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
4740 assert_eq!(
4741 serial.file_trigram_count.as_ref(),
4742 parallel.file_trigram_count.as_ref()
4743 );
4744 for (path, id) in serial.path_to_id.iter() {
4745 assert_eq!(parallel.path_to_id.get(path), Some(id));
4746 }
4747 for (serial_file, parallel_file) in serial.files.iter().zip(parallel.files.iter()) {
4748 assert_eq!(serial_file.path, parallel_file.path);
4749 assert_eq!(serial_file.size, parallel_file.size);
4750 assert_eq!(serial_file.modified, parallel_file.modified);
4751 assert_eq!(serial_file.content_hash, parallel_file.content_hash);
4752 }
4753
4754 let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4755 let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4756 assert_eq!(serial_grep.matches, parallel_grep.matches);
4757 assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
4758 assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
4759 assert_eq!(
4760 serial_grep.files_with_matches,
4761 parallel_grep.files_with_matches
4762 );
4763 }
4764
4765 #[test]
4766 fn ignore_rule_discovery_respects_gitignore() {
4767 let dir = tempfile::tempdir().expect("create temp dir");
4768 let project = dir.path().join("project");
4769 fs::create_dir_all(project.join("src")).expect("mkdir src");
4770 fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
4771 let data = project.join("src/data");
4772 fs::create_dir_all(&data).expect("mkdir data");
4773 for index in 0..200 {
4774 fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
4775 fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
4776 .expect("write ignored file");
4777 }
4778
4779 Command::new("git")
4780 .arg("init")
4781 .arg(&project)
4782 .status()
4783 .expect("git init");
4784 for args in [
4785 ["config", "user.email", "aft@example.invalid"],
4786 ["config", "user.name", "AFT Test"],
4787 ] {
4788 Command::new("git")
4789 .arg("-C")
4790 .arg(&project)
4791 .args(args)
4792 .status()
4793 .expect("git config");
4794 }
4795 Command::new("git")
4796 .arg("-C")
4797 .arg(&project)
4798 .args(["add", "."])
4799 .status()
4800 .expect("git add");
4801 Command::new("git")
4802 .arg("-C")
4803 .arg(&project)
4804 .args(["commit", "-m", "initial"])
4805 .status()
4806 .expect("git commit");
4807
4808 let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
4809 let walker_dirs = count_ignore_rule_discovery_dirs(&project);
4810 assert!(
4811 legacy_dirs > walker_dirs,
4812 "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
4813 );
4814 assert!(
4815 walker_dirs < 50,
4816 "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
4817 );
4818 }
4819
4820 #[test]
4835 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
4836 let dir = tempfile::tempdir().expect("create tempdir");
4840 let mut paths: Vec<PathBuf> = Vec::new();
4841 for i in 0..30 {
4842 let path = if i % 2 == 0 {
4844 let p = dir.path().join(format!("real-{i}.rs"));
4845 fs::write(&p, format!("// {i}\n")).expect("write");
4846 p
4847 } else {
4848 dir.path().join(format!("missing-{i}.rs"))
4849 };
4850 paths.push(path);
4851 }
4852
4853 for _ in 0..50 {
4856 let mut copy = paths.clone();
4857 sort_paths_by_mtime_desc(&mut copy);
4858 assert_eq!(copy.len(), paths.len());
4859 }
4860 }
4861
4862 #[test]
4868 fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
4869 let dir = tempfile::tempdir().expect("create tempdir");
4870 for i in 0..40 {
4873 fs::write(
4874 dir.path().join(format!("file-{i}.rs")),
4875 format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
4876 )
4877 .expect("write");
4878 }
4879 let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
4880 let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
4881 assert!(
4882 result.matches.len() >= 40,
4883 "expected a match per file, got {}",
4884 result.matches.len()
4885 );
4886 assert!(
4887 !result.engine_capped,
4888 "an uncapped grep over >10 files must not report engine_capped"
4889 );
4890 assert!(!result.truncated, "uncapped grep must not be truncated");
4891 }
4892
4893 #[test]
4897 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
4898 let dir = tempfile::tempdir().expect("create tempdir");
4899 let mut matches: Vec<GrepMatch> = Vec::new();
4900 for i in 0..30 {
4901 let file = if i % 2 == 0 {
4902 let p = dir.path().join(format!("real-{i}.rs"));
4903 fs::write(&p, format!("// {i}\n")).expect("write");
4904 p
4905 } else {
4906 dir.path().join(format!("missing-{i}.rs"))
4907 };
4908 matches.push(GrepMatch {
4909 file,
4910 line: u32::try_from(i).unwrap_or(0),
4911 column: 0,
4912 line_text: format!("match {i}"),
4913 match_text: format!("match {i}"),
4914 });
4915 }
4916
4917 for _ in 0..50 {
4918 let mut copy = matches.clone();
4919 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
4920 assert_eq!(copy.len(), matches.len());
4921 }
4922 }
4923}