1use std::collections::{BTreeMap, BTreeSet, BinaryHeap, HashMap, HashSet};
2use std::fs::{self, File, OpenOptions};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const SPILL_MAGIC: &[u8; 8] = b"AFTSPI01";
27const FILE_TRIGRAM_COUNT_MAGIC: &[u8; 8] = b"AFTFTC01";
28const INDEX_VERSION: u32 = 4;
29const PREVIEW_BYTES: usize = 8 * 1024;
30const SPIMI_SOFT_LIMIT_BYTES: usize = 128 * 1024 * 1024;
31const SPIMI_HARD_LIMIT_BYTES: usize = 256 * 1024 * 1024;
32const SPILL_RECORD_ESTIMATED_BYTES: usize = 16;
33const DELTA_COMPACT_SOFT_FILES: usize = 1_000;
34const DELTA_COMPACT_HARD_FILES: usize = 5_000;
35const DELTA_COMPACT_SOFT_BYTES: usize = 32 * 1024 * 1024;
36const DELTA_COMPACT_HARD_BYTES: usize = 128 * 1024 * 1024;
37const EOF_SENTINEL: u8 = 0;
38const MAX_ENTRIES: usize = 10_000_000;
39const MIN_FILE_ENTRY_BYTES: usize = 57;
40const LOOKUP_ENTRY_BYTES: usize = 16;
41const POSTING_BYTES: usize = 6;
42static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
43
44pub struct CacheLock {
45 _guard: fs_lock::LockGuard,
46}
47
48impl CacheLock {
49 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
50 fs::create_dir_all(cache_dir)?;
51 let path = cache_dir.join("cache.lock");
52 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
53 .lock()
54 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
55 fs_lock::try_acquire(&path, Duration::from_secs(2))
56 .map(|guard| Self { _guard: guard })
57 .map_err(|error| match error {
58 fs_lock::AcquireError::Timeout => {
59 std::io::Error::other("timed out acquiring search cache lock")
60 }
61 fs_lock::AcquireError::Io(error) => error,
62 })
63 }
64}
65
66#[derive(Clone, Debug)]
67pub struct SearchIndex {
68 base: Option<Arc<BasePostings>>,
69 delta_postings: HashMap<u32, Vec<Posting>>,
70 delta_file_trigrams: HashMap<u32, Vec<u32>>,
71 pub files: Arc<Vec<FileEntry>>,
72 pub path_to_id: Arc<HashMap<PathBuf, u32>>,
73 pub ready: bool,
74 project_root: PathBuf,
75 git_head: Option<String>,
76 max_file_size: u64,
77 ignore_rules_fingerprint: String,
78 pub file_trigram_count: Arc<Vec<u32>>,
79 unindexed_files: Arc<HashSet<u32>>,
80 superseded: HashSet<u32>,
81 base_file_count: u32,
82 delta_packed_bytes: usize,
83 compaction_state: Arc<Mutex<CompactionState>>,
84}
85
86#[derive(Clone, Debug)]
87struct BasePostings {
88 file: Arc<File>,
89 postings_blob_start: u64,
90 postings_blob_len: u64,
91 lookup: Arc<Vec<LookupEntry>>,
92}
93
94#[derive(Clone, Copy, Debug, PartialEq, Eq)]
95struct LookupEntry {
96 trigram: u32,
97 offset: u64,
98 count: u32,
99}
100
101#[derive(Clone, Debug, Default)]
102struct CompactionState {
103 running: bool,
104 requested_again: bool,
105 buffered_paths: Vec<PathBuf>,
106}
107
108#[derive(Clone, Debug)]
109pub struct SearchIndexSnapshot {
110 base: Option<Arc<BasePostings>>,
111 delta_postings: Arc<HashMap<u32, Vec<Posting>>>,
112 files: Arc<Vec<FileEntry>>,
113 path_to_id: Arc<HashMap<PathBuf, u32>>,
114 ready: bool,
115 project_root: PathBuf,
116 file_trigram_count: Arc<Vec<u32>>,
117 unindexed_files: Arc<HashSet<u32>>,
118 superseded: Arc<HashSet<u32>>,
119}
120
121#[derive(Clone, Debug, Default)]
122pub struct LexicalRankResult {
123 pub files: Vec<(PathBuf, f32)>,
124 pub engine_capped: bool,
125}
126
127impl SearchIndex {
128 pub fn file_count(&self) -> usize {
130 self.files.len()
131 }
132
133 pub fn trigram_count(&self) -> usize {
135 self.snapshot().trigram_count()
136 }
137
138 pub fn snapshot(&self) -> SearchIndexSnapshot {
143 SearchIndexSnapshot {
144 base: self.base.clone(),
145 delta_postings: Arc::new(self.delta_postings.clone()),
146 files: Arc::clone(&self.files),
147 path_to_id: Arc::clone(&self.path_to_id),
148 ready: self.ready,
149 project_root: self.project_root.clone(),
150 file_trigram_count: Arc::clone(&self.file_trigram_count),
151 unindexed_files: Arc::clone(&self.unindexed_files),
152 superseded: Arc::new(self.superseded.clone()),
153 }
154 }
155
156 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
158 query_trigrams_from_tokens(tokens)
159 }
160
161 pub fn lexical_rank(
163 &self,
164 query_trigrams: &[u32],
165 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
166 max_files: usize,
167 ) -> Vec<(PathBuf, f32)> {
168 self.snapshot()
169 .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
170 .files
171 }
172
173 pub fn lexical_rank_with_stats(
176 &self,
177 query_trigrams: &[u32],
178 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
179 max_files: usize,
180 ) -> LexicalRankResult {
181 self.snapshot()
182 .lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
183 }
184}
185
186impl SearchIndexSnapshot {
187 pub fn trigram_count(&self) -> usize {
189 let base_count = self.base.as_ref().map_or(0, |base| base.lookup.len());
190 let Some(base) = &self.base else {
191 return self.delta_postings.len();
192 };
193 base_count
194 + self
195 .delta_postings
196 .keys()
197 .filter(|trigram| base.lookup_entry(**trigram).is_none())
198 .count()
199 }
200
201 pub fn lexical_rank_with_stats(
204 &self,
205 query_trigrams: &[u32],
206 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
207 max_files: usize,
208 ) -> LexicalRankResult {
209 if query_trigrams.is_empty() || max_files == 0 {
210 return LexicalRankResult::default();
211 }
212
213 let mut non_zero: Vec<(u32, usize)> = query_trigrams
214 .iter()
215 .filter_map(|trigram| {
216 let posting_count = self.posting_count(*trigram);
217 (posting_count > 0).then_some((*trigram, posting_count))
218 })
219 .collect();
220 if non_zero.is_empty() {
221 return LexicalRankResult::default();
222 }
223
224 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
225 let selected_count = non_zero.len().min(3);
226 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
227
228 let mut candidate_ids = BTreeSet::new();
229 for (trigram, _) in non_zero.iter().take(selected_count) {
230 for file_id in self.postings_for_trigram(*trigram, None) {
231 candidate_ids.insert(file_id);
232 }
233 }
234 let pre_filter_candidate_count = candidate_ids.len();
235 let engine_capped = pre_filter_candidate_count > candidate_cap;
236 let filtered_candidates = candidate_ids
237 .into_iter()
238 .filter_map(|file_id| {
239 self.files
240 .get(file_id as usize)
241 .map(|entry| (file_id, entry))
242 })
243 .filter(|(_, entry)| {
244 if let Some(filter) = candidate_filter {
245 filter(&entry.path)
246 } else {
247 true
248 }
249 })
250 .collect::<Vec<_>>();
251
252 let mut ranked = Vec::new();
253 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
254 let score = lexical_score_snapshot(self, query_trigrams, file_id);
255 if score > 0.0 {
256 ranked.push((entry.path.clone(), score));
257 }
258 }
259
260 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
261 ranked.truncate(max_files);
262 LexicalRankResult {
263 files: ranked,
264 engine_capped,
265 }
266 }
267}
268
269#[derive(Clone, Debug, PartialEq, Eq)]
270pub struct Posting {
271 pub file_id: u32,
272 pub next_mask: u8,
273 pub loc_mask: u8,
274}
275
276#[derive(Clone, Debug)]
277pub struct FileEntry {
278 pub path: PathBuf,
279 pub size: u64,
280 pub modified: SystemTime,
281 pub content_hash: blake3::Hash,
282}
283
284#[derive(Clone, Debug, PartialEq, Eq)]
285pub struct GrepMatch {
286 pub file: PathBuf,
287 pub line: u32,
288 pub column: u32,
289 pub line_text: String,
290 pub match_text: String,
291}
292
293#[derive(Clone, Debug)]
294pub struct GrepResult {
295 pub matches: Vec<GrepMatch>,
296 pub total_matches: usize,
297 pub files_searched: usize,
298 pub files_with_matches: usize,
299 pub index_status: IndexStatus,
300 pub truncated: bool,
301 pub fully_degraded: bool,
302 pub engine_capped: bool,
303 pub walk_truncated: bool,
305}
306
307#[derive(Clone, Copy, Debug, PartialEq, Eq)]
308pub enum IndexStatus {
309 Ready,
310 Building,
311 Fallback,
312 Disabled,
313}
314
315impl IndexStatus {
316 pub fn as_str(&self) -> &'static str {
317 match self {
318 IndexStatus::Ready => "Ready",
319 IndexStatus::Building => "Building",
320 IndexStatus::Fallback => "Fallback",
321 IndexStatus::Disabled => "Disabled",
322 }
323 }
324}
325
326#[derive(Clone, Debug, Default)]
327pub struct RegexQuery {
328 pub and_trigrams: Vec<u32>,
329 pub or_groups: Vec<Vec<u32>>,
330 pub(crate) and_filters: HashMap<u32, PostingFilter>,
331 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
332}
333
334#[derive(Clone, Copy, Debug, Default)]
335pub(crate) struct PostingFilter {
336 next_mask: u8,
337 loc_mask: u8,
338}
339
340#[derive(Clone, Copy)]
341struct SearchFileMetadata {
342 size: u64,
343 modified: SystemTime,
344}
345
346struct PreparedIndexedFile {
347 metadata: SearchFileMetadata,
348 content_hash: blake3::Hash,
349 trigram_map: BTreeMap<u32, PostingFilter>,
350}
351
352enum PreparedSearchPath {
353 Indexed(PreparedIndexedFile),
354 Unindexed(SearchFileMetadata),
355 Skipped,
356}
357
358#[derive(Clone, Debug, Default)]
359struct QueryBuild {
360 and_runs: Vec<Vec<u8>>,
361 or_groups: Vec<Vec<Vec<u8>>>,
362}
363
364#[derive(Clone, Debug, Default)]
365pub(crate) struct PathFilters {
366 includes: Option<GlobSet>,
367 excludes: Option<GlobSet>,
368}
369
370#[derive(Clone, Debug)]
371pub(crate) struct SearchScope {
372 pub root: PathBuf,
373 pub use_index: bool,
374}
375
376#[derive(Clone, Debug)]
377struct SharedGrepMatch {
378 file: Arc<PathBuf>,
379 line: u32,
380 column: u32,
381 line_text: String,
382 match_text: String,
383}
384
385#[derive(Clone, Debug)]
386enum SearchMatcher {
387 Literal(LiteralSearch),
388 Regex(Regex),
389}
390
391impl SearchIndex {
392 pub fn new() -> Self {
393 SearchIndex {
394 base: None,
395 delta_postings: HashMap::new(),
396 delta_file_trigrams: HashMap::new(),
397 files: Arc::new(Vec::new()),
398 path_to_id: Arc::new(HashMap::new()),
399 ready: false,
400 project_root: PathBuf::new(),
401 git_head: None,
402 max_file_size: DEFAULT_MAX_FILE_SIZE,
403 ignore_rules_fingerprint: String::new(),
404 file_trigram_count: Arc::new(Vec::new()),
405 unindexed_files: Arc::new(HashSet::new()),
406 superseded: HashSet::new(),
407 base_file_count: 0,
408 delta_packed_bytes: 0,
409 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
410 }
411 }
412
413 pub fn build(root: &Path) -> Self {
414 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
415 }
416
417 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
418 let cache_dir = transient_search_cache_dir(root);
419 Self::build_with_limit_to_cache_dir(root, max_file_size, &cache_dir)
420 }
421
422 pub fn build_with_limit_to_cache_dir(
423 root: &Path,
424 max_file_size: u64,
425 cache_dir: &Path,
426 ) -> Self {
427 let started = std::time::Instant::now();
428 match build_streaming_index(root, max_file_size, cache_dir) {
429 Ok((mut index, indexed)) => {
430 index.ready = true;
431 crate::slog_info!(
432 "search index cold streaming build: {} files, {} trigrams, {} ms (pool={})",
433 indexed,
434 index.trigram_count(),
435 started.elapsed().as_millis(),
436 search_index_build_pool_size()
437 );
438 index
439 }
440 Err(error) => {
441 log::warn!(
442 "search index: streaming build failed ({}); falling back to bounded in-memory delta",
443 error
444 );
445 let mut index = SearchIndex {
446 project_root: fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
447 max_file_size,
448 ignore_rules_fingerprint: ignore_rules_fingerprint(
449 &fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf()),
450 ),
451 ..SearchIndex::new()
452 };
453 let filters = PathFilters::default();
454 let paths: Vec<PathBuf> = walk_project_files(&index.project_root, &filters);
455 let indexed = index.ingest_paths_parallel(&paths);
456 index.git_head = current_git_head(&index.project_root);
457 index.ready = true;
458 crate::slog_info!(
459 "search index fallback build: {} files, {} trigrams, {} ms (pool={})",
460 indexed,
461 index.trigram_count(),
462 started.elapsed().as_millis(),
463 search_index_build_pool_size()
464 );
465 index
466 }
467 }
468 }
469
470 #[cfg(test)]
472 pub fn build_with_limit_serial(root: &Path, max_file_size: u64) -> Self {
473 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
474 let mut index = SearchIndex {
475 project_root: project_root.clone(),
476 max_file_size,
477 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
478 ..SearchIndex::new()
479 };
480 let filters = PathFilters::default();
481 for path in walk_project_files(&project_root, &filters) {
482 index.update_file(&path);
483 }
484 index.git_head = current_git_head(&project_root);
485 index.ready = true;
486 index
487 }
488
489 fn ingest_paths_parallel(&mut self, paths: &[PathBuf]) -> usize {
490 let max_file_size = self.max_file_size;
491 let pool_size = search_index_build_pool_size();
492 let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
493 let pool = match rayon::ThreadPoolBuilder::new()
494 .num_threads(pool_size)
495 .thread_name(|index| format!("aft-search-build-{index}"))
496 .stack_size(8 * 1024 * 1024)
497 .build()
498 {
499 Ok(pool) => Some(pool),
500 Err(error) => {
501 log::warn!(
502 "search index: bounded build pool unavailable ({error}); using global pool"
503 );
504 None
505 }
506 };
507
508 let mut indexed = 0usize;
509 for chunk in paths.chunks(chunk_size) {
510 let prepare_chunk = || -> Vec<PreparedSearchPath> {
511 chunk
512 .par_iter()
513 .map(|path| prepare_search_path(path, max_file_size))
514 .collect()
515 };
516 let prepared = match &pool {
517 Some(pool) => pool.install(prepare_chunk),
518 None => prepare_chunk(),
519 };
520
521 for (path, prepared) in chunk.iter().zip(prepared) {
522 let inserted = match prepared {
523 PreparedSearchPath::Indexed(file) => self.index_prepared_new_file(path, file),
524 PreparedSearchPath::Unindexed(metadata) => {
525 self.track_unindexed_file_with_metadata(path, metadata)
526 }
527 PreparedSearchPath::Skipped => false,
528 };
529 if inserted {
530 indexed += 1;
531 }
532 }
533 }
534
535 indexed
536 }
537
538 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
539 self.remove_file(path);
540 let metadata = metadata_for_indexed_content(path, content.len() as u64);
541 self.index_file_with_metadata(path, content, metadata);
542 }
543
544 fn index_file_with_metadata(
545 &mut self,
546 path: &Path,
547 content: &[u8],
548 metadata: SearchFileMetadata,
549 ) -> bool {
550 self.index_prepared_new_file(
551 path,
552 PreparedIndexedFile {
553 metadata,
554 content_hash: cache_freshness::hash_bytes(content),
555 trigram_map: trigram_filter_map(content, true),
556 },
557 )
558 }
559
560 fn index_prepared_new_file(&mut self, path: &Path, file: PreparedIndexedFile) -> bool {
561 let file_id = match self.allocate_file_id_with_metadata(path, file.metadata) {
562 Some(file_id) => file_id,
563 None => return false,
564 };
565 if let Some(entry) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
566 entry.content_hash = file.content_hash;
567 }
568
569 let mut file_trigrams = Vec::with_capacity(file.trigram_map.len());
570 for (trigram, filter) in file.trigram_map {
571 let postings = self.delta_postings.entry(trigram).or_default();
572 postings.push(Posting {
573 file_id,
574 next_mask: filter.next_mask,
575 loc_mask: filter.loc_mask,
576 });
577 if postings.len() > 1
578 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
579 {
580 postings.sort_unstable_by_key(|p| p.file_id);
581 }
582 file_trigrams.push(trigram);
583 }
584
585 let trigram_count = file_trigrams.len() as u32;
586 self.delta_packed_bytes = self
587 .delta_packed_bytes
588 .saturating_add(file_trigrams.len().saturating_mul(POSTING_BYTES));
589 self.delta_file_trigrams.insert(file_id, file_trigrams);
590 ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
591 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
592 *count = trigram_count;
593 }
594 Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
595 self.update_compaction_flags(Some(path));
596 true
597 }
598
599 pub fn remove_file(&mut self, path: &Path) {
600 let canonical_path = canonicalize_existing_or_deleted_path(path);
601 let file_id = {
602 let path_to_id = Arc::make_mut(&mut self.path_to_id);
603 if let Some(file_id) = path_to_id.remove(path) {
604 file_id
605 } else if canonical_path.as_path() != path {
606 let Some(file_id) = path_to_id.remove(&canonical_path) else {
607 return;
608 };
609 file_id
610 } else {
611 return;
612 }
613 };
614
615 if file_id < self.base_file_count {
616 self.superseded.insert(file_id);
617 }
618
619 if let Some(trigrams) = self.delta_file_trigrams.remove(&file_id) {
620 self.delta_packed_bytes = self
621 .delta_packed_bytes
622 .saturating_sub(trigrams.len().saturating_mul(POSTING_BYTES));
623 for trigram in trigrams {
624 let should_remove = if let Some(postings) = self.delta_postings.get_mut(&trigram) {
625 postings.retain(|posting| posting.file_id != file_id);
626 postings.is_empty()
627 } else {
628 false
629 };
630
631 if should_remove {
632 self.delta_postings.remove(&trigram);
633 }
634 }
635 }
636
637 Arc::make_mut(&mut self.unindexed_files).remove(&file_id);
638 if let Some(file) = Arc::make_mut(&mut self.files).get_mut(file_id as usize) {
639 file.path = PathBuf::new();
640 file.size = 0;
641 file.modified = UNIX_EPOCH;
642 file.content_hash = cache_freshness::zero_hash();
643 }
644 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
645 *count = 0;
646 }
647 self.update_compaction_flags(Some(path));
648 }
649
650 pub fn update_file(&mut self, path: &Path) {
651 self.remove_file(path);
652
653 let metadata = match fs::metadata(path) {
654 Ok(metadata) if metadata.is_file() => metadata,
655 _ => return,
656 };
657
658 let metadata = search_file_metadata(&metadata);
659
660 if is_binary_path(path, metadata.size) {
661 self.track_unindexed_file_with_metadata(path, metadata);
662 return;
663 }
664
665 if metadata.size > self.max_file_size {
666 self.track_unindexed_file_with_metadata(path, metadata);
667 return;
668 }
669
670 let content = match fs::read(path) {
671 Ok(content) => content,
672 Err(_) => return,
673 };
674
675 if is_binary_bytes(&content) {
676 self.track_unindexed_file_with_metadata(path, metadata);
677 return;
678 }
679
680 self.index_file_with_metadata(path, &content, metadata);
681 }
682
683 pub fn grep(
684 &self,
685 pattern: &str,
686 case_sensitive: bool,
687 include: &[String],
688 exclude: &[String],
689 search_root: &Path,
690 max_results: usize,
691 ) -> GrepResult {
692 self.snapshot().grep(
693 pattern,
694 case_sensitive,
695 include,
696 exclude,
697 search_root,
698 max_results,
699 )
700 }
701
702 pub fn search_grep(
703 &self,
704 pattern: &CompiledPattern,
705 include: &[String],
706 exclude: &[String],
707 search_root: &Path,
708 max_results: usize,
709 ) -> GrepResult {
710 self.snapshot()
711 .search_grep(pattern, include, exclude, search_root, max_results)
712 }
713
714 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
715 self.snapshot().glob(pattern, search_root)
716 }
717
718 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
719 self.snapshot().candidates(query)
720 }
721
722 pub fn write_to_disk(&mut self, cache_dir: &Path, git_head: Option<&str>) {
723 let Some(plan) = CacheWritePlan::from_index(self, git_head) else {
724 return;
725 };
726
727 let write_result = {
728 let mut sources = self.compaction_record_sources(Arc::clone(&plan.id_map));
729 write_cache_file_from_sources(cache_dir, &plan, &mut sources)
730 };
731
732 match write_result {
733 Ok(base) => {
734 self.base = Some(Arc::new(base));
735 self.delta_postings.clear();
736 self.delta_file_trigrams.clear();
737 self.superseded.clear();
738 self.delta_packed_bytes = 0;
739 self.base_file_count = u32::try_from(plan.files.len()).unwrap_or(u32::MAX);
740 self.files = Arc::new(plan.files);
741 self.path_to_id = Arc::new(plan.path_to_id);
742 self.unindexed_files = Arc::new(plan.unindexed_files);
743 self.file_trigram_count = Arc::new(plan.file_trigram_count);
744 self.git_head = plan.git_head.filter(|head| !head.is_empty());
745 self.ignore_rules_fingerprint = plan.ignore_fingerprint;
746 }
747 Err(error) => {
748 log::warn!("search index: failed to write disk cache: {}", error);
749 }
750 }
751 }
752
753 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
754 debug_assert!(current_canonical_root.is_absolute());
755 let cache_path = cache_dir.join("cache.bin");
756 let cache_file = open_cache_file_read(&cache_path).ok()?;
757 let file_len = cache_file.metadata().ok()?.len();
758 if file_len < 16 {
759 return None;
760 }
761
762 let mut reader = BufReader::new(cache_file.try_clone().ok()?);
763 if read_u32(&mut reader).ok()? != CACHE_MAGIC {
764 return None;
765 }
766 if read_u32(&mut reader).ok()? != INDEX_VERSION {
767 return None;
768 }
769 let postings_len_total = read_u64(&mut reader).ok()?;
770 let postings_section_start = reader.stream_position().ok()?;
771 let postings_section_end = postings_section_start.checked_add(postings_len_total)?;
772 if postings_len_total < 4 || postings_section_end > file_len {
773 return None;
774 }
775 let postings_body_end = postings_section_end.checked_sub(4)?;
776
777 let mut magic = [0u8; 8];
778 reader.read_exact(&mut magic).ok()?;
779 if &magic != INDEX_MAGIC {
780 return None;
781 }
782 if read_u32(&mut reader).ok()? != INDEX_VERSION {
783 return None;
784 }
785
786 let head_len = read_u32(&mut reader).ok()? as usize;
787 let root_len = read_u32(&mut reader).ok()? as usize;
788 let ignore_fingerprint_len = read_u32(&mut reader).ok()? as usize;
789 let max_file_size = read_u64(&mut reader).ok()?;
790 let file_count = read_u32(&mut reader).ok()? as usize;
791 if file_count > MAX_ENTRIES {
792 return None;
793 }
794
795 if !reader_has_remaining(&mut reader, postings_body_end, head_len).ok()? {
796 return None;
797 }
798 let mut head_bytes = vec![0u8; head_len];
799 reader.read_exact(&mut head_bytes).ok()?;
800 let git_head = String::from_utf8(head_bytes)
801 .ok()
802 .filter(|head| !head.is_empty());
803
804 if !reader_has_remaining(&mut reader, postings_body_end, root_len).ok()? {
805 return None;
806 }
807 let mut root_bytes = vec![0u8; root_len];
808 reader.read_exact(&mut root_bytes).ok()?;
809 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
810 let project_root = current_canonical_root.to_path_buf();
811
812 if !reader_has_remaining(&mut reader, postings_body_end, ignore_fingerprint_len).ok()? {
813 return None;
814 }
815 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
816 reader.read_exact(&mut ignore_fingerprint_bytes).ok()?;
817 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
818 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
819 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
820 return None;
821 }
822
823 let mut files = Vec::with_capacity(file_count);
824 let mut path_to_id = HashMap::new();
825 let mut unindexed_files = HashSet::new();
826
827 for file_id in 0..file_count {
828 if !reader_has_remaining(&mut reader, postings_body_end, MIN_FILE_ENTRY_BYTES).ok()? {
829 return None;
830 }
831 let mut unindexed = [0u8; 1];
832 reader.read_exact(&mut unindexed).ok()?;
833 let path_len = read_u32(&mut reader).ok()? as usize;
834 let size = read_u64(&mut reader).ok()?;
835 let secs = read_u64(&mut reader).ok()?;
836 let nanos = read_u32(&mut reader).ok()?;
837 let mut hash_bytes = [0u8; 32];
838 reader.read_exact(&mut hash_bytes).ok()?;
839 let content_hash = blake3::Hash::from_bytes(hash_bytes);
840 if nanos >= 1_000_000_000 {
841 return None;
842 }
843 if !reader_has_remaining(&mut reader, postings_body_end, path_len).ok()? {
844 return None;
845 }
846 let mut path_bytes = vec![0u8; path_len];
847 reader.read_exact(&mut path_bytes).ok()?;
848 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
849 let full_path = cached_path_under_root(&project_root, &relative_path)?;
850 let file_id_u32 = u32::try_from(file_id).ok()?;
851
852 files.push(FileEntry {
853 path: full_path.clone(),
854 size,
855 modified: UNIX_EPOCH + Duration::new(secs, nanos),
856 content_hash,
857 });
858 path_to_id.insert(full_path, file_id_u32);
859 if unindexed[0] == 1 {
860 unindexed_files.insert(file_id_u32);
861 }
862 }
863
864 if !reader_has_remaining(&mut reader, postings_body_end, 8).ok()? {
865 return None;
866 }
867 let postings_blob_len = read_u64(&mut reader).ok()?;
868 let postings_blob_start = reader.stream_position().ok()?;
869 let postings_blob_end = postings_blob_start.checked_add(postings_blob_len)?;
870 if postings_blob_end > postings_body_end || postings_blob_len % POSTING_BYTES as u64 != 0 {
871 return None;
872 }
873
874 let lookup_section_start = postings_section_end;
875 if lookup_section_start >= file_len {
876 return None;
877 }
878 let mut lookup_file = cache_file.try_clone().ok()?;
879 lookup_file
880 .seek(SeekFrom::Start(lookup_section_start))
881 .ok()?;
882 let mut lookup_bytes = Vec::new();
883 lookup_file.read_to_end(&mut lookup_bytes).ok()?;
884 if lookup_bytes.len() < 4 {
885 return None;
886 }
887 verify_crc32_bytes_slice(&lookup_bytes).ok()?;
888 let lookup_body_len = lookup_bytes.len().checked_sub(4)?;
889 let mut lookup_reader = BufReader::new(Cursor::new(&lookup_bytes));
890 let mut lookup_magic = [0u8; 8];
891 lookup_reader.read_exact(&mut lookup_magic).ok()?;
892 if &lookup_magic != LOOKUP_MAGIC {
893 return None;
894 }
895 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
896 return None;
897 }
898 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
899 if entry_count > MAX_ENTRIES {
900 return None;
901 }
902 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
903 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
904 if minimum_lookup_bytes > remaining_lookup {
905 return None;
906 }
907
908 let mut lookup = Vec::with_capacity(entry_count);
909 let mut previous_trigram = None;
910 for _ in 0..entry_count {
911 let trigram = read_u32(&mut lookup_reader).ok()?;
912 let offset = read_u64(&mut lookup_reader).ok()?;
913 let count = read_u32(&mut lookup_reader).ok()?;
914 if count as usize > MAX_ENTRIES {
915 return None;
916 }
917 if previous_trigram.is_some_and(|previous| previous >= trigram) {
918 return None;
919 }
920 previous_trigram = Some(trigram);
921 let bytes_len = (count as u64).checked_mul(POSTING_BYTES as u64)?;
922 let end = offset.checked_add(bytes_len)?;
923 if end > postings_blob_len {
924 return None;
925 }
926 lookup.push(LookupEntry {
927 trigram,
928 offset,
929 count,
930 });
931 }
932
933 let base = BasePostings {
934 file: Arc::new(cache_file),
935 postings_blob_start,
936 postings_blob_len,
937 lookup: Arc::new(lookup),
938 };
939
940 let (file_trigram_count, migrated_counts) = match read_file_trigram_count_extension(
941 &base,
942 postings_blob_end,
943 postings_body_end,
944 file_count,
945 ) {
946 Ok(Some(counts)) => (counts, false),
947 Ok(None) => (
948 compute_file_trigram_counts_from_base(&base, file_count).ok()?,
949 true,
950 ),
951 Err(_) => return None,
952 };
953
954 let mut index = SearchIndex {
955 base: Some(Arc::new(base)),
956 delta_postings: HashMap::new(),
957 delta_file_trigrams: HashMap::new(),
958 files: Arc::new(files),
959 path_to_id: Arc::new(path_to_id),
960 ready: false,
961 project_root,
962 git_head,
963 max_file_size,
964 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
965 file_trigram_count: Arc::new(file_trigram_count),
966 unindexed_files: Arc::new(unindexed_files),
967 superseded: HashSet::new(),
968 base_file_count: u32::try_from(file_count).ok()?,
969 delta_packed_bytes: 0,
970 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
971 };
972
973 if migrated_counts {
974 if let Ok(_lock) = CacheLock::acquire(cache_dir) {
975 let head = index.git_head.clone();
976 index.write_to_disk(cache_dir, head.as_deref());
977 }
978 }
979
980 Some(index)
981 }
982
983 pub fn stored_git_head(&self) -> Option<&str> {
984 self.git_head.as_deref()
985 }
986
987 pub(crate) fn set_ready(&mut self, ready: bool) {
988 self.ready = ready;
989 }
990
991 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
992 self.git_head = current_head;
993 verify_file_mtimes(self);
994 self.ready = true;
995 }
996
997 #[cfg(debug_assertions)]
998 #[doc(hidden)]
999 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1000 self.verify_against_disk(current_head);
1001 }
1002
1003 pub(crate) fn rebuild_or_refresh(
1004 root: &Path,
1005 max_file_size: u64,
1006 current_head: Option<String>,
1007 baseline: Option<SearchIndex>,
1008 cache_dir: Option<&Path>,
1009 ) -> Self {
1010 if let Some(mut baseline) = baseline {
1011 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1012 baseline.max_file_size = max_file_size;
1013 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1014 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1015 return match cache_dir {
1016 Some(cache_dir) => {
1017 SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1018 }
1019 None => SearchIndex::build_with_limit(root, max_file_size),
1020 };
1021 }
1022 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1023
1024 if baseline.git_head == current_head || current_head.is_none() {
1025 baseline.git_head = current_head;
1032 verify_file_mtimes(&mut baseline);
1033 baseline.ready = true;
1034 return baseline;
1035 }
1036
1037 if let (Some(previous), Some(current)) =
1038 (baseline.git_head.clone(), current_head.clone())
1039 {
1040 let project_root = baseline.project_root.clone();
1041 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1042 baseline.git_head = Some(current);
1043 verify_file_mtimes(&mut baseline);
1044 baseline.ready = true;
1045 return baseline;
1046 }
1047 }
1048 }
1049
1050 match cache_dir {
1051 Some(cache_dir) => {
1052 SearchIndex::build_with_limit_to_cache_dir(root, max_file_size, cache_dir)
1053 }
1054 None => SearchIndex::build_with_limit(root, max_file_size),
1055 }
1056 }
1057
1058 fn allocate_file_id_with_metadata(
1059 &mut self,
1060 path: &Path,
1061 metadata: SearchFileMetadata,
1062 ) -> Option<u32> {
1063 let file_id = u32::try_from(self.files.len()).ok()?;
1064 Arc::make_mut(&mut self.files).push(FileEntry {
1065 path: path.to_path_buf(),
1066 size: metadata.size,
1067 modified: metadata.modified,
1068 content_hash: cache_freshness::zero_hash(),
1069 });
1070 Arc::make_mut(&mut self.path_to_id).insert(path.to_path_buf(), file_id);
1071 ensure_count_slot(Arc::make_mut(&mut self.file_trigram_count), file_id);
1072 Some(file_id)
1073 }
1074
1075 fn track_unindexed_file_with_metadata(
1076 &mut self,
1077 path: &Path,
1078 metadata: SearchFileMetadata,
1079 ) -> bool {
1080 let Some(file_id) = self.allocate_file_id_with_metadata(path, metadata) else {
1081 return false;
1082 };
1083 Arc::make_mut(&mut self.unindexed_files).insert(file_id);
1084 if let Some(count) = Arc::make_mut(&mut self.file_trigram_count).get_mut(file_id as usize) {
1085 *count = 0;
1086 }
1087 true
1088 }
1089
1090 fn active_file_ids(&self) -> Vec<u32> {
1091 self.snapshot().active_file_ids()
1092 }
1093
1094 #[cfg(test)]
1095 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1096 self.snapshot().postings_for_trigram(trigram, filter)
1097 }
1098
1099 fn update_compaction_flags(&mut self, changed_path: Option<&Path>) {
1100 let delta_files = self.delta_file_trigrams.len();
1101 let hard = delta_files >= DELTA_COMPACT_HARD_FILES
1102 || self.delta_packed_bytes >= DELTA_COMPACT_HARD_BYTES;
1103 let soft = delta_files >= DELTA_COMPACT_SOFT_FILES
1104 || self.delta_packed_bytes >= DELTA_COMPACT_SOFT_BYTES;
1105 if let Ok(mut state) = self.compaction_state.lock() {
1106 if state.running {
1107 if let Some(path) = changed_path {
1108 state.buffered_paths.push(path.to_path_buf());
1109 }
1110 if soft || hard {
1111 state.requested_again = true;
1112 }
1113 } else if hard || (soft && !state.requested_again) {
1114 state.requested_again = true;
1115 }
1116 }
1117 }
1118
1119 fn compaction_record_sources(
1120 &self,
1121 id_map: Arc<HashMap<u32, u32>>,
1122 ) -> Vec<Box<dyn PostingRecordSource>> {
1123 let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
1124 if let Some(base) = self.base.clone() {
1125 sources.push(Box::new(BaseRecordSource::new(
1126 base,
1127 Arc::clone(&id_map),
1128 Arc::new(self.superseded.clone()),
1129 )));
1130 }
1131
1132 let mut delta_records = Vec::new();
1133 for (&trigram, postings) in &self.delta_postings {
1134 for posting in postings {
1135 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
1136 continue;
1137 };
1138 delta_records.push(SpillRecord {
1139 trigram,
1140 file_id: mapped_file_id,
1141 next_mask: posting.next_mask,
1142 loc_mask: posting.loc_mask,
1143 });
1144 }
1145 }
1146 if !delta_records.is_empty() {
1147 delta_records.sort_unstable_by_key(|record| (record.trigram, record.file_id));
1148 sources.push(Box::new(VecRecordSource::new(delta_records)));
1149 }
1150 sources
1151 }
1152}
1153
1154impl BasePostings {
1155 fn lookup_entry(&self, trigram: u32) -> Option<LookupEntry> {
1156 self.lookup
1157 .binary_search_by_key(&trigram, |entry| entry.trigram)
1158 .ok()
1159 .and_then(|index| self.lookup.get(index).copied())
1160 }
1161
1162 fn read_postings(&self, entry: LookupEntry) -> std::io::Result<Vec<Posting>> {
1163 let bytes_len = (entry.count as usize)
1164 .checked_mul(POSTING_BYTES)
1165 .ok_or_else(|| std::io::Error::other("posting list too large"))?;
1166 let offset = self
1167 .postings_blob_start
1168 .checked_add(entry.offset)
1169 .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1170 let end = entry
1171 .offset
1172 .checked_add(bytes_len as u64)
1173 .ok_or_else(|| std::io::Error::other("posting offset overflow"))?;
1174 if end > self.postings_blob_len {
1175 return Err(std::io::Error::other("posting list exceeds blob"));
1176 }
1177 let mut bytes = vec![0u8; bytes_len];
1178 pread_exact(&self.file, offset, &mut bytes)?;
1179 let mut postings = Vec::with_capacity(entry.count as usize);
1180 for chunk in bytes.chunks_exact(POSTING_BYTES) {
1181 postings.push(Posting {
1182 file_id: u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
1183 next_mask: chunk[4],
1184 loc_mask: chunk[5],
1185 });
1186 }
1187 Ok(postings)
1188 }
1189}
1190
1191impl SearchIndexSnapshot {
1192 pub fn grep(
1193 &self,
1194 pattern: &str,
1195 case_sensitive: bool,
1196 include: &[String],
1197 exclude: &[String],
1198 search_root: &Path,
1199 max_results: usize,
1200 ) -> GrepResult {
1201 match pattern_compile::compile(
1202 pattern,
1203 CompileOpts {
1204 case_insensitive: !case_sensitive,
1205 ..CompileOpts::default()
1206 },
1207 ) {
1208 CompileResult::Ok(compiled) => {
1209 self.search_grep(&compiled, include, exclude, search_root, max_results)
1210 }
1211 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
1212 self.empty_grep_result()
1213 }
1214 }
1215 }
1216
1217 pub fn search_grep(
1218 &self,
1219 pattern: &CompiledPattern,
1220 include: &[String],
1221 exclude: &[String],
1222 search_root: &Path,
1223 max_results: usize,
1224 ) -> GrepResult {
1225 let matcher = match pattern {
1226 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
1227 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
1228 };
1229
1230 let filters = match build_path_filters(include, exclude) {
1231 Ok(filters) => filters,
1232 Err(_) => PathFilters::default(),
1233 };
1234 let search_root = canonicalize_or_normalize(search_root);
1235
1236 let raw_pattern = pattern.raw_pattern_for_trigrams();
1237 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
1238 RegexQuery::default()
1239 } else {
1240 decompose_regex(&raw_pattern)
1241 };
1242 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
1243 let candidate_ids = self.candidates(&query);
1244
1245 let candidate_files: Vec<&FileEntry> = candidate_ids
1246 .into_iter()
1247 .filter_map(|file_id| self.files.get(file_id as usize))
1248 .filter(|file| !file.path.as_os_str().is_empty())
1249 .filter(|file| is_within_search_root(&search_root, &file.path))
1250 .filter(|file| filters.matches(&self.project_root, &file.path))
1251 .collect();
1252
1253 let total_matches = AtomicUsize::new(0);
1254 let files_searched = AtomicUsize::new(0);
1255 let files_with_matches = AtomicUsize::new(0);
1256 let truncated = AtomicBool::new(false);
1257 let engine_capped = AtomicBool::new(false);
1258 let stop_after = max_results.saturating_mul(2);
1259 let stop_scan = Arc::new(AtomicBool::new(false));
1260
1261 let mut matches = if candidate_files.len() > 10 {
1262 candidate_files
1263 .par_iter()
1264 .map(|file| {
1265 if grep_scan_should_stop(
1266 Some(&stop_scan),
1267 &truncated,
1268 &total_matches,
1269 stop_after,
1270 ) {
1271 engine_capped.store(true, Ordering::Relaxed);
1272 return Vec::new();
1273 }
1274 search_candidate_file(
1275 file,
1276 &matcher,
1277 max_results,
1278 stop_after,
1279 &total_matches,
1280 &files_searched,
1281 &files_with_matches,
1282 &truncated,
1283 &engine_capped,
1284 Some(&stop_scan),
1285 )
1286 })
1287 .reduce(Vec::new, |mut left, mut right| {
1288 left.append(&mut right);
1292 left
1293 })
1294 } else {
1295 let mut matches = Vec::new();
1296 for file in candidate_files {
1297 matches.extend(search_candidate_file(
1298 file,
1299 &matcher,
1300 max_results,
1301 stop_after,
1302 &total_matches,
1303 &files_searched,
1304 &files_with_matches,
1305 &truncated,
1306 &engine_capped,
1307 None,
1308 ));
1309
1310 if should_stop_search(&truncated, &total_matches, stop_after) {
1311 engine_capped.store(true, Ordering::Relaxed);
1312 break;
1313 }
1314 }
1315 matches
1316 };
1317
1318 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
1319 self.path_to_id
1320 .get(path)
1321 .and_then(|file_id| self.files.get(*file_id as usize))
1322 .map(|file| file.modified)
1323 });
1324
1325 let matches = matches
1326 .into_iter()
1327 .map(|matched| GrepMatch {
1328 file: matched.file.as_ref().clone(),
1329 line: matched.line,
1330 column: matched.column,
1331 line_text: matched.line_text,
1332 match_text: matched.match_text,
1333 })
1334 .collect();
1335
1336 GrepResult {
1337 total_matches: total_matches.load(Ordering::Relaxed),
1338 matches,
1339 files_searched: files_searched.load(Ordering::Relaxed),
1340 files_with_matches: files_with_matches.load(Ordering::Relaxed),
1341 index_status: if self.ready {
1342 IndexStatus::Ready
1343 } else {
1344 IndexStatus::Building
1345 },
1346 truncated: truncated.load(Ordering::Relaxed),
1347 fully_degraded,
1348 engine_capped: engine_capped.load(Ordering::Relaxed),
1349 walk_truncated: false,
1350 }
1351 }
1352
1353 fn empty_grep_result(&self) -> GrepResult {
1354 GrepResult {
1355 matches: Vec::new(),
1356 total_matches: 0,
1357 files_searched: 0,
1358 files_with_matches: 0,
1359 index_status: if self.ready {
1360 IndexStatus::Ready
1361 } else {
1362 IndexStatus::Building
1363 },
1364 truncated: false,
1365 fully_degraded: false,
1366 engine_capped: false,
1367 walk_truncated: false,
1368 }
1369 }
1370
1371 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
1372 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
1373 Ok(filters) => filters,
1374 Err(_) => return Vec::new(),
1375 };
1376 let search_root = canonicalize_or_normalize(search_root);
1377 let mut entries = self
1378 .files
1379 .iter()
1380 .filter(|file| !file.path.as_os_str().is_empty())
1381 .filter(|file| is_within_search_root(&search_root, &file.path))
1382 .filter(|file| filters.matches(&self.project_root, &file.path))
1383 .map(|file| (file.path.clone(), file.modified))
1384 .collect::<Vec<_>>();
1385
1386 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
1387 right_mtime
1388 .cmp(left_mtime)
1389 .then_with(|| left_path.cmp(right_path))
1390 });
1391
1392 entries.into_iter().map(|(path, _)| path).collect()
1393 }
1394
1395 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
1396 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
1397 return self.active_file_ids();
1398 }
1399
1400 let mut and_trigrams = query.and_trigrams.clone();
1401 and_trigrams.sort_unstable_by_key(|trigram| self.posting_count(*trigram));
1402
1403 let mut current: Option<Vec<u32>> = None;
1404
1405 for trigram in and_trigrams {
1406 let filter = query.and_filters.get(&trigram).copied();
1407 let matches = self.postings_for_trigram(trigram, filter);
1408 current = Some(match current.take() {
1409 Some(existing) => intersect_sorted_ids(&existing, &matches),
1410 None => matches,
1411 });
1412
1413 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
1414 break;
1415 }
1416 }
1417
1418 let mut current = current.unwrap_or_else(|| self.active_file_ids());
1419
1420 for (index, group) in query.or_groups.iter().enumerate() {
1421 let mut group_matches = Vec::new();
1422 let filters = query.or_filters.get(index);
1423
1424 for trigram in group {
1425 let filter = filters.and_then(|filters| filters.get(trigram).copied());
1426 let matches = self.postings_for_trigram(*trigram, filter);
1427 if group_matches.is_empty() {
1428 group_matches = matches;
1429 } else {
1430 group_matches = union_sorted_ids(&group_matches, &matches);
1431 }
1432 }
1433
1434 current = intersect_sorted_ids(¤t, &group_matches);
1435 if current.is_empty() {
1436 break;
1437 }
1438 }
1439
1440 let mut unindexed = self
1441 .unindexed_files
1442 .iter()
1443 .copied()
1444 .filter(|file_id| self.is_active_file(*file_id))
1445 .collect::<Vec<_>>();
1446 if !unindexed.is_empty() {
1447 unindexed.sort_unstable();
1448 current = union_sorted_ids(¤t, &unindexed);
1449 }
1450
1451 current
1452 }
1453
1454 fn posting_count(&self, trigram: u32) -> usize {
1455 let base_count = self
1456 .base
1457 .as_ref()
1458 .and_then(|base| base.lookup_entry(trigram))
1459 .map_or(0usize, |entry| entry.count as usize);
1460 base_count.saturating_add(self.delta_postings.get(&trigram).map_or(0usize, Vec::len))
1461 }
1462
1463 fn active_file_ids(&self) -> Vec<u32> {
1464 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1465 ids.retain(|file_id| self.is_active_file(*file_id));
1466 ids.sort_unstable();
1467 ids
1468 }
1469
1470 fn is_active_file(&self, file_id: u32) -> bool {
1471 if self.superseded.contains(&file_id) {
1472 return false;
1473 }
1474 self.files
1475 .get(file_id as usize)
1476 .map(|file| !file.path.as_os_str().is_empty())
1477 .unwrap_or(false)
1478 }
1479
1480 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1481 let mut matches = Vec::new();
1482
1483 if let Some(base_entry) = self
1484 .base
1485 .as_ref()
1486 .and_then(|base| base.lookup_entry(trigram))
1487 {
1488 if let Some(base) = &self.base {
1489 if let Ok(postings) = base.read_postings(base_entry) {
1490 matches.reserve(postings.len());
1491 for posting in postings {
1492 if self.superseded.contains(&posting.file_id) {
1493 continue;
1494 }
1495 if !posting_matches_filter(&posting, filter) {
1496 continue;
1497 }
1498 if self.is_active_file(posting.file_id) {
1499 matches.push(posting.file_id);
1500 }
1501 }
1502 }
1503 }
1504 }
1505
1506 if let Some(postings) = self.delta_postings.get(&trigram) {
1507 matches.reserve(postings.len());
1508 for posting in postings {
1509 if !posting_matches_filter(posting, filter) {
1510 continue;
1511 }
1512 if self.is_active_file(posting.file_id) {
1513 matches.push(posting.file_id);
1514 }
1515 }
1516 }
1517
1518 if matches.len() > 1 {
1519 matches.sort_unstable();
1520 matches.dedup();
1521 }
1522 matches
1523 }
1524}
1525
1526fn posting_matches_filter(posting: &Posting, filter: Option<PostingFilter>) -> bool {
1527 if let Some(filter) = filter {
1528 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1531 return false;
1532 }
1533 }
1537 true
1538}
1539
1540fn search_candidate_file(
1541 file: &FileEntry,
1542 matcher: &SearchMatcher,
1543 max_results: usize,
1544 stop_after: usize,
1545 total_matches: &AtomicUsize,
1546 files_searched: &AtomicUsize,
1547 files_with_matches: &AtomicUsize,
1548 truncated: &AtomicBool,
1549 engine_capped: &AtomicBool,
1550 stop_scan: Option<&Arc<AtomicBool>>,
1551) -> Vec<SharedGrepMatch> {
1552 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1553 engine_capped.store(true, Ordering::Relaxed);
1554 return Vec::new();
1555 }
1556
1557 let content = match read_indexed_file_bytes(&file.path) {
1558 Some(content) => content,
1559 None => return Vec::new(),
1560 };
1561 if is_binary_bytes(&content) {
1568 return Vec::new();
1569 }
1570 files_searched.fetch_add(1, Ordering::Relaxed);
1571
1572 let shared_path = Arc::new(file.path.clone());
1573 let mut matches = Vec::new();
1574 let mut line_starts = None;
1575 let mut seen_lines = HashSet::new();
1576 let mut matched_this_file = false;
1577
1578 match matcher {
1579 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1580 let needle = &literal.needle;
1581 let finder = memchr::memmem::Finder::new(needle);
1582 let mut start = 0;
1583
1584 while let Some(position) = finder.find(&content[start..]) {
1585 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1586 engine_capped.store(true, Ordering::Relaxed);
1587 break;
1588 }
1589
1590 let offset = start + position;
1591 start = offset + 1;
1592
1593 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1594 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1595 if !seen_lines.insert(line) {
1596 continue;
1597 }
1598
1599 matched_this_file = true;
1600 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1601 if match_number > max_results {
1602 truncated.store(true, Ordering::Relaxed);
1603 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1604 break;
1605 }
1606
1607 let end = offset + needle.len();
1608 matches.push(SharedGrepMatch {
1609 file: shared_path.clone(),
1610 line,
1611 column,
1612 line_text,
1613 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1614 });
1615 }
1616 }
1617 SearchMatcher::Literal(literal) => {
1618 let needle = &literal.needle;
1619 let search_content = content.to_ascii_lowercase();
1620 let finder = memchr::memmem::Finder::new(needle);
1621 let mut start = 0;
1622
1623 while let Some(position) = finder.find(&search_content[start..]) {
1624 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1625 engine_capped.store(true, Ordering::Relaxed);
1626 break;
1627 }
1628
1629 let offset = start + position;
1630 start = offset + 1;
1631
1632 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1633 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1634 if !seen_lines.insert(line) {
1635 continue;
1636 }
1637
1638 matched_this_file = true;
1639 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1640 if match_number > max_results {
1641 truncated.store(true, Ordering::Relaxed);
1642 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1643 break;
1644 }
1645
1646 let end = offset + needle.len();
1647 matches.push(SharedGrepMatch {
1648 file: shared_path.clone(),
1649 line,
1650 column,
1651 line_text,
1652 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1653 });
1654 }
1655 }
1656 SearchMatcher::Regex(regex) => {
1657 for matched in regex.find_iter(&content) {
1658 if grep_scan_should_stop(stop_scan, truncated, total_matches, stop_after) {
1659 engine_capped.store(true, Ordering::Relaxed);
1660 break;
1661 }
1662
1663 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1664 let (line, column, line_text) =
1665 line_details_bytes(&content, line_starts, matched.start());
1666 if !seen_lines.insert(line) {
1667 continue;
1668 }
1669
1670 matched_this_file = true;
1671 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1672 if match_number > max_results {
1673 truncated.store(true, Ordering::Relaxed);
1674 signal_grep_scan_cap(stop_scan, total_matches, stop_after);
1675 break;
1676 }
1677
1678 matches.push(SharedGrepMatch {
1679 file: shared_path.clone(),
1680 line,
1681 column,
1682 line_text,
1683 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1684 });
1685 }
1686 }
1687 }
1688
1689 if matched_this_file {
1690 files_with_matches.fetch_add(1, Ordering::Relaxed);
1691 }
1692
1693 matches
1694}
1695
1696fn should_stop_search(
1697 truncated: &AtomicBool,
1698 total_matches: &AtomicUsize,
1699 stop_after: usize,
1700) -> bool {
1701 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1702}
1703
1704fn grep_scan_should_stop(
1705 stop_scan: Option<&Arc<AtomicBool>>,
1706 truncated: &AtomicBool,
1707 total_matches: &AtomicUsize,
1708 stop_after: usize,
1709) -> bool {
1710 stop_scan.is_some_and(|flag| flag.load(Ordering::Relaxed))
1711 || should_stop_search(truncated, total_matches, stop_after)
1712}
1713
1714fn signal_grep_scan_cap(
1715 stop_scan: Option<&Arc<AtomicBool>>,
1716 total_matches: &AtomicUsize,
1717 stop_after: usize,
1718) {
1719 if let Some(flag) = stop_scan {
1720 if total_matches.load(Ordering::Relaxed) >= stop_after {
1721 flag.store(true, Ordering::Relaxed);
1722 }
1723 }
1724}
1725
1726fn search_file_metadata(metadata: &fs::Metadata) -> SearchFileMetadata {
1727 SearchFileMetadata {
1728 size: metadata.len(),
1729 modified: metadata.modified().unwrap_or(UNIX_EPOCH),
1730 }
1731}
1732
1733fn metadata_for_indexed_content(path: &Path, size_hint: u64) -> SearchFileMetadata {
1734 fs::metadata(path)
1735 .ok()
1736 .map(|metadata| search_file_metadata(&metadata))
1737 .unwrap_or(SearchFileMetadata {
1738 size: size_hint,
1739 modified: UNIX_EPOCH,
1740 })
1741}
1742
1743fn prepare_search_path(path: &Path, max_file_size: u64) -> PreparedSearchPath {
1744 let metadata = match fs::metadata(path) {
1745 Ok(metadata) if metadata.is_file() => search_file_metadata(&metadata),
1746 _ => return PreparedSearchPath::Skipped,
1747 };
1748
1749 if is_binary_path(path, metadata.size) || metadata.size > max_file_size {
1750 return PreparedSearchPath::Unindexed(metadata);
1751 }
1752
1753 let content = match fs::read(path) {
1754 Ok(content) => content,
1755 Err(_) => return PreparedSearchPath::Skipped,
1756 };
1757
1758 if is_binary_bytes(&content) {
1759 return PreparedSearchPath::Unindexed(metadata);
1760 }
1761
1762 PreparedSearchPath::Indexed(PreparedIndexedFile {
1763 metadata,
1764 content_hash: cache_freshness::hash_bytes(&content),
1765 trigram_map: trigram_filter_map(&content, true),
1766 })
1767}
1768
1769fn search_index_build_pool_size() -> usize {
1772 std::thread::available_parallelism()
1773 .map(|parallelism| parallelism.get())
1774 .unwrap_or(1)
1775 .div_ceil(2)
1776 .clamp(1, 8)
1777}
1778
1779#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1780struct SpillRecord {
1781 trigram: u32,
1782 file_id: u32,
1783 next_mask: u8,
1784 loc_mask: u8,
1785}
1786
1787struct CacheWritePlan {
1788 project_root: PathBuf,
1789 git_head: Option<String>,
1790 ignore_fingerprint: String,
1791 max_file_size: u64,
1792 files: Vec<FileEntry>,
1793 path_to_id: HashMap<PathBuf, u32>,
1794 unindexed_files: HashSet<u32>,
1795 file_trigram_count: Vec<u32>,
1796 id_map: Arc<HashMap<u32, u32>>,
1797}
1798
1799impl CacheWritePlan {
1800 fn from_index(index: &SearchIndex, git_head: Option<&str>) -> Option<Self> {
1801 let active_ids = index.active_file_ids();
1802 let mut id_map = HashMap::with_capacity(active_ids.len());
1803 for (new_id, old_id) in active_ids.iter().enumerate() {
1804 let new_id = u32::try_from(new_id).ok()?;
1805 id_map.insert(*old_id, new_id);
1806 }
1807
1808 let mut files = Vec::with_capacity(active_ids.len());
1809 let mut path_to_id = HashMap::with_capacity(active_ids.len());
1810 let mut unindexed_files = HashSet::new();
1811 let mut file_trigram_count = Vec::with_capacity(active_ids.len());
1812 for old_id in active_ids {
1813 let new_id = *id_map.get(&old_id)?;
1814 let file = index.files.get(old_id as usize)?.clone();
1815 if file.path.as_os_str().is_empty() {
1816 continue;
1817 }
1818 path_to_id.insert(file.path.clone(), new_id);
1819 if index.unindexed_files.contains(&old_id) {
1820 unindexed_files.insert(new_id);
1821 }
1822 file_trigram_count.push(
1823 index
1824 .file_trigram_count
1825 .get(old_id as usize)
1826 .copied()
1827 .unwrap_or(0),
1828 );
1829 files.push(file);
1830 }
1831
1832 Some(Self {
1833 project_root: index.project_root.clone(),
1834 git_head: git_head.map(ToOwned::to_owned),
1835 ignore_fingerprint: if index.ignore_rules_fingerprint.is_empty() {
1836 ignore_rules_fingerprint(&index.project_root)
1837 } else {
1838 index.ignore_rules_fingerprint.clone()
1839 },
1840 max_file_size: index.max_file_size,
1841 files,
1842 path_to_id,
1843 unindexed_files,
1844 file_trigram_count,
1845 id_map: Arc::new(id_map),
1846 })
1847 }
1848}
1849
1850trait PostingRecordSource {
1851 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>>;
1852}
1853
1854struct VecRecordSource {
1855 records: Vec<SpillRecord>,
1856 index: usize,
1857}
1858
1859impl VecRecordSource {
1860 fn new(records: Vec<SpillRecord>) -> Self {
1861 Self { records, index: 0 }
1862 }
1863}
1864
1865impl PostingRecordSource for VecRecordSource {
1866 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1867 let record = self.records.get(self.index).copied();
1868 if record.is_some() {
1869 self.index += 1;
1870 }
1871 Ok(record)
1872 }
1873}
1874
1875struct SpillSegmentSource {
1876 reader: BufReader<File>,
1877 remaining_records: u64,
1878 current_trigram: u32,
1879 remaining_in_group: u32,
1880}
1881
1882impl SpillSegmentSource {
1883 fn open(path: &Path) -> std::io::Result<Self> {
1884 let mut reader = BufReader::new(File::open(path)?);
1885 let mut magic = [0u8; 8];
1886 reader.read_exact(&mut magic)?;
1887 if &magic != SPILL_MAGIC {
1888 return Err(std::io::Error::other("invalid search spill magic"));
1889 }
1890 if read_u32(&mut reader)? != INDEX_VERSION {
1891 return Err(std::io::Error::other("invalid search spill version"));
1892 }
1893 let remaining_records = read_u64(&mut reader)?;
1894 Ok(Self {
1895 reader,
1896 remaining_records,
1897 current_trigram: 0,
1898 remaining_in_group: 0,
1899 })
1900 }
1901}
1902
1903impl PostingRecordSource for SpillSegmentSource {
1904 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1905 if self.remaining_records == 0 {
1906 return Ok(None);
1907 }
1908 if self.remaining_in_group == 0 {
1909 self.current_trigram = read_u32(&mut self.reader)?;
1910 self.remaining_in_group = read_u32(&mut self.reader)?;
1911 if self.remaining_in_group == 0 {
1912 return Err(std::io::Error::other("empty search spill group"));
1913 }
1914 }
1915 let mut file_id = [0u8; 4];
1916 self.reader.read_exact(&mut file_id)?;
1917 let mut masks = [0u8; 2];
1918 self.reader.read_exact(&mut masks)?;
1919 self.remaining_in_group -= 1;
1920 self.remaining_records -= 1;
1921 Ok(Some(SpillRecord {
1922 trigram: self.current_trigram,
1923 file_id: u32::from_le_bytes(file_id),
1924 next_mask: masks[0],
1925 loc_mask: masks[1],
1926 }))
1927 }
1928}
1929
1930struct BaseRecordSource {
1931 base: Arc<BasePostings>,
1932 id_map: Arc<HashMap<u32, u32>>,
1933 superseded: Arc<HashSet<u32>>,
1934 lookup_index: usize,
1935 current: Vec<SpillRecord>,
1936 current_index: usize,
1937}
1938
1939impl BaseRecordSource {
1940 fn new(
1941 base: Arc<BasePostings>,
1942 id_map: Arc<HashMap<u32, u32>>,
1943 superseded: Arc<HashSet<u32>>,
1944 ) -> Self {
1945 Self {
1946 base,
1947 id_map,
1948 superseded,
1949 lookup_index: 0,
1950 current: Vec::new(),
1951 current_index: 0,
1952 }
1953 }
1954
1955 fn load_next_group(&mut self) -> std::io::Result<bool> {
1956 while let Some(entry) = self.base.lookup.get(self.lookup_index).copied() {
1957 self.lookup_index += 1;
1958 let postings = self.base.read_postings(entry)?;
1959 self.current.clear();
1960 self.current_index = 0;
1961 for posting in postings {
1962 if self.superseded.contains(&posting.file_id) {
1963 continue;
1964 }
1965 let Some(mapped_file_id) = self.id_map.get(&posting.file_id).copied() else {
1966 continue;
1967 };
1968 self.current.push(SpillRecord {
1969 trigram: entry.trigram,
1970 file_id: mapped_file_id,
1971 next_mask: posting.next_mask,
1972 loc_mask: posting.loc_mask,
1973 });
1974 }
1975 if !self.current.is_empty() {
1976 return Ok(true);
1977 }
1978 }
1979 Ok(false)
1980 }
1981}
1982
1983impl PostingRecordSource for BaseRecordSource {
1984 fn next_record(&mut self) -> std::io::Result<Option<SpillRecord>> {
1985 if self.current_index >= self.current.len() && !self.load_next_group()? {
1986 return Ok(None);
1987 }
1988 let record = self.current[self.current_index];
1989 self.current_index += 1;
1990 Ok(Some(record))
1991 }
1992}
1993
1994#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1995struct HeapItem {
1996 record: SpillRecord,
1997 source_index: usize,
1998}
1999
2000impl Ord for HeapItem {
2001 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
2002 other
2003 .record
2004 .trigram
2005 .cmp(&self.record.trigram)
2006 .then_with(|| other.record.file_id.cmp(&self.record.file_id))
2007 .then_with(|| other.source_index.cmp(&self.source_index))
2008 }
2009}
2010
2011impl PartialOrd for HeapItem {
2012 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
2013 Some(self.cmp(other))
2014 }
2015}
2016
2017fn build_streaming_index(
2018 root: &Path,
2019 max_file_size: u64,
2020 cache_dir: &Path,
2021) -> std::io::Result<(SearchIndex, usize)> {
2022 fs::create_dir_all(cache_dir)?;
2023 sweep_stale_search_build_dirs(cache_dir);
2024 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
2025 let ignore_fingerprint = ignore_rules_fingerprint(&project_root);
2026 let filters = PathFilters::default();
2027 let paths: Vec<PathBuf> = walk_project_files(&project_root, &filters);
2028 let pool_size = search_index_build_pool_size();
2029 let chunk_size = pool_size.saturating_mul(4).clamp(1, 32);
2030 let pool = rayon::ThreadPoolBuilder::new()
2031 .num_threads(pool_size)
2032 .thread_name(|index| format!("aft-search-build-{index}"))
2033 .stack_size(8 * 1024 * 1024)
2034 .build()
2035 .ok();
2036
2037 let spill_dir = create_spill_dir(cache_dir)?;
2038 let mut spill_paths = Vec::new();
2039 let mut spill_seq = 0usize;
2040 let mut block: Vec<SpillRecord> = Vec::new();
2041 let mut files = Vec::new();
2042 let mut path_to_id = HashMap::new();
2043 let mut unindexed_files = HashSet::new();
2044 let mut file_trigram_count = Vec::new();
2045 let mut indexed = 0usize;
2046
2047 let build_result = (|| -> std::io::Result<BasePostings> {
2048 for chunk in paths.chunks(chunk_size) {
2049 let prepare_chunk = || -> Vec<PreparedSearchPath> {
2050 chunk
2051 .par_iter()
2052 .map(|path| prepare_search_path(path, max_file_size))
2053 .collect()
2054 };
2055 let prepared = match &pool {
2056 Some(pool) => pool.install(prepare_chunk),
2057 None => prepare_chunk(),
2058 };
2059
2060 for (path, prepared) in chunk.iter().zip(prepared) {
2061 match prepared {
2062 PreparedSearchPath::Indexed(file) => {
2063 let file_id = u32::try_from(files.len())
2064 .map_err(|_| std::io::Error::other("too many files to index"))?;
2065 files.push(FileEntry {
2066 path: path.clone(),
2067 size: file.metadata.size,
2068 modified: file.metadata.modified,
2069 content_hash: file.content_hash,
2070 });
2071 path_to_id.insert(path.clone(), file_id);
2072 file_trigram_count.push(file.trigram_map.len() as u32);
2073 for (trigram, filter) in file.trigram_map {
2074 block.push(SpillRecord {
2075 trigram,
2076 file_id,
2077 next_mask: filter.next_mask,
2078 loc_mask: filter.loc_mask,
2079 });
2080 }
2081 indexed += 1;
2082 }
2083 PreparedSearchPath::Unindexed(metadata) => {
2084 let file_id = u32::try_from(files.len())
2085 .map_err(|_| std::io::Error::other("too many files to index"))?;
2086 files.push(FileEntry {
2087 path: path.clone(),
2088 size: metadata.size,
2089 modified: metadata.modified,
2090 content_hash: cache_freshness::zero_hash(),
2091 });
2092 path_to_id.insert(path.clone(), file_id);
2093 unindexed_files.insert(file_id);
2094 file_trigram_count.push(0);
2095 indexed += 1;
2096 }
2097 PreparedSearchPath::Skipped => {}
2098 }
2099
2100 let block_bytes = block.len().saturating_mul(SPILL_RECORD_ESTIMATED_BYTES);
2101 if block_bytes >= SPIMI_SOFT_LIMIT_BYTES || block_bytes >= SPIMI_HARD_LIMIT_BYTES {
2102 let path = flush_spill_segment(&spill_dir, spill_seq, &mut block)?;
2103 spill_paths.push(path);
2104 spill_seq += 1;
2105 }
2106 }
2107 }
2108
2109 block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2110 let mut sources: Vec<Box<dyn PostingRecordSource>> = Vec::new();
2111 for path in &spill_paths {
2112 sources.push(Box::new(SpillSegmentSource::open(path)?));
2113 }
2114 if !block.is_empty() {
2115 sources.push(Box::new(VecRecordSource::new(std::mem::take(&mut block))));
2116 }
2117
2118 let plan = CacheWritePlan {
2119 project_root: project_root.clone(),
2120 git_head: current_git_head(&project_root),
2121 ignore_fingerprint: ignore_fingerprint.clone(),
2122 max_file_size,
2123 files: files.clone(),
2124 path_to_id: path_to_id.clone(),
2125 unindexed_files: unindexed_files.clone(),
2126 file_trigram_count: file_trigram_count.clone(),
2127 id_map: Arc::new(
2128 (0..files.len())
2129 .filter_map(|id| {
2130 let id = u32::try_from(id).ok()?;
2131 Some((id, id))
2132 })
2133 .collect(),
2134 ),
2135 };
2136 write_cache_file_from_sources(cache_dir, &plan, &mut sources)
2137 })();
2138
2139 let _ = fs::remove_dir_all(&spill_dir);
2140 let base = build_result?;
2141 let base_file_count =
2142 u32::try_from(files.len()).map_err(|_| std::io::Error::other("too many files to index"))?;
2143 let git_head = current_git_head(&project_root);
2144 let index = SearchIndex {
2145 base: Some(Arc::new(base)),
2146 delta_postings: HashMap::new(),
2147 delta_file_trigrams: HashMap::new(),
2148 files: Arc::new(files),
2149 path_to_id: Arc::new(path_to_id),
2150 ready: false,
2151 project_root,
2152 git_head,
2153 max_file_size,
2154 ignore_rules_fingerprint: ignore_fingerprint,
2155 file_trigram_count: Arc::new(file_trigram_count),
2156 unindexed_files: Arc::new(unindexed_files),
2157 superseded: HashSet::new(),
2158 base_file_count,
2159 delta_packed_bytes: 0,
2160 compaction_state: Arc::new(Mutex::new(CompactionState::default())),
2161 };
2162 Ok((index, indexed))
2163}
2164
2165fn write_cache_file_from_sources(
2166 cache_dir: &Path,
2167 plan: &CacheWritePlan,
2168 sources: &mut [Box<dyn PostingRecordSource>],
2169) -> std::io::Result<BasePostings> {
2170 fs::create_dir_all(cache_dir)?;
2171 sweep_stale_search_build_dirs(cache_dir);
2172 let cache_path = cache_dir.join("cache.bin");
2173 let tmp_cache = cache_dir.join(format!(
2174 "cache.bin.tmp.{}.{}",
2175 std::process::id(),
2176 SystemTime::now()
2177 .duration_since(UNIX_EPOCH)
2178 .unwrap_or(Duration::ZERO)
2179 .as_nanos()
2180 ));
2181
2182 let write_result = (|| -> std::io::Result<BasePostings> {
2183 let raw = OpenOptions::new()
2184 .write(true)
2185 .create_new(true)
2186 .open(&tmp_cache)?;
2187 let mut writer = BufWriter::new(raw);
2188 write_u32(&mut writer, CACHE_MAGIC)?;
2189 write_u32(&mut writer, INDEX_VERSION)?;
2190 let postings_len_patch = writer.stream_position()?;
2191 write_u64(&mut writer, 0)?;
2192
2193 let postings_section_start = writer.stream_position()?;
2194 let postings_header = build_postings_header_bytes(plan)?;
2195 writer.write_all(&postings_header)?;
2196 let postings_blob_len_patch = writer.stream_position()?;
2197 write_u64(&mut writer, 0)?;
2198 let postings_blob_start = writer.stream_position()?;
2199
2200 let (lookup_entries, postings_blob_len) = merge_sources_to_writer(sources, &mut writer)?;
2201 let extension = build_file_trigram_count_extension(&plan.file_trigram_count)?;
2202 writer.write_all(&extension)?;
2203 let postings_crc_end = writer.stream_position()?;
2204
2205 writer.flush()?;
2206 writer.seek(SeekFrom::Start(postings_blob_len_patch))?;
2207 write_u64(&mut writer, postings_blob_len)?;
2208 writer.flush()?;
2209
2210 let checksum = crc32_file_range(
2211 &tmp_cache,
2212 postings_section_start,
2213 postings_crc_end.saturating_sub(postings_section_start),
2214 )?;
2215 writer.seek(SeekFrom::Start(postings_crc_end))?;
2216 writer.write_all(&checksum.to_le_bytes())?;
2217 let postings_section_end = writer.stream_position()?;
2218 let postings_len_total = postings_section_end.saturating_sub(postings_section_start);
2219 writer.seek(SeekFrom::Start(postings_len_patch))?;
2220 write_u64(&mut writer, postings_len_total)?;
2221 writer.seek(SeekFrom::Start(postings_section_end))?;
2222
2223 let lookup_blob = build_lookup_section_bytes(&lookup_entries)?;
2224 writer.write_all(&lookup_blob)?;
2225 writer.flush()?;
2226 writer.get_ref().sync_all()?;
2227 drop(writer);
2228
2229 fs::rename(&tmp_cache, &cache_path)?;
2230 sync_parent_dir(&cache_path);
2231 let file = open_cache_file_read(&cache_path)?;
2232 Ok(BasePostings {
2233 file: Arc::new(file),
2234 postings_blob_start,
2235 postings_blob_len,
2236 lookup: Arc::new(lookup_entries),
2237 })
2238 })();
2239
2240 if write_result.is_err() {
2241 let _ = fs::remove_file(&tmp_cache);
2242 }
2243 write_result
2244}
2245
2246fn merge_sources_to_writer(
2247 sources: &mut [Box<dyn PostingRecordSource>],
2248 writer: &mut BufWriter<File>,
2249) -> std::io::Result<(Vec<LookupEntry>, u64)> {
2250 let mut heap = BinaryHeap::new();
2251 for (source_index, source) in sources.iter_mut().enumerate() {
2252 if let Some(record) = source.next_record()? {
2253 heap.push(HeapItem {
2254 record,
2255 source_index,
2256 });
2257 }
2258 }
2259
2260 let mut lookup_entries = Vec::new();
2261 let mut postings_blob_len = 0u64;
2262 let mut current_trigram: Option<u32> = None;
2263 let mut current_offset = 0u64;
2264 let mut current_count = 0u32;
2265
2266 while let Some(item) = heap.pop() {
2267 let record = item.record;
2268 if current_trigram != Some(record.trigram) {
2269 if let Some(trigram) = current_trigram {
2270 lookup_entries.push(LookupEntry {
2271 trigram,
2272 offset: current_offset,
2273 count: current_count,
2274 });
2275 }
2276 current_trigram = Some(record.trigram);
2277 current_offset = postings_blob_len;
2278 current_count = 0;
2279 }
2280
2281 writer.write_all(&record.file_id.to_le_bytes())?;
2282 writer.write_all(&[record.next_mask, record.loc_mask])?;
2283 postings_blob_len = postings_blob_len
2284 .checked_add(POSTING_BYTES as u64)
2285 .ok_or_else(|| std::io::Error::other("postings blob too large"))?;
2286 current_count = current_count
2287 .checked_add(1)
2288 .ok_or_else(|| std::io::Error::other("posting list too large"))?;
2289
2290 if let Some(next) = sources[item.source_index].next_record()? {
2291 heap.push(HeapItem {
2292 record: next,
2293 source_index: item.source_index,
2294 });
2295 }
2296 }
2297
2298 if let Some(trigram) = current_trigram {
2299 lookup_entries.push(LookupEntry {
2300 trigram,
2301 offset: current_offset,
2302 count: current_count,
2303 });
2304 }
2305
2306 Ok((lookup_entries, postings_blob_len))
2307}
2308
2309fn build_postings_header_bytes(plan: &CacheWritePlan) -> std::io::Result<Vec<u8>> {
2310 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2311 writer.write_all(INDEX_MAGIC)?;
2312 write_u32(&mut writer, INDEX_VERSION)?;
2313
2314 let head = plan.git_head.as_deref().unwrap_or_default();
2315 let root = plan.project_root.to_string_lossy();
2316 let head_len = u32::try_from(head.len())
2317 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
2318 let root_len = u32::try_from(root.len())
2319 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
2320 let ignore_fingerprint_len = u32::try_from(plan.ignore_fingerprint.len())
2321 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
2322 let file_count = u32::try_from(plan.files.len())
2323 .map_err(|_| std::io::Error::other("too many files to cache"))?;
2324
2325 write_u32(&mut writer, head_len)?;
2326 write_u32(&mut writer, root_len)?;
2327 write_u32(&mut writer, ignore_fingerprint_len)?;
2328 write_u64(&mut writer, plan.max_file_size)?;
2329 write_u32(&mut writer, file_count)?;
2330 writer.write_all(head.as_bytes())?;
2331 writer.write_all(root.as_bytes())?;
2332 writer.write_all(plan.ignore_fingerprint.as_bytes())?;
2333
2334 for (file_id, file) in plan.files.iter().enumerate() {
2335 let file_id =
2336 u32::try_from(file_id).map_err(|_| std::io::Error::other("too many files to cache"))?;
2337 let path = cache_relative_path(&plan.project_root, &file.path)
2338 .or_else(|| {
2339 fs::canonicalize(&file.path)
2340 .ok()
2341 .and_then(|canonical| cache_relative_path(&plan.project_root, &canonical))
2342 })
2343 .ok_or_else(|| {
2344 std::io::Error::other(format!(
2345 "refusing to cache path outside project root: {}",
2346 file.path.display()
2347 ))
2348 })?;
2349 let path = path.to_string_lossy();
2350 let path_len = u32::try_from(path.len())
2351 .map_err(|_| std::io::Error::other("cached path too large"))?;
2352 let modified = file
2353 .modified
2354 .duration_since(UNIX_EPOCH)
2355 .unwrap_or(Duration::ZERO);
2356 let unindexed = if plan.unindexed_files.contains(&file_id) {
2357 1u8
2358 } else {
2359 0u8
2360 };
2361
2362 writer.write_all(&[unindexed])?;
2363 write_u32(&mut writer, path_len)?;
2364 write_u64(&mut writer, file.size)?;
2365 write_u64(&mut writer, modified.as_secs())?;
2366 write_u32(&mut writer, modified.subsec_nanos())?;
2367 writer.write_all(file.content_hash.as_bytes())?;
2368 writer.write_all(path.as_bytes())?;
2369 }
2370
2371 writer.flush()?;
2372 Ok(writer
2373 .into_inner()
2374 .map_err(|error| std::io::Error::other(error.to_string()))?
2375 .into_inner())
2376}
2377
2378fn build_lookup_section_bytes(lookup_entries: &[LookupEntry]) -> std::io::Result<Vec<u8>> {
2379 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2380 let entry_count = u32::try_from(lookup_entries.len())
2381 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
2382 writer.write_all(LOOKUP_MAGIC)?;
2383 write_u32(&mut writer, INDEX_VERSION)?;
2384 write_u32(&mut writer, entry_count)?;
2385 for entry in lookup_entries {
2386 write_u32(&mut writer, entry.trigram)?;
2387 write_u64(&mut writer, entry.offset)?;
2388 write_u32(&mut writer, entry.count)?;
2389 }
2390 writer.flush()?;
2391 let mut lookup_blob = writer
2392 .into_inner()
2393 .map_err(|error| std::io::Error::other(error.to_string()))?
2394 .into_inner();
2395 let checksum = crc32fast::hash(&lookup_blob);
2396 lookup_blob.extend_from_slice(&checksum.to_le_bytes());
2397 Ok(lookup_blob)
2398}
2399
2400fn build_file_trigram_count_extension(counts: &[u32]) -> std::io::Result<Vec<u8>> {
2401 let mut writer = BufWriter::new(Cursor::new(Vec::new()));
2402 writer.write_all(FILE_TRIGRAM_COUNT_MAGIC)?;
2403 write_u32(&mut writer, INDEX_VERSION)?;
2404 write_u32(
2405 &mut writer,
2406 u32::try_from(counts.len())
2407 .map_err(|_| std::io::Error::other("too many file trigram counts"))?,
2408 )?;
2409 for count in counts {
2410 write_u32(&mut writer, *count)?;
2411 }
2412 writer.flush()?;
2413 Ok(writer
2414 .into_inner()
2415 .map_err(|error| std::io::Error::other(error.to_string()))?
2416 .into_inner())
2417}
2418
2419fn flush_spill_segment(
2420 spill_dir: &Path,
2421 seq: usize,
2422 block: &mut Vec<SpillRecord>,
2423) -> std::io::Result<PathBuf> {
2424 if block.is_empty() {
2425 return Err(std::io::Error::other(
2426 "refusing to write empty search spill",
2427 ));
2428 }
2429 block.sort_unstable_by_key(|record| (record.trigram, record.file_id));
2430 let path = spill_dir.join(format!("segment.{seq:06}.bin"));
2431 let mut writer = BufWriter::new(File::create(&path)?);
2432 writer.write_all(SPILL_MAGIC)?;
2433 write_u32(&mut writer, INDEX_VERSION)?;
2434 write_u64(
2435 &mut writer,
2436 u64::try_from(block.len()).map_err(|_| std::io::Error::other("search spill too large"))?,
2437 )?;
2438
2439 let mut index = 0usize;
2440 while index < block.len() {
2441 let trigram = block[index].trigram;
2442 let group_start = index;
2443 while index < block.len() && block[index].trigram == trigram {
2444 index += 1;
2445 }
2446 write_u32(&mut writer, trigram)?;
2447 write_u32(
2448 &mut writer,
2449 u32::try_from(index - group_start)
2450 .map_err(|_| std::io::Error::other("search spill group too large"))?,
2451 )?;
2452 for record in &block[group_start..index] {
2453 writer.write_all(&record.file_id.to_le_bytes())?;
2454 writer.write_all(&[record.next_mask, record.loc_mask])?;
2455 }
2456 }
2457 writer.flush()?;
2458 writer.get_ref().sync_all()?;
2459 block.clear();
2460 Ok(path)
2461}
2462
2463fn create_spill_dir(cache_dir: &Path) -> std::io::Result<PathBuf> {
2464 let dir = cache_dir.join(format!(
2465 "search-build.tmp.{}.{}",
2466 std::process::id(),
2467 SystemTime::now()
2468 .duration_since(UNIX_EPOCH)
2469 .unwrap_or(Duration::ZERO)
2470 .as_nanos()
2471 ));
2472 fs::create_dir_all(&dir)?;
2473 Ok(dir)
2474}
2475
2476fn sweep_stale_search_build_dirs(cache_dir: &Path) {
2477 let Ok(entries) = fs::read_dir(cache_dir) else {
2478 return;
2479 };
2480 for entry in entries.flatten() {
2481 let file_name = entry.file_name();
2482 if file_name.to_string_lossy().starts_with("search-build.tmp.") {
2483 let _ = fs::remove_dir_all(entry.path());
2484 }
2485 }
2486}
2487
2488fn transient_search_cache_dir(root: &Path) -> PathBuf {
2489 std::env::temp_dir().join(format!(
2490 "aft-search-cache.{}.{}.{}",
2491 artifact_cache_key(root),
2492 std::process::id(),
2493 SystemTime::now()
2494 .duration_since(UNIX_EPOCH)
2495 .unwrap_or(Duration::ZERO)
2496 .as_nanos()
2497 ))
2498}
2499
2500fn read_file_trigram_count_extension(
2501 base: &BasePostings,
2502 extension_start: u64,
2503 postings_body_end: u64,
2504 file_count: usize,
2505) -> std::io::Result<Option<Vec<u32>>> {
2506 if extension_start >= postings_body_end {
2507 return Ok(None);
2508 }
2509 let extension_len = postings_body_end - extension_start;
2510 if extension_len < 16 {
2511 return Ok(None);
2512 }
2513 let mut header = [0u8; 16];
2514 pread_exact(&base.file, extension_start, &mut header)?;
2515 if &header[..8] != FILE_TRIGRAM_COUNT_MAGIC {
2516 return Ok(None);
2517 }
2518 let version = u32::from_le_bytes([header[8], header[9], header[10], header[11]]);
2519 if version != INDEX_VERSION {
2520 return Err(std::io::Error::other("invalid file trigram count version"));
2521 }
2522 let count = u32::from_le_bytes([header[12], header[13], header[14], header[15]]) as usize;
2523 if count != file_count {
2524 return Err(std::io::Error::other("file trigram count length mismatch"));
2525 }
2526 let counts_len = count
2527 .checked_mul(4)
2528 .ok_or_else(|| std::io::Error::other("file trigram count extension too large"))?;
2529 if 16u64 + counts_len as u64 > extension_len {
2530 return Err(std::io::Error::other(
2531 "truncated file trigram count extension",
2532 ));
2533 }
2534 let mut bytes = vec![0u8; counts_len];
2535 pread_exact(&base.file, extension_start + 16, &mut bytes)?;
2536 let mut counts = Vec::with_capacity(count);
2537 for chunk in bytes.chunks_exact(4) {
2538 counts.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
2539 }
2540 Ok(Some(counts))
2541}
2542
2543fn compute_file_trigram_counts_from_base(
2544 base: &BasePostings,
2545 file_count: usize,
2546) -> std::io::Result<Vec<u32>> {
2547 let mut counts = vec![0u32; file_count];
2548 for entry in base.lookup.iter().copied() {
2549 for posting in base.read_postings(entry)? {
2550 let Some(count) = counts.get_mut(posting.file_id as usize) else {
2551 return Err(std::io::Error::other("posting references missing file"));
2552 };
2553 *count = count.saturating_add(1);
2554 }
2555 }
2556 Ok(counts)
2557}
2558
2559fn ensure_count_slot(counts: &mut Vec<u32>, file_id: u32) {
2560 let len = file_id as usize + 1;
2561 if counts.len() < len {
2562 counts.resize(len, 0);
2563 }
2564}
2565
2566fn reader_has_remaining<R: Seek>(
2567 reader: &mut R,
2568 absolute_end: u64,
2569 len: usize,
2570) -> std::io::Result<bool> {
2571 let position = reader.stream_position()?;
2572 Ok(position <= absolute_end && (len as u64) <= absolute_end - position)
2573}
2574
2575fn crc32_file_range(path: &Path, start: u64, len: u64) -> std::io::Result<u32> {
2576 let mut file = File::open(path)?;
2577 file.seek(SeekFrom::Start(start))?;
2578 let mut hasher = crc32fast::Hasher::new();
2579 let mut remaining = len;
2580 let mut buffer = vec![0u8; 1024 * 1024];
2581 while remaining > 0 {
2582 let read_len = buffer.len().min(remaining as usize);
2583 let bytes_read = file.read(&mut buffer[..read_len])?;
2584 if bytes_read == 0 {
2585 return Err(std::io::Error::new(
2586 std::io::ErrorKind::UnexpectedEof,
2587 "truncated cache while checksumming",
2588 ));
2589 }
2590 hasher.update(&buffer[..bytes_read]);
2591 remaining -= bytes_read as u64;
2592 }
2593 Ok(hasher.finalize())
2594}
2595
2596fn sync_parent_dir(path: &Path) {
2597 if let Some(parent) = path.parent() {
2598 if let Ok(dir) = File::open(parent) {
2599 let _ = dir.sync_all();
2600 }
2601 }
2602}
2603
2604fn open_cache_file_read(path: &Path) -> std::io::Result<File> {
2605 let mut options = OpenOptions::new();
2606 options.read(true);
2607 #[cfg(windows)]
2608 {
2609 use std::os::windows::fs::OpenOptionsExt;
2610 const FILE_SHARE_READ: u32 = 0x0000_0001;
2611 const FILE_SHARE_WRITE: u32 = 0x0000_0002;
2612 const FILE_SHARE_DELETE: u32 = 0x0000_0004;
2613 options.share_mode(FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE);
2614 }
2615 options.open(path)
2616}
2617
2618#[cfg(unix)]
2619fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2620 use std::os::unix::fs::FileExt;
2621 while !buffer.is_empty() {
2622 let bytes_read = file.read_at(buffer, offset)?;
2623 if bytes_read == 0 {
2624 return Err(std::io::Error::new(
2625 std::io::ErrorKind::UnexpectedEof,
2626 "short pread from search cache",
2627 ));
2628 }
2629 offset += bytes_read as u64;
2630 let (_, rest) = buffer.split_at_mut(bytes_read);
2631 buffer = rest;
2632 }
2633 Ok(())
2634}
2635
2636#[cfg(windows)]
2637fn pread_exact(file: &File, mut offset: u64, mut buffer: &mut [u8]) -> std::io::Result<()> {
2638 use std::os::windows::fs::FileExt;
2639 while !buffer.is_empty() {
2640 let bytes_read = file.seek_read(buffer, offset)?;
2641 if bytes_read == 0 {
2642 return Err(std::io::Error::new(
2643 std::io::ErrorKind::UnexpectedEof,
2644 "short pread from search cache",
2645 ));
2646 }
2647 offset += bytes_read as u64;
2648 let (_, rest) = buffer.split_at_mut(bytes_read);
2649 buffer = rest;
2650 }
2651 Ok(())
2652}
2653
2654fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2655 let mut merged = Vec::with_capacity(left.len().min(right.len()));
2656 let mut left_index = 0;
2657 let mut right_index = 0;
2658
2659 while left_index < left.len() && right_index < right.len() {
2660 match left[left_index].cmp(&right[right_index]) {
2661 std::cmp::Ordering::Less => left_index += 1,
2662 std::cmp::Ordering::Greater => right_index += 1,
2663 std::cmp::Ordering::Equal => {
2664 merged.push(left[left_index]);
2665 left_index += 1;
2666 right_index += 1;
2667 }
2668 }
2669 }
2670
2671 merged
2672}
2673
2674fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
2675 let mut merged = Vec::with_capacity(left.len() + right.len());
2676 let mut left_index = 0;
2677 let mut right_index = 0;
2678
2679 while left_index < left.len() && right_index < right.len() {
2680 match left[left_index].cmp(&right[right_index]) {
2681 std::cmp::Ordering::Less => {
2682 merged.push(left[left_index]);
2683 left_index += 1;
2684 }
2685 std::cmp::Ordering::Greater => {
2686 merged.push(right[right_index]);
2687 right_index += 1;
2688 }
2689 std::cmp::Ordering::Equal => {
2690 merged.push(left[left_index]);
2691 left_index += 1;
2692 right_index += 1;
2693 }
2694 }
2695 }
2696
2697 merged.extend_from_slice(&left[left_index..]);
2698 merged.extend_from_slice(&right[right_index..]);
2699 merged
2700}
2701
2702pub fn decompose_regex(pattern: &str) -> RegexQuery {
2703 let hir = match regex_syntax::parse(pattern) {
2704 Ok(hir) => hir,
2705 Err(_) => return RegexQuery::default(),
2706 };
2707
2708 let build = build_query(&hir);
2709 build.into_query()
2710}
2711
2712pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
2713 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
2714}
2715
2716pub fn normalize_char(c: u8) -> u8 {
2717 c.to_ascii_lowercase()
2718}
2719
2720fn scan_trigrams(content: &[u8], mut visit: impl FnMut(u32, u8, usize)) {
2721 if content.len() < 3 {
2722 return;
2723 }
2724
2725 for start in 0..=content.len() - 3 {
2726 let trigram = pack_trigram(
2727 normalize_char(content[start]),
2728 normalize_char(content[start + 1]),
2729 normalize_char(content[start + 2]),
2730 );
2731 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
2732 visit(trigram, next_char, start);
2733 }
2734}
2735
2736pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
2737 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
2738 scan_trigrams(content, |trigram, next_char, position| {
2739 trigrams.push((trigram, next_char, position));
2740 });
2741 trigrams
2742}
2743
2744fn trigram_filter_map(content: &[u8], include_eof_next_char: bool) -> BTreeMap<u32, PostingFilter> {
2745 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2746 scan_trigrams(content, |trigram, next_char, position| {
2747 let entry = filters.entry(trigram).or_default();
2748 if include_eof_next_char || next_char != EOF_SENTINEL {
2749 entry.next_mask |= mask_for_next_char(next_char);
2750 }
2751 entry.loc_mask |= mask_for_position(position);
2752 });
2753 filters
2754}
2755
2756pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
2757 let mut seen = HashSet::new();
2758 let mut out = Vec::new();
2759 for token in tokens {
2760 scan_trigrams(token.as_bytes(), |trigram, _, _| {
2761 if seen.insert(trigram) {
2762 out.push(trigram);
2763 }
2764 });
2765 }
2766 out
2767}
2768
2769pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
2770 lexical_score_snapshot(&index.snapshot(), query_trigrams, file_id)
2771}
2772
2773fn lexical_score_snapshot(
2774 index: &SearchIndexSnapshot,
2775 query_trigrams: &[u32],
2776 file_id: u32,
2777) -> f32 {
2778 if query_trigrams.is_empty() {
2779 return 0.0;
2780 }
2781
2782 let mut hits = 0u32;
2783 for &trigram in query_trigrams {
2784 let postings = index.postings_for_trigram(trigram, None);
2785 if postings.binary_search(&file_id).is_ok() {
2786 hits += 1;
2787 }
2788 }
2789
2790 if hits == 0 {
2791 return 0.0;
2792 }
2793
2794 let file_trigram_count = index
2795 .file_trigram_count
2796 .get(file_id as usize)
2797 .copied()
2798 .unwrap_or(1)
2799 .max(1) as f32;
2800 (hits as f32) / (1.0 + file_trigram_count.ln())
2801}
2802
2803pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
2804 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
2806 return PathBuf::from(override_dir)
2807 .join("index")
2808 .join(artifact_cache_key(project_root));
2809 }
2810 if let Some(dir) = storage_dir {
2812 return dir.join("index").join(artifact_cache_key(project_root));
2813 }
2814 let home = std::env::var_os("HOME")
2819 .or_else(|| std::env::var_os("USERPROFILE"))
2820 .map(PathBuf::from)
2821 .unwrap_or_else(std::env::temp_dir);
2822 home.join(".cache")
2823 .join("aft")
2824 .join("index")
2825 .join(artifact_cache_key(project_root))
2826}
2827
2828pub(crate) fn build_path_filters(
2829 include: &[String],
2830 exclude: &[String],
2831) -> Result<PathFilters, String> {
2832 Ok(PathFilters {
2833 includes: build_globset(include)?,
2834 excludes: build_globset(exclude)?,
2835 })
2836}
2837
2838pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
2839 walk_project_files_from(root, root, filters)
2840}
2841
2842pub fn walk_project_files_bounded_default(
2843 root: &Path,
2844 max_files: usize,
2845) -> Result<Vec<PathBuf>, usize> {
2846 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files), true)
2847}
2848
2849pub(crate) fn walk_project_files_bounded_matching<F>(
2850 root: &Path,
2851 filters: &PathFilters,
2852 max_files: usize,
2853 matches_file: F,
2854) -> Result<Vec<PathBuf>, usize>
2855where
2856 F: Fn(&Path) -> bool,
2857{
2858 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file, true)
2859}
2860
2861pub fn walk_project_files_bounded_default_matching<F>(
2862 root: &Path,
2863 max_files: usize,
2864 matches_file: F,
2865) -> Result<Vec<PathBuf>, usize>
2866where
2867 F: Fn(&Path) -> bool,
2868{
2869 walk_project_files_from_inner_matching(
2870 root,
2871 root,
2872 &PathFilters::default(),
2873 Some(max_files),
2874 matches_file,
2875 true,
2876 )
2877}
2878
2879pub(crate) fn walk_project_files_from(
2880 filter_root: &Path,
2881 search_root: &Path,
2882 filters: &PathFilters,
2883) -> Vec<PathBuf> {
2884 walk_project_files_from_inner(filter_root, search_root, filters, None, true)
2885 .expect("unbounded project walk cannot exceed a file limit")
2886}
2887
2888pub(crate) fn has_any_project_file_from(
2889 filter_root: &Path,
2890 search_root: &Path,
2891 filters: &PathFilters,
2892) -> bool {
2893 walk_project_files_from_inner(filter_root, search_root, filters, Some(0), true).is_err()
2894}
2895
2896fn walk_project_files_from_inner(
2897 filter_root: &Path,
2898 search_root: &Path,
2899 filters: &PathFilters,
2900 max_files: Option<usize>,
2901 sort_by_mtime: bool,
2902) -> Result<Vec<PathBuf>, usize> {
2903 walk_project_files_from_inner_matching(
2904 filter_root,
2905 search_root,
2906 filters,
2907 max_files,
2908 |_| true,
2909 sort_by_mtime,
2910 )
2911}
2912
2913fn project_walk_builder(search_root: &Path) -> WalkBuilder {
2914 let mut builder = WalkBuilder::new(search_root);
2915 builder
2916 .hidden(false)
2917 .git_ignore(true)
2918 .git_global(true)
2919 .git_exclude(true)
2920 .add_custom_ignore_filename(".aftignore")
2921 .filter_entry(|entry| {
2922 let name = entry.file_name().to_string_lossy();
2923 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
2924 return !matches!(
2925 name.as_ref(),
2926 "node_modules"
2927 | "target"
2928 | "venv"
2929 | ".venv"
2930 | ".git"
2931 | "__pycache__"
2932 | ".tox"
2933 | "dist"
2934 | "build"
2935 );
2936 }
2937 true
2938 });
2939 builder
2940}
2941
2942fn walk_project_files_from_inner_matching<F>(
2943 filter_root: &Path,
2944 search_root: &Path,
2945 filters: &PathFilters,
2946 max_files: Option<usize>,
2947 matches_file: F,
2948 sort_by_mtime: bool,
2949) -> Result<Vec<PathBuf>, usize>
2950where
2951 F: Fn(&Path) -> bool,
2952{
2953 let builder = project_walk_builder(search_root);
2954
2955 let mut files = Vec::new();
2956 for entry in builder.build().filter_map(|entry| entry.ok()) {
2957 if !entry
2958 .file_type()
2959 .map_or(false, |file_type| file_type.is_file())
2960 {
2961 continue;
2962 }
2963 let path = entry.into_path();
2964 if filters.matches(filter_root, &path) && matches_file(&path) {
2965 files.push(path);
2966 if max_files.is_some_and(|limit| files.len() > limit) {
2967 return Err(files.len());
2968 }
2969 }
2970 }
2971
2972 if sort_by_mtime {
2973 sort_paths_by_mtime_desc(&mut files);
2974 }
2975 Ok(files)
2976}
2977
2978pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
2979 let bytes = fs::read(path).ok()?;
2980 if is_binary_bytes(&bytes) {
2981 return None;
2982 }
2983 String::from_utf8(bytes).ok()
2984}
2985
2986fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
2987 fs::read(path).ok()
2988}
2989
2990pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
2991 path.strip_prefix(root)
2992 .map(PathBuf::from)
2993 .unwrap_or_else(|_| path.to_path_buf())
2994}
2995
2996pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
2997 let normalized_root = normalize_path(root);
2998 let normalized_path = normalize_path(path);
2999 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
3000 validate_cached_relative_path(relative)
3001}
3002
3003pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
3004 let relative = validate_cached_relative_path(relative_path)?;
3005 let normalized_root = normalize_path(root);
3006 let full_path = normalize_path(&normalized_root.join(relative));
3007
3008 match fs::canonicalize(&full_path) {
3009 Ok(canonical_path) => {
3010 if canonical_path.starts_with(&normalized_root) {
3011 return Some(full_path);
3012 }
3013
3014 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
3015 canonical_path
3016 .starts_with(&canonical_root)
3017 .then_some(full_path)
3018 }
3019 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
3020 }
3021}
3022
3023pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
3024 if path.is_absolute() {
3025 return None;
3026 }
3027
3028 let mut normalized = PathBuf::new();
3029 for component in path.components() {
3030 match component {
3031 Component::Normal(part) => normalized.push(part),
3032 Component::CurDir => {}
3033 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
3034 }
3035 }
3036 (!normalized.as_os_str().is_empty()).then_some(normalized)
3037}
3038
3039pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
3052 use std::collections::HashMap;
3053 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
3054 for path in paths.iter() {
3055 mtimes
3056 .entry(path.clone())
3057 .or_insert_with(|| path_modified_time(path));
3058 }
3059 paths.sort_by(|left, right| {
3060 let left_mtime = mtimes.get(left).and_then(|v| *v);
3061 let right_mtime = mtimes.get(right).and_then(|v| *v);
3062 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
3063 });
3064}
3065
3066pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
3069 use std::collections::HashMap;
3070 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
3071 for m in matches.iter() {
3072 mtimes.entry(m.file.clone()).or_insert_with(|| {
3073 let resolved = resolve_match_path(project_root, &m.file);
3074 path_modified_time(&resolved)
3075 });
3076 }
3077 matches.sort_by(|left, right| {
3078 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
3079 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
3080 right_mtime
3081 .cmp(&left_mtime)
3082 .then_with(|| left.file.cmp(&right.file))
3083 .then_with(|| left.line.cmp(&right.line))
3084 .then_with(|| left.column.cmp(&right.column))
3085 });
3086}
3087
3088fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
3093 matches: &mut [SharedGrepMatch],
3094 modified_for_path: F,
3095) where
3096 F: Fn(&Path) -> Option<SystemTime>,
3097{
3098 use std::collections::HashMap;
3099 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
3100 for m in matches.iter() {
3101 let path = m.file.as_path().to_path_buf();
3102 mtimes
3103 .entry(path.clone())
3104 .or_insert_with(|| modified_for_path(&path));
3105 }
3106 matches.sort_by(|left, right| {
3107 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
3108 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
3109 right_mtime
3110 .cmp(&left_mtime)
3111 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
3112 .then_with(|| left.line.cmp(&right.line))
3113 .then_with(|| left.column.cmp(&right.column))
3114 });
3115}
3116
3117pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
3118 let resolved_project_root = canonicalize_or_normalize(project_root);
3119 let root = match path {
3120 Some(path) => {
3121 let path = PathBuf::from(path);
3122 if path.is_absolute() {
3123 canonicalize_or_normalize(&path)
3124 } else {
3125 normalize_path(&resolved_project_root.join(path))
3126 }
3127 }
3128 None => resolved_project_root.clone(),
3129 };
3130
3131 let use_index = is_within_search_root(&resolved_project_root, &root);
3132 SearchScope { root, use_index }
3133}
3134
3135pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
3136 content_inspector::inspect(content).is_binary()
3137}
3138
3139pub(crate) fn current_git_head(root: &Path) -> Option<String> {
3140 run_git(root, &["rev-parse", "HEAD"])
3141}
3142
3143pub fn artifact_cache_key(project_root: &Path) -> String {
3155 use sha2::{Digest, Sha256};
3156
3157 let mut hasher = Sha256::new();
3158
3159 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
3160 hasher.update(root_commit.as_bytes());
3163 } else {
3164 let canonical_root = canonicalize_or_normalize(project_root);
3166 hasher.update(canonical_root.to_string_lossy().as_bytes());
3167 }
3168
3169 let digest = format!("{:x}", hasher.finalize());
3170 digest[..16].to_string()
3171}
3172
3173pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
3181 use sha2::{Digest, Sha256};
3182
3183 let root = canonicalize_or_normalize(project_root);
3184 let mut files = Vec::new();
3185 collect_ignore_rule_files(&root, &mut files);
3186 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
3187 if global_ignore.is_file() {
3188 files.push(global_ignore);
3189 }
3190 }
3191 let info_exclude = git_info_exclude_path(&root);
3192 if info_exclude.is_file() {
3193 files.push(info_exclude);
3194 }
3195 files.sort();
3196 files.dedup();
3197
3198 let mut hasher = Sha256::new();
3199 hasher.update(b"aft-ignore-rules-v1\0");
3200 for path in files {
3201 if let Some(relative) = cache_relative_path(&root, &path) {
3202 hasher.update(relative.to_string_lossy().as_bytes());
3203 } else {
3204 hasher.update(path.to_string_lossy().as_bytes());
3205 }
3206 hasher.update(b"\0");
3207 match fs::read(&path) {
3208 Ok(bytes) => hasher.update(&bytes),
3209 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
3210 }
3211 hasher.update(b"\0");
3212 }
3213
3214 format!("{:x}", hasher.finalize())
3215}
3216
3217fn git_info_exclude_path(root: &Path) -> PathBuf {
3218 run_git(
3219 root,
3220 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
3221 )
3222 .map(PathBuf::from)
3223 .unwrap_or_else(|| root.join(".git"))
3224 .join("info")
3225 .join("exclude")
3226}
3227
3228fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
3229 let mut builder = WalkBuilder::new(root);
3230 builder
3231 .hidden(false)
3232 .git_ignore(true)
3233 .git_global(true)
3234 .git_exclude(true)
3235 .add_custom_ignore_filename(".aftignore")
3236 .filter_entry(|entry| {
3237 let name = entry.file_name().to_string_lossy();
3238 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3239 return !matches!(
3240 name.as_ref(),
3241 ".git"
3242 | "node_modules"
3243 | "target"
3244 | "venv"
3245 | ".venv"
3246 | "__pycache__"
3247 | ".tox"
3248 | "dist"
3249 | "build"
3250 );
3251 }
3252 true
3253 });
3254
3255 for entry in builder.build().filter_map(|entry| entry.ok()) {
3256 if !entry
3257 .file_type()
3258 .map_or(false, |file_type| file_type.is_file())
3259 {
3260 continue;
3261 }
3262 let file_name = entry.file_name();
3263 if file_name == ".gitignore" || file_name == ".aftignore" {
3264 files.push(entry.into_path());
3265 }
3266 }
3267}
3268
3269#[cfg(test)]
3271pub(crate) fn count_ignore_rule_discovery_dirs(root: &Path) -> usize {
3272 let mut dirs = 0usize;
3273 let mut builder = WalkBuilder::new(root);
3274 builder
3275 .hidden(false)
3276 .git_ignore(true)
3277 .git_global(true)
3278 .git_exclude(true)
3279 .add_custom_ignore_filename(".aftignore");
3280 for entry in builder.build().filter_map(|entry| entry.ok()) {
3281 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
3282 dirs += 1;
3283 }
3284 }
3285 dirs
3286}
3287
3288#[cfg(test)]
3290pub(crate) fn count_ignore_rule_discovery_dirs_legacy_stack(root: &Path) -> usize {
3291 let mut stack = vec![root.to_path_buf()];
3292 let mut dirs = 0usize;
3293 while let Some(dir) = stack.pop() {
3294 dirs += 1;
3295 let Ok(entries) = fs::read_dir(&dir) else {
3296 continue;
3297 };
3298 for entry in entries.flatten() {
3299 let path = entry.path();
3300 let file_name = entry.file_name();
3301 if file_name == ".gitignore" || file_name == ".aftignore" {
3302 continue;
3303 }
3304 let Ok(file_type) = entry.file_type() else {
3305 continue;
3306 };
3307 if !file_type.is_dir() || file_type.is_symlink() {
3308 continue;
3309 }
3310 if matches!(
3311 file_name.to_str().unwrap_or(""),
3312 ".git"
3313 | "node_modules"
3314 | "target"
3315 | "venv"
3316 | ".venv"
3317 | "__pycache__"
3318 | ".tox"
3319 | "dist"
3320 | "build"
3321 ) {
3322 continue;
3323 }
3324 stack.push(path);
3325 }
3326 }
3327 dirs
3328}
3329
3330impl PathFilters {
3331 pub(crate) fn matches(&self, root: &Path, path: &Path) -> bool {
3332 let relative = to_glob_path(&relative_to_root(root, path));
3333 if self
3334 .includes
3335 .as_ref()
3336 .is_some_and(|includes| !includes.is_match(&relative))
3337 {
3338 return false;
3339 }
3340 if self
3341 .excludes
3342 .as_ref()
3343 .is_some_and(|excludes| excludes.is_match(&relative))
3344 {
3345 return false;
3346 }
3347 true
3348 }
3349}
3350
3351fn canonicalize_or_normalize(path: &Path) -> PathBuf {
3352 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
3353}
3354
3355fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
3356 if path.is_absolute() {
3357 path.to_path_buf()
3358 } else {
3359 project_root.join(path)
3360 }
3361}
3362
3363fn path_modified_time(path: &Path) -> Option<SystemTime> {
3364 fs::metadata(path)
3365 .and_then(|metadata| metadata.modified())
3366 .ok()
3367}
3368
3369fn normalize_path(path: &Path) -> PathBuf {
3370 let mut result = PathBuf::new();
3371 for component in path.components() {
3372 match component {
3373 Component::ParentDir => {
3374 if !result.pop() {
3375 result.push(component);
3376 }
3377 }
3378 Component::CurDir => {}
3379 _ => result.push(component),
3380 }
3381 }
3382 result
3383}
3384
3385fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3386 if let Ok(canonical) = fs::canonicalize(path) {
3387 return canonical;
3388 }
3389
3390 let Some(parent) = path.parent() else {
3391 return path.to_path_buf();
3392 };
3393 let Some(file_name) = path.file_name() else {
3394 return path.to_path_buf();
3395 };
3396
3397 fs::canonicalize(parent)
3398 .map(|canonical_parent| canonical_parent.join(file_name))
3399 .unwrap_or_else(|_| path.to_path_buf())
3400}
3401
3402fn verify_file_mtimes(index: &mut SearchIndex) {
3405 let filters = PathFilters::default();
3406 let current_files = walk_project_files(&index.project_root, &filters);
3407 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
3408 let mut stale_paths = Vec::new();
3409 let mut removed_paths = Vec::new();
3410
3411 for entry in Arc::make_mut(&mut index.files).iter_mut() {
3412 if entry.path.as_os_str().is_empty() {
3413 continue; }
3415 if !current_file_set.contains(&entry.path) {
3416 removed_paths.push(entry.path.clone());
3417 continue;
3418 }
3419 let cached = FileFreshness {
3420 mtime: entry.modified,
3421 size: entry.size,
3422 content_hash: entry.content_hash,
3423 };
3424 match cache_freshness::verify_file_strict(&entry.path, &cached) {
3425 FreshnessVerdict::HotFresh => {}
3426 FreshnessVerdict::ContentFresh {
3427 new_mtime,
3428 new_size,
3429 } => {
3430 entry.modified = new_mtime;
3431 entry.size = new_size;
3432 }
3433 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
3434 stale_paths.push(entry.path.clone())
3435 }
3436 }
3437 }
3438
3439 for path in &removed_paths {
3440 index.remove_file(path);
3441 }
3442
3443 for path in &stale_paths {
3447 if current_file_set.contains(path) {
3448 index.update_file(path);
3449 } else {
3450 index.remove_file(path);
3451 }
3452 }
3453
3454 for path in current_files {
3456 if !index.path_to_id.contains_key(&path) {
3457 index.update_file(&path);
3458 }
3459 }
3460
3461 if !stale_paths.is_empty() {
3462 crate::slog_info!(
3463 "search index: refreshed {} stale file(s) from disk cache",
3464 stale_paths.len()
3465 );
3466 }
3467}
3468
3469fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
3470 normalize_path(path).starts_with(normalize_path(search_root))
3471}
3472
3473impl QueryBuild {
3474 fn into_query(self) -> RegexQuery {
3475 let mut query = RegexQuery::default();
3476
3477 for run in self.and_runs {
3478 add_run_to_and_query(&mut query, &run);
3479 }
3480
3481 for group in self.or_groups {
3482 let mut trigrams = BTreeSet::new();
3483 let mut filters = HashMap::new();
3484 for run in group {
3485 for (trigram, filter) in trigram_filters(&run) {
3486 trigrams.insert(trigram);
3487 merge_filter(filters.entry(trigram).or_default(), filter);
3488 }
3489 }
3490 if !trigrams.is_empty() {
3491 query.or_groups.push(trigrams.into_iter().collect());
3492 query.or_filters.push(filters);
3493 }
3494 }
3495
3496 query
3497 }
3498}
3499
3500fn build_query(hir: &Hir) -> QueryBuild {
3501 match hir.kind() {
3502 HirKind::Literal(literal) => {
3503 if literal.0.len() >= 3 {
3504 QueryBuild {
3505 and_runs: vec![literal.0.to_vec()],
3506 or_groups: Vec::new(),
3507 }
3508 } else {
3509 QueryBuild::default()
3510 }
3511 }
3512 HirKind::Capture(capture) => build_query(&capture.sub),
3513 HirKind::Concat(parts) => {
3514 let mut build = QueryBuild::default();
3515 for part in parts {
3516 let part_build = build_query(part);
3517 build.and_runs.extend(part_build.and_runs);
3518 build.or_groups.extend(part_build.or_groups);
3519 }
3520 build
3521 }
3522 HirKind::Alternation(parts) => {
3523 let mut group = Vec::new();
3524 for part in parts {
3525 let Some(mut choices) = guaranteed_run_choices(part) else {
3526 return QueryBuild::default();
3527 };
3528 group.append(&mut choices);
3529 }
3530 if group.is_empty() {
3531 QueryBuild::default()
3532 } else {
3533 QueryBuild {
3534 and_runs: Vec::new(),
3535 or_groups: vec![group],
3536 }
3537 }
3538 }
3539 HirKind::Repetition(repetition) => {
3540 if repetition.min == 0 {
3541 QueryBuild::default()
3542 } else {
3543 build_query(&repetition.sub)
3544 }
3545 }
3546 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
3547 }
3548}
3549
3550fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
3551 match hir.kind() {
3552 HirKind::Literal(literal) => {
3553 if literal.0.len() >= 3 {
3554 Some(vec![literal.0.to_vec()])
3555 } else {
3556 None
3557 }
3558 }
3559 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
3560 HirKind::Concat(parts) => {
3561 let mut runs = Vec::new();
3562 for part in parts {
3563 if let Some(mut part_runs) = guaranteed_run_choices(part) {
3564 runs.append(&mut part_runs);
3565 }
3566 }
3567 if runs.is_empty() {
3568 None
3569 } else {
3570 Some(runs)
3571 }
3572 }
3573 HirKind::Alternation(parts) => {
3574 let mut runs = Vec::new();
3575 for part in parts {
3576 let Some(mut part_runs) = guaranteed_run_choices(part) else {
3577 return None;
3578 };
3579 runs.append(&mut part_runs);
3580 }
3581 if runs.is_empty() {
3582 None
3583 } else {
3584 Some(runs)
3585 }
3586 }
3587 HirKind::Repetition(repetition) => {
3588 if repetition.min == 0 {
3589 None
3590 } else {
3591 guaranteed_run_choices(&repetition.sub)
3592 }
3593 }
3594 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
3595 }
3596}
3597
3598fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
3599 for (trigram, filter) in trigram_filters(run) {
3600 if !query.and_trigrams.contains(&trigram) {
3601 query.and_trigrams.push(trigram);
3602 }
3603 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
3604 }
3605}
3606
3607fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
3608 trigram_filter_map(run, false).into_iter().collect()
3609}
3610
3611fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
3612 target.next_mask |= filter.next_mask;
3613 target.loc_mask |= filter.loc_mask;
3614}
3615
3616fn mask_for_next_char(next_char: u8) -> u8 {
3617 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
3618 1u8 << bit
3619}
3620
3621fn mask_for_position(position: usize) -> u8 {
3622 1u8 << (position % 8)
3623}
3624
3625fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
3626 if patterns.is_empty() {
3627 return Ok(None);
3628 }
3629
3630 let mut builder = GlobSetBuilder::new();
3631 for pattern in patterns {
3632 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
3633 builder.add(glob);
3634 }
3635 builder.build().map(Some).map_err(|error| error.to_string())
3636}
3637
3638fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
3639 let mut buffer = [0u8; 4];
3640 reader.read_exact(&mut buffer)?;
3641 Ok(u32::from_le_bytes(buffer))
3642}
3643
3644fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
3645 let mut buffer = [0u8; 8];
3646 reader.read_exact(&mut buffer)?;
3647 Ok(u64::from_le_bytes(buffer))
3648}
3649
3650fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
3651 writer.write_all(&value.to_le_bytes())
3652}
3653
3654fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
3655 writer.write_all(&value.to_le_bytes())
3656}
3657
3658fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
3659 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
3660 return Err(std::io::Error::other("search index checksum missing"));
3661 };
3662 let expected = u32::from_le_bytes(*stored);
3663 let actual = crc32fast::hash(body);
3664 if actual != expected {
3665 return Err(std::io::Error::other("search index checksum mismatch"));
3666 }
3667 Ok(())
3668}
3669
3670fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
3671 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
3672 total_len.checked_sub(pos)
3673}
3674
3675fn run_git(root: &Path, args: &[&str]) -> Option<String> {
3676 let output = Command::new("git")
3677 .arg("-C")
3678 .arg(root)
3679 .args(args)
3680 .output()
3681 .ok()?;
3682 if !output.status.success() {
3683 return None;
3684 }
3685 let value = String::from_utf8(output.stdout).ok()?;
3686 let value = value.trim().to_string();
3687 if value.is_empty() {
3688 None
3689 } else {
3690 Some(value)
3691 }
3692}
3693
3694fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
3695 let diff_range = format!("{}..{}", from, to);
3696 let output = match Command::new("git")
3697 .arg("-C")
3698 .arg(root)
3699 .args(["diff", "--name-status", "-M", &diff_range])
3700 .output()
3701 {
3702 Ok(output) => output,
3703 Err(_) => return false,
3704 };
3705
3706 if !output.status.success() {
3707 return false;
3708 }
3709
3710 let Ok(diff) = String::from_utf8(output.stdout) else {
3711 return false;
3712 };
3713
3714 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
3715 let mut fields = line.split('\t');
3716 let Some(status) = fields.next() else {
3717 continue;
3718 };
3719
3720 if status.starts_with('R') {
3721 let Some(old_path) = fields
3722 .next()
3723 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3724 else {
3725 continue;
3726 };
3727 let Some(new_path) = fields
3728 .next()
3729 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3730 else {
3731 continue;
3732 };
3733 index.remove_file(&old_path);
3734 index.update_file(&new_path);
3735 continue;
3736 }
3737
3738 let Some(path) = fields
3739 .next()
3740 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
3741 else {
3742 continue;
3743 };
3744 if status.starts_with('D') || !path.exists() {
3745 index.remove_file(&path);
3746 } else {
3747 index.update_file(&path);
3748 }
3749 }
3750
3751 true
3752}
3753
3754fn is_binary_path(path: &Path, size: u64) -> bool {
3755 if size == 0 {
3756 return false;
3757 }
3758
3759 let mut file = match File::open(path) {
3760 Ok(file) => file,
3761 Err(_) => return true,
3762 };
3763
3764 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
3765 match file.read(&mut preview) {
3766 Ok(read) => is_binary_bytes(&preview[..read]),
3767 Err(_) => true,
3768 }
3769}
3770
3771fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
3772 let mut starts = vec![0usize];
3773 for (index, byte) in content.iter().copied().enumerate() {
3774 if byte == b'\n' {
3775 starts.push(index + 1);
3776 }
3777 }
3778 starts
3779}
3780
3781fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
3782 let line_index = match line_starts.binary_search(&offset) {
3783 Ok(index) => index,
3784 Err(index) => index.saturating_sub(1),
3785 };
3786 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
3787 let line_end = content[line_start..]
3788 .iter()
3789 .position(|byte| *byte == b'\n')
3790 .map(|length| line_start + length)
3791 .unwrap_or(content.len());
3792 let mut line_slice = &content[line_start..line_end];
3793 if line_slice.ends_with(b"\r") {
3794 line_slice = &line_slice[..line_slice.len() - 1];
3795 }
3796 let line_text = String::from_utf8_lossy(line_slice).into_owned();
3797 let column = String::from_utf8_lossy(&content[line_start..offset])
3798 .chars()
3799 .count() as u32
3800 + 1;
3801 (line_index as u32 + 1, column, line_text)
3802}
3803
3804fn to_glob_path(path: &Path) -> String {
3805 path.to_string_lossy().replace('\\', "/")
3806}
3807
3808#[cfg(test)]
3809mod tests {
3810 use std::process::Command;
3811
3812 use super::*;
3813
3814 #[test]
3815 fn cached_path_under_root_allows_missing_lexical_child() {
3816 let dir = tempfile::tempdir().expect("create temp dir");
3817 let project = dir.path().join("project");
3818 fs::create_dir_all(&project).expect("create project dir");
3819 let root = fs::canonicalize(&project).expect("canonicalize project");
3820
3821 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
3822 .expect("missing child should fall back to lexical validation");
3823
3824 assert_eq!(path, root.join("future/file.rs"));
3825 }
3826
3827 #[cfg(unix)]
3828 #[test]
3829 fn cached_path_under_root_rejects_symlink_escape() {
3830 let dir = tempfile::tempdir().expect("create temp dir");
3831 let project = dir.path().join("project");
3832 let outside = dir.path().join("outside");
3833 fs::create_dir_all(&project).expect("create project dir");
3834 fs::create_dir_all(&outside).expect("create outside dir");
3835 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
3836 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
3837 let root = fs::canonicalize(&project).expect("canonicalize project");
3838
3839 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
3840 }
3841
3842 #[test]
3843 fn extract_trigrams_tracks_next_char_and_position() {
3844 let trigrams = extract_trigrams(b"Rust");
3845 assert_eq!(trigrams.len(), 2);
3846 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
3847 assert_eq!(
3848 trigrams[1],
3849 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
3850 );
3851 }
3852
3853 #[test]
3854 fn index_file_trigram_filters_match_legacy_extraction() {
3855 let dir = tempfile::tempdir().expect("create temp dir");
3856 let path = dir.path().join("sample.txt");
3857 let content = b"Rust rust RUST\nxy";
3858 fs::write(&path, content).expect("write sample");
3859
3860 let mut expected = BTreeMap::new();
3861 for (trigram, next_char, position) in extract_trigrams(content) {
3862 let entry: &mut PostingFilter = expected.entry(trigram).or_default();
3863 entry.next_mask |= mask_for_next_char(next_char);
3864 entry.loc_mask |= mask_for_position(position);
3865 }
3866
3867 let mut index = SearchIndex::new();
3868 index.project_root = dir.path().to_path_buf();
3869 index.index_file(&path, content);
3870
3871 let file_id = *index.path_to_id.get(&path).expect("file indexed");
3872 let file_trigrams = index
3873 .delta_file_trigrams
3874 .get(&file_id)
3875 .expect("delta file trigrams");
3876 assert_eq!(file_trigrams, &expected.keys().copied().collect::<Vec<_>>());
3877 for (trigram, filter) in expected {
3878 let postings = index
3879 .delta_postings
3880 .get(&trigram)
3881 .expect("delta posting list");
3882 assert_eq!(postings.len(), 1);
3883 assert_eq!(postings[0].file_id, file_id);
3884 assert_eq!(postings[0].next_mask, filter.next_mask);
3885 assert_eq!(postings[0].loc_mask, filter.loc_mask);
3886 }
3887 }
3888
3889 #[test]
3890 fn decompose_regex_extracts_literals_and_alternations() {
3891 let query = decompose_regex("abc(def|ghi)xyz");
3892 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
3893 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
3894 assert_eq!(query.or_groups.len(), 1);
3895 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
3896 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
3897 }
3898
3899 #[test]
3900 fn candidates_intersect_posting_lists() {
3901 let mut index = SearchIndex::new();
3902 let dir = tempfile::tempdir().expect("create temp dir");
3903 let alpha = dir.path().join("alpha.txt");
3904 let beta = dir.path().join("beta.txt");
3905 fs::write(&alpha, "abcdef").expect("write alpha");
3906 fs::write(&beta, "abcxyz").expect("write beta");
3907 index.project_root = dir.path().to_path_buf();
3908 index.index_file(&alpha, b"abcdef");
3909 index.index_file(&beta, b"abcxyz");
3910
3911 let query = RegexQuery {
3912 and_trigrams: vec![
3913 pack_trigram(b'a', b'b', b'c'),
3914 pack_trigram(b'd', b'e', b'f'),
3915 ],
3916 ..RegexQuery::default()
3917 };
3918
3919 let candidates = index.candidates(&query);
3920 assert_eq!(candidates.len(), 1);
3921 assert_eq!(index.files[candidates[0] as usize].path, alpha);
3922 }
3923
3924 #[test]
3925 fn candidates_apply_bloom_filters() {
3926 let mut index = SearchIndex::new();
3927 let dir = tempfile::tempdir().expect("create temp dir");
3928 let file = dir.path().join("sample.txt");
3929 fs::write(&file, "abcd efgh").expect("write sample");
3930 index.project_root = dir.path().to_path_buf();
3931 index.index_file(&file, b"abcd efgh");
3932
3933 let trigram = pack_trigram(b'a', b'b', b'c');
3934 let matching_filter = PostingFilter {
3935 next_mask: mask_for_next_char(b'd'),
3936 loc_mask: mask_for_position(0),
3937 };
3938 let non_matching_filter = PostingFilter {
3939 next_mask: mask_for_next_char(b'z'),
3940 loc_mask: mask_for_position(0),
3941 };
3942
3943 assert_eq!(
3944 index
3945 .postings_for_trigram(trigram, Some(matching_filter))
3946 .len(),
3947 1
3948 );
3949 assert!(index
3950 .postings_for_trigram(trigram, Some(non_matching_filter))
3951 .is_empty());
3952 }
3953
3954 #[test]
3955 fn base_delta_readd_masks_base_and_keeps_postings_sorted() {
3956 let dir = tempfile::tempdir().expect("create temp dir");
3957 let project = dir.path().join("project");
3958 fs::create_dir_all(&project).expect("create project dir");
3959 let a = project.join("a.txt");
3960 let b = project.join("b.txt");
3961 fs::write(&a, "abc old").expect("write a");
3962 fs::write(&b, "abc base").expect("write b");
3963
3964 let mut built = SearchIndex::build(&project);
3965 let cache_dir = dir.path().join("cache");
3966 built.write_to_disk(&cache_dir, None);
3967 let mut index = SearchIndex::read_from_disk(&cache_dir, &project).expect("load base");
3968 assert_eq!(index.base_file_count, 2);
3969
3970 let old_a_id = *index.path_to_id.get(&a).expect("original a id");
3971 let b_id = *index.path_to_id.get(&b).expect("original b id");
3972 index.remove_file(&a);
3973 index.index_file(&a, b"abc new");
3974 let new_id = *index.path_to_id.get(&a).expect("re-added file id");
3975 assert!(new_id >= index.base_file_count);
3976 let abc = pack_trigram(b'a', b'b', b'c');
3977 let ids = index.postings_for_trigram(abc, None);
3978 assert_eq!(ids, {
3979 let mut expected = vec![b_id, new_id];
3980 expected.sort_unstable();
3981 expected
3982 });
3983 assert!(!ids.contains(&old_a_id));
3984 }
3985
3986 #[test]
3987 fn write_to_disk_compacts_base_and_delta() {
3988 let dir = tempfile::tempdir().expect("create temp dir");
3989 let project = dir.path().join("project");
3990 fs::create_dir_all(&project).expect("create project dir");
3991 let file = project.join("src.txt");
3992 fs::write(&file, "abcdef").expect("write source");
3993 let mut index = SearchIndex::build(&project);
3994 let cache_dir = dir.path().join("cache");
3995 index.write_to_disk(&cache_dir, None);
3996 fs::write(&file, "abcxyz").expect("edit source");
3997 index.update_file(&file);
3998 assert!(!index.delta_postings.is_empty());
3999 index.write_to_disk(&cache_dir, None);
4000 assert!(index.delta_postings.is_empty());
4001 assert!(index.superseded.is_empty());
4002 assert_eq!(
4003 index.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4004 vec![0]
4005 );
4006 assert!(index
4007 .postings_for_trigram(pack_trigram(b'd', b'e', b'f'), None)
4008 .is_empty());
4009 }
4010
4011 #[test]
4012 fn legacy_cache_without_file_trigram_count_migrates_streaming_counts() {
4013 let dir = tempfile::tempdir().expect("create temp dir");
4014 let project = dir.path().join("project");
4015 fs::create_dir_all(&project).expect("create project dir");
4016 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4017 let cache_dir = dir.path().join("cache");
4018 let mut index = SearchIndex::build(&project);
4019 index.write_to_disk(&cache_dir, None);
4020 let cache_path = cache_dir.join("cache.bin");
4021 strip_file_trigram_count_extension(&cache_path);
4022 assert!(!cache_has_file_trigram_count_extension(&cache_path));
4023
4024 let loaded = SearchIndex::read_from_disk(&cache_dir, &project).expect("load legacy cache");
4025 assert_eq!(loaded.file_trigram_count.as_ref(), &[4]);
4026 assert!(loaded.delta_postings.is_empty());
4027 assert!(cache_has_file_trigram_count_extension(&cache_path));
4028 }
4029
4030 #[test]
4031 fn compaction_flags_buffer_paths_while_running() {
4032 let dir = tempfile::tempdir().expect("create temp dir");
4033 let project = dir.path().join("project");
4034 fs::create_dir_all(&project).expect("create project dir");
4035 let file = project.join("src.txt");
4036 fs::write(&file, "abcdef").expect("write source");
4037 let mut index = SearchIndex::new();
4038 index.project_root = project.clone();
4039 {
4040 let mut state = index.compaction_state.lock().expect("compaction state");
4041 state.running = true;
4042 }
4043 index.update_file(&file);
4044 let state = index.compaction_state.lock().expect("compaction state");
4045 assert!(state.requested_again || !index.delta_postings.is_empty());
4046 assert!(state.buffered_paths.contains(&file));
4047 }
4048
4049 fn cache_has_file_trigram_count_extension(cache_path: &Path) -> bool {
4050 file_trigram_count_extension_range(cache_path).is_some()
4051 }
4052
4053 fn strip_file_trigram_count_extension(cache_path: &Path) {
4054 let mut bytes = fs::read(cache_path).expect("read cache");
4055 let (start, end) = file_trigram_count_extension_range_from_bytes(&bytes)
4056 .expect("file trigram count extension");
4057 bytes.drain(start..end);
4058 let postings_len_total = u64::from_le_bytes(bytes[8..16].try_into().unwrap())
4059 - u64::try_from(end - start).unwrap();
4060 bytes[8..16].copy_from_slice(&postings_len_total.to_le_bytes());
4061 let checksum_pos = 16 + usize::try_from(postings_len_total).unwrap() - 4;
4062 let checksum = crc32fast::hash(&bytes[16..checksum_pos]);
4063 bytes[checksum_pos..checksum_pos + 4].copy_from_slice(&checksum.to_le_bytes());
4064 fs::write(cache_path, bytes).expect("write legacy cache");
4065 }
4066
4067 fn file_trigram_count_extension_range(cache_path: &Path) -> Option<(usize, usize)> {
4068 let bytes = fs::read(cache_path).ok()?;
4069 file_trigram_count_extension_range_from_bytes(&bytes)
4070 }
4071
4072 fn file_trigram_count_extension_range_from_bytes(bytes: &[u8]) -> Option<(usize, usize)> {
4073 let postings_len_total = u64::from_le_bytes(bytes.get(8..16)?.try_into().ok()?) as usize;
4074 let postings_start = 16usize;
4075 let postings_end = postings_start.checked_add(postings_len_total)?;
4076 let postings_body_end = postings_end.checked_sub(4)?;
4077 let mut reader = Cursor::new(&bytes[postings_start..postings_body_end]);
4078 let mut magic = [0u8; 8];
4079 reader.read_exact(&mut magic).ok()?;
4080 if &magic != INDEX_MAGIC {
4081 return None;
4082 }
4083 read_u32(&mut reader).ok()?;
4084 let head_len = read_u32(&mut reader).ok()? as u64;
4085 let root_len = read_u32(&mut reader).ok()? as u64;
4086 let ignore_len = read_u32(&mut reader).ok()? as u64;
4087 read_u64(&mut reader).ok()?;
4088 let file_count = read_u32(&mut reader).ok()? as usize;
4089 let skip = head_len.checked_add(root_len)?.checked_add(ignore_len)?;
4090 reader.seek(SeekFrom::Current(skip as i64)).ok()?;
4091 for _ in 0..file_count {
4092 let mut unindexed = [0u8; 1];
4093 reader.read_exact(&mut unindexed).ok()?;
4094 let path_len = read_u32(&mut reader).ok()? as u64;
4095 read_u64(&mut reader).ok()?;
4096 read_u64(&mut reader).ok()?;
4097 read_u32(&mut reader).ok()?;
4098 let mut hash = [0u8; 32];
4099 reader.read_exact(&mut hash).ok()?;
4100 reader.seek(SeekFrom::Current(path_len as i64)).ok()?;
4101 }
4102 let postings_blob_len = read_u64(&mut reader).ok()? as usize;
4103 let extension_start = postings_start
4104 .checked_add(reader.position() as usize)?
4105 .checked_add(postings_blob_len)?;
4106 if extension_start + 16 > postings_body_end {
4107 return None;
4108 }
4109 if bytes.get(extension_start..extension_start + 8)? != FILE_TRIGRAM_COUNT_MAGIC {
4110 return None;
4111 }
4112 let count = u32::from_le_bytes(
4113 bytes[extension_start + 12..extension_start + 16]
4114 .try_into()
4115 .ok()?,
4116 ) as usize;
4117 let extension_end = extension_start
4118 .checked_add(16)?
4119 .checked_add(count.checked_mul(4)?)?;
4120 (extension_end <= postings_body_end).then_some((extension_start, extension_end))
4121 }
4122
4123 #[test]
4124 fn disk_round_trip_preserves_postings_and_files() {
4125 let dir = tempfile::tempdir().expect("create temp dir");
4126 let project = dir.path().join("project");
4127 fs::create_dir_all(&project).expect("create project dir");
4128 let file = project.join("src.txt");
4129 fs::write(&file, "abcdef").expect("write source");
4130
4131 let mut index = SearchIndex::build(&project);
4132 index.git_head = Some("deadbeef".to_string());
4133 let cache_dir = dir.path().join("cache");
4134 let head = index.git_head.clone();
4135 index.write_to_disk(&cache_dir, head.as_deref());
4136
4137 let loaded =
4138 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
4139 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
4140 assert_eq!(loaded.files.len(), 1);
4141 assert_eq!(
4142 relative_to_root(&loaded.project_root, &loaded.files[0].path),
4143 PathBuf::from("src.txt")
4144 );
4145 assert_eq!(loaded.trigram_count(), index.trigram_count());
4146 assert_eq!(
4147 loaded.postings_for_trigram(pack_trigram(b'a', b'b', b'c'), None),
4148 vec![0]
4149 );
4150 assert_eq!(
4151 loaded.file_trigram_count.as_ref(),
4152 index.file_trigram_count.as_ref()
4153 );
4154 }
4155
4156 #[test]
4157 fn cache_path_helpers_reject_absolute_and_parent_paths() {
4158 let root = PathBuf::from("/tmp/aft-project");
4159
4160 assert_eq!(
4161 cache_relative_path(&root, &root.join("src/lib.rs")),
4162 Some(PathBuf::from("src/lib.rs"))
4163 );
4164 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
4165 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
4166 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
4167 assert_eq!(
4168 cached_path_under_root(&root, Path::new("src/./lib.rs")),
4169 Some(root.join("src/lib.rs"))
4170 );
4171 }
4172
4173 #[test]
4174 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
4175 let dir = tempfile::tempdir().expect("create temp dir");
4176 let project = dir.path().join("project");
4177 fs::create_dir_all(&project).expect("create project dir");
4178 let canonical_project = fs::canonicalize(&project).expect("canonical project");
4179 fs::write(project.join("old.txt"), "old token\n").expect("write old");
4180 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
4181
4182 Command::new("git")
4183 .arg("init")
4184 .arg(&project)
4185 .status()
4186 .expect("git init");
4187 for args in [
4188 ["config", "user.email", "aft@example.invalid"],
4189 ["config", "user.name", "AFT Test"],
4190 ] {
4191 Command::new("git")
4192 .arg("-C")
4193 .arg(&project)
4194 .args(args)
4195 .status()
4196 .expect("git config");
4197 }
4198 Command::new("git")
4199 .arg("-C")
4200 .arg(&project)
4201 .args(["add", "."])
4202 .status()
4203 .expect("git add initial");
4204 Command::new("git")
4205 .arg("-C")
4206 .arg(&project)
4207 .args(["commit", "-m", "initial"])
4208 .status()
4209 .expect("git commit initial");
4210 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
4211 let mut baseline = SearchIndex::build(&project);
4212 baseline.git_head = Some(previous.clone());
4213
4214 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
4215 Command::new("git")
4216 .arg("-C")
4217 .arg(&project)
4218 .args(["add", "-A"])
4219 .status()
4220 .expect("git add rename");
4221 Command::new("git")
4222 .arg("-C")
4223 .arg(&project)
4224 .args(["commit", "-m", "rename"])
4225 .status()
4226 .expect("git commit rename");
4227 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
4228
4229 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
4230 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
4231
4232 let refreshed = SearchIndex::rebuild_or_refresh(
4233 &project,
4234 DEFAULT_MAX_FILE_SIZE,
4235 Some(current),
4236 Some(baseline),
4237 None,
4238 );
4239
4240 assert!(!refreshed
4241 .path_to_id
4242 .contains_key(&canonical_project.join("old.txt")));
4243 assert!(refreshed
4244 .path_to_id
4245 .contains_key(&canonical_project.join("new.txt")));
4246 assert!(refreshed
4247 .path_to_id
4248 .contains_key(&canonical_project.join("untracked.txt")));
4249 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
4250 assert_eq!(matches.matches.len(), 1);
4251 }
4252
4253 #[test]
4254 fn read_from_disk_rejects_corrupt_lookup_checksum() {
4255 let dir = tempfile::tempdir().expect("create temp dir");
4256 let project = dir.path().join("project");
4257 fs::create_dir_all(&project).expect("create project dir");
4258 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4259
4260 let mut index = SearchIndex::build(&project);
4261 let cache_dir = dir.path().join("cache");
4262 index.write_to_disk(&cache_dir, None);
4263
4264 let cache_path = cache_dir.join("cache.bin");
4265 let mut bytes = fs::read(&cache_path).expect("read cache");
4266 let last = bytes.len() - 1;
4267 bytes[last] ^= 0xff;
4268 fs::write(&cache_path, bytes).expect("write corrupted cache");
4269
4270 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
4271 }
4272
4273 #[test]
4274 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
4275 let dir = tempfile::tempdir().expect("create temp dir");
4276 let project = dir.path().join("project");
4277 fs::create_dir_all(&project).expect("create project dir");
4278 fs::write(project.join("src.txt"), "abcdef").expect("write source");
4279
4280 let mut index = SearchIndex::build(&project);
4281 let cache_dir = dir.path().join("cache");
4282 index.write_to_disk(&cache_dir, None);
4283
4284 assert!(cache_dir.join("cache.bin").is_file());
4285 assert!(fs::read_dir(&cache_dir)
4286 .expect("read cache dir")
4287 .all(|entry| !entry
4288 .expect("cache entry")
4289 .file_name()
4290 .to_string_lossy()
4291 .contains(".tmp.")));
4292 }
4293
4294 #[test]
4295 fn concurrent_search_index_writes_do_not_corrupt() {
4296 let dir = tempfile::tempdir().expect("create temp dir");
4297 let project = dir.path().join("project");
4298 fs::create_dir_all(&project).expect("create project dir");
4299 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
4300 let cache_dir = dir.path().join("cache");
4301
4302 let a_project = project.clone();
4303 let a_cache = cache_dir.clone();
4304 let a = std::thread::spawn(move || {
4305 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
4306 let mut index = SearchIndex::build(&a_project);
4307 index.write_to_disk(&a_cache, None);
4308 });
4309 let b_project = project.clone();
4310 let b_cache = cache_dir.clone();
4311 let b = std::thread::spawn(move || {
4312 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
4313 let mut index = SearchIndex::build(&b_project);
4314 index.write_to_disk(&b_cache, None);
4315 });
4316 a.join().expect("writer a");
4317 b.join().expect("writer b");
4318
4319 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
4320 }
4321
4322 #[test]
4323 fn search_index_atomic_rename_survives_partial_write() {
4324 let dir = tempfile::tempdir().expect("create temp dir");
4325 let cache_dir = dir.path().join("cache");
4326 fs::create_dir_all(&cache_dir).expect("create cache dir");
4327 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
4328
4329 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4330 }
4331
4332 #[test]
4333 fn artifact_cache_key_shared_across_clones_of_same_repo() {
4334 let dir = tempfile::tempdir().expect("create temp dir");
4335 let source = dir.path().join("source");
4336 fs::create_dir_all(&source).expect("create source repo dir");
4337 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
4338
4339 assert!(Command::new("git")
4340 .current_dir(&source)
4341 .args(["init"])
4342 .status()
4343 .expect("init git repo")
4344 .success());
4345 assert!(Command::new("git")
4346 .current_dir(&source)
4347 .args(["add", "."])
4348 .status()
4349 .expect("git add")
4350 .success());
4351 assert!(Command::new("git")
4352 .current_dir(&source)
4353 .args([
4354 "-c",
4355 "user.name=AFT Tests",
4356 "-c",
4357 "user.email=aft-tests@example.com",
4358 "commit",
4359 "-m",
4360 "initial",
4361 ])
4362 .status()
4363 .expect("git commit")
4364 .success());
4365
4366 let clone = dir.path().join("clone");
4367 assert!(Command::new("git")
4368 .args(["clone", "--quiet"])
4369 .arg(&source)
4370 .arg(&clone)
4371 .status()
4372 .expect("git clone")
4373 .success());
4374
4375 let source_key = artifact_cache_key(&source);
4376 let clone_key = artifact_cache_key(&clone);
4377
4378 assert_eq!(source_key.len(), 16);
4379 assert_eq!(clone_key.len(), 16);
4380 assert_eq!(source_key, clone_key);
4382 }
4383
4384 #[test]
4385 fn git_head_unchanged_picks_up_local_edits() {
4386 let dir = tempfile::tempdir().expect("create temp dir");
4387 let project = dir.path().join("repo");
4388 fs::create_dir_all(&project).expect("create repo dir");
4389 let file = project.join("tracked.txt");
4390 fs::write(&file, "oldtoken\n").expect("write file");
4391 assert!(Command::new("git")
4392 .current_dir(&project)
4393 .arg("init")
4394 .status()
4395 .unwrap()
4396 .success());
4397 assert!(Command::new("git")
4398 .current_dir(&project)
4399 .args(["add", "."])
4400 .status()
4401 .unwrap()
4402 .success());
4403 assert!(Command::new("git")
4404 .current_dir(&project)
4405 .args([
4406 "-c",
4407 "user.name=AFT Tests",
4408 "-c",
4409 "user.email=aft-tests@example.com",
4410 "commit",
4411 "-m",
4412 "initial"
4413 ])
4414 .status()
4415 .unwrap()
4416 .success());
4417 let head = current_git_head(&project);
4418 let mut baseline = SearchIndex::build(&project);
4419 baseline.git_head = head.clone();
4420 fs::write(&file, "newtoken\n").expect("edit tracked file");
4421
4422 let refreshed = SearchIndex::rebuild_or_refresh(
4423 &project,
4424 DEFAULT_MAX_FILE_SIZE,
4425 head,
4426 Some(baseline),
4427 None,
4428 );
4429 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
4430
4431 assert_eq!(result.total_matches, 1);
4432 }
4433
4434 #[test]
4435 fn non_git_project_reuses_cache_when_files_unchanged() {
4436 let dir = tempfile::tempdir().expect("create temp dir");
4437 let project = dir.path().join("project");
4438 fs::create_dir_all(&project).expect("create project dir");
4439 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
4440 let baseline = SearchIndex::build(&project);
4441 let baseline_file_count = baseline.file_count();
4442
4443 let refreshed = SearchIndex::rebuild_or_refresh(
4444 &project,
4445 DEFAULT_MAX_FILE_SIZE,
4446 None,
4447 Some(baseline),
4448 None,
4449 );
4450
4451 assert_eq!(refreshed.file_count(), baseline_file_count);
4452 assert_eq!(
4453 refreshed
4454 .grep("unchangedtoken", true, &[], &[], &project, 10)
4455 .total_matches,
4456 1
4457 );
4458 }
4459
4460 #[test]
4461 fn resolve_search_scope_disables_index_for_external_path() {
4462 let dir = tempfile::tempdir().expect("create temp dir");
4463 let project = dir.path().join("project");
4464 let outside = dir.path().join("outside");
4465 fs::create_dir_all(&project).expect("create project dir");
4466 fs::create_dir_all(&outside).expect("create outside dir");
4467
4468 let scope = resolve_search_scope(&project, outside.to_str());
4469
4470 assert_eq!(
4471 scope.root,
4472 fs::canonicalize(&outside).expect("canonicalize outside")
4473 );
4474 assert!(!scope.use_index);
4475 }
4476
4477 #[test]
4478 fn grep_filters_matches_to_search_root() {
4479 let dir = tempfile::tempdir().expect("create temp dir");
4480 let project = dir.path().join("project");
4481 let src = project.join("src");
4482 let docs = project.join("docs");
4483 fs::create_dir_all(&src).expect("create src dir");
4484 fs::create_dir_all(&docs).expect("create docs dir");
4485 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
4486 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
4487
4488 let index = SearchIndex::build(&project);
4489 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4490
4491 assert_eq!(result.files_searched, 1);
4492 assert_eq!(result.files_with_matches, 1);
4493 assert_eq!(result.matches.len(), 1);
4494 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
4496 assert_eq!(result.matches[0].file, expected);
4497 }
4498
4499 #[test]
4500 fn grep_deduplicates_multiple_matches_on_same_line() {
4501 let dir = tempfile::tempdir().expect("create temp dir");
4502 let project = dir.path().join("project");
4503 let src = project.join("src");
4504 fs::create_dir_all(&src).expect("create src dir");
4505 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
4506
4507 let index = SearchIndex::build(&project);
4508 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
4509
4510 assert_eq!(result.total_matches, 1);
4511 assert_eq!(result.matches.len(), 1);
4512 }
4513
4514 #[test]
4515 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
4516 let dir = tempfile::tempdir().expect("create temp dir");
4517 let project = dir.path().join("project");
4518 fs::create_dir_all(&project).expect("create project dir");
4519 let file = project.join("unicode.txt");
4520 fs::write(&file, "äbc\n").expect("write unicode file");
4521
4522 let index = SearchIndex::build(&project);
4523 let result = index.grep("Äbc", false, &[], &[], &project, 10);
4524
4525 assert_eq!(result.total_matches, 1);
4526 assert_eq!(result.matches.len(), 1);
4527 assert_eq!(
4528 result.matches[0].file,
4529 fs::canonicalize(file).expect("canonicalize unicode file")
4530 );
4531 }
4532
4533 #[test]
4534 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
4535 let dir = tempfile::tempdir().expect("create temp dir");
4536 let project = dir.path().join("project");
4537 fs::create_dir_all(&project).expect("create project dir");
4538 let file = project.join("tokens.txt");
4539 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
4540 fs::write(&file, "alpha").expect("write original file");
4541 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
4542
4543 let baseline = SearchIndex::build(&project);
4544 fs::write(&file, "bravo").expect("write same-size edit");
4545 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
4546
4547 let refreshed = SearchIndex::rebuild_or_refresh(
4548 &project,
4549 DEFAULT_MAX_FILE_SIZE,
4550 None,
4551 Some(baseline),
4552 None,
4553 );
4554 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
4555 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
4556 let refreshed_id = *refreshed
4557 .path_to_id
4558 .get(&canonical_file)
4559 .expect("file remains indexed");
4560
4561 assert_eq!(result.total_matches, 1);
4562 assert!(refreshed
4563 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
4564 .contains(&refreshed_id));
4565 assert!(!refreshed
4566 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
4567 .contains(&refreshed_id));
4568 }
4569
4570 #[test]
4571 fn grep_reports_total_matches_before_truncation() {
4572 let dir = tempfile::tempdir().expect("create temp dir");
4573 let project = dir.path().join("project");
4574 let src = project.join("src");
4575 fs::create_dir_all(&src).expect("create src dir");
4576 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
4577
4578 let index = SearchIndex::build(&project);
4579 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
4580
4581 assert_eq!(result.total_matches, 2);
4582 assert_eq!(result.matches.len(), 1);
4583 assert!(result.truncated);
4584 }
4585
4586 #[test]
4587 fn glob_filters_results_to_search_root() {
4588 let dir = tempfile::tempdir().expect("create temp dir");
4589 let project = dir.path().join("project");
4590 let src = project.join("src");
4591 let scripts = project.join("scripts");
4592 fs::create_dir_all(&src).expect("create src dir");
4593 fs::create_dir_all(&scripts).expect("create scripts dir");
4594 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
4595 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
4596
4597 let index = SearchIndex::build(&project);
4598 let files = index.glob("**/*.rs", &src);
4599
4600 assert_eq!(
4601 files,
4602 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
4603 );
4604 }
4605
4606 #[test]
4607 fn glob_includes_hidden_and_binary_files() {
4608 let dir = tempfile::tempdir().expect("create temp dir");
4609 let project = dir.path().join("project");
4610 let hidden_dir = project.join(".hidden");
4611 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
4612 let hidden_file = hidden_dir.join("data.bin");
4613 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
4614
4615 let index = SearchIndex::build(&project);
4616 let files = index.glob("**/*.bin", &project);
4617
4618 assert_eq!(
4619 files,
4620 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
4621 );
4622 }
4623
4624 #[test]
4625 fn read_from_disk_rejects_invalid_nanos() {
4626 let dir = tempfile::tempdir().expect("create temp dir");
4627 let cache_dir = dir.path().join("cache");
4628 fs::create_dir_all(&cache_dir).expect("create cache dir");
4629
4630 let mut postings = Vec::new();
4631 postings.extend_from_slice(INDEX_MAGIC);
4632 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4633 postings.extend_from_slice(&0u32.to_le_bytes());
4634 postings.extend_from_slice(&1u32.to_le_bytes());
4635 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
4636 postings.extend_from_slice(&1u32.to_le_bytes());
4637 postings.extend_from_slice(b"/");
4638 postings.push(0u8);
4639 postings.extend_from_slice(&1u32.to_le_bytes());
4640 postings.extend_from_slice(&0u64.to_le_bytes());
4641 postings.extend_from_slice(&0u64.to_le_bytes());
4642 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
4643 postings.extend_from_slice(b"a");
4644 postings.extend_from_slice(&0u64.to_le_bytes());
4645
4646 let mut lookup = Vec::new();
4647 lookup.extend_from_slice(LOOKUP_MAGIC);
4648 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4649 lookup.extend_from_slice(&0u32.to_le_bytes());
4650
4651 let postings_checksum = crc32fast::hash(&postings);
4652 postings.extend_from_slice(&postings_checksum.to_le_bytes());
4653 let lookup_checksum = crc32fast::hash(&lookup);
4654 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
4655 let mut cache = Vec::new();
4656 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
4657 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
4658 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
4659 cache.extend_from_slice(&postings);
4660 cache.extend_from_slice(&lookup);
4661 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
4662
4663 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
4664 }
4665
4666 #[test]
4667 fn parallel_cold_build_matches_serial_index() {
4668 let dir = tempfile::tempdir().expect("create temp dir");
4669 let project = dir.path().join("project");
4670 for index in 0..80 {
4671 let sub = project.join(format!("pkg_{index:03}"));
4672 fs::create_dir_all(&sub).expect("create subdir");
4673 fs::write(
4674 sub.join("lib.rs"),
4675 format!(
4676 "pub fn unique_marker_{index}() {{ println!(\"aft_perf_marker_{index}\"); }}\n"
4677 ),
4678 )
4679 .expect("write lib");
4680 }
4681
4682 let serial = SearchIndex::build_with_limit_serial(&project, DEFAULT_MAX_FILE_SIZE);
4683 let parallel = SearchIndex::build_with_limit(&project, DEFAULT_MAX_FILE_SIZE);
4684
4685 assert_eq!(serial.file_count(), parallel.file_count());
4686 assert_eq!(serial.trigram_count(), parallel.trigram_count());
4687 assert_eq!(serial.path_to_id.len(), parallel.path_to_id.len());
4688 assert_eq!(
4689 serial.file_trigram_count.as_ref(),
4690 parallel.file_trigram_count.as_ref()
4691 );
4692 for (path, id) in serial.path_to_id.iter() {
4693 assert_eq!(parallel.path_to_id.get(path), Some(id));
4694 }
4695 for (serial_file, parallel_file) in serial.files.iter().zip(parallel.files.iter()) {
4696 assert_eq!(serial_file.path, parallel_file.path);
4697 assert_eq!(serial_file.size, parallel_file.size);
4698 assert_eq!(serial_file.modified, parallel_file.modified);
4699 assert_eq!(serial_file.content_hash, parallel_file.content_hash);
4700 }
4701
4702 let serial_grep = serial.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4703 let parallel_grep = parallel.grep("aft_perf_marker_17", true, &[], &[], &project, 10);
4704 assert_eq!(serial_grep.matches, parallel_grep.matches);
4705 assert_eq!(serial_grep.total_matches, parallel_grep.total_matches);
4706 assert_eq!(serial_grep.files_searched, parallel_grep.files_searched);
4707 assert_eq!(
4708 serial_grep.files_with_matches,
4709 parallel_grep.files_with_matches
4710 );
4711 }
4712
4713 #[test]
4714 fn ignore_rule_discovery_respects_gitignore() {
4715 let dir = tempfile::tempdir().expect("create temp dir");
4716 let project = dir.path().join("project");
4717 fs::create_dir_all(project.join("src")).expect("mkdir src");
4718 fs::write(project.join("src/.gitignore"), "data/\n").expect("write gitignore");
4719 let data = project.join("src/data");
4720 fs::create_dir_all(&data).expect("mkdir data");
4721 for index in 0..200 {
4722 fs::create_dir_all(data.join(format!("d{index}"))).expect("mkdir nested");
4723 fs::write(data.join(format!("d{index}/f.rs")), "fn ignored() {}\n")
4724 .expect("write ignored file");
4725 }
4726
4727 Command::new("git")
4728 .arg("init")
4729 .arg(&project)
4730 .status()
4731 .expect("git init");
4732 for args in [
4733 ["config", "user.email", "aft@example.invalid"],
4734 ["config", "user.name", "AFT Test"],
4735 ] {
4736 Command::new("git")
4737 .arg("-C")
4738 .arg(&project)
4739 .args(args)
4740 .status()
4741 .expect("git config");
4742 }
4743 Command::new("git")
4744 .arg("-C")
4745 .arg(&project)
4746 .args(["add", "."])
4747 .status()
4748 .expect("git add");
4749 Command::new("git")
4750 .arg("-C")
4751 .arg(&project)
4752 .args(["commit", "-m", "initial"])
4753 .status()
4754 .expect("git commit");
4755
4756 let legacy_dirs = count_ignore_rule_discovery_dirs_legacy_stack(&project);
4757 let walker_dirs = count_ignore_rule_discovery_dirs(&project);
4758 assert!(
4759 legacy_dirs > walker_dirs,
4760 "legacy stack should descend into gitignored data/ (legacy={legacy_dirs}, walker={walker_dirs})"
4761 );
4762 assert!(
4763 walker_dirs < 50,
4764 "ignore walker should not descend deeply into ignored tree (dirs={walker_dirs})"
4765 );
4766 }
4767
4768 #[test]
4783 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
4784 let dir = tempfile::tempdir().expect("create tempdir");
4788 let mut paths: Vec<PathBuf> = Vec::new();
4789 for i in 0..30 {
4790 let path = if i % 2 == 0 {
4792 let p = dir.path().join(format!("real-{i}.rs"));
4793 fs::write(&p, format!("// {i}\n")).expect("write");
4794 p
4795 } else {
4796 dir.path().join(format!("missing-{i}.rs"))
4797 };
4798 paths.push(path);
4799 }
4800
4801 for _ in 0..50 {
4804 let mut copy = paths.clone();
4805 sort_paths_by_mtime_desc(&mut copy);
4806 assert_eq!(copy.len(), paths.len());
4807 }
4808 }
4809
4810 #[test]
4816 fn uncapped_indexed_grep_over_many_files_is_not_engine_capped() {
4817 let dir = tempfile::tempdir().expect("create tempdir");
4818 for i in 0..40 {
4821 fs::write(
4822 dir.path().join(format!("file-{i}.rs")),
4823 format!("fn unique_marker_{i}() {{ let _ = \"needle_token\"; }}\n"),
4824 )
4825 .expect("write");
4826 }
4827 let index = SearchIndex::build_with_limit(dir.path(), DEFAULT_MAX_FILE_SIZE);
4828 let result = index.grep("needle_token", false, &[], &[], dir.path(), 1000);
4829 assert!(
4830 result.matches.len() >= 40,
4831 "expected a match per file, got {}",
4832 result.matches.len()
4833 );
4834 assert!(
4835 !result.engine_capped,
4836 "an uncapped grep over >10 files must not report engine_capped"
4837 );
4838 assert!(!result.truncated, "uncapped grep must not be truncated");
4839 }
4840
4841 #[test]
4845 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
4846 let dir = tempfile::tempdir().expect("create tempdir");
4847 let mut matches: Vec<GrepMatch> = Vec::new();
4848 for i in 0..30 {
4849 let file = if i % 2 == 0 {
4850 let p = dir.path().join(format!("real-{i}.rs"));
4851 fs::write(&p, format!("// {i}\n")).expect("write");
4852 p
4853 } else {
4854 dir.path().join(format!("missing-{i}.rs"))
4855 };
4856 matches.push(GrepMatch {
4857 file,
4858 line: u32::try_from(i).unwrap_or(0),
4859 column: 0,
4860 line_text: format!("match {i}"),
4861 match_text: format!("match {i}"),
4862 });
4863 }
4864
4865 for _ in 0..50 {
4866 let mut copy = matches.clone();
4867 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
4868 assert_eq!(copy.len(), matches.len());
4869 }
4870 }
4871}