1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36 _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41 fs::create_dir_all(cache_dir)?;
42 let path = cache_dir.join("cache.lock");
43 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44 .lock()
45 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46 fs_lock::try_acquire(&path, Duration::from_secs(2))
47 .map(|guard| Self { _guard: guard })
48 .map_err(|error| match error {
49 fs_lock::AcquireError::Timeout => {
50 std::io::Error::other("timed out acquiring search cache lock")
51 }
52 fs_lock::AcquireError::Io(error) => error,
53 })
54 }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59 pub postings: HashMap<u32, Vec<Posting>>,
60 pub files: Vec<FileEntry>,
61 pub path_to_id: HashMap<PathBuf, u32>,
62 pub ready: bool,
63 project_root: PathBuf,
64 git_head: Option<String>,
65 max_file_size: u64,
66 ignore_rules_fingerprint: String,
67 pub file_trigrams: HashMap<u32, Vec<u32>>,
68 unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73 pub files: Vec<(PathBuf, f32)>,
74 pub engine_capped: bool,
75}
76
77impl SearchIndex {
78 pub fn file_count(&self) -> usize {
80 self.files.len()
81 }
82
83 pub fn trigram_count(&self) -> usize {
85 self.postings.len()
86 }
87
88 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90 query_trigrams_from_tokens(tokens)
91 }
92
93 pub fn lexical_rank(
95 &self,
96 query_trigrams: &[u32],
97 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98 max_files: usize,
99 ) -> Vec<(PathBuf, f32)> {
100 self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101 .files
102 }
103
104 pub fn lexical_rank_with_stats(
107 &self,
108 query_trigrams: &[u32],
109 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110 max_files: usize,
111 ) -> LexicalRankResult {
112 if query_trigrams.is_empty() || max_files == 0 {
113 return LexicalRankResult::default();
114 }
115
116 let mut non_zero: Vec<(u32, usize)> = query_trigrams
117 .iter()
118 .filter_map(|trigram| {
119 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120 (posting_count > 0).then_some((*trigram, posting_count))
121 })
122 .collect();
123 if non_zero.is_empty() {
124 return LexicalRankResult::default();
125 }
126
127 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128 let selected_count = non_zero.len().min(3);
129 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131 let mut candidate_ids = BTreeSet::new();
132 for (trigram, _) in non_zero.iter().take(selected_count) {
133 if let Some(postings) = self.postings.get(trigram) {
134 for posting in postings {
135 if self.is_active_file(posting.file_id) {
136 candidate_ids.insert(posting.file_id);
137 }
138 }
139 }
140 }
141 let pre_filter_candidate_count = candidate_ids.len();
142 let engine_capped = pre_filter_candidate_count > candidate_cap;
143 let filtered_candidates = candidate_ids
144 .into_iter()
145 .filter_map(|file_id| {
146 self.files
147 .get(file_id as usize)
148 .map(|entry| (file_id, entry))
149 })
150 .filter(|(_, entry)| {
151 if let Some(filter) = candidate_filter {
152 filter(&entry.path)
153 } else {
154 true
155 }
156 })
157 .collect::<Vec<_>>();
158
159 let mut ranked = Vec::new();
160 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161 let score = lexical_score(self, query_trigrams, file_id);
162 if score > 0.0 {
163 ranked.push((entry.path.clone(), score));
164 }
165 }
166
167 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168 ranked.truncate(max_files);
169 LexicalRankResult {
170 files: ranked,
171 engine_capped,
172 }
173 }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178 pub file_id: u32,
179 pub next_mask: u8,
180 pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185 pub path: PathBuf,
186 pub size: u64,
187 pub modified: SystemTime,
188 pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193 pub file: PathBuf,
194 pub line: u32,
195 pub column: u32,
196 pub line_text: String,
197 pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202 pub matches: Vec<GrepMatch>,
203 pub total_matches: usize,
204 pub files_searched: usize,
205 pub files_with_matches: usize,
206 pub index_status: IndexStatus,
207 pub truncated: bool,
208 pub fully_degraded: bool,
209 pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214 Ready,
215 Building,
216 Fallback,
217 Disabled,
218}
219
220impl IndexStatus {
221 pub fn as_str(&self) -> &'static str {
222 match self {
223 IndexStatus::Ready => "Ready",
224 IndexStatus::Building => "Building",
225 IndexStatus::Fallback => "Fallback",
226 IndexStatus::Disabled => "Disabled",
227 }
228 }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233 pub and_trigrams: Vec<u32>,
234 pub or_groups: Vec<Vec<u32>>,
235 pub(crate) and_filters: HashMap<u32, PostingFilter>,
236 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241 next_mask: u8,
242 loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247 and_runs: Vec<Vec<u8>>,
248 or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253 includes: Option<GlobSet>,
254 excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259 pub root: PathBuf,
260 pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265 file: Arc<PathBuf>,
266 line: u32,
267 column: u32,
268 line_text: String,
269 match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274 Literal(LiteralSearch),
275 Regex(Regex),
276}
277
278impl SearchIndex {
279 pub fn new() -> Self {
280 SearchIndex {
281 postings: HashMap::new(),
282 files: Vec::new(),
283 path_to_id: HashMap::new(),
284 ready: false,
285 project_root: PathBuf::new(),
286 git_head: None,
287 max_file_size: DEFAULT_MAX_FILE_SIZE,
288 ignore_rules_fingerprint: String::new(),
289 file_trigrams: HashMap::new(),
290 unindexed_files: HashSet::new(),
291 }
292 }
293
294 pub fn build(root: &Path) -> Self {
295 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296 }
297
298 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299 let started = std::time::Instant::now();
300 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
301 let mut index = SearchIndex {
302 project_root: project_root.clone(),
303 max_file_size,
304 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
305 ..SearchIndex::new()
306 };
307
308 let filters = PathFilters::default();
309 let mut indexed = 0usize;
310 for path in walk_project_files(&project_root, &filters) {
311 index.update_file(&path);
312 indexed += 1;
313 }
314
315 index.git_head = current_git_head(&project_root);
316 index.ready = true;
317 crate::slog_info!(
318 "search index cold build: {} files, {} trigrams, {} ms",
319 indexed,
320 index.postings.len(),
321 started.elapsed().as_millis()
322 );
323 index
324 }
325
326 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
327 self.remove_file(path);
328
329 let file_id = match self.allocate_file_id(path, content.len() as u64) {
330 Some(file_id) => file_id,
331 None => return,
332 };
333 if let Some(file) = self.files.get_mut(file_id as usize) {
334 file.content_hash = cache_freshness::hash_bytes(content);
335 }
336
337 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
338 for (trigram, next_char, position) in extract_trigrams(content) {
339 let entry = trigram_map.entry(trigram).or_default();
340 entry.next_mask |= mask_for_next_char(next_char);
341 entry.loc_mask |= mask_for_position(position);
342 }
343
344 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
345 for (trigram, filter) in trigram_map {
346 let postings = self.postings.entry(trigram).or_default();
347 postings.push(Posting {
348 file_id,
349 next_mask: filter.next_mask,
350 loc_mask: filter.loc_mask,
351 });
352 if postings.len() > 1
356 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
357 {
358 postings.sort_unstable_by_key(|p| p.file_id);
359 }
360 file_trigrams.push(trigram);
361 }
362
363 self.file_trigrams.insert(file_id, file_trigrams);
364 self.unindexed_files.remove(&file_id);
365 }
366
367 pub fn remove_file(&mut self, path: &Path) {
368 let canonical_path = canonicalize_existing_or_deleted_path(path);
369 let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
370 file_id
371 } else if canonical_path.as_path() != path {
372 let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
373 return;
374 };
375 file_id
376 } else {
377 return;
378 };
379
380 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
381 for trigram in trigrams {
382 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
383 postings.retain(|posting| posting.file_id != file_id);
384 postings.is_empty()
385 } else {
386 false
387 };
388
389 if should_remove {
390 self.postings.remove(&trigram);
391 }
392 }
393 }
394
395 self.unindexed_files.remove(&file_id);
396 if let Some(file) = self.files.get_mut(file_id as usize) {
397 file.path = PathBuf::new();
398 file.size = 0;
399 file.modified = UNIX_EPOCH;
400 file.content_hash = cache_freshness::zero_hash();
401 }
402 }
403
404 pub fn update_file(&mut self, path: &Path) {
405 self.remove_file(path);
406
407 let metadata = match fs::metadata(path) {
408 Ok(metadata) if metadata.is_file() => metadata,
409 _ => return,
410 };
411
412 if is_binary_path(path, metadata.len()) {
413 self.track_unindexed_file(path, &metadata);
414 return;
415 }
416
417 if metadata.len() > self.max_file_size {
418 self.track_unindexed_file(path, &metadata);
419 return;
420 }
421
422 let content = match fs::read(path) {
423 Ok(content) => content,
424 Err(_) => return,
425 };
426
427 if is_binary_bytes(&content) {
428 self.track_unindexed_file(path, &metadata);
429 return;
430 }
431
432 self.index_file(path, &content);
433 }
434
435 pub fn grep(
436 &self,
437 pattern: &str,
438 case_sensitive: bool,
439 include: &[String],
440 exclude: &[String],
441 search_root: &Path,
442 max_results: usize,
443 ) -> GrepResult {
444 match pattern_compile::compile(
445 pattern,
446 CompileOpts {
447 case_insensitive: !case_sensitive,
448 ..CompileOpts::default()
449 },
450 ) {
451 CompileResult::Ok(compiled) => {
452 self.search_grep(&compiled, include, exclude, search_root, max_results)
453 }
454 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
455 self.empty_grep_result()
456 }
457 }
458 }
459
460 pub fn search_grep(
461 &self,
462 pattern: &CompiledPattern,
463 include: &[String],
464 exclude: &[String],
465 search_root: &Path,
466 max_results: usize,
467 ) -> GrepResult {
468 let matcher = match pattern {
469 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
470 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
471 };
472
473 let filters = match build_path_filters(include, exclude) {
474 Ok(filters) => filters,
475 Err(_) => PathFilters::default(),
476 };
477 let search_root = canonicalize_or_normalize(search_root);
478
479 let raw_pattern = pattern.raw_pattern_for_trigrams();
480 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
481 RegexQuery::default()
482 } else {
483 decompose_regex(&raw_pattern)
484 };
485 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
486 let candidate_ids = self.candidates(&query);
487
488 let candidate_files: Vec<&FileEntry> = candidate_ids
489 .into_iter()
490 .filter_map(|file_id| self.files.get(file_id as usize))
491 .filter(|file| !file.path.as_os_str().is_empty())
492 .filter(|file| is_within_search_root(&search_root, &file.path))
493 .filter(|file| filters.matches(&self.project_root, &file.path))
494 .collect();
495
496 let total_matches = AtomicUsize::new(0);
497 let files_searched = AtomicUsize::new(0);
498 let files_with_matches = AtomicUsize::new(0);
499 let truncated = AtomicBool::new(false);
500 let engine_capped = AtomicBool::new(false);
501 let stop_after = max_results.saturating_mul(2);
502
503 let mut matches = if candidate_files.len() > 10 {
504 candidate_files
505 .par_iter()
506 .map(|file| {
507 search_candidate_file(
508 file,
509 &matcher,
510 max_results,
511 stop_after,
512 &total_matches,
513 &files_searched,
514 &files_with_matches,
515 &truncated,
516 &engine_capped,
517 )
518 })
519 .reduce(Vec::new, |mut left, mut right| {
520 left.append(&mut right);
521 left
522 })
523 } else {
524 let mut matches = Vec::new();
525 for file in candidate_files {
526 matches.extend(search_candidate_file(
527 file,
528 &matcher,
529 max_results,
530 stop_after,
531 &total_matches,
532 &files_searched,
533 &files_with_matches,
534 &truncated,
535 &engine_capped,
536 ));
537
538 if should_stop_search(&truncated, &total_matches, stop_after) {
539 engine_capped.store(true, Ordering::Relaxed);
540 break;
541 }
542 }
543 matches
544 };
545
546 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
547 self.path_to_id
548 .get(path)
549 .and_then(|file_id| self.files.get(*file_id as usize))
550 .map(|file| file.modified)
551 });
552
553 let matches = matches
554 .into_iter()
555 .map(|matched| GrepMatch {
556 file: matched.file.as_ref().clone(),
557 line: matched.line,
558 column: matched.column,
559 line_text: matched.line_text,
560 match_text: matched.match_text,
561 })
562 .collect();
563
564 GrepResult {
565 total_matches: total_matches.load(Ordering::Relaxed),
566 matches,
567 files_searched: files_searched.load(Ordering::Relaxed),
568 files_with_matches: files_with_matches.load(Ordering::Relaxed),
569 index_status: if self.ready {
570 IndexStatus::Ready
571 } else {
572 IndexStatus::Building
573 },
574 truncated: truncated.load(Ordering::Relaxed),
575 fully_degraded,
576 engine_capped: engine_capped.load(Ordering::Relaxed),
577 }
578 }
579
580 fn empty_grep_result(&self) -> GrepResult {
581 GrepResult {
582 matches: Vec::new(),
583 total_matches: 0,
584 files_searched: 0,
585 files_with_matches: 0,
586 index_status: if self.ready {
587 IndexStatus::Ready
588 } else {
589 IndexStatus::Building
590 },
591 truncated: false,
592 fully_degraded: false,
593 engine_capped: false,
594 }
595 }
596
597 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
598 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
599 Ok(filters) => filters,
600 Err(_) => return Vec::new(),
601 };
602 let search_root = canonicalize_or_normalize(search_root);
603 let mut entries = self
604 .files
605 .iter()
606 .filter(|file| !file.path.as_os_str().is_empty())
607 .filter(|file| is_within_search_root(&search_root, &file.path))
608 .filter(|file| filters.matches(&self.project_root, &file.path))
609 .map(|file| (file.path.clone(), file.modified))
610 .collect::<Vec<_>>();
611
612 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
613 right_mtime
614 .cmp(left_mtime)
615 .then_with(|| left_path.cmp(right_path))
616 });
617
618 entries.into_iter().map(|(path, _)| path).collect()
619 }
620
621 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
622 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
623 return self.active_file_ids();
624 }
625
626 let mut and_trigrams = query.and_trigrams.clone();
627 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
628
629 let mut current: Option<Vec<u32>> = None;
630
631 for trigram in and_trigrams {
632 let filter = query.and_filters.get(&trigram).copied();
633 let matches = self.postings_for_trigram(trigram, filter);
634 current = Some(match current.take() {
635 Some(existing) => intersect_sorted_ids(&existing, &matches),
636 None => matches,
637 });
638
639 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
640 break;
641 }
642 }
643
644 let mut current = current.unwrap_or_else(|| self.active_file_ids());
645
646 for (index, group) in query.or_groups.iter().enumerate() {
647 let mut group_matches = Vec::new();
648 let filters = query.or_filters.get(index);
649
650 for trigram in group {
651 let filter = filters.and_then(|filters| filters.get(trigram).copied());
652 let matches = self.postings_for_trigram(*trigram, filter);
653 if group_matches.is_empty() {
654 group_matches = matches;
655 } else {
656 group_matches = union_sorted_ids(&group_matches, &matches);
657 }
658 }
659
660 current = intersect_sorted_ids(¤t, &group_matches);
661 if current.is_empty() {
662 break;
663 }
664 }
665
666 let mut unindexed = self
667 .unindexed_files
668 .iter()
669 .copied()
670 .filter(|file_id| self.is_active_file(*file_id))
671 .collect::<Vec<_>>();
672 if !unindexed.is_empty() {
673 unindexed.sort_unstable();
674 current = union_sorted_ids(¤t, &unindexed);
675 }
676
677 current
678 }
679
680 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
681 if fs::create_dir_all(cache_dir).is_err() {
682 return;
683 }
684
685 let cache_path = cache_dir.join("cache.bin");
686 let tmp_cache = cache_dir.join(format!(
687 "cache.bin.tmp.{}.{}",
688 std::process::id(),
689 SystemTime::now()
690 .duration_since(UNIX_EPOCH)
691 .unwrap_or(Duration::ZERO)
692 .as_nanos()
693 ));
694
695 let active_ids = self.active_file_ids();
696 let mut id_map = HashMap::new();
697 for (new_id, old_id) in active_ids.iter().enumerate() {
698 let Ok(new_id_u32) = u32::try_from(new_id) else {
699 return;
700 };
701 id_map.insert(*old_id, new_id_u32);
702 }
703
704 let write_result = (|| -> std::io::Result<()> {
705 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
706
707 postings_writer.write_all(INDEX_MAGIC)?;
708 write_u32(&mut postings_writer, INDEX_VERSION)?;
709
710 let head = git_head.unwrap_or_default();
711 let root = self.project_root.to_string_lossy();
712 let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
713 ignore_rules_fingerprint(&self.project_root)
714 } else {
715 self.ignore_rules_fingerprint.clone()
716 };
717 let head_len = u32::try_from(head.len())
718 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
719 let root_len = u32::try_from(root.len())
720 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
721 let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
722 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
723 let file_count = u32::try_from(active_ids.len())
724 .map_err(|_| std::io::Error::other("too many files to cache"))?;
725
726 write_u32(&mut postings_writer, head_len)?;
727 write_u32(&mut postings_writer, root_len)?;
728 write_u32(&mut postings_writer, ignore_fingerprint_len)?;
729 write_u64(&mut postings_writer, self.max_file_size)?;
730 write_u32(&mut postings_writer, file_count)?;
731 postings_writer.write_all(head.as_bytes())?;
732 postings_writer.write_all(root.as_bytes())?;
733 postings_writer.write_all(ignore_fingerprint.as_bytes())?;
734
735 for old_id in &active_ids {
736 let Some(file) = self.files.get(*old_id as usize) else {
737 return Err(std::io::Error::other("missing file entry for cache write"));
738 };
739 let path =
740 cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
741 std::io::Error::other(format!(
742 "refusing to cache path outside project root: {}",
743 file.path.display()
744 ))
745 })?;
746 let path = path.to_string_lossy();
747 let path_len = u32::try_from(path.len())
748 .map_err(|_| std::io::Error::other("cached path too large"))?;
749 let modified = file
750 .modified
751 .duration_since(UNIX_EPOCH)
752 .unwrap_or(Duration::ZERO);
753 let unindexed = if self.unindexed_files.contains(old_id) {
754 1u8
755 } else {
756 0u8
757 };
758
759 postings_writer.write_all(&[unindexed])?;
760 write_u32(&mut postings_writer, path_len)?;
761 write_u64(&mut postings_writer, file.size)?;
762 write_u64(&mut postings_writer, modified.as_secs())?;
763 write_u32(&mut postings_writer, modified.subsec_nanos())?;
764 postings_writer.write_all(file.content_hash.as_bytes())?;
765 postings_writer.write_all(path.as_bytes())?;
766 }
767
768 let mut lookup_entries = Vec::new();
769 let mut postings_blob = Vec::new();
770 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
771 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
772
773 for (trigram, postings) in sorted_postings {
774 let offset = u64::try_from(postings_blob.len())
775 .map_err(|_| std::io::Error::other("postings blob too large"))?;
776 let mut count = 0u32;
777
778 for posting in postings {
779 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
780 continue;
781 };
782
783 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
784 postings_blob.push(posting.next_mask);
785 postings_blob.push(posting.loc_mask);
786 count = count.saturating_add(1);
787 }
788
789 if count > 0 {
790 lookup_entries.push((*trigram, offset, count));
791 }
792 }
793
794 write_u64(
795 &mut postings_writer,
796 u64::try_from(postings_blob.len())
797 .map_err(|_| std::io::Error::other("postings blob too large"))?,
798 )?;
799 postings_writer.write_all(&postings_blob)?;
800 postings_writer.flush()?;
801 let mut postings_blob_file = postings_writer
802 .into_inner()
803 .map_err(|error| std::io::Error::other(error.to_string()))?
804 .into_inner();
805 let checksum = crc32fast::hash(&postings_blob_file);
806 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
807
808 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
809 let entry_count = u32::try_from(lookup_entries.len())
810 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
811
812 lookup_writer.write_all(LOOKUP_MAGIC)?;
813 write_u32(&mut lookup_writer, INDEX_VERSION)?;
814 write_u32(&mut lookup_writer, entry_count)?;
815
816 for (trigram, offset, count) in lookup_entries {
817 write_u32(&mut lookup_writer, trigram)?;
818 write_u64(&mut lookup_writer, offset)?;
819 write_u32(&mut lookup_writer, count)?;
820 }
821
822 lookup_writer.flush()?;
823 let mut lookup_blob_file = lookup_writer
824 .into_inner()
825 .map_err(|error| std::io::Error::other(error.to_string()))?
826 .into_inner();
827 let checksum = crc32fast::hash(&lookup_blob_file);
828 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
829
830 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
831 write_u32(&mut cache_writer, CACHE_MAGIC)?;
832 write_u32(&mut cache_writer, INDEX_VERSION)?;
833 write_u64(
834 &mut cache_writer,
835 u64::try_from(postings_blob_file.len())
836 .map_err(|_| std::io::Error::other("postings section too large"))?,
837 )?;
838 cache_writer.write_all(&postings_blob_file)?;
839 cache_writer.write_all(&lookup_blob_file)?;
840 cache_writer.flush()?;
841 cache_writer.get_ref().sync_all()?;
842 drop(cache_writer);
843 fs::rename(&tmp_cache, &cache_path)?;
844
845 Ok(())
846 })();
847
848 if write_result.is_err() {
849 let _ = fs::remove_file(&tmp_cache);
850 }
851 }
852
853 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
854 debug_assert!(current_canonical_root.is_absolute());
855 let cache_path = cache_dir.join("cache.bin");
856 let cache_bytes = fs::read(&cache_path).ok()?;
857 if cache_bytes.len() < 16 {
858 return None;
859 }
860 let mut header = Cursor::new(&cache_bytes);
861 if read_u32(&mut header).ok()? != CACHE_MAGIC {
862 return None;
863 }
864 if read_u32(&mut header).ok()? != INDEX_VERSION {
865 return None;
866 }
867 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
868 let start = usize::try_from(header.position()).ok()?;
869 let postings_end = start.checked_add(postings_len_total)?;
870 if postings_end > cache_bytes.len() {
871 return None;
872 }
873 let postings_bytes = &cache_bytes[start..postings_end];
874 let lookup_bytes = &cache_bytes[postings_end..];
875 let lookup_len_total = lookup_bytes.len();
876 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
877 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
878 if postings_len_total < 4 || lookup_len_total < 4 {
879 return None;
880 }
881 verify_crc32_bytes_slice(postings_bytes).ok()?;
882 verify_crc32_bytes_slice(lookup_bytes).ok()?;
883
884 let mut magic = [0u8; 8];
885 postings_reader.read_exact(&mut magic).ok()?;
886 if &magic != INDEX_MAGIC {
887 return None;
888 }
889 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
890 return None;
891 }
892
893 let head_len = read_u32(&mut postings_reader).ok()? as usize;
894 let root_len = read_u32(&mut postings_reader).ok()? as usize;
895 let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
896 let max_file_size = read_u64(&mut postings_reader).ok()?;
897 let file_count = read_u32(&mut postings_reader).ok()? as usize;
898 if file_count > MAX_ENTRIES {
899 return None;
900 }
901 let postings_body_len = postings_len_total.checked_sub(4)?;
902 let lookup_body_len = lookup_len_total.checked_sub(4)?;
903
904 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
905 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
906 if minimum_file_bytes > remaining_postings {
907 return None;
908 }
909
910 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
911 return None;
912 }
913 let mut head_bytes = vec![0u8; head_len];
914 postings_reader.read_exact(&mut head_bytes).ok()?;
915 let git_head = String::from_utf8(head_bytes)
916 .ok()
917 .filter(|head| !head.is_empty());
918
919 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
920 return None;
921 }
922 let mut root_bytes = vec![0u8; root_len];
923 postings_reader.read_exact(&mut root_bytes).ok()?;
924 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
925 let project_root = current_canonical_root.to_path_buf();
926
927 if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
928 return None;
929 }
930 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
931 postings_reader
932 .read_exact(&mut ignore_fingerprint_bytes)
933 .ok()?;
934 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
935 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
936 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
937 return None;
938 }
939
940 let mut files = Vec::with_capacity(file_count);
941 let mut path_to_id = HashMap::new();
942 let mut unindexed_files = HashSet::new();
943
944 for file_id in 0..file_count {
945 let mut unindexed = [0u8; 1];
946 postings_reader.read_exact(&mut unindexed).ok()?;
947 let path_len = read_u32(&mut postings_reader).ok()? as usize;
948 let size = read_u64(&mut postings_reader).ok()?;
949 let secs = read_u64(&mut postings_reader).ok()?;
950 let nanos = read_u32(&mut postings_reader).ok()?;
951 let mut hash_bytes = [0u8; 32];
952 postings_reader.read_exact(&mut hash_bytes).ok()?;
953 let content_hash = blake3::Hash::from_bytes(hash_bytes);
954 if nanos >= 1_000_000_000 {
955 return None;
956 }
957 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
958 return None;
959 }
960 let mut path_bytes = vec![0u8; path_len];
961 postings_reader.read_exact(&mut path_bytes).ok()?;
962 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
963 let full_path = cached_path_under_root(&project_root, &relative_path)?;
964 let file_id_u32 = u32::try_from(file_id).ok()?;
965
966 files.push(FileEntry {
967 path: full_path.clone(),
968 size,
969 modified: UNIX_EPOCH + Duration::new(secs, nanos),
970 content_hash,
971 });
972 path_to_id.insert(full_path, file_id_u32);
973 if unindexed[0] == 1 {
974 unindexed_files.insert(file_id_u32);
975 }
976 }
977
978 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
979 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
980 if postings_len > max_postings_bytes {
981 return None;
982 }
983 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
984 return None;
985 }
986 let mut postings_blob = vec![0u8; postings_len];
987 postings_reader.read_exact(&mut postings_blob).ok()?;
988
989 let mut lookup_magic = [0u8; 8];
990 lookup_reader.read_exact(&mut lookup_magic).ok()?;
991 if &lookup_magic != LOOKUP_MAGIC {
992 return None;
993 }
994 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
995 return None;
996 }
997 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
998 if entry_count > MAX_ENTRIES {
999 return None;
1000 }
1001 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
1002 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
1003 if minimum_lookup_bytes > remaining_lookup {
1004 return None;
1005 }
1006
1007 let mut postings = HashMap::new();
1008 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1009
1010 for _ in 0..entry_count {
1011 let trigram = read_u32(&mut lookup_reader).ok()?;
1012 let offset = read_u64(&mut lookup_reader).ok()? as usize;
1013 let count = read_u32(&mut lookup_reader).ok()? as usize;
1014 if count > MAX_ENTRIES {
1015 return None;
1016 }
1017 let bytes_len = count.checked_mul(POSTING_BYTES)?;
1018 let end = offset.checked_add(bytes_len)?;
1019 if end > postings_blob.len() {
1020 return None;
1021 }
1022
1023 let mut trigram_postings = Vec::with_capacity(count);
1024 for chunk in postings_blob[offset..end].chunks_exact(6) {
1025 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1026 let posting = Posting {
1027 file_id,
1028 next_mask: chunk[4],
1029 loc_mask: chunk[5],
1030 };
1031 trigram_postings.push(posting.clone());
1032 file_trigrams.entry(file_id).or_default().push(trigram);
1033 }
1034 postings.insert(trigram, trigram_postings);
1035 }
1036
1037 Some(SearchIndex {
1038 postings,
1039 files,
1040 path_to_id,
1041 ready: false,
1042 project_root,
1043 git_head,
1044 max_file_size,
1045 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1046 file_trigrams,
1047 unindexed_files,
1048 })
1049 }
1050
1051 pub fn stored_git_head(&self) -> Option<&str> {
1052 self.git_head.as_deref()
1053 }
1054
1055 pub(crate) fn set_ready(&mut self, ready: bool) {
1056 self.ready = ready;
1057 }
1058
1059 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1060 self.git_head = current_head;
1061 verify_file_mtimes(self);
1062 self.ready = true;
1063 }
1064
1065 #[cfg(debug_assertions)]
1066 #[doc(hidden)]
1067 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1068 self.verify_against_disk(current_head);
1069 }
1070
1071 pub(crate) fn rebuild_or_refresh(
1072 root: &Path,
1073 max_file_size: u64,
1074 current_head: Option<String>,
1075 baseline: Option<SearchIndex>,
1076 ) -> Self {
1077 if let Some(mut baseline) = baseline {
1078 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1079 baseline.max_file_size = max_file_size;
1080 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1081 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1082 return SearchIndex::build_with_limit(root, max_file_size);
1083 }
1084 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1085
1086 if baseline.git_head == current_head || current_head.is_none() {
1087 baseline.git_head = current_head;
1094 verify_file_mtimes(&mut baseline);
1095 baseline.ready = true;
1096 return baseline;
1097 }
1098
1099 if let (Some(previous), Some(current)) =
1100 (baseline.git_head.clone(), current_head.clone())
1101 {
1102 let project_root = baseline.project_root.clone();
1103 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1104 baseline.git_head = Some(current);
1105 verify_file_mtimes(&mut baseline);
1106 baseline.ready = true;
1107 return baseline;
1108 }
1109 }
1110 }
1111
1112 SearchIndex::build_with_limit(root, max_file_size)
1113 }
1114
1115 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1116 let file_id = u32::try_from(self.files.len()).ok()?;
1117 let metadata = fs::metadata(path).ok();
1118 let size = metadata
1119 .as_ref()
1120 .map_or(size_hint, |metadata| metadata.len());
1121 let modified = metadata
1122 .and_then(|metadata| metadata.modified().ok())
1123 .unwrap_or(UNIX_EPOCH);
1124
1125 self.files.push(FileEntry {
1126 path: path.to_path_buf(),
1127 size,
1128 modified,
1129 content_hash: cache_freshness::zero_hash(),
1130 });
1131 self.path_to_id.insert(path.to_path_buf(), file_id);
1132 Some(file_id)
1133 }
1134
1135 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1136 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1137 return;
1138 };
1139 self.unindexed_files.insert(file_id);
1140 self.file_trigrams.insert(file_id, Vec::new());
1141 }
1142
1143 fn active_file_ids(&self) -> Vec<u32> {
1144 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1145 ids.sort_unstable();
1146 ids
1147 }
1148
1149 fn is_active_file(&self, file_id: u32) -> bool {
1150 self.files
1151 .get(file_id as usize)
1152 .map(|file| !file.path.as_os_str().is_empty())
1153 .unwrap_or(false)
1154 }
1155
1156 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1157 let Some(postings) = self.postings.get(&trigram) else {
1158 return Vec::new();
1159 };
1160
1161 let mut matches = Vec::with_capacity(postings.len());
1162
1163 for posting in postings {
1164 if let Some(filter) = filter {
1165 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1168 continue;
1169 }
1170 }
1175 if self.is_active_file(posting.file_id) {
1176 matches.push(posting.file_id);
1177 }
1178 }
1179
1180 matches
1181 }
1182}
1183
1184fn search_candidate_file(
1185 file: &FileEntry,
1186 matcher: &SearchMatcher,
1187 max_results: usize,
1188 stop_after: usize,
1189 total_matches: &AtomicUsize,
1190 files_searched: &AtomicUsize,
1191 files_with_matches: &AtomicUsize,
1192 truncated: &AtomicBool,
1193 engine_capped: &AtomicBool,
1194) -> Vec<SharedGrepMatch> {
1195 if should_stop_search(truncated, total_matches, stop_after) {
1196 engine_capped.store(true, Ordering::Relaxed);
1197 return Vec::new();
1198 }
1199
1200 let content = match read_indexed_file_bytes(&file.path) {
1201 Some(content) => content,
1202 None => return Vec::new(),
1203 };
1204 if is_binary_bytes(&content) {
1211 return Vec::new();
1212 }
1213 files_searched.fetch_add(1, Ordering::Relaxed);
1214
1215 let shared_path = Arc::new(file.path.clone());
1216 let mut matches = Vec::new();
1217 let mut line_starts = None;
1218 let mut seen_lines = HashSet::new();
1219 let mut matched_this_file = false;
1220
1221 match matcher {
1222 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1223 let needle = &literal.needle;
1224 let finder = memchr::memmem::Finder::new(needle);
1225 let mut start = 0;
1226
1227 while let Some(position) = finder.find(&content[start..]) {
1228 if should_stop_search(truncated, total_matches, stop_after) {
1229 engine_capped.store(true, Ordering::Relaxed);
1230 break;
1231 }
1232
1233 let offset = start + position;
1234 start = offset + 1;
1235
1236 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1237 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1238 if !seen_lines.insert(line) {
1239 continue;
1240 }
1241
1242 matched_this_file = true;
1243 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1244 if match_number > max_results {
1245 truncated.store(true, Ordering::Relaxed);
1246 break;
1247 }
1248
1249 let end = offset + needle.len();
1250 matches.push(SharedGrepMatch {
1251 file: shared_path.clone(),
1252 line,
1253 column,
1254 line_text,
1255 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1256 });
1257 }
1258 }
1259 SearchMatcher::Literal(literal) => {
1260 let needle = &literal.needle;
1261 let search_content = content.to_ascii_lowercase();
1262 let finder = memchr::memmem::Finder::new(needle);
1263 let mut start = 0;
1264
1265 while let Some(position) = finder.find(&search_content[start..]) {
1266 if should_stop_search(truncated, total_matches, stop_after) {
1267 engine_capped.store(true, Ordering::Relaxed);
1268 break;
1269 }
1270
1271 let offset = start + position;
1272 start = offset + 1;
1273
1274 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1275 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1276 if !seen_lines.insert(line) {
1277 continue;
1278 }
1279
1280 matched_this_file = true;
1281 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1282 if match_number > max_results {
1283 truncated.store(true, Ordering::Relaxed);
1284 break;
1285 }
1286
1287 let end = offset + needle.len();
1288 matches.push(SharedGrepMatch {
1289 file: shared_path.clone(),
1290 line,
1291 column,
1292 line_text,
1293 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1294 });
1295 }
1296 }
1297 SearchMatcher::Regex(regex) => {
1298 for matched in regex.find_iter(&content) {
1299 if should_stop_search(truncated, total_matches, stop_after) {
1300 engine_capped.store(true, Ordering::Relaxed);
1301 break;
1302 }
1303
1304 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1305 let (line, column, line_text) =
1306 line_details_bytes(&content, line_starts, matched.start());
1307 if !seen_lines.insert(line) {
1308 continue;
1309 }
1310
1311 matched_this_file = true;
1312 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1313 if match_number > max_results {
1314 truncated.store(true, Ordering::Relaxed);
1315 break;
1316 }
1317
1318 matches.push(SharedGrepMatch {
1319 file: shared_path.clone(),
1320 line,
1321 column,
1322 line_text,
1323 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1324 });
1325 }
1326 }
1327 }
1328
1329 if matched_this_file {
1330 files_with_matches.fetch_add(1, Ordering::Relaxed);
1331 }
1332
1333 matches
1334}
1335
1336fn should_stop_search(
1337 truncated: &AtomicBool,
1338 total_matches: &AtomicUsize,
1339 stop_after: usize,
1340) -> bool {
1341 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1342}
1343
1344fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1345 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1346 let mut left_index = 0;
1347 let mut right_index = 0;
1348
1349 while left_index < left.len() && right_index < right.len() {
1350 match left[left_index].cmp(&right[right_index]) {
1351 std::cmp::Ordering::Less => left_index += 1,
1352 std::cmp::Ordering::Greater => right_index += 1,
1353 std::cmp::Ordering::Equal => {
1354 merged.push(left[left_index]);
1355 left_index += 1;
1356 right_index += 1;
1357 }
1358 }
1359 }
1360
1361 merged
1362}
1363
1364fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1365 let mut merged = Vec::with_capacity(left.len() + right.len());
1366 let mut left_index = 0;
1367 let mut right_index = 0;
1368
1369 while left_index < left.len() && right_index < right.len() {
1370 match left[left_index].cmp(&right[right_index]) {
1371 std::cmp::Ordering::Less => {
1372 merged.push(left[left_index]);
1373 left_index += 1;
1374 }
1375 std::cmp::Ordering::Greater => {
1376 merged.push(right[right_index]);
1377 right_index += 1;
1378 }
1379 std::cmp::Ordering::Equal => {
1380 merged.push(left[left_index]);
1381 left_index += 1;
1382 right_index += 1;
1383 }
1384 }
1385 }
1386
1387 merged.extend_from_slice(&left[left_index..]);
1388 merged.extend_from_slice(&right[right_index..]);
1389 merged
1390}
1391
1392pub fn decompose_regex(pattern: &str) -> RegexQuery {
1393 let hir = match regex_syntax::parse(pattern) {
1394 Ok(hir) => hir,
1395 Err(_) => return RegexQuery::default(),
1396 };
1397
1398 let build = build_query(&hir);
1399 build.into_query()
1400}
1401
1402pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1403 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1404}
1405
1406pub fn normalize_char(c: u8) -> u8 {
1407 c.to_ascii_lowercase()
1408}
1409
1410pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1411 if content.len() < 3 {
1412 return Vec::new();
1413 }
1414
1415 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1416 for start in 0..=content.len() - 3 {
1417 let trigram = pack_trigram(
1418 normalize_char(content[start]),
1419 normalize_char(content[start + 1]),
1420 normalize_char(content[start + 2]),
1421 );
1422 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1423 trigrams.push((trigram, next_char, start));
1424 }
1425 trigrams
1426}
1427
1428pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1429 let mut seen = HashSet::new();
1430 let mut out = Vec::new();
1431 for token in tokens {
1432 for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1433 if seen.insert(trigram) {
1434 out.push(trigram);
1435 }
1436 }
1437 }
1438 out
1439}
1440
1441pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1442 if query_trigrams.is_empty() {
1443 return 0.0;
1444 }
1445
1446 let mut hits = 0u32;
1447 for &trigram in query_trigrams {
1448 if let Some(postings) = index.postings.get(&trigram) {
1449 if postings
1450 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1451 .is_ok()
1452 {
1453 hits += 1;
1454 }
1455 }
1456 }
1457
1458 if hits == 0 {
1459 return 0.0;
1460 }
1461
1462 let file_trigram_count = index
1463 .file_trigrams
1464 .get(&file_id)
1465 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1466 (hits as f32) / (1.0 + file_trigram_count.ln())
1467}
1468
1469pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1470 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1472 return PathBuf::from(override_dir)
1473 .join("index")
1474 .join(project_cache_key(project_root));
1475 }
1476 if let Some(dir) = storage_dir {
1478 return dir.join("index").join(project_cache_key(project_root));
1479 }
1480 let home = std::env::var_os("HOME")
1485 .or_else(|| std::env::var_os("USERPROFILE"))
1486 .map(PathBuf::from)
1487 .unwrap_or_else(std::env::temp_dir);
1488 home.join(".cache")
1489 .join("aft")
1490 .join("index")
1491 .join(project_cache_key(project_root))
1492}
1493
1494pub(crate) fn build_path_filters(
1495 include: &[String],
1496 exclude: &[String],
1497) -> Result<PathFilters, String> {
1498 Ok(PathFilters {
1499 includes: build_globset(include)?,
1500 excludes: build_globset(exclude)?,
1501 })
1502}
1503
1504pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1505 walk_project_files_from(root, root, filters)
1506}
1507
1508pub fn walk_project_files_bounded_default(
1509 root: &Path,
1510 max_files: usize,
1511) -> Result<Vec<PathBuf>, usize> {
1512 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files))
1513}
1514
1515pub(crate) fn walk_project_files_bounded_matching<F>(
1516 root: &Path,
1517 filters: &PathFilters,
1518 max_files: usize,
1519 matches_file: F,
1520) -> Result<Vec<PathBuf>, usize>
1521where
1522 F: Fn(&Path) -> bool,
1523{
1524 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file)
1525}
1526
1527pub fn walk_project_files_bounded_default_matching<F>(
1528 root: &Path,
1529 max_files: usize,
1530 matches_file: F,
1531) -> Result<Vec<PathBuf>, usize>
1532where
1533 F: Fn(&Path) -> bool,
1534{
1535 walk_project_files_from_inner_matching(
1536 root,
1537 root,
1538 &PathFilters::default(),
1539 Some(max_files),
1540 matches_file,
1541 )
1542}
1543
1544pub(crate) fn walk_project_files_from(
1545 filter_root: &Path,
1546 search_root: &Path,
1547 filters: &PathFilters,
1548) -> Vec<PathBuf> {
1549 walk_project_files_from_inner(filter_root, search_root, filters, None)
1550 .expect("unbounded project walk cannot exceed a file limit")
1551}
1552
1553pub(crate) fn has_any_project_file_from(
1554 filter_root: &Path,
1555 search_root: &Path,
1556 filters: &PathFilters,
1557) -> bool {
1558 walk_project_files_from_inner(filter_root, search_root, filters, Some(0)).is_err()
1559}
1560
1561fn walk_project_files_from_inner(
1562 filter_root: &Path,
1563 search_root: &Path,
1564 filters: &PathFilters,
1565 max_files: Option<usize>,
1566) -> Result<Vec<PathBuf>, usize> {
1567 walk_project_files_from_inner_matching(filter_root, search_root, filters, max_files, |_| true)
1568}
1569
1570fn walk_project_files_from_inner_matching<F>(
1571 filter_root: &Path,
1572 search_root: &Path,
1573 filters: &PathFilters,
1574 max_files: Option<usize>,
1575 matches_file: F,
1576) -> Result<Vec<PathBuf>, usize>
1577where
1578 F: Fn(&Path) -> bool,
1579{
1580 let mut builder = WalkBuilder::new(search_root);
1581 builder
1582 .hidden(false)
1583 .git_ignore(true)
1584 .git_global(true)
1585 .git_exclude(true)
1586 .add_custom_ignore_filename(".aftignore")
1590 .filter_entry(|entry| {
1591 let name = entry.file_name().to_string_lossy();
1592 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1593 return !matches!(
1594 name.as_ref(),
1595 "node_modules"
1596 | "target"
1597 | "venv"
1598 | ".venv"
1599 | ".git"
1600 | "__pycache__"
1601 | ".tox"
1602 | "dist"
1603 | "build"
1604 );
1605 }
1606 true
1607 });
1608
1609 let mut files = Vec::new();
1610 for entry in builder.build().filter_map(|entry| entry.ok()) {
1611 if !entry
1612 .file_type()
1613 .map_or(false, |file_type| file_type.is_file())
1614 {
1615 continue;
1616 }
1617 let path = entry.into_path();
1618 if filters.matches(filter_root, &path) && matches_file(&path) {
1619 files.push(path);
1620 if max_files.is_some_and(|limit| files.len() > limit) {
1621 return Err(files.len());
1622 }
1623 }
1624 }
1625
1626 sort_paths_by_mtime_desc(&mut files);
1627 Ok(files)
1628}
1629
1630pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1631 let bytes = fs::read(path).ok()?;
1632 if is_binary_bytes(&bytes) {
1633 return None;
1634 }
1635 String::from_utf8(bytes).ok()
1636}
1637
1638fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1639 fs::read(path).ok()
1640}
1641
1642pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1643 path.strip_prefix(root)
1644 .map(PathBuf::from)
1645 .unwrap_or_else(|_| path.to_path_buf())
1646}
1647
1648pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1649 let normalized_root = normalize_path(root);
1650 let normalized_path = normalize_path(path);
1651 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1652 validate_cached_relative_path(relative)
1653}
1654
1655pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1656 let relative = validate_cached_relative_path(relative_path)?;
1657 let normalized_root = normalize_path(root);
1658 let full_path = normalize_path(&normalized_root.join(relative));
1659
1660 match fs::canonicalize(&full_path) {
1661 Ok(canonical_path) => {
1662 if canonical_path.starts_with(&normalized_root) {
1663 return Some(full_path);
1664 }
1665
1666 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1667 canonical_path
1668 .starts_with(&canonical_root)
1669 .then_some(full_path)
1670 }
1671 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1672 }
1673}
1674
1675pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1676 if path.is_absolute() {
1677 return None;
1678 }
1679
1680 let mut normalized = PathBuf::new();
1681 for component in path.components() {
1682 match component {
1683 Component::Normal(part) => normalized.push(part),
1684 Component::CurDir => {}
1685 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1686 }
1687 }
1688 (!normalized.as_os_str().is_empty()).then_some(normalized)
1689}
1690
1691pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1704 use std::collections::HashMap;
1705 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1706 for path in paths.iter() {
1707 mtimes
1708 .entry(path.clone())
1709 .or_insert_with(|| path_modified_time(path));
1710 }
1711 paths.sort_by(|left, right| {
1712 let left_mtime = mtimes.get(left).and_then(|v| *v);
1713 let right_mtime = mtimes.get(right).and_then(|v| *v);
1714 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1715 });
1716}
1717
1718pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1721 use std::collections::HashMap;
1722 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1723 for m in matches.iter() {
1724 mtimes.entry(m.file.clone()).or_insert_with(|| {
1725 let resolved = resolve_match_path(project_root, &m.file);
1726 path_modified_time(&resolved)
1727 });
1728 }
1729 matches.sort_by(|left, right| {
1730 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1731 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1732 right_mtime
1733 .cmp(&left_mtime)
1734 .then_with(|| left.file.cmp(&right.file))
1735 .then_with(|| left.line.cmp(&right.line))
1736 .then_with(|| left.column.cmp(&right.column))
1737 });
1738}
1739
1740fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1745 matches: &mut [SharedGrepMatch],
1746 modified_for_path: F,
1747) where
1748 F: Fn(&Path) -> Option<SystemTime>,
1749{
1750 use std::collections::HashMap;
1751 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1752 for m in matches.iter() {
1753 let path = m.file.as_path().to_path_buf();
1754 mtimes
1755 .entry(path.clone())
1756 .or_insert_with(|| modified_for_path(&path));
1757 }
1758 matches.sort_by(|left, right| {
1759 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1760 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1761 right_mtime
1762 .cmp(&left_mtime)
1763 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1764 .then_with(|| left.line.cmp(&right.line))
1765 .then_with(|| left.column.cmp(&right.column))
1766 });
1767}
1768
1769pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1770 let resolved_project_root = canonicalize_or_normalize(project_root);
1771 let root = match path {
1772 Some(path) => {
1773 let path = PathBuf::from(path);
1774 if path.is_absolute() {
1775 canonicalize_or_normalize(&path)
1776 } else {
1777 normalize_path(&resolved_project_root.join(path))
1778 }
1779 }
1780 None => resolved_project_root.clone(),
1781 };
1782
1783 let use_index = is_within_search_root(&resolved_project_root, &root);
1784 SearchScope { root, use_index }
1785}
1786
1787pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1788 content_inspector::inspect(content).is_binary()
1789}
1790
1791pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1792 run_git(root, &["rev-parse", "HEAD"])
1793}
1794
1795pub fn project_cache_key(project_root: &Path) -> String {
1796 use sha2::{Digest, Sha256};
1797
1798 let mut hasher = Sha256::new();
1799
1800 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1801 hasher.update(root_commit.as_bytes());
1804 } else {
1805 let canonical_root = canonicalize_or_normalize(project_root);
1807 hasher.update(canonical_root.to_string_lossy().as_bytes());
1808 }
1809
1810 let digest = format!("{:x}", hasher.finalize());
1811 digest[..16].to_string()
1812}
1813
1814pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1822 use sha2::{Digest, Sha256};
1823
1824 let root = canonicalize_or_normalize(project_root);
1825 let mut files = Vec::new();
1826 collect_ignore_rule_files(&root, &mut files);
1827 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
1828 if global_ignore.is_file() {
1829 files.push(global_ignore);
1830 }
1831 }
1832 let info_exclude = git_info_exclude_path(&root);
1833 if info_exclude.is_file() {
1834 files.push(info_exclude);
1835 }
1836 files.sort();
1837 files.dedup();
1838
1839 let mut hasher = Sha256::new();
1840 hasher.update(b"aft-ignore-rules-v1\0");
1841 for path in files {
1842 if let Some(relative) = cache_relative_path(&root, &path) {
1843 hasher.update(relative.to_string_lossy().as_bytes());
1844 } else {
1845 hasher.update(path.to_string_lossy().as_bytes());
1846 }
1847 hasher.update(b"\0");
1848 match fs::read(&path) {
1849 Ok(bytes) => hasher.update(&bytes),
1850 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
1851 }
1852 hasher.update(b"\0");
1853 }
1854
1855 format!("{:x}", hasher.finalize())
1856}
1857
1858fn git_info_exclude_path(root: &Path) -> PathBuf {
1859 run_git(
1860 root,
1861 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
1862 )
1863 .map(PathBuf::from)
1864 .unwrap_or_else(|| root.join(".git"))
1865 .join("info")
1866 .join("exclude")
1867}
1868
1869fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
1870 let mut stack = vec![root.to_path_buf()];
1871 while let Some(dir) = stack.pop() {
1872 let Ok(entries) = fs::read_dir(&dir) else {
1873 continue;
1874 };
1875 for entry in entries.flatten() {
1876 let path = entry.path();
1877 let file_name = entry.file_name();
1878 if file_name == ".gitignore" || file_name == ".aftignore" {
1879 if path.is_file() {
1880 files.push(path);
1881 }
1882 continue;
1883 }
1884
1885 let Ok(file_type) = entry.file_type() else {
1886 continue;
1887 };
1888 if !file_type.is_dir() || file_type.is_symlink() {
1889 continue;
1890 }
1891 if ignore_rule_fingerprint_skips_dir(&file_name) {
1892 continue;
1893 }
1894 stack.push(path);
1895 }
1896 }
1897}
1898
1899fn ignore_rule_fingerprint_skips_dir(name: &std::ffi::OsStr) -> bool {
1900 matches!(
1901 name.to_str().unwrap_or(""),
1902 ".git"
1903 | "node_modules"
1904 | "target"
1905 | "venv"
1906 | ".venv"
1907 | "__pycache__"
1908 | ".tox"
1909 | "dist"
1910 | "build"
1911 )
1912}
1913
1914impl PathFilters {
1915 fn matches(&self, root: &Path, path: &Path) -> bool {
1916 let relative = to_glob_path(&relative_to_root(root, path));
1917 if self
1918 .includes
1919 .as_ref()
1920 .is_some_and(|includes| !includes.is_match(&relative))
1921 {
1922 return false;
1923 }
1924 if self
1925 .excludes
1926 .as_ref()
1927 .is_some_and(|excludes| excludes.is_match(&relative))
1928 {
1929 return false;
1930 }
1931 true
1932 }
1933}
1934
1935fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1936 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1937}
1938
1939fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1940 if path.is_absolute() {
1941 path.to_path_buf()
1942 } else {
1943 project_root.join(path)
1944 }
1945}
1946
1947fn path_modified_time(path: &Path) -> Option<SystemTime> {
1948 fs::metadata(path)
1949 .and_then(|metadata| metadata.modified())
1950 .ok()
1951}
1952
1953fn normalize_path(path: &Path) -> PathBuf {
1954 let mut result = PathBuf::new();
1955 for component in path.components() {
1956 match component {
1957 Component::ParentDir => {
1958 if !result.pop() {
1959 result.push(component);
1960 }
1961 }
1962 Component::CurDir => {}
1963 _ => result.push(component),
1964 }
1965 }
1966 result
1967}
1968
1969fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
1970 if let Ok(canonical) = fs::canonicalize(path) {
1971 return canonical;
1972 }
1973
1974 let Some(parent) = path.parent() else {
1975 return path.to_path_buf();
1976 };
1977 let Some(file_name) = path.file_name() else {
1978 return path.to_path_buf();
1979 };
1980
1981 fs::canonicalize(parent)
1982 .map(|canonical_parent| canonical_parent.join(file_name))
1983 .unwrap_or_else(|_| path.to_path_buf())
1984}
1985
1986fn verify_file_mtimes(index: &mut SearchIndex) {
1989 let filters = PathFilters::default();
1990 let current_files = walk_project_files(&index.project_root, &filters);
1991 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
1992 let mut stale_paths = Vec::new();
1993 let mut removed_paths = Vec::new();
1994
1995 for entry in &mut index.files {
1996 if entry.path.as_os_str().is_empty() {
1997 continue; }
1999 if !current_file_set.contains(&entry.path) {
2000 removed_paths.push(entry.path.clone());
2001 continue;
2002 }
2003 let cached = FileFreshness {
2004 mtime: entry.modified,
2005 size: entry.size,
2006 content_hash: entry.content_hash,
2007 };
2008 match cache_freshness::verify_file_strict(&entry.path, &cached) {
2009 FreshnessVerdict::HotFresh => {}
2010 FreshnessVerdict::ContentFresh {
2011 new_mtime,
2012 new_size,
2013 } => {
2014 entry.modified = new_mtime;
2015 entry.size = new_size;
2016 }
2017 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2018 stale_paths.push(entry.path.clone())
2019 }
2020 }
2021 }
2022
2023 for path in &removed_paths {
2024 index.remove_file(path);
2025 }
2026
2027 for path in &stale_paths {
2031 if current_file_set.contains(path) {
2032 index.update_file(path);
2033 } else {
2034 index.remove_file(path);
2035 }
2036 }
2037
2038 for path in current_files {
2040 if !index.path_to_id.contains_key(&path) {
2041 index.update_file(&path);
2042 }
2043 }
2044
2045 if !stale_paths.is_empty() {
2046 crate::slog_info!(
2047 "search index: refreshed {} stale file(s) from disk cache",
2048 stale_paths.len()
2049 );
2050 }
2051}
2052
2053fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2054 normalize_path(path).starts_with(normalize_path(search_root))
2055}
2056
2057impl QueryBuild {
2058 fn into_query(self) -> RegexQuery {
2059 let mut query = RegexQuery::default();
2060
2061 for run in self.and_runs {
2062 add_run_to_and_query(&mut query, &run);
2063 }
2064
2065 for group in self.or_groups {
2066 let mut trigrams = BTreeSet::new();
2067 let mut filters = HashMap::new();
2068 for run in group {
2069 for (trigram, filter) in trigram_filters(&run) {
2070 trigrams.insert(trigram);
2071 merge_filter(filters.entry(trigram).or_default(), filter);
2072 }
2073 }
2074 if !trigrams.is_empty() {
2075 query.or_groups.push(trigrams.into_iter().collect());
2076 query.or_filters.push(filters);
2077 }
2078 }
2079
2080 query
2081 }
2082}
2083
2084fn build_query(hir: &Hir) -> QueryBuild {
2085 match hir.kind() {
2086 HirKind::Literal(literal) => {
2087 if literal.0.len() >= 3 {
2088 QueryBuild {
2089 and_runs: vec![literal.0.to_vec()],
2090 or_groups: Vec::new(),
2091 }
2092 } else {
2093 QueryBuild::default()
2094 }
2095 }
2096 HirKind::Capture(capture) => build_query(&capture.sub),
2097 HirKind::Concat(parts) => {
2098 let mut build = QueryBuild::default();
2099 for part in parts {
2100 let part_build = build_query(part);
2101 build.and_runs.extend(part_build.and_runs);
2102 build.or_groups.extend(part_build.or_groups);
2103 }
2104 build
2105 }
2106 HirKind::Alternation(parts) => {
2107 let mut group = Vec::new();
2108 for part in parts {
2109 let Some(mut choices) = guaranteed_run_choices(part) else {
2110 return QueryBuild::default();
2111 };
2112 group.append(&mut choices);
2113 }
2114 if group.is_empty() {
2115 QueryBuild::default()
2116 } else {
2117 QueryBuild {
2118 and_runs: Vec::new(),
2119 or_groups: vec![group],
2120 }
2121 }
2122 }
2123 HirKind::Repetition(repetition) => {
2124 if repetition.min == 0 {
2125 QueryBuild::default()
2126 } else {
2127 build_query(&repetition.sub)
2128 }
2129 }
2130 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2131 }
2132}
2133
2134fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2135 match hir.kind() {
2136 HirKind::Literal(literal) => {
2137 if literal.0.len() >= 3 {
2138 Some(vec![literal.0.to_vec()])
2139 } else {
2140 None
2141 }
2142 }
2143 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2144 HirKind::Concat(parts) => {
2145 let mut runs = Vec::new();
2146 for part in parts {
2147 if let Some(mut part_runs) = guaranteed_run_choices(part) {
2148 runs.append(&mut part_runs);
2149 }
2150 }
2151 if runs.is_empty() {
2152 None
2153 } else {
2154 Some(runs)
2155 }
2156 }
2157 HirKind::Alternation(parts) => {
2158 let mut runs = Vec::new();
2159 for part in parts {
2160 let Some(mut part_runs) = guaranteed_run_choices(part) else {
2161 return None;
2162 };
2163 runs.append(&mut part_runs);
2164 }
2165 if runs.is_empty() {
2166 None
2167 } else {
2168 Some(runs)
2169 }
2170 }
2171 HirKind::Repetition(repetition) => {
2172 if repetition.min == 0 {
2173 None
2174 } else {
2175 guaranteed_run_choices(&repetition.sub)
2176 }
2177 }
2178 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2179 }
2180}
2181
2182fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2183 for (trigram, filter) in trigram_filters(run) {
2184 if !query.and_trigrams.contains(&trigram) {
2185 query.and_trigrams.push(trigram);
2186 }
2187 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2188 }
2189}
2190
2191fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2192 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2193 for (trigram, next_char, position) in extract_trigrams(run) {
2194 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
2195 if next_char != EOF_SENTINEL {
2196 entry.next_mask |= mask_for_next_char(next_char);
2197 }
2198 entry.loc_mask |= mask_for_position(position);
2199 }
2200 filters.into_iter().collect()
2201}
2202
2203fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2204 target.next_mask |= filter.next_mask;
2205 target.loc_mask |= filter.loc_mask;
2206}
2207
2208fn mask_for_next_char(next_char: u8) -> u8 {
2209 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2210 1u8 << bit
2211}
2212
2213fn mask_for_position(position: usize) -> u8 {
2214 1u8 << (position % 8)
2215}
2216
2217fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2218 if patterns.is_empty() {
2219 return Ok(None);
2220 }
2221
2222 let mut builder = GlobSetBuilder::new();
2223 for pattern in patterns {
2224 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2225 builder.add(glob);
2226 }
2227 builder.build().map(Some).map_err(|error| error.to_string())
2228}
2229
2230fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2231 let mut buffer = [0u8; 4];
2232 reader.read_exact(&mut buffer)?;
2233 Ok(u32::from_le_bytes(buffer))
2234}
2235
2236fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2237 let mut buffer = [0u8; 8];
2238 reader.read_exact(&mut buffer)?;
2239 Ok(u64::from_le_bytes(buffer))
2240}
2241
2242fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2243 writer.write_all(&value.to_le_bytes())
2244}
2245
2246fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2247 writer.write_all(&value.to_le_bytes())
2248}
2249
2250fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2251 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2252 return Err(std::io::Error::other("search index checksum missing"));
2253 };
2254 let expected = u32::from_le_bytes(*stored);
2255 let actual = crc32fast::hash(body);
2256 if actual != expected {
2257 return Err(std::io::Error::other("search index checksum mismatch"));
2258 }
2259 Ok(())
2260}
2261
2262fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2263 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2264 total_len.checked_sub(pos)
2265}
2266
2267fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2268 let output = Command::new("git")
2269 .arg("-C")
2270 .arg(root)
2271 .args(args)
2272 .output()
2273 .ok()?;
2274 if !output.status.success() {
2275 return None;
2276 }
2277 let value = String::from_utf8(output.stdout).ok()?;
2278 let value = value.trim().to_string();
2279 if value.is_empty() {
2280 None
2281 } else {
2282 Some(value)
2283 }
2284}
2285
2286fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2287 let diff_range = format!("{}..{}", from, to);
2288 let output = match Command::new("git")
2289 .arg("-C")
2290 .arg(root)
2291 .args(["diff", "--name-status", "-M", &diff_range])
2292 .output()
2293 {
2294 Ok(output) => output,
2295 Err(_) => return false,
2296 };
2297
2298 if !output.status.success() {
2299 return false;
2300 }
2301
2302 let Ok(diff) = String::from_utf8(output.stdout) else {
2303 return false;
2304 };
2305
2306 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2307 let mut fields = line.split('\t');
2308 let Some(status) = fields.next() else {
2309 continue;
2310 };
2311
2312 if status.starts_with('R') {
2313 let Some(old_path) = fields
2314 .next()
2315 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2316 else {
2317 continue;
2318 };
2319 let Some(new_path) = fields
2320 .next()
2321 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2322 else {
2323 continue;
2324 };
2325 index.remove_file(&old_path);
2326 index.update_file(&new_path);
2327 continue;
2328 }
2329
2330 let Some(path) = fields
2331 .next()
2332 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2333 else {
2334 continue;
2335 };
2336 if status.starts_with('D') || !path.exists() {
2337 index.remove_file(&path);
2338 } else {
2339 index.update_file(&path);
2340 }
2341 }
2342
2343 true
2344}
2345
2346fn is_binary_path(path: &Path, size: u64) -> bool {
2347 if size == 0 {
2348 return false;
2349 }
2350
2351 let mut file = match File::open(path) {
2352 Ok(file) => file,
2353 Err(_) => return true,
2354 };
2355
2356 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2357 match file.read(&mut preview) {
2358 Ok(read) => is_binary_bytes(&preview[..read]),
2359 Err(_) => true,
2360 }
2361}
2362
2363fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2364 let mut starts = vec![0usize];
2365 for (index, byte) in content.iter().copied().enumerate() {
2366 if byte == b'\n' {
2367 starts.push(index + 1);
2368 }
2369 }
2370 starts
2371}
2372
2373fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2374 let line_index = match line_starts.binary_search(&offset) {
2375 Ok(index) => index,
2376 Err(index) => index.saturating_sub(1),
2377 };
2378 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2379 let line_end = content[line_start..]
2380 .iter()
2381 .position(|byte| *byte == b'\n')
2382 .map(|length| line_start + length)
2383 .unwrap_or(content.len());
2384 let mut line_slice = &content[line_start..line_end];
2385 if line_slice.ends_with(b"\r") {
2386 line_slice = &line_slice[..line_slice.len() - 1];
2387 }
2388 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2389 let column = String::from_utf8_lossy(&content[line_start..offset])
2390 .chars()
2391 .count() as u32
2392 + 1;
2393 (line_index as u32 + 1, column, line_text)
2394}
2395
2396fn to_glob_path(path: &Path) -> String {
2397 path.to_string_lossy().replace('\\', "/")
2398}
2399
2400#[cfg(test)]
2401mod tests {
2402 use std::process::Command;
2403
2404 use super::*;
2405
2406 #[test]
2407 fn cached_path_under_root_allows_missing_lexical_child() {
2408 let dir = tempfile::tempdir().expect("create temp dir");
2409 let project = dir.path().join("project");
2410 fs::create_dir_all(&project).expect("create project dir");
2411 let root = fs::canonicalize(&project).expect("canonicalize project");
2412
2413 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2414 .expect("missing child should fall back to lexical validation");
2415
2416 assert_eq!(path, root.join("future/file.rs"));
2417 }
2418
2419 #[cfg(unix)]
2420 #[test]
2421 fn cached_path_under_root_rejects_symlink_escape() {
2422 let dir = tempfile::tempdir().expect("create temp dir");
2423 let project = dir.path().join("project");
2424 let outside = dir.path().join("outside");
2425 fs::create_dir_all(&project).expect("create project dir");
2426 fs::create_dir_all(&outside).expect("create outside dir");
2427 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2428 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2429 let root = fs::canonicalize(&project).expect("canonicalize project");
2430
2431 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2432 }
2433
2434 #[test]
2435 fn extract_trigrams_tracks_next_char_and_position() {
2436 let trigrams = extract_trigrams(b"Rust");
2437 assert_eq!(trigrams.len(), 2);
2438 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2439 assert_eq!(
2440 trigrams[1],
2441 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2442 );
2443 }
2444
2445 #[test]
2446 fn decompose_regex_extracts_literals_and_alternations() {
2447 let query = decompose_regex("abc(def|ghi)xyz");
2448 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2449 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2450 assert_eq!(query.or_groups.len(), 1);
2451 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2452 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2453 }
2454
2455 #[test]
2456 fn candidates_intersect_posting_lists() {
2457 let mut index = SearchIndex::new();
2458 let dir = tempfile::tempdir().expect("create temp dir");
2459 let alpha = dir.path().join("alpha.txt");
2460 let beta = dir.path().join("beta.txt");
2461 fs::write(&alpha, "abcdef").expect("write alpha");
2462 fs::write(&beta, "abcxyz").expect("write beta");
2463 index.project_root = dir.path().to_path_buf();
2464 index.index_file(&alpha, b"abcdef");
2465 index.index_file(&beta, b"abcxyz");
2466
2467 let query = RegexQuery {
2468 and_trigrams: vec![
2469 pack_trigram(b'a', b'b', b'c'),
2470 pack_trigram(b'd', b'e', b'f'),
2471 ],
2472 ..RegexQuery::default()
2473 };
2474
2475 let candidates = index.candidates(&query);
2476 assert_eq!(candidates.len(), 1);
2477 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2478 }
2479
2480 #[test]
2481 fn candidates_apply_bloom_filters() {
2482 let mut index = SearchIndex::new();
2483 let dir = tempfile::tempdir().expect("create temp dir");
2484 let file = dir.path().join("sample.txt");
2485 fs::write(&file, "abcd efgh").expect("write sample");
2486 index.project_root = dir.path().to_path_buf();
2487 index.index_file(&file, b"abcd efgh");
2488
2489 let trigram = pack_trigram(b'a', b'b', b'c');
2490 let matching_filter = PostingFilter {
2491 next_mask: mask_for_next_char(b'd'),
2492 loc_mask: mask_for_position(0),
2493 };
2494 let non_matching_filter = PostingFilter {
2495 next_mask: mask_for_next_char(b'z'),
2496 loc_mask: mask_for_position(0),
2497 };
2498
2499 assert_eq!(
2500 index
2501 .postings_for_trigram(trigram, Some(matching_filter))
2502 .len(),
2503 1
2504 );
2505 assert!(index
2506 .postings_for_trigram(trigram, Some(non_matching_filter))
2507 .is_empty());
2508 }
2509
2510 #[test]
2511 fn disk_round_trip_preserves_postings_and_files() {
2512 let dir = tempfile::tempdir().expect("create temp dir");
2513 let project = dir.path().join("project");
2514 fs::create_dir_all(&project).expect("create project dir");
2515 let file = project.join("src.txt");
2516 fs::write(&file, "abcdef").expect("write source");
2517
2518 let mut index = SearchIndex::build(&project);
2519 index.git_head = Some("deadbeef".to_string());
2520 let cache_dir = dir.path().join("cache");
2521 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2522
2523 let loaded =
2524 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2525 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2526 assert_eq!(loaded.files.len(), 1);
2527 assert_eq!(
2528 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2529 PathBuf::from("src.txt")
2530 );
2531 assert_eq!(loaded.postings.len(), index.postings.len());
2532 assert!(loaded
2533 .postings
2534 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2535 }
2536
2537 #[test]
2538 fn cache_path_helpers_reject_absolute_and_parent_paths() {
2539 let root = PathBuf::from("/tmp/aft-project");
2540
2541 assert_eq!(
2542 cache_relative_path(&root, &root.join("src/lib.rs")),
2543 Some(PathBuf::from("src/lib.rs"))
2544 );
2545 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2546 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2547 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2548 assert_eq!(
2549 cached_path_under_root(&root, Path::new("src/./lib.rs")),
2550 Some(root.join("src/lib.rs"))
2551 );
2552 }
2553
2554 #[test]
2555 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2556 let dir = tempfile::tempdir().expect("create temp dir");
2557 let project = dir.path().join("project");
2558 fs::create_dir_all(&project).expect("create project dir");
2559 let canonical_project = fs::canonicalize(&project).expect("canonical project");
2560 fs::write(project.join("old.txt"), "old token\n").expect("write old");
2561 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2562
2563 Command::new("git")
2564 .arg("init")
2565 .arg(&project)
2566 .status()
2567 .expect("git init");
2568 for args in [
2569 ["config", "user.email", "aft@example.invalid"],
2570 ["config", "user.name", "AFT Test"],
2571 ] {
2572 Command::new("git")
2573 .arg("-C")
2574 .arg(&project)
2575 .args(args)
2576 .status()
2577 .expect("git config");
2578 }
2579 Command::new("git")
2580 .arg("-C")
2581 .arg(&project)
2582 .args(["add", "."])
2583 .status()
2584 .expect("git add initial");
2585 Command::new("git")
2586 .arg("-C")
2587 .arg(&project)
2588 .args(["commit", "-m", "initial"])
2589 .status()
2590 .expect("git commit initial");
2591 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2592 let mut baseline = SearchIndex::build(&project);
2593 baseline.git_head = Some(previous.clone());
2594
2595 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2596 Command::new("git")
2597 .arg("-C")
2598 .arg(&project)
2599 .args(["add", "-A"])
2600 .status()
2601 .expect("git add rename");
2602 Command::new("git")
2603 .arg("-C")
2604 .arg(&project)
2605 .args(["commit", "-m", "rename"])
2606 .status()
2607 .expect("git commit rename");
2608 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2609
2610 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2611 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2612
2613 let refreshed = SearchIndex::rebuild_or_refresh(
2614 &project,
2615 DEFAULT_MAX_FILE_SIZE,
2616 Some(current),
2617 Some(baseline),
2618 );
2619
2620 assert!(!refreshed
2621 .path_to_id
2622 .contains_key(&canonical_project.join("old.txt")));
2623 assert!(refreshed
2624 .path_to_id
2625 .contains_key(&canonical_project.join("new.txt")));
2626 assert!(refreshed
2627 .path_to_id
2628 .contains_key(&canonical_project.join("untracked.txt")));
2629 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2630 assert_eq!(matches.matches.len(), 1);
2631 }
2632
2633 #[test]
2634 fn read_from_disk_rejects_corrupt_postings_checksum() {
2635 let dir = tempfile::tempdir().expect("create temp dir");
2636 let project = dir.path().join("project");
2637 fs::create_dir_all(&project).expect("create project dir");
2638 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2639
2640 let index = SearchIndex::build(&project);
2641 let cache_dir = dir.path().join("cache");
2642 index.write_to_disk(&cache_dir, None);
2643
2644 let cache_path = cache_dir.join("cache.bin");
2645 let mut bytes = fs::read(&cache_path).expect("read cache");
2646 let middle = bytes.len() / 2;
2647 bytes[middle] ^= 0xff;
2648 fs::write(&cache_path, bytes).expect("write corrupted cache");
2649
2650 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2651 }
2652
2653 #[test]
2654 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2655 let dir = tempfile::tempdir().expect("create temp dir");
2656 let project = dir.path().join("project");
2657 fs::create_dir_all(&project).expect("create project dir");
2658 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2659
2660 let index = SearchIndex::build(&project);
2661 let cache_dir = dir.path().join("cache");
2662 index.write_to_disk(&cache_dir, None);
2663
2664 assert!(cache_dir.join("cache.bin").is_file());
2665 assert!(fs::read_dir(&cache_dir)
2666 .expect("read cache dir")
2667 .all(|entry| !entry
2668 .expect("cache entry")
2669 .file_name()
2670 .to_string_lossy()
2671 .contains(".tmp.")));
2672 }
2673
2674 #[test]
2675 fn concurrent_search_index_writes_do_not_corrupt() {
2676 let dir = tempfile::tempdir().expect("create temp dir");
2677 let project = dir.path().join("project");
2678 fs::create_dir_all(&project).expect("create project dir");
2679 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2680 let cache_dir = dir.path().join("cache");
2681
2682 let a_project = project.clone();
2683 let a_cache = cache_dir.clone();
2684 let a = std::thread::spawn(move || {
2685 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2686 let index = SearchIndex::build(&a_project);
2687 index.write_to_disk(&a_cache, None);
2688 });
2689 let b_project = project.clone();
2690 let b_cache = cache_dir.clone();
2691 let b = std::thread::spawn(move || {
2692 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2693 let index = SearchIndex::build(&b_project);
2694 index.write_to_disk(&b_cache, None);
2695 });
2696 a.join().expect("writer a");
2697 b.join().expect("writer b");
2698
2699 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2700 }
2701
2702 #[test]
2703 fn search_index_atomic_rename_survives_partial_write() {
2704 let dir = tempfile::tempdir().expect("create temp dir");
2705 let cache_dir = dir.path().join("cache");
2706 fs::create_dir_all(&cache_dir).expect("create cache dir");
2707 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2708
2709 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2710 }
2711
2712 #[test]
2713 fn project_cache_key_includes_checkout_path() {
2714 let dir = tempfile::tempdir().expect("create temp dir");
2715 let source = dir.path().join("source");
2716 fs::create_dir_all(&source).expect("create source repo dir");
2717 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2718
2719 assert!(Command::new("git")
2720 .current_dir(&source)
2721 .args(["init"])
2722 .status()
2723 .expect("init git repo")
2724 .success());
2725 assert!(Command::new("git")
2726 .current_dir(&source)
2727 .args(["add", "."])
2728 .status()
2729 .expect("git add")
2730 .success());
2731 assert!(Command::new("git")
2732 .current_dir(&source)
2733 .args([
2734 "-c",
2735 "user.name=AFT Tests",
2736 "-c",
2737 "user.email=aft-tests@example.com",
2738 "commit",
2739 "-m",
2740 "initial",
2741 ])
2742 .status()
2743 .expect("git commit")
2744 .success());
2745
2746 let clone = dir.path().join("clone");
2747 assert!(Command::new("git")
2748 .args(["clone", "--quiet"])
2749 .arg(&source)
2750 .arg(&clone)
2751 .status()
2752 .expect("git clone")
2753 .success());
2754
2755 let source_key = project_cache_key(&source);
2756 let clone_key = project_cache_key(&clone);
2757
2758 assert_eq!(source_key.len(), 16);
2759 assert_eq!(clone_key.len(), 16);
2760 assert_eq!(source_key, clone_key);
2762 }
2763
2764 #[test]
2765 fn git_head_unchanged_picks_up_local_edits() {
2766 let dir = tempfile::tempdir().expect("create temp dir");
2767 let project = dir.path().join("repo");
2768 fs::create_dir_all(&project).expect("create repo dir");
2769 let file = project.join("tracked.txt");
2770 fs::write(&file, "oldtoken\n").expect("write file");
2771 assert!(Command::new("git")
2772 .current_dir(&project)
2773 .arg("init")
2774 .status()
2775 .unwrap()
2776 .success());
2777 assert!(Command::new("git")
2778 .current_dir(&project)
2779 .args(["add", "."])
2780 .status()
2781 .unwrap()
2782 .success());
2783 assert!(Command::new("git")
2784 .current_dir(&project)
2785 .args([
2786 "-c",
2787 "user.name=AFT Tests",
2788 "-c",
2789 "user.email=aft-tests@example.com",
2790 "commit",
2791 "-m",
2792 "initial"
2793 ])
2794 .status()
2795 .unwrap()
2796 .success());
2797 let head = current_git_head(&project);
2798 let mut baseline = SearchIndex::build(&project);
2799 baseline.git_head = head.clone();
2800 fs::write(&file, "newtoken\n").expect("edit tracked file");
2801
2802 let refreshed =
2803 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2804 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2805
2806 assert_eq!(result.total_matches, 1);
2807 }
2808
2809 #[test]
2810 fn non_git_project_reuses_cache_when_files_unchanged() {
2811 let dir = tempfile::tempdir().expect("create temp dir");
2812 let project = dir.path().join("project");
2813 fs::create_dir_all(&project).expect("create project dir");
2814 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2815 let baseline = SearchIndex::build(&project);
2816 let baseline_file_count = baseline.file_count();
2817
2818 let refreshed =
2819 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2820
2821 assert_eq!(refreshed.file_count(), baseline_file_count);
2822 assert_eq!(
2823 refreshed
2824 .grep("unchangedtoken", true, &[], &[], &project, 10)
2825 .total_matches,
2826 1
2827 );
2828 }
2829
2830 #[test]
2831 fn resolve_search_scope_disables_index_for_external_path() {
2832 let dir = tempfile::tempdir().expect("create temp dir");
2833 let project = dir.path().join("project");
2834 let outside = dir.path().join("outside");
2835 fs::create_dir_all(&project).expect("create project dir");
2836 fs::create_dir_all(&outside).expect("create outside dir");
2837
2838 let scope = resolve_search_scope(&project, outside.to_str());
2839
2840 assert_eq!(
2841 scope.root,
2842 fs::canonicalize(&outside).expect("canonicalize outside")
2843 );
2844 assert!(!scope.use_index);
2845 }
2846
2847 #[test]
2848 fn grep_filters_matches_to_search_root() {
2849 let dir = tempfile::tempdir().expect("create temp dir");
2850 let project = dir.path().join("project");
2851 let src = project.join("src");
2852 let docs = project.join("docs");
2853 fs::create_dir_all(&src).expect("create src dir");
2854 fs::create_dir_all(&docs).expect("create docs dir");
2855 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2856 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2857
2858 let index = SearchIndex::build(&project);
2859 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2860
2861 assert_eq!(result.files_searched, 1);
2862 assert_eq!(result.files_with_matches, 1);
2863 assert_eq!(result.matches.len(), 1);
2864 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2866 assert_eq!(result.matches[0].file, expected);
2867 }
2868
2869 #[test]
2870 fn grep_deduplicates_multiple_matches_on_same_line() {
2871 let dir = tempfile::tempdir().expect("create temp dir");
2872 let project = dir.path().join("project");
2873 let src = project.join("src");
2874 fs::create_dir_all(&src).expect("create src dir");
2875 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2876
2877 let index = SearchIndex::build(&project);
2878 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2879
2880 assert_eq!(result.total_matches, 1);
2881 assert_eq!(result.matches.len(), 1);
2882 }
2883
2884 #[test]
2885 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2886 let dir = tempfile::tempdir().expect("create temp dir");
2887 let project = dir.path().join("project");
2888 fs::create_dir_all(&project).expect("create project dir");
2889 let file = project.join("unicode.txt");
2890 fs::write(&file, "äbc\n").expect("write unicode file");
2891
2892 let index = SearchIndex::build(&project);
2893 let result = index.grep("Äbc", false, &[], &[], &project, 10);
2894
2895 assert_eq!(result.total_matches, 1);
2896 assert_eq!(result.matches.len(), 1);
2897 assert_eq!(
2898 result.matches[0].file,
2899 fs::canonicalize(file).expect("canonicalize unicode file")
2900 );
2901 }
2902
2903 #[test]
2904 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2905 let dir = tempfile::tempdir().expect("create temp dir");
2906 let project = dir.path().join("project");
2907 fs::create_dir_all(&project).expect("create project dir");
2908 let file = project.join("tokens.txt");
2909 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2910 fs::write(&file, "alpha").expect("write original file");
2911 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2912
2913 let baseline = SearchIndex::build(&project);
2914 fs::write(&file, "bravo").expect("write same-size edit");
2915 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2916
2917 let refreshed =
2918 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2919 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2920 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2921 let refreshed_id = *refreshed
2922 .path_to_id
2923 .get(&canonical_file)
2924 .expect("file remains indexed");
2925
2926 assert_eq!(result.total_matches, 1);
2927 assert!(refreshed
2928 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2929 .contains(&refreshed_id));
2930 assert!(!refreshed
2931 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2932 .contains(&refreshed_id));
2933 }
2934
2935 #[test]
2936 fn grep_reports_total_matches_before_truncation() {
2937 let dir = tempfile::tempdir().expect("create temp dir");
2938 let project = dir.path().join("project");
2939 let src = project.join("src");
2940 fs::create_dir_all(&src).expect("create src dir");
2941 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2942
2943 let index = SearchIndex::build(&project);
2944 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2945
2946 assert_eq!(result.total_matches, 2);
2947 assert_eq!(result.matches.len(), 1);
2948 assert!(result.truncated);
2949 }
2950
2951 #[test]
2952 fn glob_filters_results_to_search_root() {
2953 let dir = tempfile::tempdir().expect("create temp dir");
2954 let project = dir.path().join("project");
2955 let src = project.join("src");
2956 let scripts = project.join("scripts");
2957 fs::create_dir_all(&src).expect("create src dir");
2958 fs::create_dir_all(&scripts).expect("create scripts dir");
2959 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2960 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2961
2962 let index = SearchIndex::build(&project);
2963 let files = index.glob("**/*.rs", &src);
2964
2965 assert_eq!(
2966 files,
2967 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2968 );
2969 }
2970
2971 #[test]
2972 fn glob_includes_hidden_and_binary_files() {
2973 let dir = tempfile::tempdir().expect("create temp dir");
2974 let project = dir.path().join("project");
2975 let hidden_dir = project.join(".hidden");
2976 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2977 let hidden_file = hidden_dir.join("data.bin");
2978 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2979
2980 let index = SearchIndex::build(&project);
2981 let files = index.glob("**/*.bin", &project);
2982
2983 assert_eq!(
2984 files,
2985 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2986 );
2987 }
2988
2989 #[test]
2990 fn read_from_disk_rejects_invalid_nanos() {
2991 let dir = tempfile::tempdir().expect("create temp dir");
2992 let cache_dir = dir.path().join("cache");
2993 fs::create_dir_all(&cache_dir).expect("create cache dir");
2994
2995 let mut postings = Vec::new();
2996 postings.extend_from_slice(INDEX_MAGIC);
2997 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2998 postings.extend_from_slice(&0u32.to_le_bytes());
2999 postings.extend_from_slice(&1u32.to_le_bytes());
3000 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
3001 postings.extend_from_slice(&1u32.to_le_bytes());
3002 postings.extend_from_slice(b"/");
3003 postings.push(0u8);
3004 postings.extend_from_slice(&1u32.to_le_bytes());
3005 postings.extend_from_slice(&0u64.to_le_bytes());
3006 postings.extend_from_slice(&0u64.to_le_bytes());
3007 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
3008 postings.extend_from_slice(b"a");
3009 postings.extend_from_slice(&0u64.to_le_bytes());
3010
3011 let mut lookup = Vec::new();
3012 lookup.extend_from_slice(LOOKUP_MAGIC);
3013 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3014 lookup.extend_from_slice(&0u32.to_le_bytes());
3015
3016 let postings_checksum = crc32fast::hash(&postings);
3017 postings.extend_from_slice(&postings_checksum.to_le_bytes());
3018 let lookup_checksum = crc32fast::hash(&lookup);
3019 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3020 let mut cache = Vec::new();
3021 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3022 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3023 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3024 cache.extend_from_slice(&postings);
3025 cache.extend_from_slice(&lookup);
3026 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3027
3028 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3029 }
3030
3031 #[test]
3046 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3047 let dir = tempfile::tempdir().expect("create tempdir");
3051 let mut paths: Vec<PathBuf> = Vec::new();
3052 for i in 0..30 {
3053 let path = if i % 2 == 0 {
3055 let p = dir.path().join(format!("real-{i}.rs"));
3056 fs::write(&p, format!("// {i}\n")).expect("write");
3057 p
3058 } else {
3059 dir.path().join(format!("missing-{i}.rs"))
3060 };
3061 paths.push(path);
3062 }
3063
3064 for _ in 0..50 {
3067 let mut copy = paths.clone();
3068 sort_paths_by_mtime_desc(&mut copy);
3069 assert_eq!(copy.len(), paths.len());
3070 }
3071 }
3072
3073 #[test]
3077 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3078 let dir = tempfile::tempdir().expect("create tempdir");
3079 let mut matches: Vec<GrepMatch> = Vec::new();
3080 for i in 0..30 {
3081 let file = if i % 2 == 0 {
3082 let p = dir.path().join(format!("real-{i}.rs"));
3083 fs::write(&p, format!("// {i}\n")).expect("write");
3084 p
3085 } else {
3086 dir.path().join(format!("missing-{i}.rs"))
3087 };
3088 matches.push(GrepMatch {
3089 file,
3090 line: u32::try_from(i).unwrap_or(0),
3091 column: 0,
3092 line_text: format!("match {i}"),
3093 match_text: format!("match {i}"),
3094 });
3095 }
3096
3097 for _ in 0..50 {
3098 let mut copy = matches.clone();
3099 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3100 assert_eq!(copy.len(), matches.len());
3101 }
3102 }
3103}