1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc, Mutex,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::Regex;
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19use crate::fs_lock;
20use crate::pattern_compile::{self, CompileOpts, CompileResult, CompiledPattern, LiteralSearch};
21
22const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
23const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
25const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
26const INDEX_VERSION: u32 = 4;
27const PREVIEW_BYTES: usize = 8 * 1024;
28const EOF_SENTINEL: u8 = 0;
29const MAX_ENTRIES: usize = 10_000_000;
30const MIN_FILE_ENTRY_BYTES: usize = 57;
31const LOOKUP_ENTRY_BYTES: usize = 16;
32const POSTING_BYTES: usize = 6;
33static CACHE_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
34
35pub struct CacheLock {
36 _guard: fs_lock::LockGuard,
37}
38
39impl CacheLock {
40 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
41 fs::create_dir_all(cache_dir)?;
42 let path = cache_dir.join("cache.lock");
43 let _acquire_guard = CACHE_LOCK_ACQUIRE_MUTEX
44 .lock()
45 .map_err(|_| std::io::Error::other("search cache lock acquisition mutex poisoned"))?;
46 fs_lock::try_acquire(&path, Duration::from_secs(2))
47 .map(|guard| Self { _guard: guard })
48 .map_err(|error| match error {
49 fs_lock::AcquireError::Timeout => {
50 std::io::Error::other("timed out acquiring search cache lock")
51 }
52 fs_lock::AcquireError::Io(error) => error,
53 })
54 }
55}
56
57#[derive(Clone, Debug)]
58pub struct SearchIndex {
59 pub postings: HashMap<u32, Vec<Posting>>,
60 pub files: Vec<FileEntry>,
61 pub path_to_id: HashMap<PathBuf, u32>,
62 pub ready: bool,
63 project_root: PathBuf,
64 git_head: Option<String>,
65 max_file_size: u64,
66 ignore_rules_fingerprint: String,
67 pub file_trigrams: HashMap<u32, Vec<u32>>,
68 unindexed_files: HashSet<u32>,
69}
70
71#[derive(Clone, Debug, Default)]
72pub struct LexicalRankResult {
73 pub files: Vec<(PathBuf, f32)>,
74 pub engine_capped: bool,
75}
76
77impl SearchIndex {
78 pub fn file_count(&self) -> usize {
80 self.files.len()
81 }
82
83 pub fn trigram_count(&self) -> usize {
85 self.postings.len()
86 }
87
88 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
90 query_trigrams_from_tokens(tokens)
91 }
92
93 pub fn lexical_rank(
95 &self,
96 query_trigrams: &[u32],
97 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
98 max_files: usize,
99 ) -> Vec<(PathBuf, f32)> {
100 self.lexical_rank_with_stats(query_trigrams, candidate_filter, max_files)
101 .files
102 }
103
104 pub fn lexical_rank_with_stats(
107 &self,
108 query_trigrams: &[u32],
109 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
110 max_files: usize,
111 ) -> LexicalRankResult {
112 if query_trigrams.is_empty() || max_files == 0 {
113 return LexicalRankResult::default();
114 }
115
116 let mut non_zero: Vec<(u32, usize)> = query_trigrams
117 .iter()
118 .filter_map(|trigram| {
119 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
120 (posting_count > 0).then_some((*trigram, posting_count))
121 })
122 .collect();
123 if non_zero.is_empty() {
124 return LexicalRankResult::default();
125 }
126
127 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
128 let selected_count = non_zero.len().min(3);
129 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
130
131 let mut candidate_ids = BTreeSet::new();
132 for (trigram, _) in non_zero.iter().take(selected_count) {
133 if let Some(postings) = self.postings.get(trigram) {
134 for posting in postings {
135 if self.is_active_file(posting.file_id) {
136 candidate_ids.insert(posting.file_id);
137 }
138 }
139 }
140 }
141 let pre_filter_candidate_count = candidate_ids.len();
142 let engine_capped = pre_filter_candidate_count > candidate_cap;
143 let filtered_candidates = candidate_ids
144 .into_iter()
145 .filter_map(|file_id| {
146 self.files
147 .get(file_id as usize)
148 .map(|entry| (file_id, entry))
149 })
150 .filter(|(_, entry)| {
151 if let Some(filter) = candidate_filter {
152 filter(&entry.path)
153 } else {
154 true
155 }
156 })
157 .collect::<Vec<_>>();
158
159 let mut ranked = Vec::new();
160 for (file_id, entry) in filtered_candidates.into_iter().take(candidate_cap) {
161 let score = lexical_score(self, query_trigrams, file_id);
162 if score > 0.0 {
163 ranked.push((entry.path.clone(), score));
164 }
165 }
166
167 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
168 ranked.truncate(max_files);
169 LexicalRankResult {
170 files: ranked,
171 engine_capped,
172 }
173 }
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
177pub struct Posting {
178 pub file_id: u32,
179 pub next_mask: u8,
180 pub loc_mask: u8,
181}
182
183#[derive(Clone, Debug)]
184pub struct FileEntry {
185 pub path: PathBuf,
186 pub size: u64,
187 pub modified: SystemTime,
188 pub content_hash: blake3::Hash,
189}
190
191#[derive(Clone, Debug, PartialEq, Eq)]
192pub struct GrepMatch {
193 pub file: PathBuf,
194 pub line: u32,
195 pub column: u32,
196 pub line_text: String,
197 pub match_text: String,
198}
199
200#[derive(Clone, Debug)]
201pub struct GrepResult {
202 pub matches: Vec<GrepMatch>,
203 pub total_matches: usize,
204 pub files_searched: usize,
205 pub files_with_matches: usize,
206 pub index_status: IndexStatus,
207 pub truncated: bool,
208 pub fully_degraded: bool,
209 pub engine_capped: bool,
210}
211
212#[derive(Clone, Copy, Debug, PartialEq, Eq)]
213pub enum IndexStatus {
214 Ready,
215 Building,
216 Fallback,
217 Disabled,
218}
219
220impl IndexStatus {
221 pub fn as_str(&self) -> &'static str {
222 match self {
223 IndexStatus::Ready => "Ready",
224 IndexStatus::Building => "Building",
225 IndexStatus::Fallback => "Fallback",
226 IndexStatus::Disabled => "Disabled",
227 }
228 }
229}
230
231#[derive(Clone, Debug, Default)]
232pub struct RegexQuery {
233 pub and_trigrams: Vec<u32>,
234 pub or_groups: Vec<Vec<u32>>,
235 pub(crate) and_filters: HashMap<u32, PostingFilter>,
236 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
237}
238
239#[derive(Clone, Copy, Debug, Default)]
240pub(crate) struct PostingFilter {
241 next_mask: u8,
242 loc_mask: u8,
243}
244
245#[derive(Clone, Debug, Default)]
246struct QueryBuild {
247 and_runs: Vec<Vec<u8>>,
248 or_groups: Vec<Vec<Vec<u8>>>,
249}
250
251#[derive(Clone, Debug, Default)]
252pub(crate) struct PathFilters {
253 includes: Option<GlobSet>,
254 excludes: Option<GlobSet>,
255}
256
257#[derive(Clone, Debug)]
258pub(crate) struct SearchScope {
259 pub root: PathBuf,
260 pub use_index: bool,
261}
262
263#[derive(Clone, Debug)]
264struct SharedGrepMatch {
265 file: Arc<PathBuf>,
266 line: u32,
267 column: u32,
268 line_text: String,
269 match_text: String,
270}
271
272#[derive(Clone, Debug)]
273enum SearchMatcher {
274 Literal(LiteralSearch),
275 Regex(Regex),
276}
277
278impl SearchIndex {
279 pub fn new() -> Self {
280 SearchIndex {
281 postings: HashMap::new(),
282 files: Vec::new(),
283 path_to_id: HashMap::new(),
284 ready: false,
285 project_root: PathBuf::new(),
286 git_head: None,
287 max_file_size: DEFAULT_MAX_FILE_SIZE,
288 ignore_rules_fingerprint: String::new(),
289 file_trigrams: HashMap::new(),
290 unindexed_files: HashSet::new(),
291 }
292 }
293
294 pub fn build(root: &Path) -> Self {
295 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
296 }
297
298 pub fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
299 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
300 let mut index = SearchIndex {
301 project_root: project_root.clone(),
302 max_file_size,
303 ignore_rules_fingerprint: ignore_rules_fingerprint(&project_root),
304 ..SearchIndex::new()
305 };
306
307 let filters = PathFilters::default();
308 for path in walk_project_files(&project_root, &filters) {
309 index.update_file(&path);
310 }
311
312 index.git_head = current_git_head(&project_root);
313 index.ready = true;
314 index
315 }
316
317 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
318 self.remove_file(path);
319
320 let file_id = match self.allocate_file_id(path, content.len() as u64) {
321 Some(file_id) => file_id,
322 None => return,
323 };
324 if let Some(file) = self.files.get_mut(file_id as usize) {
325 file.content_hash = cache_freshness::hash_bytes(content);
326 }
327
328 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
329 for (trigram, next_char, position) in extract_trigrams(content) {
330 let entry = trigram_map.entry(trigram).or_default();
331 entry.next_mask |= mask_for_next_char(next_char);
332 entry.loc_mask |= mask_for_position(position);
333 }
334
335 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
336 for (trigram, filter) in trigram_map {
337 let postings = self.postings.entry(trigram).or_default();
338 postings.push(Posting {
339 file_id,
340 next_mask: filter.next_mask,
341 loc_mask: filter.loc_mask,
342 });
343 if postings.len() > 1
347 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
348 {
349 postings.sort_unstable_by_key(|p| p.file_id);
350 }
351 file_trigrams.push(trigram);
352 }
353
354 self.file_trigrams.insert(file_id, file_trigrams);
355 self.unindexed_files.remove(&file_id);
356 }
357
358 pub fn remove_file(&mut self, path: &Path) {
359 let canonical_path = canonicalize_existing_or_deleted_path(path);
360 let file_id = if let Some(file_id) = self.path_to_id.remove(path) {
361 file_id
362 } else if canonical_path.as_path() != path {
363 let Some(file_id) = self.path_to_id.remove(&canonical_path) else {
364 return;
365 };
366 file_id
367 } else {
368 return;
369 };
370
371 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
372 for trigram in trigrams {
373 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
374 postings.retain(|posting| posting.file_id != file_id);
375 postings.is_empty()
376 } else {
377 false
378 };
379
380 if should_remove {
381 self.postings.remove(&trigram);
382 }
383 }
384 }
385
386 self.unindexed_files.remove(&file_id);
387 if let Some(file) = self.files.get_mut(file_id as usize) {
388 file.path = PathBuf::new();
389 file.size = 0;
390 file.modified = UNIX_EPOCH;
391 file.content_hash = cache_freshness::zero_hash();
392 }
393 }
394
395 pub fn update_file(&mut self, path: &Path) {
396 self.remove_file(path);
397
398 let metadata = match fs::metadata(path) {
399 Ok(metadata) if metadata.is_file() => metadata,
400 _ => return,
401 };
402
403 if is_binary_path(path, metadata.len()) {
404 self.track_unindexed_file(path, &metadata);
405 return;
406 }
407
408 if metadata.len() > self.max_file_size {
409 self.track_unindexed_file(path, &metadata);
410 return;
411 }
412
413 let content = match fs::read(path) {
414 Ok(content) => content,
415 Err(_) => return,
416 };
417
418 if is_binary_bytes(&content) {
419 self.track_unindexed_file(path, &metadata);
420 return;
421 }
422
423 self.index_file(path, &content);
424 }
425
426 pub fn grep(
427 &self,
428 pattern: &str,
429 case_sensitive: bool,
430 include: &[String],
431 exclude: &[String],
432 search_root: &Path,
433 max_results: usize,
434 ) -> GrepResult {
435 match pattern_compile::compile(
436 pattern,
437 CompileOpts {
438 case_insensitive: !case_sensitive,
439 ..CompileOpts::default()
440 },
441 ) {
442 CompileResult::Ok(compiled) => {
443 self.search_grep(&compiled, include, exclude, search_root, max_results)
444 }
445 CompileResult::InvalidPattern { .. } | CompileResult::UnsupportedSyntax { .. } => {
446 self.empty_grep_result()
447 }
448 }
449 }
450
451 pub fn search_grep(
452 &self,
453 pattern: &CompiledPattern,
454 include: &[String],
455 exclude: &[String],
456 search_root: &Path,
457 max_results: usize,
458 ) -> GrepResult {
459 let matcher = match pattern {
460 CompiledPattern::Literal(literal) => SearchMatcher::Literal(literal.clone()),
461 CompiledPattern::Regex { compiled, .. } => SearchMatcher::Regex(compiled.clone()),
462 };
463
464 let filters = match build_path_filters(include, exclude) {
465 Ok(filters) => filters,
466 Err(_) => PathFilters::default(),
467 };
468 let search_root = canonicalize_or_normalize(search_root);
469
470 let raw_pattern = pattern.raw_pattern_for_trigrams();
471 let query = if pattern.case_insensitive() && !raw_pattern.is_ascii() {
472 RegexQuery::default()
473 } else {
474 decompose_regex(&raw_pattern)
475 };
476 let fully_degraded = query.and_trigrams.is_empty() && query.or_groups.is_empty();
477 let candidate_ids = self.candidates(&query);
478
479 let candidate_files: Vec<&FileEntry> = candidate_ids
480 .into_iter()
481 .filter_map(|file_id| self.files.get(file_id as usize))
482 .filter(|file| !file.path.as_os_str().is_empty())
483 .filter(|file| is_within_search_root(&search_root, &file.path))
484 .filter(|file| filters.matches(&self.project_root, &file.path))
485 .collect();
486
487 let total_matches = AtomicUsize::new(0);
488 let files_searched = AtomicUsize::new(0);
489 let files_with_matches = AtomicUsize::new(0);
490 let truncated = AtomicBool::new(false);
491 let engine_capped = AtomicBool::new(false);
492 let stop_after = max_results.saturating_mul(2);
493
494 let mut matches = if candidate_files.len() > 10 {
495 candidate_files
496 .par_iter()
497 .map(|file| {
498 search_candidate_file(
499 file,
500 &matcher,
501 max_results,
502 stop_after,
503 &total_matches,
504 &files_searched,
505 &files_with_matches,
506 &truncated,
507 &engine_capped,
508 )
509 })
510 .reduce(Vec::new, |mut left, mut right| {
511 left.append(&mut right);
512 left
513 })
514 } else {
515 let mut matches = Vec::new();
516 for file in candidate_files {
517 matches.extend(search_candidate_file(
518 file,
519 &matcher,
520 max_results,
521 stop_after,
522 &total_matches,
523 &files_searched,
524 &files_with_matches,
525 &truncated,
526 &engine_capped,
527 ));
528
529 if should_stop_search(&truncated, &total_matches, stop_after) {
530 engine_capped.store(true, Ordering::Relaxed);
531 break;
532 }
533 }
534 matches
535 };
536
537 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
538 self.path_to_id
539 .get(path)
540 .and_then(|file_id| self.files.get(*file_id as usize))
541 .map(|file| file.modified)
542 });
543
544 let matches = matches
545 .into_iter()
546 .map(|matched| GrepMatch {
547 file: matched.file.as_ref().clone(),
548 line: matched.line,
549 column: matched.column,
550 line_text: matched.line_text,
551 match_text: matched.match_text,
552 })
553 .collect();
554
555 GrepResult {
556 total_matches: total_matches.load(Ordering::Relaxed),
557 matches,
558 files_searched: files_searched.load(Ordering::Relaxed),
559 files_with_matches: files_with_matches.load(Ordering::Relaxed),
560 index_status: if self.ready {
561 IndexStatus::Ready
562 } else {
563 IndexStatus::Building
564 },
565 truncated: truncated.load(Ordering::Relaxed),
566 fully_degraded,
567 engine_capped: engine_capped.load(Ordering::Relaxed),
568 }
569 }
570
571 fn empty_grep_result(&self) -> GrepResult {
572 GrepResult {
573 matches: Vec::new(),
574 total_matches: 0,
575 files_searched: 0,
576 files_with_matches: 0,
577 index_status: if self.ready {
578 IndexStatus::Ready
579 } else {
580 IndexStatus::Building
581 },
582 truncated: false,
583 fully_degraded: false,
584 engine_capped: false,
585 }
586 }
587
588 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
589 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
590 Ok(filters) => filters,
591 Err(_) => return Vec::new(),
592 };
593 let search_root = canonicalize_or_normalize(search_root);
594 let mut entries = self
595 .files
596 .iter()
597 .filter(|file| !file.path.as_os_str().is_empty())
598 .filter(|file| is_within_search_root(&search_root, &file.path))
599 .filter(|file| filters.matches(&self.project_root, &file.path))
600 .map(|file| (file.path.clone(), file.modified))
601 .collect::<Vec<_>>();
602
603 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
604 right_mtime
605 .cmp(left_mtime)
606 .then_with(|| left_path.cmp(right_path))
607 });
608
609 entries.into_iter().map(|(path, _)| path).collect()
610 }
611
612 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
613 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
614 return self.active_file_ids();
615 }
616
617 let mut and_trigrams = query.and_trigrams.clone();
618 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
619
620 let mut current: Option<Vec<u32>> = None;
621
622 for trigram in and_trigrams {
623 let filter = query.and_filters.get(&trigram).copied();
624 let matches = self.postings_for_trigram(trigram, filter);
625 current = Some(match current.take() {
626 Some(existing) => intersect_sorted_ids(&existing, &matches),
627 None => matches,
628 });
629
630 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
631 break;
632 }
633 }
634
635 let mut current = current.unwrap_or_else(|| self.active_file_ids());
636
637 for (index, group) in query.or_groups.iter().enumerate() {
638 let mut group_matches = Vec::new();
639 let filters = query.or_filters.get(index);
640
641 for trigram in group {
642 let filter = filters.and_then(|filters| filters.get(trigram).copied());
643 let matches = self.postings_for_trigram(*trigram, filter);
644 if group_matches.is_empty() {
645 group_matches = matches;
646 } else {
647 group_matches = union_sorted_ids(&group_matches, &matches);
648 }
649 }
650
651 current = intersect_sorted_ids(¤t, &group_matches);
652 if current.is_empty() {
653 break;
654 }
655 }
656
657 let mut unindexed = self
658 .unindexed_files
659 .iter()
660 .copied()
661 .filter(|file_id| self.is_active_file(*file_id))
662 .collect::<Vec<_>>();
663 if !unindexed.is_empty() {
664 unindexed.sort_unstable();
665 current = union_sorted_ids(¤t, &unindexed);
666 }
667
668 current
669 }
670
671 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
672 if fs::create_dir_all(cache_dir).is_err() {
673 return;
674 }
675
676 let cache_path = cache_dir.join("cache.bin");
677 let tmp_cache = cache_dir.join(format!(
678 "cache.bin.tmp.{}.{}",
679 std::process::id(),
680 SystemTime::now()
681 .duration_since(UNIX_EPOCH)
682 .unwrap_or(Duration::ZERO)
683 .as_nanos()
684 ));
685
686 let active_ids = self.active_file_ids();
687 let mut id_map = HashMap::new();
688 for (new_id, old_id) in active_ids.iter().enumerate() {
689 let Ok(new_id_u32) = u32::try_from(new_id) else {
690 return;
691 };
692 id_map.insert(*old_id, new_id_u32);
693 }
694
695 let write_result = (|| -> std::io::Result<()> {
696 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
697
698 postings_writer.write_all(INDEX_MAGIC)?;
699 write_u32(&mut postings_writer, INDEX_VERSION)?;
700
701 let head = git_head.unwrap_or_default();
702 let root = self.project_root.to_string_lossy();
703 let ignore_fingerprint = if self.ignore_rules_fingerprint.is_empty() {
704 ignore_rules_fingerprint(&self.project_root)
705 } else {
706 self.ignore_rules_fingerprint.clone()
707 };
708 let head_len = u32::try_from(head.len())
709 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
710 let root_len = u32::try_from(root.len())
711 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
712 let ignore_fingerprint_len = u32::try_from(ignore_fingerprint.len())
713 .map_err(|_| std::io::Error::other("ignore fingerprint too large to cache"))?;
714 let file_count = u32::try_from(active_ids.len())
715 .map_err(|_| std::io::Error::other("too many files to cache"))?;
716
717 write_u32(&mut postings_writer, head_len)?;
718 write_u32(&mut postings_writer, root_len)?;
719 write_u32(&mut postings_writer, ignore_fingerprint_len)?;
720 write_u64(&mut postings_writer, self.max_file_size)?;
721 write_u32(&mut postings_writer, file_count)?;
722 postings_writer.write_all(head.as_bytes())?;
723 postings_writer.write_all(root.as_bytes())?;
724 postings_writer.write_all(ignore_fingerprint.as_bytes())?;
725
726 for old_id in &active_ids {
727 let Some(file) = self.files.get(*old_id as usize) else {
728 return Err(std::io::Error::other("missing file entry for cache write"));
729 };
730 let path =
731 cache_relative_path(&self.project_root, &file.path).ok_or_else(|| {
732 std::io::Error::other(format!(
733 "refusing to cache path outside project root: {}",
734 file.path.display()
735 ))
736 })?;
737 let path = path.to_string_lossy();
738 let path_len = u32::try_from(path.len())
739 .map_err(|_| std::io::Error::other("cached path too large"))?;
740 let modified = file
741 .modified
742 .duration_since(UNIX_EPOCH)
743 .unwrap_or(Duration::ZERO);
744 let unindexed = if self.unindexed_files.contains(old_id) {
745 1u8
746 } else {
747 0u8
748 };
749
750 postings_writer.write_all(&[unindexed])?;
751 write_u32(&mut postings_writer, path_len)?;
752 write_u64(&mut postings_writer, file.size)?;
753 write_u64(&mut postings_writer, modified.as_secs())?;
754 write_u32(&mut postings_writer, modified.subsec_nanos())?;
755 postings_writer.write_all(file.content_hash.as_bytes())?;
756 postings_writer.write_all(path.as_bytes())?;
757 }
758
759 let mut lookup_entries = Vec::new();
760 let mut postings_blob = Vec::new();
761 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
762 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
763
764 for (trigram, postings) in sorted_postings {
765 let offset = u64::try_from(postings_blob.len())
766 .map_err(|_| std::io::Error::other("postings blob too large"))?;
767 let mut count = 0u32;
768
769 for posting in postings {
770 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
771 continue;
772 };
773
774 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
775 postings_blob.push(posting.next_mask);
776 postings_blob.push(posting.loc_mask);
777 count = count.saturating_add(1);
778 }
779
780 if count > 0 {
781 lookup_entries.push((*trigram, offset, count));
782 }
783 }
784
785 write_u64(
786 &mut postings_writer,
787 u64::try_from(postings_blob.len())
788 .map_err(|_| std::io::Error::other("postings blob too large"))?,
789 )?;
790 postings_writer.write_all(&postings_blob)?;
791 postings_writer.flush()?;
792 let mut postings_blob_file = postings_writer
793 .into_inner()
794 .map_err(|error| std::io::Error::other(error.to_string()))?
795 .into_inner();
796 let checksum = crc32fast::hash(&postings_blob_file);
797 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
798
799 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
800 let entry_count = u32::try_from(lookup_entries.len())
801 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
802
803 lookup_writer.write_all(LOOKUP_MAGIC)?;
804 write_u32(&mut lookup_writer, INDEX_VERSION)?;
805 write_u32(&mut lookup_writer, entry_count)?;
806
807 for (trigram, offset, count) in lookup_entries {
808 write_u32(&mut lookup_writer, trigram)?;
809 write_u64(&mut lookup_writer, offset)?;
810 write_u32(&mut lookup_writer, count)?;
811 }
812
813 lookup_writer.flush()?;
814 let mut lookup_blob_file = lookup_writer
815 .into_inner()
816 .map_err(|error| std::io::Error::other(error.to_string()))?
817 .into_inner();
818 let checksum = crc32fast::hash(&lookup_blob_file);
819 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
820
821 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
822 write_u32(&mut cache_writer, CACHE_MAGIC)?;
823 write_u32(&mut cache_writer, INDEX_VERSION)?;
824 write_u64(
825 &mut cache_writer,
826 u64::try_from(postings_blob_file.len())
827 .map_err(|_| std::io::Error::other("postings section too large"))?,
828 )?;
829 cache_writer.write_all(&postings_blob_file)?;
830 cache_writer.write_all(&lookup_blob_file)?;
831 cache_writer.flush()?;
832 cache_writer.get_ref().sync_all()?;
833 drop(cache_writer);
834 fs::rename(&tmp_cache, &cache_path)?;
835
836 Ok(())
837 })();
838
839 if write_result.is_err() {
840 let _ = fs::remove_file(&tmp_cache);
841 }
842 }
843
844 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
845 debug_assert!(current_canonical_root.is_absolute());
846 let cache_path = cache_dir.join("cache.bin");
847 let cache_bytes = fs::read(&cache_path).ok()?;
848 if cache_bytes.len() < 16 {
849 return None;
850 }
851 let mut header = Cursor::new(&cache_bytes);
852 if read_u32(&mut header).ok()? != CACHE_MAGIC {
853 return None;
854 }
855 if read_u32(&mut header).ok()? != INDEX_VERSION {
856 return None;
857 }
858 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
859 let start = usize::try_from(header.position()).ok()?;
860 let postings_end = start.checked_add(postings_len_total)?;
861 if postings_end > cache_bytes.len() {
862 return None;
863 }
864 let postings_bytes = &cache_bytes[start..postings_end];
865 let lookup_bytes = &cache_bytes[postings_end..];
866 let lookup_len_total = lookup_bytes.len();
867 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
868 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
869 if postings_len_total < 4 || lookup_len_total < 4 {
870 return None;
871 }
872 verify_crc32_bytes_slice(postings_bytes).ok()?;
873 verify_crc32_bytes_slice(lookup_bytes).ok()?;
874
875 let mut magic = [0u8; 8];
876 postings_reader.read_exact(&mut magic).ok()?;
877 if &magic != INDEX_MAGIC {
878 return None;
879 }
880 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
881 return None;
882 }
883
884 let head_len = read_u32(&mut postings_reader).ok()? as usize;
885 let root_len = read_u32(&mut postings_reader).ok()? as usize;
886 let ignore_fingerprint_len = read_u32(&mut postings_reader).ok()? as usize;
887 let max_file_size = read_u64(&mut postings_reader).ok()?;
888 let file_count = read_u32(&mut postings_reader).ok()? as usize;
889 if file_count > MAX_ENTRIES {
890 return None;
891 }
892 let postings_body_len = postings_len_total.checked_sub(4)?;
893 let lookup_body_len = lookup_len_total.checked_sub(4)?;
894
895 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
896 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
897 if minimum_file_bytes > remaining_postings {
898 return None;
899 }
900
901 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
902 return None;
903 }
904 let mut head_bytes = vec![0u8; head_len];
905 postings_reader.read_exact(&mut head_bytes).ok()?;
906 let git_head = String::from_utf8(head_bytes)
907 .ok()
908 .filter(|head| !head.is_empty());
909
910 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
911 return None;
912 }
913 let mut root_bytes = vec![0u8; root_len];
914 postings_reader.read_exact(&mut root_bytes).ok()?;
915 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
916 let project_root = current_canonical_root.to_path_buf();
917
918 if ignore_fingerprint_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
919 return None;
920 }
921 let mut ignore_fingerprint_bytes = vec![0u8; ignore_fingerprint_len];
922 postings_reader
923 .read_exact(&mut ignore_fingerprint_bytes)
924 .ok()?;
925 let stored_ignore_rules_fingerprint = String::from_utf8(ignore_fingerprint_bytes).ok()?;
926 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&project_root);
927 if stored_ignore_rules_fingerprint != current_ignore_rules_fingerprint {
928 return None;
929 }
930
931 let mut files = Vec::with_capacity(file_count);
932 let mut path_to_id = HashMap::new();
933 let mut unindexed_files = HashSet::new();
934
935 for file_id in 0..file_count {
936 let mut unindexed = [0u8; 1];
937 postings_reader.read_exact(&mut unindexed).ok()?;
938 let path_len = read_u32(&mut postings_reader).ok()? as usize;
939 let size = read_u64(&mut postings_reader).ok()?;
940 let secs = read_u64(&mut postings_reader).ok()?;
941 let nanos = read_u32(&mut postings_reader).ok()?;
942 let mut hash_bytes = [0u8; 32];
943 postings_reader.read_exact(&mut hash_bytes).ok()?;
944 let content_hash = blake3::Hash::from_bytes(hash_bytes);
945 if nanos >= 1_000_000_000 {
946 return None;
947 }
948 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
949 return None;
950 }
951 let mut path_bytes = vec![0u8; path_len];
952 postings_reader.read_exact(&mut path_bytes).ok()?;
953 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
954 let full_path = cached_path_under_root(&project_root, &relative_path)?;
955 let file_id_u32 = u32::try_from(file_id).ok()?;
956
957 files.push(FileEntry {
958 path: full_path.clone(),
959 size,
960 modified: UNIX_EPOCH + Duration::new(secs, nanos),
961 content_hash,
962 });
963 path_to_id.insert(full_path, file_id_u32);
964 if unindexed[0] == 1 {
965 unindexed_files.insert(file_id_u32);
966 }
967 }
968
969 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
970 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
971 if postings_len > max_postings_bytes {
972 return None;
973 }
974 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
975 return None;
976 }
977 let mut postings_blob = vec![0u8; postings_len];
978 postings_reader.read_exact(&mut postings_blob).ok()?;
979
980 let mut lookup_magic = [0u8; 8];
981 lookup_reader.read_exact(&mut lookup_magic).ok()?;
982 if &lookup_magic != LOOKUP_MAGIC {
983 return None;
984 }
985 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
986 return None;
987 }
988 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
989 if entry_count > MAX_ENTRIES {
990 return None;
991 }
992 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
993 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
994 if minimum_lookup_bytes > remaining_lookup {
995 return None;
996 }
997
998 let mut postings = HashMap::new();
999 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
1000
1001 for _ in 0..entry_count {
1002 let trigram = read_u32(&mut lookup_reader).ok()?;
1003 let offset = read_u64(&mut lookup_reader).ok()? as usize;
1004 let count = read_u32(&mut lookup_reader).ok()? as usize;
1005 if count > MAX_ENTRIES {
1006 return None;
1007 }
1008 let bytes_len = count.checked_mul(POSTING_BYTES)?;
1009 let end = offset.checked_add(bytes_len)?;
1010 if end > postings_blob.len() {
1011 return None;
1012 }
1013
1014 let mut trigram_postings = Vec::with_capacity(count);
1015 for chunk in postings_blob[offset..end].chunks_exact(6) {
1016 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
1017 let posting = Posting {
1018 file_id,
1019 next_mask: chunk[4],
1020 loc_mask: chunk[5],
1021 };
1022 trigram_postings.push(posting.clone());
1023 file_trigrams.entry(file_id).or_default().push(trigram);
1024 }
1025 postings.insert(trigram, trigram_postings);
1026 }
1027
1028 Some(SearchIndex {
1029 postings,
1030 files,
1031 path_to_id,
1032 ready: false,
1033 project_root,
1034 git_head,
1035 max_file_size,
1036 ignore_rules_fingerprint: current_ignore_rules_fingerprint,
1037 file_trigrams,
1038 unindexed_files,
1039 })
1040 }
1041
1042 pub fn stored_git_head(&self) -> Option<&str> {
1043 self.git_head.as_deref()
1044 }
1045
1046 pub(crate) fn set_ready(&mut self, ready: bool) {
1047 self.ready = ready;
1048 }
1049
1050 pub(crate) fn verify_against_disk(&mut self, current_head: Option<String>) {
1051 self.git_head = current_head;
1052 verify_file_mtimes(self);
1053 self.ready = true;
1054 }
1055
1056 #[cfg(debug_assertions)]
1057 #[doc(hidden)]
1058 pub fn verify_against_disk_for_debug(&mut self, current_head: Option<String>) {
1059 self.verify_against_disk(current_head);
1060 }
1061
1062 pub(crate) fn rebuild_or_refresh(
1063 root: &Path,
1064 max_file_size: u64,
1065 current_head: Option<String>,
1066 baseline: Option<SearchIndex>,
1067 ) -> Self {
1068 if let Some(mut baseline) = baseline {
1069 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1070 baseline.max_file_size = max_file_size;
1071 let current_ignore_rules_fingerprint = ignore_rules_fingerprint(&baseline.project_root);
1072 if baseline.ignore_rules_fingerprint != current_ignore_rules_fingerprint {
1073 return SearchIndex::build_with_limit(root, max_file_size);
1074 }
1075 baseline.ignore_rules_fingerprint = current_ignore_rules_fingerprint;
1076
1077 if baseline.git_head == current_head || current_head.is_none() {
1078 baseline.git_head = current_head;
1085 verify_file_mtimes(&mut baseline);
1086 baseline.ready = true;
1087 return baseline;
1088 }
1089
1090 if let (Some(previous), Some(current)) =
1091 (baseline.git_head.clone(), current_head.clone())
1092 {
1093 let project_root = baseline.project_root.clone();
1094 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1095 baseline.git_head = Some(current);
1096 verify_file_mtimes(&mut baseline);
1097 baseline.ready = true;
1098 return baseline;
1099 }
1100 }
1101 }
1102
1103 SearchIndex::build_with_limit(root, max_file_size)
1104 }
1105
1106 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1107 let file_id = u32::try_from(self.files.len()).ok()?;
1108 let metadata = fs::metadata(path).ok();
1109 let size = metadata
1110 .as_ref()
1111 .map_or(size_hint, |metadata| metadata.len());
1112 let modified = metadata
1113 .and_then(|metadata| metadata.modified().ok())
1114 .unwrap_or(UNIX_EPOCH);
1115
1116 self.files.push(FileEntry {
1117 path: path.to_path_buf(),
1118 size,
1119 modified,
1120 content_hash: cache_freshness::zero_hash(),
1121 });
1122 self.path_to_id.insert(path.to_path_buf(), file_id);
1123 Some(file_id)
1124 }
1125
1126 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1127 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1128 return;
1129 };
1130 self.unindexed_files.insert(file_id);
1131 self.file_trigrams.insert(file_id, Vec::new());
1132 }
1133
1134 fn active_file_ids(&self) -> Vec<u32> {
1135 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1136 ids.sort_unstable();
1137 ids
1138 }
1139
1140 fn is_active_file(&self, file_id: u32) -> bool {
1141 self.files
1142 .get(file_id as usize)
1143 .map(|file| !file.path.as_os_str().is_empty())
1144 .unwrap_or(false)
1145 }
1146
1147 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1148 let Some(postings) = self.postings.get(&trigram) else {
1149 return Vec::new();
1150 };
1151
1152 let mut matches = Vec::with_capacity(postings.len());
1153
1154 for posting in postings {
1155 if let Some(filter) = filter {
1156 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1159 continue;
1160 }
1161 }
1166 if self.is_active_file(posting.file_id) {
1167 matches.push(posting.file_id);
1168 }
1169 }
1170
1171 matches
1172 }
1173}
1174
1175fn search_candidate_file(
1176 file: &FileEntry,
1177 matcher: &SearchMatcher,
1178 max_results: usize,
1179 stop_after: usize,
1180 total_matches: &AtomicUsize,
1181 files_searched: &AtomicUsize,
1182 files_with_matches: &AtomicUsize,
1183 truncated: &AtomicBool,
1184 engine_capped: &AtomicBool,
1185) -> Vec<SharedGrepMatch> {
1186 if should_stop_search(truncated, total_matches, stop_after) {
1187 engine_capped.store(true, Ordering::Relaxed);
1188 return Vec::new();
1189 }
1190
1191 let content = match read_indexed_file_bytes(&file.path) {
1192 Some(content) => content,
1193 None => return Vec::new(),
1194 };
1195 if is_binary_bytes(&content) {
1202 return Vec::new();
1203 }
1204 files_searched.fetch_add(1, Ordering::Relaxed);
1205
1206 let shared_path = Arc::new(file.path.clone());
1207 let mut matches = Vec::new();
1208 let mut line_starts = None;
1209 let mut seen_lines = HashSet::new();
1210 let mut matched_this_file = false;
1211
1212 match matcher {
1213 SearchMatcher::Literal(literal) if !literal.case_insensitive_ascii => {
1214 let needle = &literal.needle;
1215 let finder = memchr::memmem::Finder::new(needle);
1216 let mut start = 0;
1217
1218 while let Some(position) = finder.find(&content[start..]) {
1219 if should_stop_search(truncated, total_matches, stop_after) {
1220 engine_capped.store(true, Ordering::Relaxed);
1221 break;
1222 }
1223
1224 let offset = start + position;
1225 start = offset + 1;
1226
1227 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1228 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1229 if !seen_lines.insert(line) {
1230 continue;
1231 }
1232
1233 matched_this_file = true;
1234 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1235 if match_number > max_results {
1236 truncated.store(true, Ordering::Relaxed);
1237 break;
1238 }
1239
1240 let end = offset + needle.len();
1241 matches.push(SharedGrepMatch {
1242 file: shared_path.clone(),
1243 line,
1244 column,
1245 line_text,
1246 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1247 });
1248 }
1249 }
1250 SearchMatcher::Literal(literal) => {
1251 let needle = &literal.needle;
1252 let search_content = content.to_ascii_lowercase();
1253 let finder = memchr::memmem::Finder::new(needle);
1254 let mut start = 0;
1255
1256 while let Some(position) = finder.find(&search_content[start..]) {
1257 if should_stop_search(truncated, total_matches, stop_after) {
1258 engine_capped.store(true, Ordering::Relaxed);
1259 break;
1260 }
1261
1262 let offset = start + position;
1263 start = offset + 1;
1264
1265 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1266 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1267 if !seen_lines.insert(line) {
1268 continue;
1269 }
1270
1271 matched_this_file = true;
1272 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1273 if match_number > max_results {
1274 truncated.store(true, Ordering::Relaxed);
1275 break;
1276 }
1277
1278 let end = offset + needle.len();
1279 matches.push(SharedGrepMatch {
1280 file: shared_path.clone(),
1281 line,
1282 column,
1283 line_text,
1284 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1285 });
1286 }
1287 }
1288 SearchMatcher::Regex(regex) => {
1289 for matched in regex.find_iter(&content) {
1290 if should_stop_search(truncated, total_matches, stop_after) {
1291 engine_capped.store(true, Ordering::Relaxed);
1292 break;
1293 }
1294
1295 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1296 let (line, column, line_text) =
1297 line_details_bytes(&content, line_starts, matched.start());
1298 if !seen_lines.insert(line) {
1299 continue;
1300 }
1301
1302 matched_this_file = true;
1303 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1304 if match_number > max_results {
1305 truncated.store(true, Ordering::Relaxed);
1306 break;
1307 }
1308
1309 matches.push(SharedGrepMatch {
1310 file: shared_path.clone(),
1311 line,
1312 column,
1313 line_text,
1314 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1315 });
1316 }
1317 }
1318 }
1319
1320 if matched_this_file {
1321 files_with_matches.fetch_add(1, Ordering::Relaxed);
1322 }
1323
1324 matches
1325}
1326
1327fn should_stop_search(
1328 truncated: &AtomicBool,
1329 total_matches: &AtomicUsize,
1330 stop_after: usize,
1331) -> bool {
1332 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1333}
1334
1335fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1336 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1337 let mut left_index = 0;
1338 let mut right_index = 0;
1339
1340 while left_index < left.len() && right_index < right.len() {
1341 match left[left_index].cmp(&right[right_index]) {
1342 std::cmp::Ordering::Less => left_index += 1,
1343 std::cmp::Ordering::Greater => right_index += 1,
1344 std::cmp::Ordering::Equal => {
1345 merged.push(left[left_index]);
1346 left_index += 1;
1347 right_index += 1;
1348 }
1349 }
1350 }
1351
1352 merged
1353}
1354
1355fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1356 let mut merged = Vec::with_capacity(left.len() + right.len());
1357 let mut left_index = 0;
1358 let mut right_index = 0;
1359
1360 while left_index < left.len() && right_index < right.len() {
1361 match left[left_index].cmp(&right[right_index]) {
1362 std::cmp::Ordering::Less => {
1363 merged.push(left[left_index]);
1364 left_index += 1;
1365 }
1366 std::cmp::Ordering::Greater => {
1367 merged.push(right[right_index]);
1368 right_index += 1;
1369 }
1370 std::cmp::Ordering::Equal => {
1371 merged.push(left[left_index]);
1372 left_index += 1;
1373 right_index += 1;
1374 }
1375 }
1376 }
1377
1378 merged.extend_from_slice(&left[left_index..]);
1379 merged.extend_from_slice(&right[right_index..]);
1380 merged
1381}
1382
1383pub fn decompose_regex(pattern: &str) -> RegexQuery {
1384 let hir = match regex_syntax::parse(pattern) {
1385 Ok(hir) => hir,
1386 Err(_) => return RegexQuery::default(),
1387 };
1388
1389 let build = build_query(&hir);
1390 build.into_query()
1391}
1392
1393pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1394 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1395}
1396
1397pub fn normalize_char(c: u8) -> u8 {
1398 c.to_ascii_lowercase()
1399}
1400
1401pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1402 if content.len() < 3 {
1403 return Vec::new();
1404 }
1405
1406 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1407 for start in 0..=content.len() - 3 {
1408 let trigram = pack_trigram(
1409 normalize_char(content[start]),
1410 normalize_char(content[start + 1]),
1411 normalize_char(content[start + 2]),
1412 );
1413 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1414 trigrams.push((trigram, next_char, start));
1415 }
1416 trigrams
1417}
1418
1419pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1420 let mut seen = HashSet::new();
1421 let mut out = Vec::new();
1422 for token in tokens {
1423 for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1424 if seen.insert(trigram) {
1425 out.push(trigram);
1426 }
1427 }
1428 }
1429 out
1430}
1431
1432pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1433 if query_trigrams.is_empty() {
1434 return 0.0;
1435 }
1436
1437 let mut hits = 0u32;
1438 for &trigram in query_trigrams {
1439 if let Some(postings) = index.postings.get(&trigram) {
1440 if postings
1441 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1442 .is_ok()
1443 {
1444 hits += 1;
1445 }
1446 }
1447 }
1448
1449 if hits == 0 {
1450 return 0.0;
1451 }
1452
1453 let file_trigram_count = index
1454 .file_trigrams
1455 .get(&file_id)
1456 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1457 (hits as f32) / (1.0 + file_trigram_count.ln())
1458}
1459
1460pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1461 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1463 return PathBuf::from(override_dir)
1464 .join("index")
1465 .join(project_cache_key(project_root));
1466 }
1467 if let Some(dir) = storage_dir {
1469 return dir.join("index").join(project_cache_key(project_root));
1470 }
1471 let home = std::env::var_os("HOME")
1476 .or_else(|| std::env::var_os("USERPROFILE"))
1477 .map(PathBuf::from)
1478 .unwrap_or_else(std::env::temp_dir);
1479 home.join(".cache")
1480 .join("aft")
1481 .join("index")
1482 .join(project_cache_key(project_root))
1483}
1484
1485pub(crate) fn build_path_filters(
1486 include: &[String],
1487 exclude: &[String],
1488) -> Result<PathFilters, String> {
1489 Ok(PathFilters {
1490 includes: build_globset(include)?,
1491 excludes: build_globset(exclude)?,
1492 })
1493}
1494
1495pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1496 walk_project_files_from(root, root, filters)
1497}
1498
1499pub fn walk_project_files_bounded_default(
1500 root: &Path,
1501 max_files: usize,
1502) -> Result<Vec<PathBuf>, usize> {
1503 walk_project_files_from_inner(root, root, &PathFilters::default(), Some(max_files))
1504}
1505
1506pub(crate) fn walk_project_files_bounded_matching<F>(
1507 root: &Path,
1508 filters: &PathFilters,
1509 max_files: usize,
1510 matches_file: F,
1511) -> Result<Vec<PathBuf>, usize>
1512where
1513 F: Fn(&Path) -> bool,
1514{
1515 walk_project_files_from_inner_matching(root, root, filters, Some(max_files), matches_file)
1516}
1517
1518pub fn walk_project_files_bounded_default_matching<F>(
1519 root: &Path,
1520 max_files: usize,
1521 matches_file: F,
1522) -> Result<Vec<PathBuf>, usize>
1523where
1524 F: Fn(&Path) -> bool,
1525{
1526 walk_project_files_from_inner_matching(
1527 root,
1528 root,
1529 &PathFilters::default(),
1530 Some(max_files),
1531 matches_file,
1532 )
1533}
1534
1535pub(crate) fn walk_project_files_from(
1536 filter_root: &Path,
1537 search_root: &Path,
1538 filters: &PathFilters,
1539) -> Vec<PathBuf> {
1540 walk_project_files_from_inner(filter_root, search_root, filters, None)
1541 .expect("unbounded project walk cannot exceed a file limit")
1542}
1543
1544pub(crate) fn has_any_project_file_from(
1545 filter_root: &Path,
1546 search_root: &Path,
1547 filters: &PathFilters,
1548) -> bool {
1549 walk_project_files_from_inner(filter_root, search_root, filters, Some(0)).is_err()
1550}
1551
1552fn walk_project_files_from_inner(
1553 filter_root: &Path,
1554 search_root: &Path,
1555 filters: &PathFilters,
1556 max_files: Option<usize>,
1557) -> Result<Vec<PathBuf>, usize> {
1558 walk_project_files_from_inner_matching(filter_root, search_root, filters, max_files, |_| true)
1559}
1560
1561fn walk_project_files_from_inner_matching<F>(
1562 filter_root: &Path,
1563 search_root: &Path,
1564 filters: &PathFilters,
1565 max_files: Option<usize>,
1566 matches_file: F,
1567) -> Result<Vec<PathBuf>, usize>
1568where
1569 F: Fn(&Path) -> bool,
1570{
1571 let mut builder = WalkBuilder::new(search_root);
1572 builder
1573 .hidden(false)
1574 .git_ignore(true)
1575 .git_global(true)
1576 .git_exclude(true)
1577 .add_custom_ignore_filename(".aftignore")
1581 .filter_entry(|entry| {
1582 let name = entry.file_name().to_string_lossy();
1583 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1584 return !matches!(
1585 name.as_ref(),
1586 "node_modules"
1587 | "target"
1588 | "venv"
1589 | ".venv"
1590 | ".git"
1591 | "__pycache__"
1592 | ".tox"
1593 | "dist"
1594 | "build"
1595 );
1596 }
1597 true
1598 });
1599
1600 let mut files = Vec::new();
1601 for entry in builder.build().filter_map(|entry| entry.ok()) {
1602 if !entry
1603 .file_type()
1604 .map_or(false, |file_type| file_type.is_file())
1605 {
1606 continue;
1607 }
1608 let path = entry.into_path();
1609 if filters.matches(filter_root, &path) && matches_file(&path) {
1610 files.push(path);
1611 if max_files.is_some_and(|limit| files.len() > limit) {
1612 return Err(files.len());
1613 }
1614 }
1615 }
1616
1617 sort_paths_by_mtime_desc(&mut files);
1618 Ok(files)
1619}
1620
1621pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1622 let bytes = fs::read(path).ok()?;
1623 if is_binary_bytes(&bytes) {
1624 return None;
1625 }
1626 String::from_utf8(bytes).ok()
1627}
1628
1629fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1630 fs::read(path).ok()
1631}
1632
1633pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1634 path.strip_prefix(root)
1635 .map(PathBuf::from)
1636 .unwrap_or_else(|_| path.to_path_buf())
1637}
1638
1639pub(crate) fn cache_relative_path(root: &Path, path: &Path) -> Option<PathBuf> {
1640 let normalized_root = normalize_path(root);
1641 let normalized_path = normalize_path(path);
1642 let relative = normalized_path.strip_prefix(&normalized_root).ok()?;
1643 validate_cached_relative_path(relative)
1644}
1645
1646pub(crate) fn cached_path_under_root(root: &Path, relative_path: &Path) -> Option<PathBuf> {
1647 let relative = validate_cached_relative_path(relative_path)?;
1648 let normalized_root = normalize_path(root);
1649 let full_path = normalize_path(&normalized_root.join(relative));
1650
1651 match fs::canonicalize(&full_path) {
1652 Ok(canonical_path) => {
1653 if canonical_path.starts_with(&normalized_root) {
1654 return Some(full_path);
1655 }
1656
1657 let canonical_root = fs::canonicalize(&normalized_root).ok()?;
1658 canonical_path
1659 .starts_with(&canonical_root)
1660 .then_some(full_path)
1661 }
1662 Err(_) => full_path.starts_with(&normalized_root).then_some(full_path),
1663 }
1664}
1665
1666pub(crate) fn validate_cached_relative_path(path: &Path) -> Option<PathBuf> {
1667 if path.is_absolute() {
1668 return None;
1669 }
1670
1671 let mut normalized = PathBuf::new();
1672 for component in path.components() {
1673 match component {
1674 Component::Normal(part) => normalized.push(part),
1675 Component::CurDir => {}
1676 Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
1677 }
1678 }
1679 (!normalized.as_os_str().is_empty()).then_some(normalized)
1680}
1681
1682pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1695 use std::collections::HashMap;
1696 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1697 for path in paths.iter() {
1698 mtimes
1699 .entry(path.clone())
1700 .or_insert_with(|| path_modified_time(path));
1701 }
1702 paths.sort_by(|left, right| {
1703 let left_mtime = mtimes.get(left).and_then(|v| *v);
1704 let right_mtime = mtimes.get(right).and_then(|v| *v);
1705 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1706 });
1707}
1708
1709pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1712 use std::collections::HashMap;
1713 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1714 for m in matches.iter() {
1715 mtimes.entry(m.file.clone()).or_insert_with(|| {
1716 let resolved = resolve_match_path(project_root, &m.file);
1717 path_modified_time(&resolved)
1718 });
1719 }
1720 matches.sort_by(|left, right| {
1721 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1722 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1723 right_mtime
1724 .cmp(&left_mtime)
1725 .then_with(|| left.file.cmp(&right.file))
1726 .then_with(|| left.line.cmp(&right.line))
1727 .then_with(|| left.column.cmp(&right.column))
1728 });
1729}
1730
1731fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1736 matches: &mut [SharedGrepMatch],
1737 modified_for_path: F,
1738) where
1739 F: Fn(&Path) -> Option<SystemTime>,
1740{
1741 use std::collections::HashMap;
1742 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1743 for m in matches.iter() {
1744 let path = m.file.as_path().to_path_buf();
1745 mtimes
1746 .entry(path.clone())
1747 .or_insert_with(|| modified_for_path(&path));
1748 }
1749 matches.sort_by(|left, right| {
1750 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1751 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1752 right_mtime
1753 .cmp(&left_mtime)
1754 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1755 .then_with(|| left.line.cmp(&right.line))
1756 .then_with(|| left.column.cmp(&right.column))
1757 });
1758}
1759
1760pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1761 let resolved_project_root = canonicalize_or_normalize(project_root);
1762 let root = match path {
1763 Some(path) => {
1764 let path = PathBuf::from(path);
1765 if path.is_absolute() {
1766 canonicalize_or_normalize(&path)
1767 } else {
1768 normalize_path(&resolved_project_root.join(path))
1769 }
1770 }
1771 None => resolved_project_root.clone(),
1772 };
1773
1774 let use_index = is_within_search_root(&resolved_project_root, &root);
1775 SearchScope { root, use_index }
1776}
1777
1778pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1779 content_inspector::inspect(content).is_binary()
1780}
1781
1782pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1783 run_git(root, &["rev-parse", "HEAD"])
1784}
1785
1786pub fn project_cache_key(project_root: &Path) -> String {
1787 use sha2::{Digest, Sha256};
1788
1789 let mut hasher = Sha256::new();
1790
1791 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1792 hasher.update(root_commit.as_bytes());
1795 } else {
1796 let canonical_root = canonicalize_or_normalize(project_root);
1798 hasher.update(canonical_root.to_string_lossy().as_bytes());
1799 }
1800
1801 let digest = format!("{:x}", hasher.finalize());
1802 digest[..16].to_string()
1803}
1804
1805pub fn ignore_rules_fingerprint(project_root: &Path) -> String {
1813 use sha2::{Digest, Sha256};
1814
1815 let root = canonicalize_or_normalize(project_root);
1816 let mut files = Vec::new();
1817 collect_ignore_rule_files(&root, &mut files);
1818 if let Some(global_ignore) = ignore::gitignore::gitconfig_excludes_path() {
1819 if global_ignore.is_file() {
1820 files.push(global_ignore);
1821 }
1822 }
1823 let info_exclude = git_info_exclude_path(&root);
1824 if info_exclude.is_file() {
1825 files.push(info_exclude);
1826 }
1827 files.sort();
1828 files.dedup();
1829
1830 let mut hasher = Sha256::new();
1831 hasher.update(b"aft-ignore-rules-v1\0");
1832 for path in files {
1833 if let Some(relative) = cache_relative_path(&root, &path) {
1834 hasher.update(relative.to_string_lossy().as_bytes());
1835 } else {
1836 hasher.update(path.to_string_lossy().as_bytes());
1837 }
1838 hasher.update(b"\0");
1839 match fs::read(&path) {
1840 Ok(bytes) => hasher.update(&bytes),
1841 Err(error) => hasher.update(format!("read-error:{error}").as_bytes()),
1842 }
1843 hasher.update(b"\0");
1844 }
1845
1846 format!("{:x}", hasher.finalize())
1847}
1848
1849fn git_info_exclude_path(root: &Path) -> PathBuf {
1850 run_git(
1851 root,
1852 &["rev-parse", "--path-format=absolute", "--git-common-dir"],
1853 )
1854 .map(PathBuf::from)
1855 .unwrap_or_else(|| root.join(".git"))
1856 .join("info")
1857 .join("exclude")
1858}
1859
1860fn collect_ignore_rule_files(root: &Path, files: &mut Vec<PathBuf>) {
1861 let mut stack = vec![root.to_path_buf()];
1862 while let Some(dir) = stack.pop() {
1863 let Ok(entries) = fs::read_dir(&dir) else {
1864 continue;
1865 };
1866 for entry in entries.flatten() {
1867 let path = entry.path();
1868 let file_name = entry.file_name();
1869 if file_name == ".gitignore" || file_name == ".aftignore" {
1870 if path.is_file() {
1871 files.push(path);
1872 }
1873 continue;
1874 }
1875
1876 let Ok(file_type) = entry.file_type() else {
1877 continue;
1878 };
1879 if !file_type.is_dir() || file_type.is_symlink() {
1880 continue;
1881 }
1882 if ignore_rule_fingerprint_skips_dir(&file_name) {
1883 continue;
1884 }
1885 stack.push(path);
1886 }
1887 }
1888}
1889
1890fn ignore_rule_fingerprint_skips_dir(name: &std::ffi::OsStr) -> bool {
1891 matches!(
1892 name.to_str().unwrap_or(""),
1893 ".git"
1894 | "node_modules"
1895 | "target"
1896 | "venv"
1897 | ".venv"
1898 | "__pycache__"
1899 | ".tox"
1900 | "dist"
1901 | "build"
1902 )
1903}
1904
1905impl PathFilters {
1906 fn matches(&self, root: &Path, path: &Path) -> bool {
1907 let relative = to_glob_path(&relative_to_root(root, path));
1908 if self
1909 .includes
1910 .as_ref()
1911 .is_some_and(|includes| !includes.is_match(&relative))
1912 {
1913 return false;
1914 }
1915 if self
1916 .excludes
1917 .as_ref()
1918 .is_some_and(|excludes| excludes.is_match(&relative))
1919 {
1920 return false;
1921 }
1922 true
1923 }
1924}
1925
1926fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1927 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1928}
1929
1930fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1931 if path.is_absolute() {
1932 path.to_path_buf()
1933 } else {
1934 project_root.join(path)
1935 }
1936}
1937
1938fn path_modified_time(path: &Path) -> Option<SystemTime> {
1939 fs::metadata(path)
1940 .and_then(|metadata| metadata.modified())
1941 .ok()
1942}
1943
1944fn normalize_path(path: &Path) -> PathBuf {
1945 let mut result = PathBuf::new();
1946 for component in path.components() {
1947 match component {
1948 Component::ParentDir => {
1949 if !result.pop() {
1950 result.push(component);
1951 }
1952 }
1953 Component::CurDir => {}
1954 _ => result.push(component),
1955 }
1956 }
1957 result
1958}
1959
1960fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
1961 if let Ok(canonical) = fs::canonicalize(path) {
1962 return canonical;
1963 }
1964
1965 let Some(parent) = path.parent() else {
1966 return path.to_path_buf();
1967 };
1968 let Some(file_name) = path.file_name() else {
1969 return path.to_path_buf();
1970 };
1971
1972 fs::canonicalize(parent)
1973 .map(|canonical_parent| canonical_parent.join(file_name))
1974 .unwrap_or_else(|_| path.to_path_buf())
1975}
1976
1977fn verify_file_mtimes(index: &mut SearchIndex) {
1980 let filters = PathFilters::default();
1981 let current_files = walk_project_files(&index.project_root, &filters);
1982 let current_file_set: HashSet<PathBuf> = current_files.iter().cloned().collect();
1983 let mut stale_paths = Vec::new();
1984 let mut removed_paths = Vec::new();
1985
1986 for entry in &mut index.files {
1987 if entry.path.as_os_str().is_empty() {
1988 continue; }
1990 if !current_file_set.contains(&entry.path) {
1991 removed_paths.push(entry.path.clone());
1992 continue;
1993 }
1994 let cached = FileFreshness {
1995 mtime: entry.modified,
1996 size: entry.size,
1997 content_hash: entry.content_hash,
1998 };
1999 match cache_freshness::verify_file_strict(&entry.path, &cached) {
2000 FreshnessVerdict::HotFresh => {}
2001 FreshnessVerdict::ContentFresh {
2002 new_mtime,
2003 new_size,
2004 } => {
2005 entry.modified = new_mtime;
2006 entry.size = new_size;
2007 }
2008 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
2009 stale_paths.push(entry.path.clone())
2010 }
2011 }
2012 }
2013
2014 for path in &removed_paths {
2015 index.remove_file(path);
2016 }
2017
2018 for path in &stale_paths {
2022 if current_file_set.contains(path) {
2023 index.update_file(path);
2024 } else {
2025 index.remove_file(path);
2026 }
2027 }
2028
2029 for path in current_files {
2031 if !index.path_to_id.contains_key(&path) {
2032 index.update_file(&path);
2033 }
2034 }
2035
2036 if !stale_paths.is_empty() {
2037 crate::slog_info!(
2038 "search index: refreshed {} stale file(s) from disk cache",
2039 stale_paths.len()
2040 );
2041 }
2042}
2043
2044fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
2045 normalize_path(path).starts_with(normalize_path(search_root))
2046}
2047
2048impl QueryBuild {
2049 fn into_query(self) -> RegexQuery {
2050 let mut query = RegexQuery::default();
2051
2052 for run in self.and_runs {
2053 add_run_to_and_query(&mut query, &run);
2054 }
2055
2056 for group in self.or_groups {
2057 let mut trigrams = BTreeSet::new();
2058 let mut filters = HashMap::new();
2059 for run in group {
2060 for (trigram, filter) in trigram_filters(&run) {
2061 trigrams.insert(trigram);
2062 merge_filter(filters.entry(trigram).or_default(), filter);
2063 }
2064 }
2065 if !trigrams.is_empty() {
2066 query.or_groups.push(trigrams.into_iter().collect());
2067 query.or_filters.push(filters);
2068 }
2069 }
2070
2071 query
2072 }
2073}
2074
2075fn build_query(hir: &Hir) -> QueryBuild {
2076 match hir.kind() {
2077 HirKind::Literal(literal) => {
2078 if literal.0.len() >= 3 {
2079 QueryBuild {
2080 and_runs: vec![literal.0.to_vec()],
2081 or_groups: Vec::new(),
2082 }
2083 } else {
2084 QueryBuild::default()
2085 }
2086 }
2087 HirKind::Capture(capture) => build_query(&capture.sub),
2088 HirKind::Concat(parts) => {
2089 let mut build = QueryBuild::default();
2090 for part in parts {
2091 let part_build = build_query(part);
2092 build.and_runs.extend(part_build.and_runs);
2093 build.or_groups.extend(part_build.or_groups);
2094 }
2095 build
2096 }
2097 HirKind::Alternation(parts) => {
2098 let mut group = Vec::new();
2099 for part in parts {
2100 let Some(mut choices) = guaranteed_run_choices(part) else {
2101 return QueryBuild::default();
2102 };
2103 group.append(&mut choices);
2104 }
2105 if group.is_empty() {
2106 QueryBuild::default()
2107 } else {
2108 QueryBuild {
2109 and_runs: Vec::new(),
2110 or_groups: vec![group],
2111 }
2112 }
2113 }
2114 HirKind::Repetition(repetition) => {
2115 if repetition.min == 0 {
2116 QueryBuild::default()
2117 } else {
2118 build_query(&repetition.sub)
2119 }
2120 }
2121 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
2122 }
2123}
2124
2125fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
2126 match hir.kind() {
2127 HirKind::Literal(literal) => {
2128 if literal.0.len() >= 3 {
2129 Some(vec![literal.0.to_vec()])
2130 } else {
2131 None
2132 }
2133 }
2134 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
2135 HirKind::Concat(parts) => {
2136 let mut runs = Vec::new();
2137 for part in parts {
2138 if let Some(mut part_runs) = guaranteed_run_choices(part) {
2139 runs.append(&mut part_runs);
2140 }
2141 }
2142 if runs.is_empty() {
2143 None
2144 } else {
2145 Some(runs)
2146 }
2147 }
2148 HirKind::Alternation(parts) => {
2149 let mut runs = Vec::new();
2150 for part in parts {
2151 let Some(mut part_runs) = guaranteed_run_choices(part) else {
2152 return None;
2153 };
2154 runs.append(&mut part_runs);
2155 }
2156 if runs.is_empty() {
2157 None
2158 } else {
2159 Some(runs)
2160 }
2161 }
2162 HirKind::Repetition(repetition) => {
2163 if repetition.min == 0 {
2164 None
2165 } else {
2166 guaranteed_run_choices(&repetition.sub)
2167 }
2168 }
2169 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
2170 }
2171}
2172
2173fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
2174 for (trigram, filter) in trigram_filters(run) {
2175 if !query.and_trigrams.contains(&trigram) {
2176 query.and_trigrams.push(trigram);
2177 }
2178 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
2179 }
2180}
2181
2182fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
2183 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
2184 for (trigram, next_char, position) in extract_trigrams(run) {
2185 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
2186 if next_char != EOF_SENTINEL {
2187 entry.next_mask |= mask_for_next_char(next_char);
2188 }
2189 entry.loc_mask |= mask_for_position(position);
2190 }
2191 filters.into_iter().collect()
2192}
2193
2194fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
2195 target.next_mask |= filter.next_mask;
2196 target.loc_mask |= filter.loc_mask;
2197}
2198
2199fn mask_for_next_char(next_char: u8) -> u8 {
2200 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
2201 1u8 << bit
2202}
2203
2204fn mask_for_position(position: usize) -> u8 {
2205 1u8 << (position % 8)
2206}
2207
2208fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
2209 if patterns.is_empty() {
2210 return Ok(None);
2211 }
2212
2213 let mut builder = GlobSetBuilder::new();
2214 for pattern in patterns {
2215 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
2216 builder.add(glob);
2217 }
2218 builder.build().map(Some).map_err(|error| error.to_string())
2219}
2220
2221fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
2222 let mut buffer = [0u8; 4];
2223 reader.read_exact(&mut buffer)?;
2224 Ok(u32::from_le_bytes(buffer))
2225}
2226
2227fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
2228 let mut buffer = [0u8; 8];
2229 reader.read_exact(&mut buffer)?;
2230 Ok(u64::from_le_bytes(buffer))
2231}
2232
2233fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
2234 writer.write_all(&value.to_le_bytes())
2235}
2236
2237fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
2238 writer.write_all(&value.to_le_bytes())
2239}
2240
2241fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
2242 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
2243 return Err(std::io::Error::other("search index checksum missing"));
2244 };
2245 let expected = u32::from_le_bytes(*stored);
2246 let actual = crc32fast::hash(body);
2247 if actual != expected {
2248 return Err(std::io::Error::other("search index checksum mismatch"));
2249 }
2250 Ok(())
2251}
2252
2253fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
2254 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
2255 total_len.checked_sub(pos)
2256}
2257
2258fn run_git(root: &Path, args: &[&str]) -> Option<String> {
2259 let output = Command::new("git")
2260 .arg("-C")
2261 .arg(root)
2262 .args(args)
2263 .output()
2264 .ok()?;
2265 if !output.status.success() {
2266 return None;
2267 }
2268 let value = String::from_utf8(output.stdout).ok()?;
2269 let value = value.trim().to_string();
2270 if value.is_empty() {
2271 None
2272 } else {
2273 Some(value)
2274 }
2275}
2276
2277fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
2278 let diff_range = format!("{}..{}", from, to);
2279 let output = match Command::new("git")
2280 .arg("-C")
2281 .arg(root)
2282 .args(["diff", "--name-status", "-M", &diff_range])
2283 .output()
2284 {
2285 Ok(output) => output,
2286 Err(_) => return false,
2287 };
2288
2289 if !output.status.success() {
2290 return false;
2291 }
2292
2293 let Ok(diff) = String::from_utf8(output.stdout) else {
2294 return false;
2295 };
2296
2297 for line in diff.lines().map(str::trim).filter(|line| !line.is_empty()) {
2298 let mut fields = line.split('\t');
2299 let Some(status) = fields.next() else {
2300 continue;
2301 };
2302
2303 if status.starts_with('R') {
2304 let Some(old_path) = fields
2305 .next()
2306 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2307 else {
2308 continue;
2309 };
2310 let Some(new_path) = fields
2311 .next()
2312 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2313 else {
2314 continue;
2315 };
2316 index.remove_file(&old_path);
2317 index.update_file(&new_path);
2318 continue;
2319 }
2320
2321 let Some(path) = fields
2322 .next()
2323 .and_then(|path| cached_path_under_root(root, &PathBuf::from(path)))
2324 else {
2325 continue;
2326 };
2327 if status.starts_with('D') || !path.exists() {
2328 index.remove_file(&path);
2329 } else {
2330 index.update_file(&path);
2331 }
2332 }
2333
2334 true
2335}
2336
2337fn is_binary_path(path: &Path, size: u64) -> bool {
2338 if size == 0 {
2339 return false;
2340 }
2341
2342 let mut file = match File::open(path) {
2343 Ok(file) => file,
2344 Err(_) => return true,
2345 };
2346
2347 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2348 match file.read(&mut preview) {
2349 Ok(read) => is_binary_bytes(&preview[..read]),
2350 Err(_) => true,
2351 }
2352}
2353
2354fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2355 let mut starts = vec![0usize];
2356 for (index, byte) in content.iter().copied().enumerate() {
2357 if byte == b'\n' {
2358 starts.push(index + 1);
2359 }
2360 }
2361 starts
2362}
2363
2364fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2365 let line_index = match line_starts.binary_search(&offset) {
2366 Ok(index) => index,
2367 Err(index) => index.saturating_sub(1),
2368 };
2369 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2370 let line_end = content[line_start..]
2371 .iter()
2372 .position(|byte| *byte == b'\n')
2373 .map(|length| line_start + length)
2374 .unwrap_or(content.len());
2375 let mut line_slice = &content[line_start..line_end];
2376 if line_slice.ends_with(b"\r") {
2377 line_slice = &line_slice[..line_slice.len() - 1];
2378 }
2379 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2380 let column = String::from_utf8_lossy(&content[line_start..offset])
2381 .chars()
2382 .count() as u32
2383 + 1;
2384 (line_index as u32 + 1, column, line_text)
2385}
2386
2387fn to_glob_path(path: &Path) -> String {
2388 path.to_string_lossy().replace('\\', "/")
2389}
2390
2391#[cfg(test)]
2392mod tests {
2393 use std::process::Command;
2394
2395 use super::*;
2396
2397 #[test]
2398 fn cached_path_under_root_allows_missing_lexical_child() {
2399 let dir = tempfile::tempdir().expect("create temp dir");
2400 let project = dir.path().join("project");
2401 fs::create_dir_all(&project).expect("create project dir");
2402 let root = fs::canonicalize(&project).expect("canonicalize project");
2403
2404 let path = cached_path_under_root(&root, Path::new("future/file.rs"))
2405 .expect("missing child should fall back to lexical validation");
2406
2407 assert_eq!(path, root.join("future/file.rs"));
2408 }
2409
2410 #[cfg(unix)]
2411 #[test]
2412 fn cached_path_under_root_rejects_symlink_escape() {
2413 let dir = tempfile::tempdir().expect("create temp dir");
2414 let project = dir.path().join("project");
2415 let outside = dir.path().join("outside");
2416 fs::create_dir_all(&project).expect("create project dir");
2417 fs::create_dir_all(&outside).expect("create outside dir");
2418 fs::write(outside.join("secret.txt"), "secret").expect("write outside file");
2419 std::os::unix::fs::symlink(&outside, project.join("link")).expect("create symlink");
2420 let root = fs::canonicalize(&project).expect("canonicalize project");
2421
2422 assert!(cached_path_under_root(&root, Path::new("link/secret.txt")).is_none());
2423 }
2424
2425 #[test]
2426 fn extract_trigrams_tracks_next_char_and_position() {
2427 let trigrams = extract_trigrams(b"Rust");
2428 assert_eq!(trigrams.len(), 2);
2429 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2430 assert_eq!(
2431 trigrams[1],
2432 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2433 );
2434 }
2435
2436 #[test]
2437 fn decompose_regex_extracts_literals_and_alternations() {
2438 let query = decompose_regex("abc(def|ghi)xyz");
2439 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2440 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2441 assert_eq!(query.or_groups.len(), 1);
2442 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2443 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2444 }
2445
2446 #[test]
2447 fn candidates_intersect_posting_lists() {
2448 let mut index = SearchIndex::new();
2449 let dir = tempfile::tempdir().expect("create temp dir");
2450 let alpha = dir.path().join("alpha.txt");
2451 let beta = dir.path().join("beta.txt");
2452 fs::write(&alpha, "abcdef").expect("write alpha");
2453 fs::write(&beta, "abcxyz").expect("write beta");
2454 index.project_root = dir.path().to_path_buf();
2455 index.index_file(&alpha, b"abcdef");
2456 index.index_file(&beta, b"abcxyz");
2457
2458 let query = RegexQuery {
2459 and_trigrams: vec![
2460 pack_trigram(b'a', b'b', b'c'),
2461 pack_trigram(b'd', b'e', b'f'),
2462 ],
2463 ..RegexQuery::default()
2464 };
2465
2466 let candidates = index.candidates(&query);
2467 assert_eq!(candidates.len(), 1);
2468 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2469 }
2470
2471 #[test]
2472 fn candidates_apply_bloom_filters() {
2473 let mut index = SearchIndex::new();
2474 let dir = tempfile::tempdir().expect("create temp dir");
2475 let file = dir.path().join("sample.txt");
2476 fs::write(&file, "abcd efgh").expect("write sample");
2477 index.project_root = dir.path().to_path_buf();
2478 index.index_file(&file, b"abcd efgh");
2479
2480 let trigram = pack_trigram(b'a', b'b', b'c');
2481 let matching_filter = PostingFilter {
2482 next_mask: mask_for_next_char(b'd'),
2483 loc_mask: mask_for_position(0),
2484 };
2485 let non_matching_filter = PostingFilter {
2486 next_mask: mask_for_next_char(b'z'),
2487 loc_mask: mask_for_position(0),
2488 };
2489
2490 assert_eq!(
2491 index
2492 .postings_for_trigram(trigram, Some(matching_filter))
2493 .len(),
2494 1
2495 );
2496 assert!(index
2497 .postings_for_trigram(trigram, Some(non_matching_filter))
2498 .is_empty());
2499 }
2500
2501 #[test]
2502 fn disk_round_trip_preserves_postings_and_files() {
2503 let dir = tempfile::tempdir().expect("create temp dir");
2504 let project = dir.path().join("project");
2505 fs::create_dir_all(&project).expect("create project dir");
2506 let file = project.join("src.txt");
2507 fs::write(&file, "abcdef").expect("write source");
2508
2509 let mut index = SearchIndex::build(&project);
2510 index.git_head = Some("deadbeef".to_string());
2511 let cache_dir = dir.path().join("cache");
2512 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2513
2514 let loaded =
2515 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2516 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2517 assert_eq!(loaded.files.len(), 1);
2518 assert_eq!(
2519 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2520 PathBuf::from("src.txt")
2521 );
2522 assert_eq!(loaded.postings.len(), index.postings.len());
2523 assert!(loaded
2524 .postings
2525 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2526 }
2527
2528 #[test]
2529 fn cache_path_helpers_reject_absolute_and_parent_paths() {
2530 let root = PathBuf::from("/tmp/aft-project");
2531
2532 assert_eq!(
2533 cache_relative_path(&root, &root.join("src/lib.rs")),
2534 Some(PathBuf::from("src/lib.rs"))
2535 );
2536 assert!(cache_relative_path(&root, Path::new("/tmp/outside.rs")).is_none());
2537 assert!(cached_path_under_root(&root, Path::new("../outside.rs")).is_none());
2538 assert!(cached_path_under_root(&root, Path::new("/tmp/outside.rs")).is_none());
2539 assert_eq!(
2540 cached_path_under_root(&root, Path::new("src/./lib.rs")),
2541 Some(root.join("src/lib.rs"))
2542 );
2543 }
2544
2545 #[test]
2546 fn refresh_after_head_change_removes_renames_and_detects_local_files() {
2547 let dir = tempfile::tempdir().expect("create temp dir");
2548 let project = dir.path().join("project");
2549 fs::create_dir_all(&project).expect("create project dir");
2550 let canonical_project = fs::canonicalize(&project).expect("canonical project");
2551 fs::write(project.join("old.txt"), "old token\n").expect("write old");
2552 fs::write(project.join("unchanged.txt"), "before\n").expect("write unchanged");
2553
2554 Command::new("git")
2555 .arg("init")
2556 .arg(&project)
2557 .status()
2558 .expect("git init");
2559 for args in [
2560 ["config", "user.email", "aft@example.invalid"],
2561 ["config", "user.name", "AFT Test"],
2562 ] {
2563 Command::new("git")
2564 .arg("-C")
2565 .arg(&project)
2566 .args(args)
2567 .status()
2568 .expect("git config");
2569 }
2570 Command::new("git")
2571 .arg("-C")
2572 .arg(&project)
2573 .args(["add", "."])
2574 .status()
2575 .expect("git add initial");
2576 Command::new("git")
2577 .arg("-C")
2578 .arg(&project)
2579 .args(["commit", "-m", "initial"])
2580 .status()
2581 .expect("git commit initial");
2582 let previous = run_git(&project, &["rev-parse", "HEAD"]).expect("previous head");
2583 let mut baseline = SearchIndex::build(&project);
2584 baseline.git_head = Some(previous.clone());
2585
2586 fs::rename(project.join("old.txt"), project.join("new.txt")).expect("rename file");
2587 Command::new("git")
2588 .arg("-C")
2589 .arg(&project)
2590 .args(["add", "-A"])
2591 .status()
2592 .expect("git add rename");
2593 Command::new("git")
2594 .arg("-C")
2595 .arg(&project)
2596 .args(["commit", "-m", "rename"])
2597 .status()
2598 .expect("git commit rename");
2599 let current = run_git(&project, &["rev-parse", "HEAD"]).expect("current head");
2600
2601 fs::write(project.join("unchanged.txt"), "after local edit\n").expect("local edit");
2602 fs::write(project.join("untracked.txt"), "untracked token\n").expect("untracked");
2603
2604 let refreshed = SearchIndex::rebuild_or_refresh(
2605 &project,
2606 DEFAULT_MAX_FILE_SIZE,
2607 Some(current),
2608 Some(baseline),
2609 );
2610
2611 assert!(!refreshed
2612 .path_to_id
2613 .contains_key(&canonical_project.join("old.txt")));
2614 assert!(refreshed
2615 .path_to_id
2616 .contains_key(&canonical_project.join("new.txt")));
2617 assert!(refreshed
2618 .path_to_id
2619 .contains_key(&canonical_project.join("untracked.txt")));
2620 let matches = refreshed.grep("after local edit", true, &[], &[], &canonical_project, 10);
2621 assert_eq!(matches.matches.len(), 1);
2622 }
2623
2624 #[test]
2625 fn read_from_disk_rejects_corrupt_postings_checksum() {
2626 let dir = tempfile::tempdir().expect("create temp dir");
2627 let project = dir.path().join("project");
2628 fs::create_dir_all(&project).expect("create project dir");
2629 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2630
2631 let index = SearchIndex::build(&project);
2632 let cache_dir = dir.path().join("cache");
2633 index.write_to_disk(&cache_dir, None);
2634
2635 let cache_path = cache_dir.join("cache.bin");
2636 let mut bytes = fs::read(&cache_path).expect("read cache");
2637 let middle = bytes.len() / 2;
2638 bytes[middle] ^= 0xff;
2639 fs::write(&cache_path, bytes).expect("write corrupted cache");
2640
2641 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2642 }
2643
2644 #[test]
2645 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2646 let dir = tempfile::tempdir().expect("create temp dir");
2647 let project = dir.path().join("project");
2648 fs::create_dir_all(&project).expect("create project dir");
2649 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2650
2651 let index = SearchIndex::build(&project);
2652 let cache_dir = dir.path().join("cache");
2653 index.write_to_disk(&cache_dir, None);
2654
2655 assert!(cache_dir.join("cache.bin").is_file());
2656 assert!(fs::read_dir(&cache_dir)
2657 .expect("read cache dir")
2658 .all(|entry| !entry
2659 .expect("cache entry")
2660 .file_name()
2661 .to_string_lossy()
2662 .contains(".tmp.")));
2663 }
2664
2665 #[test]
2666 fn concurrent_search_index_writes_do_not_corrupt() {
2667 let dir = tempfile::tempdir().expect("create temp dir");
2668 let project = dir.path().join("project");
2669 fs::create_dir_all(&project).expect("create project dir");
2670 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2671 let cache_dir = dir.path().join("cache");
2672
2673 let a_project = project.clone();
2674 let a_cache = cache_dir.clone();
2675 let a = std::thread::spawn(move || {
2676 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2677 let index = SearchIndex::build(&a_project);
2678 index.write_to_disk(&a_cache, None);
2679 });
2680 let b_project = project.clone();
2681 let b_cache = cache_dir.clone();
2682 let b = std::thread::spawn(move || {
2683 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2684 let index = SearchIndex::build(&b_project);
2685 index.write_to_disk(&b_cache, None);
2686 });
2687 a.join().expect("writer a");
2688 b.join().expect("writer b");
2689
2690 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2691 }
2692
2693 #[test]
2694 fn search_index_atomic_rename_survives_partial_write() {
2695 let dir = tempfile::tempdir().expect("create temp dir");
2696 let cache_dir = dir.path().join("cache");
2697 fs::create_dir_all(&cache_dir).expect("create cache dir");
2698 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2699
2700 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2701 }
2702
2703 #[test]
2704 fn project_cache_key_includes_checkout_path() {
2705 let dir = tempfile::tempdir().expect("create temp dir");
2706 let source = dir.path().join("source");
2707 fs::create_dir_all(&source).expect("create source repo dir");
2708 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2709
2710 assert!(Command::new("git")
2711 .current_dir(&source)
2712 .args(["init"])
2713 .status()
2714 .expect("init git repo")
2715 .success());
2716 assert!(Command::new("git")
2717 .current_dir(&source)
2718 .args(["add", "."])
2719 .status()
2720 .expect("git add")
2721 .success());
2722 assert!(Command::new("git")
2723 .current_dir(&source)
2724 .args([
2725 "-c",
2726 "user.name=AFT Tests",
2727 "-c",
2728 "user.email=aft-tests@example.com",
2729 "commit",
2730 "-m",
2731 "initial",
2732 ])
2733 .status()
2734 .expect("git commit")
2735 .success());
2736
2737 let clone = dir.path().join("clone");
2738 assert!(Command::new("git")
2739 .args(["clone", "--quiet"])
2740 .arg(&source)
2741 .arg(&clone)
2742 .status()
2743 .expect("git clone")
2744 .success());
2745
2746 let source_key = project_cache_key(&source);
2747 let clone_key = project_cache_key(&clone);
2748
2749 assert_eq!(source_key.len(), 16);
2750 assert_eq!(clone_key.len(), 16);
2751 assert_eq!(source_key, clone_key);
2753 }
2754
2755 #[test]
2756 fn git_head_unchanged_picks_up_local_edits() {
2757 let dir = tempfile::tempdir().expect("create temp dir");
2758 let project = dir.path().join("repo");
2759 fs::create_dir_all(&project).expect("create repo dir");
2760 let file = project.join("tracked.txt");
2761 fs::write(&file, "oldtoken\n").expect("write file");
2762 assert!(Command::new("git")
2763 .current_dir(&project)
2764 .arg("init")
2765 .status()
2766 .unwrap()
2767 .success());
2768 assert!(Command::new("git")
2769 .current_dir(&project)
2770 .args(["add", "."])
2771 .status()
2772 .unwrap()
2773 .success());
2774 assert!(Command::new("git")
2775 .current_dir(&project)
2776 .args([
2777 "-c",
2778 "user.name=AFT Tests",
2779 "-c",
2780 "user.email=aft-tests@example.com",
2781 "commit",
2782 "-m",
2783 "initial"
2784 ])
2785 .status()
2786 .unwrap()
2787 .success());
2788 let head = current_git_head(&project);
2789 let mut baseline = SearchIndex::build(&project);
2790 baseline.git_head = head.clone();
2791 fs::write(&file, "newtoken\n").expect("edit tracked file");
2792
2793 let refreshed =
2794 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2795 let result = refreshed.grep("newtoken", true, &[], &[], &project, 10);
2796
2797 assert_eq!(result.total_matches, 1);
2798 }
2799
2800 #[test]
2801 fn non_git_project_reuses_cache_when_files_unchanged() {
2802 let dir = tempfile::tempdir().expect("create temp dir");
2803 let project = dir.path().join("project");
2804 fs::create_dir_all(&project).expect("create project dir");
2805 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2806 let baseline = SearchIndex::build(&project);
2807 let baseline_file_count = baseline.file_count();
2808
2809 let refreshed =
2810 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2811
2812 assert_eq!(refreshed.file_count(), baseline_file_count);
2813 assert_eq!(
2814 refreshed
2815 .grep("unchangedtoken", true, &[], &[], &project, 10)
2816 .total_matches,
2817 1
2818 );
2819 }
2820
2821 #[test]
2822 fn resolve_search_scope_disables_index_for_external_path() {
2823 let dir = tempfile::tempdir().expect("create temp dir");
2824 let project = dir.path().join("project");
2825 let outside = dir.path().join("outside");
2826 fs::create_dir_all(&project).expect("create project dir");
2827 fs::create_dir_all(&outside).expect("create outside dir");
2828
2829 let scope = resolve_search_scope(&project, outside.to_str());
2830
2831 assert_eq!(
2832 scope.root,
2833 fs::canonicalize(&outside).expect("canonicalize outside")
2834 );
2835 assert!(!scope.use_index);
2836 }
2837
2838 #[test]
2839 fn grep_filters_matches_to_search_root() {
2840 let dir = tempfile::tempdir().expect("create temp dir");
2841 let project = dir.path().join("project");
2842 let src = project.join("src");
2843 let docs = project.join("docs");
2844 fs::create_dir_all(&src).expect("create src dir");
2845 fs::create_dir_all(&docs).expect("create docs dir");
2846 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2847 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2848
2849 let index = SearchIndex::build(&project);
2850 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2851
2852 assert_eq!(result.files_searched, 1);
2853 assert_eq!(result.files_with_matches, 1);
2854 assert_eq!(result.matches.len(), 1);
2855 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2857 assert_eq!(result.matches[0].file, expected);
2858 }
2859
2860 #[test]
2861 fn grep_deduplicates_multiple_matches_on_same_line() {
2862 let dir = tempfile::tempdir().expect("create temp dir");
2863 let project = dir.path().join("project");
2864 let src = project.join("src");
2865 fs::create_dir_all(&src).expect("create src dir");
2866 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2867
2868 let index = SearchIndex::build(&project);
2869 let result = index.grep("SearchIndex", true, &[], &[], &src, 10);
2870
2871 assert_eq!(result.total_matches, 1);
2872 assert_eq!(result.matches.len(), 1);
2873 }
2874
2875 #[test]
2876 fn grep_case_insensitive_unicode_literal_matches_indexed_file() {
2877 let dir = tempfile::tempdir().expect("create temp dir");
2878 let project = dir.path().join("project");
2879 fs::create_dir_all(&project).expect("create project dir");
2880 let file = project.join("unicode.txt");
2881 fs::write(&file, "äbc\n").expect("write unicode file");
2882
2883 let index = SearchIndex::build(&project);
2884 let result = index.grep("Äbc", false, &[], &[], &project, 10);
2885
2886 assert_eq!(result.total_matches, 1);
2887 assert_eq!(result.matches.len(), 1);
2888 assert_eq!(
2889 result.matches[0].file,
2890 fs::canonicalize(file).expect("canonicalize unicode file")
2891 );
2892 }
2893
2894 #[test]
2895 fn refresh_reindexes_same_size_edit_with_preserved_mtime() {
2896 let dir = tempfile::tempdir().expect("create temp dir");
2897 let project = dir.path().join("project");
2898 fs::create_dir_all(&project).expect("create project dir");
2899 let file = project.join("tokens.txt");
2900 let original_mtime = filetime::FileTime::from_unix_time(1_700_000_000, 0);
2901 fs::write(&file, "alpha").expect("write original file");
2902 filetime::set_file_mtime(&file, original_mtime).expect("set original mtime");
2903
2904 let baseline = SearchIndex::build(&project);
2905 fs::write(&file, "bravo").expect("write same-size edit");
2906 filetime::set_file_mtime(&file, original_mtime).expect("restore original mtime");
2907
2908 let refreshed =
2909 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2910 let result = refreshed.grep("bravo", true, &[], &[], &project, 10);
2911 let canonical_file = fs::canonicalize(&file).expect("canonicalize edited file");
2912 let refreshed_id = *refreshed
2913 .path_to_id
2914 .get(&canonical_file)
2915 .expect("file remains indexed");
2916
2917 assert_eq!(result.total_matches, 1);
2918 assert!(refreshed
2919 .postings_for_trigram(pack_trigram(b'b', b'r', b'a'), None)
2920 .contains(&refreshed_id));
2921 assert!(!refreshed
2922 .postings_for_trigram(pack_trigram(b'a', b'l', b'p'), None)
2923 .contains(&refreshed_id));
2924 }
2925
2926 #[test]
2927 fn grep_reports_total_matches_before_truncation() {
2928 let dir = tempfile::tempdir().expect("create temp dir");
2929 let project = dir.path().join("project");
2930 let src = project.join("src");
2931 fs::create_dir_all(&src).expect("create src dir");
2932 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2933
2934 let index = SearchIndex::build(&project);
2935 let result = index.grep("SearchIndex", true, &[], &[], &src, 1);
2936
2937 assert_eq!(result.total_matches, 2);
2938 assert_eq!(result.matches.len(), 1);
2939 assert!(result.truncated);
2940 }
2941
2942 #[test]
2943 fn glob_filters_results_to_search_root() {
2944 let dir = tempfile::tempdir().expect("create temp dir");
2945 let project = dir.path().join("project");
2946 let src = project.join("src");
2947 let scripts = project.join("scripts");
2948 fs::create_dir_all(&src).expect("create src dir");
2949 fs::create_dir_all(&scripts).expect("create scripts dir");
2950 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2951 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2952
2953 let index = SearchIndex::build(&project);
2954 let files = index.glob("**/*.rs", &src);
2955
2956 assert_eq!(
2957 files,
2958 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2959 );
2960 }
2961
2962 #[test]
2963 fn glob_includes_hidden_and_binary_files() {
2964 let dir = tempfile::tempdir().expect("create temp dir");
2965 let project = dir.path().join("project");
2966 let hidden_dir = project.join(".hidden");
2967 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2968 let hidden_file = hidden_dir.join("data.bin");
2969 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2970
2971 let index = SearchIndex::build(&project);
2972 let files = index.glob("**/*.bin", &project);
2973
2974 assert_eq!(
2975 files,
2976 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2977 );
2978 }
2979
2980 #[test]
2981 fn read_from_disk_rejects_invalid_nanos() {
2982 let dir = tempfile::tempdir().expect("create temp dir");
2983 let cache_dir = dir.path().join("cache");
2984 fs::create_dir_all(&cache_dir).expect("create cache dir");
2985
2986 let mut postings = Vec::new();
2987 postings.extend_from_slice(INDEX_MAGIC);
2988 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2989 postings.extend_from_slice(&0u32.to_le_bytes());
2990 postings.extend_from_slice(&1u32.to_le_bytes());
2991 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2992 postings.extend_from_slice(&1u32.to_le_bytes());
2993 postings.extend_from_slice(b"/");
2994 postings.push(0u8);
2995 postings.extend_from_slice(&1u32.to_le_bytes());
2996 postings.extend_from_slice(&0u64.to_le_bytes());
2997 postings.extend_from_slice(&0u64.to_le_bytes());
2998 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2999 postings.extend_from_slice(b"a");
3000 postings.extend_from_slice(&0u64.to_le_bytes());
3001
3002 let mut lookup = Vec::new();
3003 lookup.extend_from_slice(LOOKUP_MAGIC);
3004 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3005 lookup.extend_from_slice(&0u32.to_le_bytes());
3006
3007 let postings_checksum = crc32fast::hash(&postings);
3008 postings.extend_from_slice(&postings_checksum.to_le_bytes());
3009 let lookup_checksum = crc32fast::hash(&lookup);
3010 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
3011 let mut cache = Vec::new();
3012 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
3013 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
3014 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
3015 cache.extend_from_slice(&postings);
3016 cache.extend_from_slice(&lookup);
3017 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
3018
3019 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
3020 }
3021
3022 #[test]
3037 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
3038 let dir = tempfile::tempdir().expect("create tempdir");
3042 let mut paths: Vec<PathBuf> = Vec::new();
3043 for i in 0..30 {
3044 let path = if i % 2 == 0 {
3046 let p = dir.path().join(format!("real-{i}.rs"));
3047 fs::write(&p, format!("// {i}\n")).expect("write");
3048 p
3049 } else {
3050 dir.path().join(format!("missing-{i}.rs"))
3051 };
3052 paths.push(path);
3053 }
3054
3055 for _ in 0..50 {
3058 let mut copy = paths.clone();
3059 sort_paths_by_mtime_desc(&mut copy);
3060 assert_eq!(copy.len(), paths.len());
3061 }
3062 }
3063
3064 #[test]
3068 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
3069 let dir = tempfile::tempdir().expect("create tempdir");
3070 let mut matches: Vec<GrepMatch> = Vec::new();
3071 for i in 0..30 {
3072 let file = if i % 2 == 0 {
3073 let p = dir.path().join(format!("real-{i}.rs"));
3074 fs::write(&p, format!("// {i}\n")).expect("write");
3075 p
3076 } else {
3077 dir.path().join(format!("missing-{i}.rs"))
3078 };
3079 matches.push(GrepMatch {
3080 file,
3081 line: u32::try_from(i).unwrap_or(0),
3082 column: 0,
3083 line_text: format!("match {i}"),
3084 match_text: format!("match {i}"),
3085 });
3086 }
3087
3088 for _ in 0..50 {
3089 let mut copy = matches.clone();
3090 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
3091 assert_eq!(copy.len(), matches.len());
3092 }
3093 }
3094}