1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
19
20const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
21const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
23const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
24const INDEX_VERSION: u32 = 3;
25const PREVIEW_BYTES: usize = 8 * 1024;
26const EOF_SENTINEL: u8 = 0;
27const MAX_ENTRIES: usize = 10_000_000;
28const MIN_FILE_ENTRY_BYTES: usize = 57;
29const LOOKUP_ENTRY_BYTES: usize = 16;
30const POSTING_BYTES: usize = 6;
31
32pub struct CacheLock {
33 path: PathBuf,
34}
35
36impl CacheLock {
37 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
38 fs::create_dir_all(cache_dir)?;
39 let path = cache_dir.join("cache.lock");
40 for _ in 0..200 {
41 match fs::OpenOptions::new()
42 .write(true)
43 .create_new(true)
44 .open(&path)
45 {
46 Ok(mut file) => {
47 let _ = writeln!(file, "{}", std::process::id());
48 let _ = file.sync_all();
49 return Ok(Self { path });
50 }
51 Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
52 std::thread::sleep(Duration::from_millis(10));
53 }
54 Err(error) => return Err(error),
55 }
56 }
57 Err(std::io::Error::other(
58 "timed out acquiring search cache lock",
59 ))
60 }
61}
62
63impl Drop for CacheLock {
64 fn drop(&mut self) {
65 let _ = fs::remove_file(&self.path);
66 }
67}
68
69#[derive(Clone, Debug)]
70pub struct SearchIndex {
71 pub postings: HashMap<u32, Vec<Posting>>,
72 pub files: Vec<FileEntry>,
73 pub path_to_id: HashMap<PathBuf, u32>,
74 pub ready: bool,
75 project_root: PathBuf,
76 git_head: Option<String>,
77 max_file_size: u64,
78 pub file_trigrams: HashMap<u32, Vec<u32>>,
79 unindexed_files: HashSet<u32>,
80}
81
82impl SearchIndex {
83 pub fn file_count(&self) -> usize {
85 self.files.len()
86 }
87
88 pub fn trigram_count(&self) -> usize {
90 self.postings.len()
91 }
92
93 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
95 query_trigrams_from_tokens(tokens)
96 }
97
98 pub fn lexical_rank(
100 &self,
101 query_trigrams: &[u32],
102 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
103 max_files: usize,
104 ) -> Vec<(PathBuf, f32)> {
105 if query_trigrams.is_empty() || max_files == 0 {
106 return Vec::new();
107 }
108
109 let mut non_zero: Vec<(u32, usize)> = query_trigrams
110 .iter()
111 .filter_map(|trigram| {
112 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
113 (posting_count > 0).then_some((*trigram, posting_count))
114 })
115 .collect();
116 if non_zero.is_empty() {
117 return Vec::new();
118 }
119
120 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
121 let selected_count = non_zero.len().min(3);
122 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
123
124 let mut candidate_ids = BTreeSet::new();
125 for (trigram, _) in non_zero.iter().take(selected_count) {
126 if let Some(postings) = self.postings.get(trigram) {
127 for posting in postings {
128 if self.is_active_file(posting.file_id) {
129 candidate_ids.insert(posting.file_id);
130 }
131 }
132 }
133 }
134
135 let mut ranked = Vec::new();
136 for file_id in candidate_ids.into_iter().take(candidate_cap) {
137 let Some(entry) = self.files.get(file_id as usize) else {
138 continue;
139 };
140 if let Some(filter) = candidate_filter {
141 if !filter(&entry.path) {
142 continue;
143 }
144 }
145 let score = lexical_score(self, query_trigrams, file_id);
146 if score > 0.0 {
147 ranked.push((entry.path.clone(), score));
148 }
149 }
150
151 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
152 ranked.truncate(max_files);
153 ranked
154 }
155}
156
157#[derive(Clone, Debug, PartialEq, Eq)]
158pub struct Posting {
159 pub file_id: u32,
160 pub next_mask: u8,
161 pub loc_mask: u8,
162}
163
164#[derive(Clone, Debug)]
165pub struct FileEntry {
166 pub path: PathBuf,
167 pub size: u64,
168 pub modified: SystemTime,
169 pub content_hash: blake3::Hash,
170}
171
172#[derive(Clone, Debug, PartialEq, Eq)]
173pub struct GrepMatch {
174 pub file: PathBuf,
175 pub line: u32,
176 pub column: u32,
177 pub line_text: String,
178 pub match_text: String,
179}
180
181#[derive(Clone, Debug)]
182pub struct GrepResult {
183 pub matches: Vec<GrepMatch>,
184 pub total_matches: usize,
185 pub files_searched: usize,
186 pub files_with_matches: usize,
187 pub index_status: IndexStatus,
188 pub truncated: bool,
189}
190
191#[derive(Clone, Copy, Debug, PartialEq, Eq)]
192pub enum IndexStatus {
193 Ready,
194 Building,
195 Fallback,
196}
197
198impl IndexStatus {
199 pub fn as_str(&self) -> &'static str {
200 match self {
201 IndexStatus::Ready => "Ready",
202 IndexStatus::Building => "Building",
203 IndexStatus::Fallback => "Fallback",
204 }
205 }
206}
207
208#[derive(Clone, Debug, Default)]
209pub struct RegexQuery {
210 pub and_trigrams: Vec<u32>,
211 pub or_groups: Vec<Vec<u32>>,
212 pub(crate) and_filters: HashMap<u32, PostingFilter>,
213 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
214}
215
216#[derive(Clone, Copy, Debug, Default)]
217pub(crate) struct PostingFilter {
218 next_mask: u8,
219 loc_mask: u8,
220}
221
222#[derive(Clone, Debug, Default)]
223struct QueryBuild {
224 and_runs: Vec<Vec<u8>>,
225 or_groups: Vec<Vec<Vec<u8>>>,
226}
227
228#[derive(Clone, Debug, Default)]
229pub(crate) struct PathFilters {
230 includes: Option<GlobSet>,
231 excludes: Option<GlobSet>,
232}
233
234#[derive(Clone, Debug)]
235pub(crate) struct SearchScope {
236 pub root: PathBuf,
237 pub use_index: bool,
238}
239
240#[derive(Clone, Debug)]
241struct SharedGrepMatch {
242 file: Arc<PathBuf>,
243 line: u32,
244 column: u32,
245 line_text: String,
246 match_text: String,
247}
248
249#[derive(Clone, Debug)]
250enum SearchMatcher {
251 Literal(LiteralSearch),
252 Regex(Regex),
253}
254
255#[derive(Clone, Debug)]
256enum LiteralSearch {
257 CaseSensitive(Vec<u8>),
258 AsciiCaseInsensitive(Vec<u8>),
259}
260
261impl SearchIndex {
262 pub fn new() -> Self {
263 SearchIndex {
264 postings: HashMap::new(),
265 files: Vec::new(),
266 path_to_id: HashMap::new(),
267 ready: false,
268 project_root: PathBuf::new(),
269 git_head: None,
270 max_file_size: DEFAULT_MAX_FILE_SIZE,
271 file_trigrams: HashMap::new(),
272 unindexed_files: HashSet::new(),
273 }
274 }
275
276 pub fn build(root: &Path) -> Self {
277 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
278 }
279
280 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
281 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
282 let mut index = SearchIndex {
283 project_root: project_root.clone(),
284 max_file_size,
285 ..SearchIndex::new()
286 };
287
288 let filters = PathFilters::default();
289 for path in walk_project_files(&project_root, &filters) {
290 index.update_file(&path);
291 }
292
293 index.git_head = current_git_head(&project_root);
294 index.ready = true;
295 index
296 }
297
298 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
299 self.remove_file(path);
300
301 let file_id = match self.allocate_file_id(path, content.len() as u64) {
302 Some(file_id) => file_id,
303 None => return,
304 };
305 if let Some(file) = self.files.get_mut(file_id as usize) {
306 file.content_hash = cache_freshness::hash_bytes(content);
307 }
308
309 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
310 for (trigram, next_char, position) in extract_trigrams(content) {
311 let entry = trigram_map.entry(trigram).or_default();
312 entry.next_mask |= mask_for_next_char(next_char);
313 entry.loc_mask |= mask_for_position(position);
314 }
315
316 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
317 for (trigram, filter) in trigram_map {
318 let postings = self.postings.entry(trigram).or_default();
319 postings.push(Posting {
320 file_id,
321 next_mask: filter.next_mask,
322 loc_mask: filter.loc_mask,
323 });
324 if postings.len() > 1
328 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
329 {
330 postings.sort_unstable_by_key(|p| p.file_id);
331 }
332 file_trigrams.push(trigram);
333 }
334
335 self.file_trigrams.insert(file_id, file_trigrams);
336 self.unindexed_files.remove(&file_id);
337 }
338
339 pub fn remove_file(&mut self, path: &Path) {
340 let Some(file_id) = self.path_to_id.remove(path) else {
341 return;
342 };
343
344 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
345 for trigram in trigrams {
346 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
347 postings.retain(|posting| posting.file_id != file_id);
348 postings.is_empty()
349 } else {
350 false
351 };
352
353 if should_remove {
354 self.postings.remove(&trigram);
355 }
356 }
357 }
358
359 self.unindexed_files.remove(&file_id);
360 if let Some(file) = self.files.get_mut(file_id as usize) {
361 file.path = PathBuf::new();
362 file.size = 0;
363 file.modified = UNIX_EPOCH;
364 file.content_hash = cache_freshness::zero_hash();
365 }
366 }
367
368 pub fn update_file(&mut self, path: &Path) {
369 self.remove_file(path);
370
371 let metadata = match fs::metadata(path) {
372 Ok(metadata) if metadata.is_file() => metadata,
373 _ => return,
374 };
375
376 if is_binary_path(path, metadata.len()) {
377 self.track_unindexed_file(path, &metadata);
378 return;
379 }
380
381 if metadata.len() > self.max_file_size {
382 self.track_unindexed_file(path, &metadata);
383 return;
384 }
385
386 let content = match fs::read(path) {
387 Ok(content) => content,
388 Err(_) => return,
389 };
390
391 if is_binary_bytes(&content) {
392 self.track_unindexed_file(path, &metadata);
393 return;
394 }
395
396 self.index_file(path, &content);
397 }
398
399 pub fn grep(
400 &self,
401 pattern: &str,
402 case_sensitive: bool,
403 include: &[String],
404 exclude: &[String],
405 search_root: &Path,
406 max_results: usize,
407 ) -> GrepResult {
408 self.search_grep(
409 pattern,
410 case_sensitive,
411 include,
412 exclude,
413 search_root,
414 max_results,
415 )
416 }
417
418 pub fn search_grep(
419 &self,
420 pattern: &str,
421 case_sensitive: bool,
422 include: &[String],
423 exclude: &[String],
424 search_root: &Path,
425 max_results: usize,
426 ) -> GrepResult {
427 let is_literal = !pattern.chars().any(|c| {
430 matches!(
431 c,
432 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
433 )
434 });
435
436 let literal_search = if is_literal {
437 if case_sensitive {
438 Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
439 } else if pattern.is_ascii() {
440 Some(LiteralSearch::AsciiCaseInsensitive(
441 pattern
442 .as_bytes()
443 .iter()
444 .map(|byte| byte.to_ascii_lowercase())
445 .collect(),
446 ))
447 } else {
448 None
449 }
450 } else {
451 None
452 };
453
454 let regex = if literal_search.is_some() {
456 None
457 } else {
458 let regex_pattern = if is_literal {
459 regex::escape(pattern)
460 } else {
461 pattern.to_string()
462 };
463 let mut builder = RegexBuilder::new(®ex_pattern);
464 builder.case_insensitive(!case_sensitive);
465 builder.multi_line(true);
467 match builder.build() {
468 Ok(r) => Some(r),
469 Err(_) => {
470 return GrepResult {
471 matches: Vec::new(),
472 total_matches: 0,
473 files_searched: 0,
474 files_with_matches: 0,
475 index_status: if self.ready {
476 IndexStatus::Ready
477 } else {
478 IndexStatus::Building
479 },
480 truncated: false,
481 };
482 }
483 }
484 };
485
486 let matcher = if let Some(literal_search) = literal_search {
487 SearchMatcher::Literal(literal_search)
488 } else {
489 SearchMatcher::Regex(
490 regex.expect("regex should exist when literal matcher is unavailable"),
491 )
492 };
493
494 let filters = match build_path_filters(include, exclude) {
495 Ok(filters) => filters,
496 Err(_) => PathFilters::default(),
497 };
498 let search_root = canonicalize_or_normalize(search_root);
499
500 let query = decompose_regex(pattern);
501 let candidate_ids = self.candidates(&query);
502
503 let candidate_files: Vec<&FileEntry> = candidate_ids
504 .into_iter()
505 .filter_map(|file_id| self.files.get(file_id as usize))
506 .filter(|file| !file.path.as_os_str().is_empty())
507 .filter(|file| is_within_search_root(&search_root, &file.path))
508 .filter(|file| filters.matches(&self.project_root, &file.path))
509 .collect();
510
511 let total_matches = AtomicUsize::new(0);
512 let files_searched = AtomicUsize::new(0);
513 let files_with_matches = AtomicUsize::new(0);
514 let truncated = AtomicBool::new(false);
515 let stop_after = max_results.saturating_mul(2);
516
517 let mut matches = if candidate_files.len() > 10 {
518 candidate_files
519 .par_iter()
520 .map(|file| {
521 search_candidate_file(
522 file,
523 &matcher,
524 max_results,
525 stop_after,
526 &total_matches,
527 &files_searched,
528 &files_with_matches,
529 &truncated,
530 )
531 })
532 .reduce(Vec::new, |mut left, mut right| {
533 left.append(&mut right);
534 left
535 })
536 } else {
537 let mut matches = Vec::new();
538 for file in candidate_files {
539 matches.extend(search_candidate_file(
540 file,
541 &matcher,
542 max_results,
543 stop_after,
544 &total_matches,
545 &files_searched,
546 &files_with_matches,
547 &truncated,
548 ));
549
550 if should_stop_search(&truncated, &total_matches, stop_after) {
551 break;
552 }
553 }
554 matches
555 };
556
557 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
558 self.path_to_id
559 .get(path)
560 .and_then(|file_id| self.files.get(*file_id as usize))
561 .map(|file| file.modified)
562 });
563
564 let matches = matches
565 .into_iter()
566 .map(|matched| GrepMatch {
567 file: matched.file.as_ref().clone(),
568 line: matched.line,
569 column: matched.column,
570 line_text: matched.line_text,
571 match_text: matched.match_text,
572 })
573 .collect();
574
575 GrepResult {
576 total_matches: total_matches.load(Ordering::Relaxed),
577 matches,
578 files_searched: files_searched.load(Ordering::Relaxed),
579 files_with_matches: files_with_matches.load(Ordering::Relaxed),
580 index_status: if self.ready {
581 IndexStatus::Ready
582 } else {
583 IndexStatus::Building
584 },
585 truncated: truncated.load(Ordering::Relaxed),
586 }
587 }
588
589 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
590 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
591 Ok(filters) => filters,
592 Err(_) => return Vec::new(),
593 };
594 let search_root = canonicalize_or_normalize(search_root);
595 let mut entries = self
596 .files
597 .iter()
598 .filter(|file| !file.path.as_os_str().is_empty())
599 .filter(|file| is_within_search_root(&search_root, &file.path))
600 .filter(|file| filters.matches(&self.project_root, &file.path))
601 .map(|file| (file.path.clone(), file.modified))
602 .collect::<Vec<_>>();
603
604 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
605 right_mtime
606 .cmp(left_mtime)
607 .then_with(|| left_path.cmp(right_path))
608 });
609
610 entries.into_iter().map(|(path, _)| path).collect()
611 }
612
613 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
614 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
615 return self.active_file_ids();
616 }
617
618 let mut and_trigrams = query.and_trigrams.clone();
619 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
620
621 let mut current: Option<Vec<u32>> = None;
622
623 for trigram in and_trigrams {
624 let filter = query.and_filters.get(&trigram).copied();
625 let matches = self.postings_for_trigram(trigram, filter);
626 current = Some(match current.take() {
627 Some(existing) => intersect_sorted_ids(&existing, &matches),
628 None => matches,
629 });
630
631 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
632 break;
633 }
634 }
635
636 let mut current = current.unwrap_or_else(|| self.active_file_ids());
637
638 for (index, group) in query.or_groups.iter().enumerate() {
639 let mut group_matches = Vec::new();
640 let filters = query.or_filters.get(index);
641
642 for trigram in group {
643 let filter = filters.and_then(|filters| filters.get(trigram).copied());
644 let matches = self.postings_for_trigram(*trigram, filter);
645 if group_matches.is_empty() {
646 group_matches = matches;
647 } else {
648 group_matches = union_sorted_ids(&group_matches, &matches);
649 }
650 }
651
652 current = intersect_sorted_ids(¤t, &group_matches);
653 if current.is_empty() {
654 break;
655 }
656 }
657
658 let mut unindexed = self
659 .unindexed_files
660 .iter()
661 .copied()
662 .filter(|file_id| self.is_active_file(*file_id))
663 .collect::<Vec<_>>();
664 if !unindexed.is_empty() {
665 unindexed.sort_unstable();
666 current = union_sorted_ids(¤t, &unindexed);
667 }
668
669 current
670 }
671
672 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
673 if fs::create_dir_all(cache_dir).is_err() {
674 return;
675 }
676
677 let cache_path = cache_dir.join("cache.bin");
678 let tmp_cache = cache_dir.join(format!(
679 "cache.bin.tmp.{}.{}",
680 std::process::id(),
681 SystemTime::now()
682 .duration_since(UNIX_EPOCH)
683 .unwrap_or(Duration::ZERO)
684 .as_nanos()
685 ));
686
687 let active_ids = self.active_file_ids();
688 let mut id_map = HashMap::new();
689 for (new_id, old_id) in active_ids.iter().enumerate() {
690 let Ok(new_id_u32) = u32::try_from(new_id) else {
691 return;
692 };
693 id_map.insert(*old_id, new_id_u32);
694 }
695
696 let write_result = (|| -> std::io::Result<()> {
697 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
698
699 postings_writer.write_all(INDEX_MAGIC)?;
700 write_u32(&mut postings_writer, INDEX_VERSION)?;
701
702 let head = git_head.unwrap_or_default();
703 let root = self.project_root.to_string_lossy();
704 let head_len = u32::try_from(head.len())
705 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
706 let root_len = u32::try_from(root.len())
707 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
708 let file_count = u32::try_from(active_ids.len())
709 .map_err(|_| std::io::Error::other("too many files to cache"))?;
710
711 write_u32(&mut postings_writer, head_len)?;
712 write_u32(&mut postings_writer, root_len)?;
713 write_u64(&mut postings_writer, self.max_file_size)?;
714 write_u32(&mut postings_writer, file_count)?;
715 postings_writer.write_all(head.as_bytes())?;
716 postings_writer.write_all(root.as_bytes())?;
717
718 for old_id in &active_ids {
719 let Some(file) = self.files.get(*old_id as usize) else {
720 return Err(std::io::Error::other("missing file entry for cache write"));
721 };
722 let path = relative_to_root(&self.project_root, &file.path);
723 let path = path.to_string_lossy();
724 let path_len = u32::try_from(path.len())
725 .map_err(|_| std::io::Error::other("cached path too large"))?;
726 let modified = file
727 .modified
728 .duration_since(UNIX_EPOCH)
729 .unwrap_or(Duration::ZERO);
730 let unindexed = if self.unindexed_files.contains(old_id) {
731 1u8
732 } else {
733 0u8
734 };
735
736 postings_writer.write_all(&[unindexed])?;
737 write_u32(&mut postings_writer, path_len)?;
738 write_u64(&mut postings_writer, file.size)?;
739 write_u64(&mut postings_writer, modified.as_secs())?;
740 write_u32(&mut postings_writer, modified.subsec_nanos())?;
741 postings_writer.write_all(file.content_hash.as_bytes())?;
742 postings_writer.write_all(path.as_bytes())?;
743 }
744
745 let mut lookup_entries = Vec::new();
746 let mut postings_blob = Vec::new();
747 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
748 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
749
750 for (trigram, postings) in sorted_postings {
751 let offset = u64::try_from(postings_blob.len())
752 .map_err(|_| std::io::Error::other("postings blob too large"))?;
753 let mut count = 0u32;
754
755 for posting in postings {
756 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
757 continue;
758 };
759
760 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
761 postings_blob.push(posting.next_mask);
762 postings_blob.push(posting.loc_mask);
763 count = count.saturating_add(1);
764 }
765
766 if count > 0 {
767 lookup_entries.push((*trigram, offset, count));
768 }
769 }
770
771 write_u64(
772 &mut postings_writer,
773 u64::try_from(postings_blob.len())
774 .map_err(|_| std::io::Error::other("postings blob too large"))?,
775 )?;
776 postings_writer.write_all(&postings_blob)?;
777 postings_writer.flush()?;
778 let mut postings_blob_file = postings_writer
779 .into_inner()
780 .map_err(|error| std::io::Error::other(error.to_string()))?
781 .into_inner();
782 let checksum = crc32fast::hash(&postings_blob_file);
783 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
784
785 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
786 let entry_count = u32::try_from(lookup_entries.len())
787 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
788
789 lookup_writer.write_all(LOOKUP_MAGIC)?;
790 write_u32(&mut lookup_writer, INDEX_VERSION)?;
791 write_u32(&mut lookup_writer, entry_count)?;
792
793 for (trigram, offset, count) in lookup_entries {
794 write_u32(&mut lookup_writer, trigram)?;
795 write_u64(&mut lookup_writer, offset)?;
796 write_u32(&mut lookup_writer, count)?;
797 }
798
799 lookup_writer.flush()?;
800 let mut lookup_blob_file = lookup_writer
801 .into_inner()
802 .map_err(|error| std::io::Error::other(error.to_string()))?
803 .into_inner();
804 let checksum = crc32fast::hash(&lookup_blob_file);
805 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
806
807 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
808 write_u32(&mut cache_writer, CACHE_MAGIC)?;
809 write_u32(&mut cache_writer, INDEX_VERSION)?;
810 write_u64(
811 &mut cache_writer,
812 u64::try_from(postings_blob_file.len())
813 .map_err(|_| std::io::Error::other("postings section too large"))?,
814 )?;
815 cache_writer.write_all(&postings_blob_file)?;
816 cache_writer.write_all(&lookup_blob_file)?;
817 cache_writer.flush()?;
818 cache_writer.get_ref().sync_all()?;
819 drop(cache_writer);
820 fs::rename(&tmp_cache, &cache_path)?;
821
822 Ok(())
823 })();
824
825 if write_result.is_err() {
826 let _ = fs::remove_file(&tmp_cache);
827 }
828 }
829
830 pub fn read_from_disk(cache_dir: &Path, current_canonical_root: &Path) -> Option<Self> {
831 debug_assert!(current_canonical_root.is_absolute());
832 let cache_path = cache_dir.join("cache.bin");
833 let cache_bytes = fs::read(&cache_path).ok()?;
834 if cache_bytes.len() < 16 {
835 return None;
836 }
837 let mut header = Cursor::new(&cache_bytes);
838 if read_u32(&mut header).ok()? != CACHE_MAGIC {
839 return None;
840 }
841 if read_u32(&mut header).ok()? != INDEX_VERSION {
842 return None;
843 }
844 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
845 let start = usize::try_from(header.position()).ok()?;
846 let postings_end = start.checked_add(postings_len_total)?;
847 if postings_end > cache_bytes.len() {
848 return None;
849 }
850 let postings_bytes = &cache_bytes[start..postings_end];
851 let lookup_bytes = &cache_bytes[postings_end..];
852 let lookup_len_total = lookup_bytes.len();
853 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
854 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
855 if postings_len_total < 4 || lookup_len_total < 4 {
856 return None;
857 }
858 verify_crc32_bytes_slice(postings_bytes).ok()?;
859 verify_crc32_bytes_slice(lookup_bytes).ok()?;
860
861 let mut magic = [0u8; 8];
862 postings_reader.read_exact(&mut magic).ok()?;
863 if &magic != INDEX_MAGIC {
864 return None;
865 }
866 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
867 return None;
868 }
869
870 let head_len = read_u32(&mut postings_reader).ok()? as usize;
871 let root_len = read_u32(&mut postings_reader).ok()? as usize;
872 let max_file_size = read_u64(&mut postings_reader).ok()?;
873 let file_count = read_u32(&mut postings_reader).ok()? as usize;
874 if file_count > MAX_ENTRIES {
875 return None;
876 }
877 let postings_body_len = postings_len_total.checked_sub(4)?;
878 let lookup_body_len = lookup_len_total.checked_sub(4)?;
879
880 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
881 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
882 if minimum_file_bytes > remaining_postings {
883 return None;
884 }
885
886 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
887 return None;
888 }
889 let mut head_bytes = vec![0u8; head_len];
890 postings_reader.read_exact(&mut head_bytes).ok()?;
891 let git_head = String::from_utf8(head_bytes)
892 .ok()
893 .filter(|head| !head.is_empty());
894
895 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
896 return None;
897 }
898 let mut root_bytes = vec![0u8; root_len];
899 postings_reader.read_exact(&mut root_bytes).ok()?;
900 let _stored_project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
901 let project_root = current_canonical_root.to_path_buf();
902
903 let mut files = Vec::with_capacity(file_count);
904 let mut path_to_id = HashMap::new();
905 let mut unindexed_files = HashSet::new();
906
907 for file_id in 0..file_count {
908 let mut unindexed = [0u8; 1];
909 postings_reader.read_exact(&mut unindexed).ok()?;
910 let path_len = read_u32(&mut postings_reader).ok()? as usize;
911 let size = read_u64(&mut postings_reader).ok()?;
912 let secs = read_u64(&mut postings_reader).ok()?;
913 let nanos = read_u32(&mut postings_reader).ok()?;
914 let mut hash_bytes = [0u8; 32];
915 postings_reader.read_exact(&mut hash_bytes).ok()?;
916 let content_hash = blake3::Hash::from_bytes(hash_bytes);
917 if nanos >= 1_000_000_000 {
918 return None;
919 }
920 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
921 return None;
922 }
923 let mut path_bytes = vec![0u8; path_len];
924 postings_reader.read_exact(&mut path_bytes).ok()?;
925 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
926 let full_path = project_root.join(relative_path);
927 let file_id_u32 = u32::try_from(file_id).ok()?;
928
929 files.push(FileEntry {
930 path: full_path.clone(),
931 size,
932 modified: UNIX_EPOCH + Duration::new(secs, nanos),
933 content_hash,
934 });
935 path_to_id.insert(full_path, file_id_u32);
936 if unindexed[0] == 1 {
937 unindexed_files.insert(file_id_u32);
938 }
939 }
940
941 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
942 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
943 if postings_len > max_postings_bytes {
944 return None;
945 }
946 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
947 return None;
948 }
949 let mut postings_blob = vec![0u8; postings_len];
950 postings_reader.read_exact(&mut postings_blob).ok()?;
951
952 let mut lookup_magic = [0u8; 8];
953 lookup_reader.read_exact(&mut lookup_magic).ok()?;
954 if &lookup_magic != LOOKUP_MAGIC {
955 return None;
956 }
957 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
958 return None;
959 }
960 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
961 if entry_count > MAX_ENTRIES {
962 return None;
963 }
964 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
965 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
966 if minimum_lookup_bytes > remaining_lookup {
967 return None;
968 }
969
970 let mut postings = HashMap::new();
971 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
972
973 for _ in 0..entry_count {
974 let trigram = read_u32(&mut lookup_reader).ok()?;
975 let offset = read_u64(&mut lookup_reader).ok()? as usize;
976 let count = read_u32(&mut lookup_reader).ok()? as usize;
977 if count > MAX_ENTRIES {
978 return None;
979 }
980 let bytes_len = count.checked_mul(POSTING_BYTES)?;
981 let end = offset.checked_add(bytes_len)?;
982 if end > postings_blob.len() {
983 return None;
984 }
985
986 let mut trigram_postings = Vec::with_capacity(count);
987 for chunk in postings_blob[offset..end].chunks_exact(6) {
988 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
989 let posting = Posting {
990 file_id,
991 next_mask: chunk[4],
992 loc_mask: chunk[5],
993 };
994 trigram_postings.push(posting.clone());
995 file_trigrams.entry(file_id).or_default().push(trigram);
996 }
997 postings.insert(trigram, trigram_postings);
998 }
999
1000 Some(SearchIndex {
1001 postings,
1002 files,
1003 path_to_id,
1004 ready: true,
1005 project_root,
1006 git_head,
1007 max_file_size,
1008 file_trigrams,
1009 unindexed_files,
1010 })
1011 }
1012
1013 pub(crate) fn stored_git_head(&self) -> Option<&str> {
1014 self.git_head.as_deref()
1015 }
1016
1017 pub(crate) fn set_ready(&mut self, ready: bool) {
1018 self.ready = ready;
1019 }
1020
1021 pub(crate) fn rebuild_or_refresh(
1022 root: &Path,
1023 max_file_size: u64,
1024 current_head: Option<String>,
1025 baseline: Option<SearchIndex>,
1026 ) -> Self {
1027 if let Some(mut baseline) = baseline {
1028 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1029 baseline.max_file_size = max_file_size;
1030
1031 if baseline.git_head == current_head || current_head.is_none() {
1032 baseline.git_head = current_head;
1039 verify_file_mtimes(&mut baseline);
1040 baseline.ready = true;
1041 return baseline;
1042 }
1043
1044 if let (Some(previous), Some(current)) =
1045 (baseline.git_head.clone(), current_head.clone())
1046 {
1047 let project_root = baseline.project_root.clone();
1048 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1049 baseline.git_head = Some(current);
1050 baseline.ready = true;
1051 return baseline;
1052 }
1053 }
1054 }
1055
1056 SearchIndex::build_with_limit(root, max_file_size)
1057 }
1058
1059 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1060 let file_id = u32::try_from(self.files.len()).ok()?;
1061 let metadata = fs::metadata(path).ok();
1062 let size = metadata
1063 .as_ref()
1064 .map_or(size_hint, |metadata| metadata.len());
1065 let modified = metadata
1066 .and_then(|metadata| metadata.modified().ok())
1067 .unwrap_or(UNIX_EPOCH);
1068
1069 self.files.push(FileEntry {
1070 path: path.to_path_buf(),
1071 size,
1072 modified,
1073 content_hash: cache_freshness::zero_hash(),
1074 });
1075 self.path_to_id.insert(path.to_path_buf(), file_id);
1076 Some(file_id)
1077 }
1078
1079 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1080 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1081 return;
1082 };
1083 self.unindexed_files.insert(file_id);
1084 self.file_trigrams.insert(file_id, Vec::new());
1085 }
1086
1087 fn active_file_ids(&self) -> Vec<u32> {
1088 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1089 ids.sort_unstable();
1090 ids
1091 }
1092
1093 fn is_active_file(&self, file_id: u32) -> bool {
1094 self.files
1095 .get(file_id as usize)
1096 .map(|file| !file.path.as_os_str().is_empty())
1097 .unwrap_or(false)
1098 }
1099
1100 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1101 let Some(postings) = self.postings.get(&trigram) else {
1102 return Vec::new();
1103 };
1104
1105 let mut matches = Vec::with_capacity(postings.len());
1106
1107 for posting in postings {
1108 if let Some(filter) = filter {
1109 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1112 continue;
1113 }
1114 }
1119 if self.is_active_file(posting.file_id) {
1120 matches.push(posting.file_id);
1121 }
1122 }
1123
1124 matches
1125 }
1126}
1127
1128fn search_candidate_file(
1129 file: &FileEntry,
1130 matcher: &SearchMatcher,
1131 max_results: usize,
1132 stop_after: usize,
1133 total_matches: &AtomicUsize,
1134 files_searched: &AtomicUsize,
1135 files_with_matches: &AtomicUsize,
1136 truncated: &AtomicBool,
1137) -> Vec<SharedGrepMatch> {
1138 if should_stop_search(truncated, total_matches, stop_after) {
1139 return Vec::new();
1140 }
1141
1142 let content = match read_indexed_file_bytes(&file.path) {
1143 Some(content) => content,
1144 None => return Vec::new(),
1145 };
1146 if is_binary_bytes(&content) {
1153 return Vec::new();
1154 }
1155 files_searched.fetch_add(1, Ordering::Relaxed);
1156
1157 let shared_path = Arc::new(file.path.clone());
1158 let mut matches = Vec::new();
1159 let mut line_starts = None;
1160 let mut seen_lines = HashSet::new();
1161 let mut matched_this_file = false;
1162
1163 match matcher {
1164 SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1165 let finder = memchr::memmem::Finder::new(needle);
1166 let mut start = 0;
1167
1168 while let Some(position) = finder.find(&content[start..]) {
1169 if should_stop_search(truncated, total_matches, stop_after) {
1170 break;
1171 }
1172
1173 let offset = start + position;
1174 start = offset + 1;
1175
1176 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1177 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1178 if !seen_lines.insert(line) {
1179 continue;
1180 }
1181
1182 matched_this_file = true;
1183 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1184 if match_number > max_results {
1185 truncated.store(true, Ordering::Relaxed);
1186 break;
1187 }
1188
1189 let end = offset + needle.len();
1190 matches.push(SharedGrepMatch {
1191 file: shared_path.clone(),
1192 line,
1193 column,
1194 line_text,
1195 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1196 });
1197 }
1198 }
1199 SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1200 let search_content = content.to_ascii_lowercase();
1201 let finder = memchr::memmem::Finder::new(needle);
1202 let mut start = 0;
1203
1204 while let Some(position) = finder.find(&search_content[start..]) {
1205 if should_stop_search(truncated, total_matches, stop_after) {
1206 break;
1207 }
1208
1209 let offset = start + position;
1210 start = offset + 1;
1211
1212 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1213 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1214 if !seen_lines.insert(line) {
1215 continue;
1216 }
1217
1218 matched_this_file = true;
1219 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1220 if match_number > max_results {
1221 truncated.store(true, Ordering::Relaxed);
1222 break;
1223 }
1224
1225 let end = offset + needle.len();
1226 matches.push(SharedGrepMatch {
1227 file: shared_path.clone(),
1228 line,
1229 column,
1230 line_text,
1231 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1232 });
1233 }
1234 }
1235 SearchMatcher::Regex(regex) => {
1236 for matched in regex.find_iter(&content) {
1237 if should_stop_search(truncated, total_matches, stop_after) {
1238 break;
1239 }
1240
1241 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1242 let (line, column, line_text) =
1243 line_details_bytes(&content, line_starts, matched.start());
1244 if !seen_lines.insert(line) {
1245 continue;
1246 }
1247
1248 matched_this_file = true;
1249 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1250 if match_number > max_results {
1251 truncated.store(true, Ordering::Relaxed);
1252 break;
1253 }
1254
1255 matches.push(SharedGrepMatch {
1256 file: shared_path.clone(),
1257 line,
1258 column,
1259 line_text,
1260 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1261 });
1262 }
1263 }
1264 }
1265
1266 if matched_this_file {
1267 files_with_matches.fetch_add(1, Ordering::Relaxed);
1268 }
1269
1270 matches
1271}
1272
1273fn should_stop_search(
1274 truncated: &AtomicBool,
1275 total_matches: &AtomicUsize,
1276 stop_after: usize,
1277) -> bool {
1278 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1279}
1280
1281fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1282 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1283 let mut left_index = 0;
1284 let mut right_index = 0;
1285
1286 while left_index < left.len() && right_index < right.len() {
1287 match left[left_index].cmp(&right[right_index]) {
1288 std::cmp::Ordering::Less => left_index += 1,
1289 std::cmp::Ordering::Greater => right_index += 1,
1290 std::cmp::Ordering::Equal => {
1291 merged.push(left[left_index]);
1292 left_index += 1;
1293 right_index += 1;
1294 }
1295 }
1296 }
1297
1298 merged
1299}
1300
1301fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1302 let mut merged = Vec::with_capacity(left.len() + right.len());
1303 let mut left_index = 0;
1304 let mut right_index = 0;
1305
1306 while left_index < left.len() && right_index < right.len() {
1307 match left[left_index].cmp(&right[right_index]) {
1308 std::cmp::Ordering::Less => {
1309 merged.push(left[left_index]);
1310 left_index += 1;
1311 }
1312 std::cmp::Ordering::Greater => {
1313 merged.push(right[right_index]);
1314 right_index += 1;
1315 }
1316 std::cmp::Ordering::Equal => {
1317 merged.push(left[left_index]);
1318 left_index += 1;
1319 right_index += 1;
1320 }
1321 }
1322 }
1323
1324 merged.extend_from_slice(&left[left_index..]);
1325 merged.extend_from_slice(&right[right_index..]);
1326 merged
1327}
1328
1329pub fn decompose_regex(pattern: &str) -> RegexQuery {
1330 let hir = match regex_syntax::parse(pattern) {
1331 Ok(hir) => hir,
1332 Err(_) => return RegexQuery::default(),
1333 };
1334
1335 let build = build_query(&hir);
1336 build.into_query()
1337}
1338
1339pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1340 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1341}
1342
1343pub fn normalize_char(c: u8) -> u8 {
1344 c.to_ascii_lowercase()
1345}
1346
1347pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1348 if content.len() < 3 {
1349 return Vec::new();
1350 }
1351
1352 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1353 for start in 0..=content.len() - 3 {
1354 let trigram = pack_trigram(
1355 normalize_char(content[start]),
1356 normalize_char(content[start + 1]),
1357 normalize_char(content[start + 2]),
1358 );
1359 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1360 trigrams.push((trigram, next_char, start));
1361 }
1362 trigrams
1363}
1364
1365pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1366 let mut seen = HashSet::new();
1367 let mut out = Vec::new();
1368 for token in tokens {
1369 for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1370 if seen.insert(trigram) {
1371 out.push(trigram);
1372 }
1373 }
1374 }
1375 out
1376}
1377
1378pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1379 if query_trigrams.is_empty() {
1380 return 0.0;
1381 }
1382
1383 let mut hits = 0u32;
1384 for &trigram in query_trigrams {
1385 if let Some(postings) = index.postings.get(&trigram) {
1386 if postings
1387 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1388 .is_ok()
1389 {
1390 hits += 1;
1391 }
1392 }
1393 }
1394
1395 if hits == 0 {
1396 return 0.0;
1397 }
1398
1399 let file_trigram_count = index
1400 .file_trigrams
1401 .get(&file_id)
1402 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1403 (hits as f32) / (1.0 + file_trigram_count.ln())
1404}
1405
1406pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1407 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1409 return PathBuf::from(override_dir)
1410 .join("index")
1411 .join(project_cache_key(project_root));
1412 }
1413 if let Some(dir) = storage_dir {
1415 return dir.join("index").join(project_cache_key(project_root));
1416 }
1417 let home = std::env::var_os("HOME")
1422 .or_else(|| std::env::var_os("USERPROFILE"))
1423 .map(PathBuf::from)
1424 .unwrap_or_else(std::env::temp_dir);
1425 home.join(".cache")
1426 .join("aft")
1427 .join("index")
1428 .join(project_cache_key(project_root))
1429}
1430
1431pub(crate) fn build_path_filters(
1432 include: &[String],
1433 exclude: &[String],
1434) -> Result<PathFilters, String> {
1435 Ok(PathFilters {
1436 includes: build_globset(include)?,
1437 excludes: build_globset(exclude)?,
1438 })
1439}
1440
1441pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1442 walk_project_files_from(root, root, filters)
1443}
1444
1445pub(crate) fn walk_project_files_from(
1446 filter_root: &Path,
1447 search_root: &Path,
1448 filters: &PathFilters,
1449) -> Vec<PathBuf> {
1450 let mut builder = WalkBuilder::new(search_root);
1451 builder
1452 .hidden(false)
1453 .git_ignore(true)
1454 .git_global(true)
1455 .git_exclude(true)
1456 .filter_entry(|entry| {
1457 let name = entry.file_name().to_string_lossy();
1458 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1459 return !matches!(
1460 name.as_ref(),
1461 "node_modules"
1462 | "target"
1463 | "venv"
1464 | ".venv"
1465 | ".git"
1466 | "__pycache__"
1467 | ".tox"
1468 | "dist"
1469 | "build"
1470 );
1471 }
1472 true
1473 });
1474
1475 let mut files = Vec::new();
1476 for entry in builder.build().filter_map(|entry| entry.ok()) {
1477 if !entry
1478 .file_type()
1479 .map_or(false, |file_type| file_type.is_file())
1480 {
1481 continue;
1482 }
1483 let path = entry.into_path();
1484 if filters.matches(filter_root, &path) {
1485 files.push(path);
1486 }
1487 }
1488
1489 sort_paths_by_mtime_desc(&mut files);
1490 files
1491}
1492
1493pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1494 let bytes = fs::read(path).ok()?;
1495 if is_binary_bytes(&bytes) {
1496 return None;
1497 }
1498 String::from_utf8(bytes).ok()
1499}
1500
1501fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1502 fs::read(path).ok()
1503}
1504
1505pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1506 path.strip_prefix(root)
1507 .map(PathBuf::from)
1508 .unwrap_or_else(|_| path.to_path_buf())
1509}
1510
1511pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1524 use std::collections::HashMap;
1525 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1526 for path in paths.iter() {
1527 mtimes
1528 .entry(path.clone())
1529 .or_insert_with(|| path_modified_time(path));
1530 }
1531 paths.sort_by(|left, right| {
1532 let left_mtime = mtimes.get(left).and_then(|v| *v);
1533 let right_mtime = mtimes.get(right).and_then(|v| *v);
1534 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1535 });
1536}
1537
1538pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1541 use std::collections::HashMap;
1542 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1543 for m in matches.iter() {
1544 mtimes.entry(m.file.clone()).or_insert_with(|| {
1545 let resolved = resolve_match_path(project_root, &m.file);
1546 path_modified_time(&resolved)
1547 });
1548 }
1549 matches.sort_by(|left, right| {
1550 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1551 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1552 right_mtime
1553 .cmp(&left_mtime)
1554 .then_with(|| left.file.cmp(&right.file))
1555 .then_with(|| left.line.cmp(&right.line))
1556 .then_with(|| left.column.cmp(&right.column))
1557 });
1558}
1559
1560fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1565 matches: &mut [SharedGrepMatch],
1566 modified_for_path: F,
1567) where
1568 F: Fn(&Path) -> Option<SystemTime>,
1569{
1570 use std::collections::HashMap;
1571 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1572 for m in matches.iter() {
1573 let path = m.file.as_path().to_path_buf();
1574 mtimes
1575 .entry(path.clone())
1576 .or_insert_with(|| modified_for_path(&path));
1577 }
1578 matches.sort_by(|left, right| {
1579 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1580 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1581 right_mtime
1582 .cmp(&left_mtime)
1583 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1584 .then_with(|| left.line.cmp(&right.line))
1585 .then_with(|| left.column.cmp(&right.column))
1586 });
1587}
1588
1589pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1590 let resolved_project_root = canonicalize_or_normalize(project_root);
1591 let root = match path {
1592 Some(path) => {
1593 let path = PathBuf::from(path);
1594 if path.is_absolute() {
1595 canonicalize_or_normalize(&path)
1596 } else {
1597 normalize_path(&resolved_project_root.join(path))
1598 }
1599 }
1600 None => resolved_project_root.clone(),
1601 };
1602
1603 let use_index = is_within_search_root(&resolved_project_root, &root);
1604 SearchScope { root, use_index }
1605}
1606
1607pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1608 content_inspector::inspect(content).is_binary()
1609}
1610
1611pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1612 run_git(root, &["rev-parse", "HEAD"])
1613}
1614
1615pub fn project_cache_key(project_root: &Path) -> String {
1616 use sha2::{Digest, Sha256};
1617
1618 let mut hasher = Sha256::new();
1619
1620 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1621 hasher.update(root_commit.as_bytes());
1624 } else {
1625 let canonical_root = canonicalize_or_normalize(project_root);
1627 hasher.update(canonical_root.to_string_lossy().as_bytes());
1628 }
1629
1630 let digest = format!("{:x}", hasher.finalize());
1631 digest[..16].to_string()
1632}
1633
1634impl PathFilters {
1635 fn matches(&self, root: &Path, path: &Path) -> bool {
1636 let relative = to_glob_path(&relative_to_root(root, path));
1637 if self
1638 .includes
1639 .as_ref()
1640 .is_some_and(|includes| !includes.is_match(&relative))
1641 {
1642 return false;
1643 }
1644 if self
1645 .excludes
1646 .as_ref()
1647 .is_some_and(|excludes| excludes.is_match(&relative))
1648 {
1649 return false;
1650 }
1651 true
1652 }
1653}
1654
1655fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1656 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1657}
1658
1659fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1660 if path.is_absolute() {
1661 path.to_path_buf()
1662 } else {
1663 project_root.join(path)
1664 }
1665}
1666
1667fn path_modified_time(path: &Path) -> Option<SystemTime> {
1668 fs::metadata(path)
1669 .and_then(|metadata| metadata.modified())
1670 .ok()
1671}
1672
1673fn normalize_path(path: &Path) -> PathBuf {
1674 let mut result = PathBuf::new();
1675 for component in path.components() {
1676 match component {
1677 Component::ParentDir => {
1678 if !result.pop() {
1679 result.push(component);
1680 }
1681 }
1682 Component::CurDir => {}
1683 _ => result.push(component),
1684 }
1685 }
1686 result
1687}
1688
1689fn verify_file_mtimes(index: &mut SearchIndex) {
1692 let mut stale_paths = Vec::new();
1694 for entry in &index.files {
1695 if entry.path.as_os_str().is_empty() {
1696 continue; }
1698 let cached = FileFreshness {
1699 mtime: entry.modified,
1700 size: entry.size,
1701 content_hash: entry.content_hash,
1702 };
1703 match cache_freshness::verify_file(&entry.path, &cached) {
1704 FreshnessVerdict::HotFresh | FreshnessVerdict::ContentFresh { .. } => {}
1705 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => {
1706 stale_paths.push(entry.path.clone())
1707 }
1708 }
1709 }
1710
1711 for entry in &mut index.files {
1712 if entry.path.as_os_str().is_empty() {
1713 continue;
1714 }
1715 let cached = FileFreshness {
1716 mtime: entry.modified,
1717 size: entry.size,
1718 content_hash: entry.content_hash,
1719 };
1720 if let FreshnessVerdict::ContentFresh {
1721 new_mtime,
1722 new_size,
1723 } = cache_freshness::verify_file(&entry.path, &cached)
1724 {
1725 entry.modified = new_mtime;
1726 entry.size = new_size;
1727 }
1728 }
1729
1730 for path in &stale_paths {
1732 index.update_file(path);
1733 }
1734
1735 let filters = PathFilters::default();
1737 for path in walk_project_files(&index.project_root, &filters) {
1738 if !index.path_to_id.contains_key(&path) {
1739 index.update_file(&path);
1740 }
1741 }
1742
1743 if !stale_paths.is_empty() {
1744 crate::slog_info!(
1745 "search index: refreshed {} stale file(s) from disk cache",
1746 stale_paths.len()
1747 );
1748 }
1749}
1750
1751fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1752 path.starts_with(search_root)
1753}
1754
1755impl QueryBuild {
1756 fn into_query(self) -> RegexQuery {
1757 let mut query = RegexQuery::default();
1758
1759 for run in self.and_runs {
1760 add_run_to_and_query(&mut query, &run);
1761 }
1762
1763 for group in self.or_groups {
1764 let mut trigrams = BTreeSet::new();
1765 let mut filters = HashMap::new();
1766 for run in group {
1767 for (trigram, filter) in trigram_filters(&run) {
1768 trigrams.insert(trigram);
1769 merge_filter(filters.entry(trigram).or_default(), filter);
1770 }
1771 }
1772 if !trigrams.is_empty() {
1773 query.or_groups.push(trigrams.into_iter().collect());
1774 query.or_filters.push(filters);
1775 }
1776 }
1777
1778 query
1779 }
1780}
1781
1782fn build_query(hir: &Hir) -> QueryBuild {
1783 match hir.kind() {
1784 HirKind::Literal(literal) => {
1785 if literal.0.len() >= 3 {
1786 QueryBuild {
1787 and_runs: vec![literal.0.to_vec()],
1788 or_groups: Vec::new(),
1789 }
1790 } else {
1791 QueryBuild::default()
1792 }
1793 }
1794 HirKind::Capture(capture) => build_query(&capture.sub),
1795 HirKind::Concat(parts) => {
1796 let mut build = QueryBuild::default();
1797 for part in parts {
1798 let part_build = build_query(part);
1799 build.and_runs.extend(part_build.and_runs);
1800 build.or_groups.extend(part_build.or_groups);
1801 }
1802 build
1803 }
1804 HirKind::Alternation(parts) => {
1805 let mut group = Vec::new();
1806 for part in parts {
1807 let Some(mut choices) = guaranteed_run_choices(part) else {
1808 return QueryBuild::default();
1809 };
1810 group.append(&mut choices);
1811 }
1812 if group.is_empty() {
1813 QueryBuild::default()
1814 } else {
1815 QueryBuild {
1816 and_runs: Vec::new(),
1817 or_groups: vec![group],
1818 }
1819 }
1820 }
1821 HirKind::Repetition(repetition) => {
1822 if repetition.min == 0 {
1823 QueryBuild::default()
1824 } else {
1825 build_query(&repetition.sub)
1826 }
1827 }
1828 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1829 }
1830}
1831
1832fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1833 match hir.kind() {
1834 HirKind::Literal(literal) => {
1835 if literal.0.len() >= 3 {
1836 Some(vec![literal.0.to_vec()])
1837 } else {
1838 None
1839 }
1840 }
1841 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1842 HirKind::Concat(parts) => {
1843 let mut runs = Vec::new();
1844 for part in parts {
1845 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1846 runs.append(&mut part_runs);
1847 }
1848 }
1849 if runs.is_empty() {
1850 None
1851 } else {
1852 Some(runs)
1853 }
1854 }
1855 HirKind::Alternation(parts) => {
1856 let mut runs = Vec::new();
1857 for part in parts {
1858 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1859 return None;
1860 };
1861 runs.append(&mut part_runs);
1862 }
1863 if runs.is_empty() {
1864 None
1865 } else {
1866 Some(runs)
1867 }
1868 }
1869 HirKind::Repetition(repetition) => {
1870 if repetition.min == 0 {
1871 None
1872 } else {
1873 guaranteed_run_choices(&repetition.sub)
1874 }
1875 }
1876 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1877 }
1878}
1879
1880fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1881 for (trigram, filter) in trigram_filters(run) {
1882 if !query.and_trigrams.contains(&trigram) {
1883 query.and_trigrams.push(trigram);
1884 }
1885 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1886 }
1887}
1888
1889fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1890 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1891 for (trigram, next_char, position) in extract_trigrams(run) {
1892 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1893 if next_char != EOF_SENTINEL {
1894 entry.next_mask |= mask_for_next_char(next_char);
1895 }
1896 entry.loc_mask |= mask_for_position(position);
1897 }
1898 filters.into_iter().collect()
1899}
1900
1901fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1902 target.next_mask |= filter.next_mask;
1903 target.loc_mask |= filter.loc_mask;
1904}
1905
1906fn mask_for_next_char(next_char: u8) -> u8 {
1907 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1908 1u8 << bit
1909}
1910
1911fn mask_for_position(position: usize) -> u8 {
1912 1u8 << (position % 8)
1913}
1914
1915fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1916 if patterns.is_empty() {
1917 return Ok(None);
1918 }
1919
1920 let mut builder = GlobSetBuilder::new();
1921 for pattern in patterns {
1922 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1923 builder.add(glob);
1924 }
1925 builder.build().map(Some).map_err(|error| error.to_string())
1926}
1927
1928fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1929 let mut buffer = [0u8; 4];
1930 reader.read_exact(&mut buffer)?;
1931 Ok(u32::from_le_bytes(buffer))
1932}
1933
1934fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1935 let mut buffer = [0u8; 8];
1936 reader.read_exact(&mut buffer)?;
1937 Ok(u64::from_le_bytes(buffer))
1938}
1939
1940fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1941 writer.write_all(&value.to_le_bytes())
1942}
1943
1944fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1945 writer.write_all(&value.to_le_bytes())
1946}
1947
1948fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1949 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1950 return Err(std::io::Error::other("search index checksum missing"));
1951 };
1952 let expected = u32::from_le_bytes(*stored);
1953 let actual = crc32fast::hash(body);
1954 if actual != expected {
1955 return Err(std::io::Error::other("search index checksum mismatch"));
1956 }
1957 Ok(())
1958}
1959
1960fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1961 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1962 total_len.checked_sub(pos)
1963}
1964
1965fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1966 let output = Command::new("git")
1967 .arg("-C")
1968 .arg(root)
1969 .args(args)
1970 .output()
1971 .ok()?;
1972 if !output.status.success() {
1973 return None;
1974 }
1975 let value = String::from_utf8(output.stdout).ok()?;
1976 let value = value.trim().to_string();
1977 if value.is_empty() {
1978 None
1979 } else {
1980 Some(value)
1981 }
1982}
1983
1984fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1985 let diff_range = format!("{}..{}", from, to);
1986 let output = match Command::new("git")
1987 .arg("-C")
1988 .arg(root)
1989 .args(["diff", "--name-only", &diff_range])
1990 .output()
1991 {
1992 Ok(output) => output,
1993 Err(_) => return false,
1994 };
1995
1996 if !output.status.success() {
1997 return false;
1998 }
1999
2000 let Ok(paths) = String::from_utf8(output.stdout) else {
2001 return false;
2002 };
2003
2004 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
2005 let path = root.join(relative_path);
2006 if path.exists() {
2007 index.update_file(&path);
2008 } else {
2009 index.remove_file(&path);
2010 }
2011 }
2012
2013 true
2014}
2015
2016fn is_binary_path(path: &Path, size: u64) -> bool {
2017 if size == 0 {
2018 return false;
2019 }
2020
2021 let mut file = match File::open(path) {
2022 Ok(file) => file,
2023 Err(_) => return true,
2024 };
2025
2026 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
2027 match file.read(&mut preview) {
2028 Ok(read) => is_binary_bytes(&preview[..read]),
2029 Err(_) => true,
2030 }
2031}
2032
2033fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2034 let mut starts = vec![0usize];
2035 for (index, byte) in content.iter().copied().enumerate() {
2036 if byte == b'\n' {
2037 starts.push(index + 1);
2038 }
2039 }
2040 starts
2041}
2042
2043fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2044 let line_index = match line_starts.binary_search(&offset) {
2045 Ok(index) => index,
2046 Err(index) => index.saturating_sub(1),
2047 };
2048 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2049 let line_end = content[line_start..]
2050 .iter()
2051 .position(|byte| *byte == b'\n')
2052 .map(|length| line_start + length)
2053 .unwrap_or(content.len());
2054 let mut line_slice = &content[line_start..line_end];
2055 if line_slice.ends_with(b"\r") {
2056 line_slice = &line_slice[..line_slice.len() - 1];
2057 }
2058 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2059 let column = String::from_utf8_lossy(&content[line_start..offset])
2060 .chars()
2061 .count() as u32
2062 + 1;
2063 (line_index as u32 + 1, column, line_text)
2064}
2065
2066fn to_glob_path(path: &Path) -> String {
2067 path.to_string_lossy().replace('\\', "/")
2068}
2069
2070#[cfg(test)]
2071mod tests {
2072 use std::process::Command;
2073
2074 use super::*;
2075
2076 #[test]
2077 fn extract_trigrams_tracks_next_char_and_position() {
2078 let trigrams = extract_trigrams(b"Rust");
2079 assert_eq!(trigrams.len(), 2);
2080 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2081 assert_eq!(
2082 trigrams[1],
2083 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2084 );
2085 }
2086
2087 #[test]
2088 fn decompose_regex_extracts_literals_and_alternations() {
2089 let query = decompose_regex("abc(def|ghi)xyz");
2090 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2091 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2092 assert_eq!(query.or_groups.len(), 1);
2093 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2094 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2095 }
2096
2097 #[test]
2098 fn candidates_intersect_posting_lists() {
2099 let mut index = SearchIndex::new();
2100 let dir = tempfile::tempdir().expect("create temp dir");
2101 let alpha = dir.path().join("alpha.txt");
2102 let beta = dir.path().join("beta.txt");
2103 fs::write(&alpha, "abcdef").expect("write alpha");
2104 fs::write(&beta, "abcxyz").expect("write beta");
2105 index.project_root = dir.path().to_path_buf();
2106 index.index_file(&alpha, b"abcdef");
2107 index.index_file(&beta, b"abcxyz");
2108
2109 let query = RegexQuery {
2110 and_trigrams: vec![
2111 pack_trigram(b'a', b'b', b'c'),
2112 pack_trigram(b'd', b'e', b'f'),
2113 ],
2114 ..RegexQuery::default()
2115 };
2116
2117 let candidates = index.candidates(&query);
2118 assert_eq!(candidates.len(), 1);
2119 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2120 }
2121
2122 #[test]
2123 fn candidates_apply_bloom_filters() {
2124 let mut index = SearchIndex::new();
2125 let dir = tempfile::tempdir().expect("create temp dir");
2126 let file = dir.path().join("sample.txt");
2127 fs::write(&file, "abcd efgh").expect("write sample");
2128 index.project_root = dir.path().to_path_buf();
2129 index.index_file(&file, b"abcd efgh");
2130
2131 let trigram = pack_trigram(b'a', b'b', b'c');
2132 let matching_filter = PostingFilter {
2133 next_mask: mask_for_next_char(b'd'),
2134 loc_mask: mask_for_position(0),
2135 };
2136 let non_matching_filter = PostingFilter {
2137 next_mask: mask_for_next_char(b'z'),
2138 loc_mask: mask_for_position(0),
2139 };
2140
2141 assert_eq!(
2142 index
2143 .postings_for_trigram(trigram, Some(matching_filter))
2144 .len(),
2145 1
2146 );
2147 assert!(index
2148 .postings_for_trigram(trigram, Some(non_matching_filter))
2149 .is_empty());
2150 }
2151
2152 #[test]
2153 fn disk_round_trip_preserves_postings_and_files() {
2154 let dir = tempfile::tempdir().expect("create temp dir");
2155 let project = dir.path().join("project");
2156 fs::create_dir_all(&project).expect("create project dir");
2157 let file = project.join("src.txt");
2158 fs::write(&file, "abcdef").expect("write source");
2159
2160 let mut index = SearchIndex::build(&project);
2161 index.git_head = Some("deadbeef".to_string());
2162 let cache_dir = dir.path().join("cache");
2163 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2164
2165 let loaded =
2166 SearchIndex::read_from_disk(&cache_dir, &project).expect("load index from disk");
2167 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2168 assert_eq!(loaded.files.len(), 1);
2169 assert_eq!(
2170 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2171 PathBuf::from("src.txt")
2172 );
2173 assert_eq!(loaded.postings.len(), index.postings.len());
2174 assert!(loaded
2175 .postings
2176 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2177 }
2178
2179 #[test]
2180 fn read_from_disk_rejects_corrupt_postings_checksum() {
2181 let dir = tempfile::tempdir().expect("create temp dir");
2182 let project = dir.path().join("project");
2183 fs::create_dir_all(&project).expect("create project dir");
2184 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2185
2186 let index = SearchIndex::build(&project);
2187 let cache_dir = dir.path().join("cache");
2188 index.write_to_disk(&cache_dir, None);
2189
2190 let cache_path = cache_dir.join("cache.bin");
2191 let mut bytes = fs::read(&cache_path).expect("read cache");
2192 let middle = bytes.len() / 2;
2193 bytes[middle] ^= 0xff;
2194 fs::write(&cache_path, bytes).expect("write corrupted cache");
2195
2196 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_none());
2197 }
2198
2199 #[test]
2200 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2201 let dir = tempfile::tempdir().expect("create temp dir");
2202 let project = dir.path().join("project");
2203 fs::create_dir_all(&project).expect("create project dir");
2204 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2205
2206 let index = SearchIndex::build(&project);
2207 let cache_dir = dir.path().join("cache");
2208 index.write_to_disk(&cache_dir, None);
2209
2210 assert!(cache_dir.join("cache.bin").is_file());
2211 assert!(fs::read_dir(&cache_dir)
2212 .expect("read cache dir")
2213 .all(|entry| !entry
2214 .expect("cache entry")
2215 .file_name()
2216 .to_string_lossy()
2217 .contains(".tmp.")));
2218 }
2219
2220 #[test]
2221 fn concurrent_search_index_writes_do_not_corrupt() {
2222 let dir = tempfile::tempdir().expect("create temp dir");
2223 let project = dir.path().join("project");
2224 fs::create_dir_all(&project).expect("create project dir");
2225 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2226 let cache_dir = dir.path().join("cache");
2227
2228 let a_project = project.clone();
2229 let a_cache = cache_dir.clone();
2230 let a = std::thread::spawn(move || {
2231 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2232 let index = SearchIndex::build(&a_project);
2233 index.write_to_disk(&a_cache, None);
2234 });
2235 let b_project = project.clone();
2236 let b_cache = cache_dir.clone();
2237 let b = std::thread::spawn(move || {
2238 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2239 let index = SearchIndex::build(&b_project);
2240 index.write_to_disk(&b_cache, None);
2241 });
2242 a.join().expect("writer a");
2243 b.join().expect("writer b");
2244
2245 assert!(SearchIndex::read_from_disk(&cache_dir, &project).is_some());
2246 }
2247
2248 #[test]
2249 fn search_index_atomic_rename_survives_partial_write() {
2250 let dir = tempfile::tempdir().expect("create temp dir");
2251 let cache_dir = dir.path().join("cache");
2252 fs::create_dir_all(&cache_dir).expect("create cache dir");
2253 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2254
2255 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2256 }
2257
2258 #[test]
2259 fn project_cache_key_includes_checkout_path() {
2260 let dir = tempfile::tempdir().expect("create temp dir");
2261 let source = dir.path().join("source");
2262 fs::create_dir_all(&source).expect("create source repo dir");
2263 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2264
2265 assert!(Command::new("git")
2266 .current_dir(&source)
2267 .args(["init"])
2268 .status()
2269 .expect("init git repo")
2270 .success());
2271 assert!(Command::new("git")
2272 .current_dir(&source)
2273 .args(["add", "."])
2274 .status()
2275 .expect("git add")
2276 .success());
2277 assert!(Command::new("git")
2278 .current_dir(&source)
2279 .args([
2280 "-c",
2281 "user.name=AFT Tests",
2282 "-c",
2283 "user.email=aft-tests@example.com",
2284 "commit",
2285 "-m",
2286 "initial",
2287 ])
2288 .status()
2289 .expect("git commit")
2290 .success());
2291
2292 let clone = dir.path().join("clone");
2293 assert!(Command::new("git")
2294 .args(["clone", "--quiet"])
2295 .arg(&source)
2296 .arg(&clone)
2297 .status()
2298 .expect("git clone")
2299 .success());
2300
2301 let source_key = project_cache_key(&source);
2302 let clone_key = project_cache_key(&clone);
2303
2304 assert_eq!(source_key.len(), 16);
2305 assert_eq!(clone_key.len(), 16);
2306 assert_eq!(source_key, clone_key);
2308 }
2309
2310 #[test]
2311 fn git_head_unchanged_picks_up_local_edits() {
2312 let dir = tempfile::tempdir().expect("create temp dir");
2313 let project = dir.path().join("repo");
2314 fs::create_dir_all(&project).expect("create repo dir");
2315 let file = project.join("tracked.txt");
2316 fs::write(&file, "oldtoken\n").expect("write file");
2317 assert!(Command::new("git")
2318 .current_dir(&project)
2319 .arg("init")
2320 .status()
2321 .unwrap()
2322 .success());
2323 assert!(Command::new("git")
2324 .current_dir(&project)
2325 .args(["add", "."])
2326 .status()
2327 .unwrap()
2328 .success());
2329 assert!(Command::new("git")
2330 .current_dir(&project)
2331 .args([
2332 "-c",
2333 "user.name=AFT Tests",
2334 "-c",
2335 "user.email=aft-tests@example.com",
2336 "commit",
2337 "-m",
2338 "initial"
2339 ])
2340 .status()
2341 .unwrap()
2342 .success());
2343 let head = current_git_head(&project);
2344 let mut baseline = SearchIndex::build(&project);
2345 baseline.git_head = head.clone();
2346 fs::write(&file, "newtoken\n").expect("edit tracked file");
2347
2348 let refreshed =
2349 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2350 let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2351
2352 assert_eq!(result.total_matches, 1);
2353 }
2354
2355 #[test]
2356 fn non_git_project_reuses_cache_when_files_unchanged() {
2357 let dir = tempfile::tempdir().expect("create temp dir");
2358 let project = dir.path().join("project");
2359 fs::create_dir_all(&project).expect("create project dir");
2360 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2361 let baseline = SearchIndex::build(&project);
2362 let baseline_file_count = baseline.file_count();
2363
2364 let refreshed =
2365 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2366
2367 assert_eq!(refreshed.file_count(), baseline_file_count);
2368 assert_eq!(
2369 refreshed
2370 .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2371 .total_matches,
2372 1
2373 );
2374 }
2375
2376 #[test]
2377 fn resolve_search_scope_disables_index_for_external_path() {
2378 let dir = tempfile::tempdir().expect("create temp dir");
2379 let project = dir.path().join("project");
2380 let outside = dir.path().join("outside");
2381 fs::create_dir_all(&project).expect("create project dir");
2382 fs::create_dir_all(&outside).expect("create outside dir");
2383
2384 let scope = resolve_search_scope(&project, outside.to_str());
2385
2386 assert_eq!(
2387 scope.root,
2388 fs::canonicalize(&outside).expect("canonicalize outside")
2389 );
2390 assert!(!scope.use_index);
2391 }
2392
2393 #[test]
2394 fn grep_filters_matches_to_search_root() {
2395 let dir = tempfile::tempdir().expect("create temp dir");
2396 let project = dir.path().join("project");
2397 let src = project.join("src");
2398 let docs = project.join("docs");
2399 fs::create_dir_all(&src).expect("create src dir");
2400 fs::create_dir_all(&docs).expect("create docs dir");
2401 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2402 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2403
2404 let index = SearchIndex::build(&project);
2405 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2406
2407 assert_eq!(result.files_searched, 1);
2408 assert_eq!(result.files_with_matches, 1);
2409 assert_eq!(result.matches.len(), 1);
2410 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2412 assert_eq!(result.matches[0].file, expected);
2413 }
2414
2415 #[test]
2416 fn grep_deduplicates_multiple_matches_on_same_line() {
2417 let dir = tempfile::tempdir().expect("create temp dir");
2418 let project = dir.path().join("project");
2419 let src = project.join("src");
2420 fs::create_dir_all(&src).expect("create src dir");
2421 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2422
2423 let index = SearchIndex::build(&project);
2424 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2425
2426 assert_eq!(result.total_matches, 1);
2427 assert_eq!(result.matches.len(), 1);
2428 }
2429
2430 #[test]
2431 fn grep_reports_total_matches_before_truncation() {
2432 let dir = tempfile::tempdir().expect("create temp dir");
2433 let project = dir.path().join("project");
2434 let src = project.join("src");
2435 fs::create_dir_all(&src).expect("create src dir");
2436 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2437
2438 let index = SearchIndex::build(&project);
2439 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2440
2441 assert_eq!(result.total_matches, 2);
2442 assert_eq!(result.matches.len(), 1);
2443 assert!(result.truncated);
2444 }
2445
2446 #[test]
2447 fn glob_filters_results_to_search_root() {
2448 let dir = tempfile::tempdir().expect("create temp dir");
2449 let project = dir.path().join("project");
2450 let src = project.join("src");
2451 let scripts = project.join("scripts");
2452 fs::create_dir_all(&src).expect("create src dir");
2453 fs::create_dir_all(&scripts).expect("create scripts dir");
2454 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2455 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2456
2457 let index = SearchIndex::build(&project);
2458 let files = index.glob("**/*.rs", &src);
2459
2460 assert_eq!(
2461 files,
2462 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2463 );
2464 }
2465
2466 #[test]
2467 fn glob_includes_hidden_and_binary_files() {
2468 let dir = tempfile::tempdir().expect("create temp dir");
2469 let project = dir.path().join("project");
2470 let hidden_dir = project.join(".hidden");
2471 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2472 let hidden_file = hidden_dir.join("data.bin");
2473 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2474
2475 let index = SearchIndex::build(&project);
2476 let files = index.glob("**/*.bin", &project);
2477
2478 assert_eq!(
2479 files,
2480 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2481 );
2482 }
2483
2484 #[test]
2485 fn read_from_disk_rejects_invalid_nanos() {
2486 let dir = tempfile::tempdir().expect("create temp dir");
2487 let cache_dir = dir.path().join("cache");
2488 fs::create_dir_all(&cache_dir).expect("create cache dir");
2489
2490 let mut postings = Vec::new();
2491 postings.extend_from_slice(INDEX_MAGIC);
2492 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2493 postings.extend_from_slice(&0u32.to_le_bytes());
2494 postings.extend_from_slice(&1u32.to_le_bytes());
2495 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2496 postings.extend_from_slice(&1u32.to_le_bytes());
2497 postings.extend_from_slice(b"/");
2498 postings.push(0u8);
2499 postings.extend_from_slice(&1u32.to_le_bytes());
2500 postings.extend_from_slice(&0u64.to_le_bytes());
2501 postings.extend_from_slice(&0u64.to_le_bytes());
2502 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2503 postings.extend_from_slice(b"a");
2504 postings.extend_from_slice(&0u64.to_le_bytes());
2505
2506 let mut lookup = Vec::new();
2507 lookup.extend_from_slice(LOOKUP_MAGIC);
2508 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2509 lookup.extend_from_slice(&0u32.to_le_bytes());
2510
2511 let postings_checksum = crc32fast::hash(&postings);
2512 postings.extend_from_slice(&postings_checksum.to_le_bytes());
2513 let lookup_checksum = crc32fast::hash(&lookup);
2514 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2515 let mut cache = Vec::new();
2516 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2517 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2518 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2519 cache.extend_from_slice(&postings);
2520 cache.extend_from_slice(&lookup);
2521 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2522
2523 assert!(SearchIndex::read_from_disk(&cache_dir, dir.path()).is_none());
2524 }
2525
2526 #[test]
2541 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2542 let dir = tempfile::tempdir().expect("create tempdir");
2546 let mut paths: Vec<PathBuf> = Vec::new();
2547 for i in 0..30 {
2548 let path = if i % 2 == 0 {
2550 let p = dir.path().join(format!("real-{i}.rs"));
2551 fs::write(&p, format!("// {i}\n")).expect("write");
2552 p
2553 } else {
2554 dir.path().join(format!("missing-{i}.rs"))
2555 };
2556 paths.push(path);
2557 }
2558
2559 for _ in 0..50 {
2562 let mut copy = paths.clone();
2563 sort_paths_by_mtime_desc(&mut copy);
2564 assert_eq!(copy.len(), paths.len());
2565 }
2566 }
2567
2568 #[test]
2572 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2573 let dir = tempfile::tempdir().expect("create tempdir");
2574 let mut matches: Vec<GrepMatch> = Vec::new();
2575 for i in 0..30 {
2576 let file = if i % 2 == 0 {
2577 let p = dir.path().join(format!("real-{i}.rs"));
2578 fs::write(&p, format!("// {i}\n")).expect("write");
2579 p
2580 } else {
2581 dir.path().join(format!("missing-{i}.rs"))
2582 };
2583 matches.push(GrepMatch {
2584 file,
2585 line: u32::try_from(i).unwrap_or(0),
2586 column: 0,
2587 line_text: format!("match {i}"),
2588 match_text: format!("match {i}"),
2589 });
2590 }
2591
2592 for _ in 0..50 {
2593 let mut copy = matches.clone();
2594 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2595 assert_eq!(copy.len(), matches.len());
2596 }
2597 }
2598}