1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
21const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
22const INDEX_VERSION: u32 = 2;
23const PREVIEW_BYTES: usize = 8 * 1024;
24const EOF_SENTINEL: u8 = 0;
25const MAX_ENTRIES: usize = 10_000_000;
26const MIN_FILE_ENTRY_BYTES: usize = 25;
27const LOOKUP_ENTRY_BYTES: usize = 16;
28const POSTING_BYTES: usize = 6;
29
30pub struct CacheLock {
31 path: PathBuf,
32}
33
34impl CacheLock {
35 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
36 fs::create_dir_all(cache_dir)?;
37 let path = cache_dir.join("cache.lock");
38 for _ in 0..200 {
39 match fs::OpenOptions::new()
40 .write(true)
41 .create_new(true)
42 .open(&path)
43 {
44 Ok(mut file) => {
45 let _ = writeln!(file, "{}", std::process::id());
46 let _ = file.sync_all();
47 return Ok(Self { path });
48 }
49 Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
50 std::thread::sleep(Duration::from_millis(10));
51 }
52 Err(error) => return Err(error),
53 }
54 }
55 Err(std::io::Error::other(
56 "timed out acquiring search cache lock",
57 ))
58 }
59}
60
61impl Drop for CacheLock {
62 fn drop(&mut self) {
63 let _ = fs::remove_file(&self.path);
64 }
65}
66
67#[derive(Clone, Debug)]
68pub struct SearchIndex {
69 pub postings: HashMap<u32, Vec<Posting>>,
70 pub files: Vec<FileEntry>,
71 pub path_to_id: HashMap<PathBuf, u32>,
72 pub ready: bool,
73 project_root: PathBuf,
74 git_head: Option<String>,
75 max_file_size: u64,
76 pub file_trigrams: HashMap<u32, Vec<u32>>,
77 unindexed_files: HashSet<u32>,
78}
79
80impl SearchIndex {
81 pub fn file_count(&self) -> usize {
83 self.files.len()
84 }
85
86 pub fn trigram_count(&self) -> usize {
88 self.postings.len()
89 }
90
91 pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
93 query_trigrams_from_tokens(tokens)
94 }
95
96 pub fn lexical_rank(
98 &self,
99 query_trigrams: &[u32],
100 candidate_filter: Option<&dyn Fn(&Path) -> bool>,
101 max_files: usize,
102 ) -> Vec<(PathBuf, f32)> {
103 if query_trigrams.is_empty() || max_files == 0 {
104 return Vec::new();
105 }
106
107 let mut non_zero: Vec<(u32, usize)> = query_trigrams
108 .iter()
109 .filter_map(|trigram| {
110 let posting_count = self.postings.get(trigram).map_or(0, Vec::len);
111 (posting_count > 0).then_some((*trigram, posting_count))
112 })
113 .collect();
114 if non_zero.is_empty() {
115 return Vec::new();
116 }
117
118 non_zero.sort_unstable_by_key(|(_, posting_count)| *posting_count);
119 let selected_count = non_zero.len().min(3);
120 let candidate_cap = if selected_count == 3 { 200 } else { 500 };
121
122 let mut candidate_ids = BTreeSet::new();
123 for (trigram, _) in non_zero.iter().take(selected_count) {
124 if let Some(postings) = self.postings.get(trigram) {
125 for posting in postings {
126 if self.is_active_file(posting.file_id) {
127 candidate_ids.insert(posting.file_id);
128 }
129 }
130 }
131 }
132
133 let mut ranked = Vec::new();
134 for file_id in candidate_ids.into_iter().take(candidate_cap) {
135 let Some(entry) = self.files.get(file_id as usize) else {
136 continue;
137 };
138 if let Some(filter) = candidate_filter {
139 if !filter(&entry.path) {
140 continue;
141 }
142 }
143 let score = lexical_score(self, query_trigrams, file_id);
144 if score > 0.0 {
145 ranked.push((entry.path.clone(), score));
146 }
147 }
148
149 ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
150 ranked.truncate(max_files);
151 ranked
152 }
153}
154
155#[derive(Clone, Debug, PartialEq, Eq)]
156pub struct Posting {
157 pub file_id: u32,
158 pub next_mask: u8,
159 pub loc_mask: u8,
160}
161
162#[derive(Clone, Debug)]
163pub struct FileEntry {
164 pub path: PathBuf,
165 pub size: u64,
166 pub modified: SystemTime,
167}
168
169#[derive(Clone, Debug, PartialEq, Eq)]
170pub struct GrepMatch {
171 pub file: PathBuf,
172 pub line: u32,
173 pub column: u32,
174 pub line_text: String,
175 pub match_text: String,
176}
177
178#[derive(Clone, Debug)]
179pub struct GrepResult {
180 pub matches: Vec<GrepMatch>,
181 pub total_matches: usize,
182 pub files_searched: usize,
183 pub files_with_matches: usize,
184 pub index_status: IndexStatus,
185 pub truncated: bool,
186}
187
188#[derive(Clone, Copy, Debug, PartialEq, Eq)]
189pub enum IndexStatus {
190 Ready,
191 Building,
192 Fallback,
193}
194
195impl IndexStatus {
196 pub fn as_str(&self) -> &'static str {
197 match self {
198 IndexStatus::Ready => "Ready",
199 IndexStatus::Building => "Building",
200 IndexStatus::Fallback => "Fallback",
201 }
202 }
203}
204
205#[derive(Clone, Debug, Default)]
206pub struct RegexQuery {
207 pub and_trigrams: Vec<u32>,
208 pub or_groups: Vec<Vec<u32>>,
209 pub(crate) and_filters: HashMap<u32, PostingFilter>,
210 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
211}
212
213#[derive(Clone, Copy, Debug, Default)]
214pub(crate) struct PostingFilter {
215 next_mask: u8,
216 loc_mask: u8,
217}
218
219#[derive(Clone, Debug, Default)]
220struct QueryBuild {
221 and_runs: Vec<Vec<u8>>,
222 or_groups: Vec<Vec<Vec<u8>>>,
223}
224
225#[derive(Clone, Debug, Default)]
226pub(crate) struct PathFilters {
227 includes: Option<GlobSet>,
228 excludes: Option<GlobSet>,
229}
230
231#[derive(Clone, Debug)]
232pub(crate) struct SearchScope {
233 pub root: PathBuf,
234 pub use_index: bool,
235}
236
237#[derive(Clone, Debug)]
238struct SharedGrepMatch {
239 file: Arc<PathBuf>,
240 line: u32,
241 column: u32,
242 line_text: String,
243 match_text: String,
244}
245
246#[derive(Clone, Debug)]
247enum SearchMatcher {
248 Literal(LiteralSearch),
249 Regex(Regex),
250}
251
252#[derive(Clone, Debug)]
253enum LiteralSearch {
254 CaseSensitive(Vec<u8>),
255 AsciiCaseInsensitive(Vec<u8>),
256}
257
258impl SearchIndex {
259 pub fn new() -> Self {
260 SearchIndex {
261 postings: HashMap::new(),
262 files: Vec::new(),
263 path_to_id: HashMap::new(),
264 ready: false,
265 project_root: PathBuf::new(),
266 git_head: None,
267 max_file_size: DEFAULT_MAX_FILE_SIZE,
268 file_trigrams: HashMap::new(),
269 unindexed_files: HashSet::new(),
270 }
271 }
272
273 pub fn build(root: &Path) -> Self {
274 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
275 }
276
277 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
278 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
279 let mut index = SearchIndex {
280 project_root: project_root.clone(),
281 max_file_size,
282 ..SearchIndex::new()
283 };
284
285 let filters = PathFilters::default();
286 for path in walk_project_files(&project_root, &filters) {
287 index.update_file(&path);
288 }
289
290 index.git_head = current_git_head(&project_root);
291 index.ready = true;
292 index
293 }
294
295 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
296 self.remove_file(path);
297
298 let file_id = match self.allocate_file_id(path, content.len() as u64) {
299 Some(file_id) => file_id,
300 None => return,
301 };
302
303 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
304 for (trigram, next_char, position) in extract_trigrams(content) {
305 let entry = trigram_map.entry(trigram).or_default();
306 entry.next_mask |= mask_for_next_char(next_char);
307 entry.loc_mask |= mask_for_position(position);
308 }
309
310 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
311 for (trigram, filter) in trigram_map {
312 let postings = self.postings.entry(trigram).or_default();
313 postings.push(Posting {
314 file_id,
315 next_mask: filter.next_mask,
316 loc_mask: filter.loc_mask,
317 });
318 if postings.len() > 1
322 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
323 {
324 postings.sort_unstable_by_key(|p| p.file_id);
325 }
326 file_trigrams.push(trigram);
327 }
328
329 self.file_trigrams.insert(file_id, file_trigrams);
330 self.unindexed_files.remove(&file_id);
331 }
332
333 pub fn remove_file(&mut self, path: &Path) {
334 let Some(file_id) = self.path_to_id.remove(path) else {
335 return;
336 };
337
338 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
339 for trigram in trigrams {
340 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
341 postings.retain(|posting| posting.file_id != file_id);
342 postings.is_empty()
343 } else {
344 false
345 };
346
347 if should_remove {
348 self.postings.remove(&trigram);
349 }
350 }
351 }
352
353 self.unindexed_files.remove(&file_id);
354 if let Some(file) = self.files.get_mut(file_id as usize) {
355 file.path = PathBuf::new();
356 file.size = 0;
357 file.modified = UNIX_EPOCH;
358 }
359 }
360
361 pub fn update_file(&mut self, path: &Path) {
362 self.remove_file(path);
363
364 let metadata = match fs::metadata(path) {
365 Ok(metadata) if metadata.is_file() => metadata,
366 _ => return,
367 };
368
369 if is_binary_path(path, metadata.len()) {
370 self.track_unindexed_file(path, &metadata);
371 return;
372 }
373
374 if metadata.len() > self.max_file_size {
375 self.track_unindexed_file(path, &metadata);
376 return;
377 }
378
379 let content = match fs::read(path) {
380 Ok(content) => content,
381 Err(_) => return,
382 };
383
384 if is_binary_bytes(&content) {
385 self.track_unindexed_file(path, &metadata);
386 return;
387 }
388
389 self.index_file(path, &content);
390 }
391
392 pub fn grep(
393 &self,
394 pattern: &str,
395 case_sensitive: bool,
396 include: &[String],
397 exclude: &[String],
398 search_root: &Path,
399 max_results: usize,
400 ) -> GrepResult {
401 self.search_grep(
402 pattern,
403 case_sensitive,
404 include,
405 exclude,
406 search_root,
407 max_results,
408 )
409 }
410
411 pub fn search_grep(
412 &self,
413 pattern: &str,
414 case_sensitive: bool,
415 include: &[String],
416 exclude: &[String],
417 search_root: &Path,
418 max_results: usize,
419 ) -> GrepResult {
420 let is_literal = !pattern.chars().any(|c| {
423 matches!(
424 c,
425 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
426 )
427 });
428
429 let literal_search = if is_literal {
430 if case_sensitive {
431 Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
432 } else if pattern.is_ascii() {
433 Some(LiteralSearch::AsciiCaseInsensitive(
434 pattern
435 .as_bytes()
436 .iter()
437 .map(|byte| byte.to_ascii_lowercase())
438 .collect(),
439 ))
440 } else {
441 None
442 }
443 } else {
444 None
445 };
446
447 let regex = if literal_search.is_some() {
449 None
450 } else {
451 let regex_pattern = if is_literal {
452 regex::escape(pattern)
453 } else {
454 pattern.to_string()
455 };
456 let mut builder = RegexBuilder::new(®ex_pattern);
457 builder.case_insensitive(!case_sensitive);
458 builder.multi_line(true);
460 match builder.build() {
461 Ok(r) => Some(r),
462 Err(_) => {
463 return GrepResult {
464 matches: Vec::new(),
465 total_matches: 0,
466 files_searched: 0,
467 files_with_matches: 0,
468 index_status: if self.ready {
469 IndexStatus::Ready
470 } else {
471 IndexStatus::Building
472 },
473 truncated: false,
474 };
475 }
476 }
477 };
478
479 let matcher = if let Some(literal_search) = literal_search {
480 SearchMatcher::Literal(literal_search)
481 } else {
482 SearchMatcher::Regex(
483 regex.expect("regex should exist when literal matcher is unavailable"),
484 )
485 };
486
487 let filters = match build_path_filters(include, exclude) {
488 Ok(filters) => filters,
489 Err(_) => PathFilters::default(),
490 };
491 let search_root = canonicalize_or_normalize(search_root);
492
493 let query = decompose_regex(pattern);
494 let candidate_ids = self.candidates(&query);
495
496 let candidate_files: Vec<&FileEntry> = candidate_ids
497 .into_iter()
498 .filter_map(|file_id| self.files.get(file_id as usize))
499 .filter(|file| !file.path.as_os_str().is_empty())
500 .filter(|file| is_within_search_root(&search_root, &file.path))
501 .filter(|file| filters.matches(&self.project_root, &file.path))
502 .collect();
503
504 let total_matches = AtomicUsize::new(0);
505 let files_searched = AtomicUsize::new(0);
506 let files_with_matches = AtomicUsize::new(0);
507 let truncated = AtomicBool::new(false);
508 let stop_after = max_results.saturating_mul(2);
509
510 let mut matches = if candidate_files.len() > 10 {
511 candidate_files
512 .par_iter()
513 .map(|file| {
514 search_candidate_file(
515 file,
516 &matcher,
517 max_results,
518 stop_after,
519 &total_matches,
520 &files_searched,
521 &files_with_matches,
522 &truncated,
523 )
524 })
525 .reduce(Vec::new, |mut left, mut right| {
526 left.append(&mut right);
527 left
528 })
529 } else {
530 let mut matches = Vec::new();
531 for file in candidate_files {
532 matches.extend(search_candidate_file(
533 file,
534 &matcher,
535 max_results,
536 stop_after,
537 &total_matches,
538 &files_searched,
539 &files_with_matches,
540 &truncated,
541 ));
542
543 if should_stop_search(&truncated, &total_matches, stop_after) {
544 break;
545 }
546 }
547 matches
548 };
549
550 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
551 self.path_to_id
552 .get(path)
553 .and_then(|file_id| self.files.get(*file_id as usize))
554 .map(|file| file.modified)
555 });
556
557 let matches = matches
558 .into_iter()
559 .map(|matched| GrepMatch {
560 file: matched.file.as_ref().clone(),
561 line: matched.line,
562 column: matched.column,
563 line_text: matched.line_text,
564 match_text: matched.match_text,
565 })
566 .collect();
567
568 GrepResult {
569 total_matches: total_matches.load(Ordering::Relaxed),
570 matches,
571 files_searched: files_searched.load(Ordering::Relaxed),
572 files_with_matches: files_with_matches.load(Ordering::Relaxed),
573 index_status: if self.ready {
574 IndexStatus::Ready
575 } else {
576 IndexStatus::Building
577 },
578 truncated: truncated.load(Ordering::Relaxed),
579 }
580 }
581
582 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
583 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
584 Ok(filters) => filters,
585 Err(_) => return Vec::new(),
586 };
587 let search_root = canonicalize_or_normalize(search_root);
588 let mut entries = self
589 .files
590 .iter()
591 .filter(|file| !file.path.as_os_str().is_empty())
592 .filter(|file| is_within_search_root(&search_root, &file.path))
593 .filter(|file| filters.matches(&self.project_root, &file.path))
594 .map(|file| (file.path.clone(), file.modified))
595 .collect::<Vec<_>>();
596
597 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
598 right_mtime
599 .cmp(left_mtime)
600 .then_with(|| left_path.cmp(right_path))
601 });
602
603 entries.into_iter().map(|(path, _)| path).collect()
604 }
605
606 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
607 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
608 return self.active_file_ids();
609 }
610
611 let mut and_trigrams = query.and_trigrams.clone();
612 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
613
614 let mut current: Option<Vec<u32>> = None;
615
616 for trigram in and_trigrams {
617 let filter = query.and_filters.get(&trigram).copied();
618 let matches = self.postings_for_trigram(trigram, filter);
619 current = Some(match current.take() {
620 Some(existing) => intersect_sorted_ids(&existing, &matches),
621 None => matches,
622 });
623
624 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
625 break;
626 }
627 }
628
629 let mut current = current.unwrap_or_else(|| self.active_file_ids());
630
631 for (index, group) in query.or_groups.iter().enumerate() {
632 let mut group_matches = Vec::new();
633 let filters = query.or_filters.get(index);
634
635 for trigram in group {
636 let filter = filters.and_then(|filters| filters.get(trigram).copied());
637 let matches = self.postings_for_trigram(*trigram, filter);
638 if group_matches.is_empty() {
639 group_matches = matches;
640 } else {
641 group_matches = union_sorted_ids(&group_matches, &matches);
642 }
643 }
644
645 current = intersect_sorted_ids(¤t, &group_matches);
646 if current.is_empty() {
647 break;
648 }
649 }
650
651 let mut unindexed = self
652 .unindexed_files
653 .iter()
654 .copied()
655 .filter(|file_id| self.is_active_file(*file_id))
656 .collect::<Vec<_>>();
657 if !unindexed.is_empty() {
658 unindexed.sort_unstable();
659 current = union_sorted_ids(¤t, &unindexed);
660 }
661
662 current
663 }
664
665 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
666 if fs::create_dir_all(cache_dir).is_err() {
667 return;
668 }
669
670 let cache_path = cache_dir.join("cache.bin");
671 let tmp_cache = cache_dir.join(format!(
672 "cache.bin.tmp.{}.{}",
673 std::process::id(),
674 SystemTime::now()
675 .duration_since(UNIX_EPOCH)
676 .unwrap_or(Duration::ZERO)
677 .as_nanos()
678 ));
679
680 let active_ids = self.active_file_ids();
681 let mut id_map = HashMap::new();
682 for (new_id, old_id) in active_ids.iter().enumerate() {
683 let Ok(new_id_u32) = u32::try_from(new_id) else {
684 return;
685 };
686 id_map.insert(*old_id, new_id_u32);
687 }
688
689 let write_result = (|| -> std::io::Result<()> {
690 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
691
692 postings_writer.write_all(INDEX_MAGIC)?;
693 write_u32(&mut postings_writer, INDEX_VERSION)?;
694
695 let head = git_head.unwrap_or_default();
696 let root = self.project_root.to_string_lossy();
697 let head_len = u32::try_from(head.len())
698 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
699 let root_len = u32::try_from(root.len())
700 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
701 let file_count = u32::try_from(active_ids.len())
702 .map_err(|_| std::io::Error::other("too many files to cache"))?;
703
704 write_u32(&mut postings_writer, head_len)?;
705 write_u32(&mut postings_writer, root_len)?;
706 write_u64(&mut postings_writer, self.max_file_size)?;
707 write_u32(&mut postings_writer, file_count)?;
708 postings_writer.write_all(head.as_bytes())?;
709 postings_writer.write_all(root.as_bytes())?;
710
711 for old_id in &active_ids {
712 let Some(file) = self.files.get(*old_id as usize) else {
713 return Err(std::io::Error::other("missing file entry for cache write"));
714 };
715 let path = relative_to_root(&self.project_root, &file.path);
716 let path = path.to_string_lossy();
717 let path_len = u32::try_from(path.len())
718 .map_err(|_| std::io::Error::other("cached path too large"))?;
719 let modified = file
720 .modified
721 .duration_since(UNIX_EPOCH)
722 .unwrap_or(Duration::ZERO);
723 let unindexed = if self.unindexed_files.contains(old_id) {
724 1u8
725 } else {
726 0u8
727 };
728
729 postings_writer.write_all(&[unindexed])?;
730 write_u32(&mut postings_writer, path_len)?;
731 write_u64(&mut postings_writer, file.size)?;
732 write_u64(&mut postings_writer, modified.as_secs())?;
733 write_u32(&mut postings_writer, modified.subsec_nanos())?;
734 postings_writer.write_all(path.as_bytes())?;
735 }
736
737 let mut lookup_entries = Vec::new();
738 let mut postings_blob = Vec::new();
739 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
740 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
741
742 for (trigram, postings) in sorted_postings {
743 let offset = u64::try_from(postings_blob.len())
744 .map_err(|_| std::io::Error::other("postings blob too large"))?;
745 let mut count = 0u32;
746
747 for posting in postings {
748 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
749 continue;
750 };
751
752 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
753 postings_blob.push(posting.next_mask);
754 postings_blob.push(posting.loc_mask);
755 count = count.saturating_add(1);
756 }
757
758 if count > 0 {
759 lookup_entries.push((*trigram, offset, count));
760 }
761 }
762
763 write_u64(
764 &mut postings_writer,
765 u64::try_from(postings_blob.len())
766 .map_err(|_| std::io::Error::other("postings blob too large"))?,
767 )?;
768 postings_writer.write_all(&postings_blob)?;
769 postings_writer.flush()?;
770 let mut postings_blob_file = postings_writer
771 .into_inner()
772 .map_err(|error| std::io::Error::other(error.to_string()))?
773 .into_inner();
774 let checksum = crc32fast::hash(&postings_blob_file);
775 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
776
777 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
778 let entry_count = u32::try_from(lookup_entries.len())
779 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
780
781 lookup_writer.write_all(LOOKUP_MAGIC)?;
782 write_u32(&mut lookup_writer, INDEX_VERSION)?;
783 write_u32(&mut lookup_writer, entry_count)?;
784
785 for (trigram, offset, count) in lookup_entries {
786 write_u32(&mut lookup_writer, trigram)?;
787 write_u64(&mut lookup_writer, offset)?;
788 write_u32(&mut lookup_writer, count)?;
789 }
790
791 lookup_writer.flush()?;
792 let mut lookup_blob_file = lookup_writer
793 .into_inner()
794 .map_err(|error| std::io::Error::other(error.to_string()))?
795 .into_inner();
796 let checksum = crc32fast::hash(&lookup_blob_file);
797 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
798
799 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
800 write_u32(&mut cache_writer, CACHE_MAGIC)?;
801 write_u32(&mut cache_writer, INDEX_VERSION)?;
802 write_u64(
803 &mut cache_writer,
804 u64::try_from(postings_blob_file.len())
805 .map_err(|_| std::io::Error::other("postings section too large"))?,
806 )?;
807 cache_writer.write_all(&postings_blob_file)?;
808 cache_writer.write_all(&lookup_blob_file)?;
809 cache_writer.flush()?;
810 cache_writer.get_ref().sync_all()?;
811 drop(cache_writer);
812 fs::rename(&tmp_cache, &cache_path)?;
813
814 Ok(())
815 })();
816
817 if write_result.is_err() {
818 let _ = fs::remove_file(&tmp_cache);
819 }
820 }
821
822 pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
823 let cache_path = cache_dir.join("cache.bin");
824 let cache_bytes = fs::read(&cache_path).ok()?;
825 if cache_bytes.len() < 16 {
826 return None;
827 }
828 let mut header = Cursor::new(&cache_bytes);
829 if read_u32(&mut header).ok()? != CACHE_MAGIC {
830 return None;
831 }
832 if read_u32(&mut header).ok()? != INDEX_VERSION {
833 return None;
834 }
835 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
836 let start = usize::try_from(header.position()).ok()?;
837 let postings_end = start.checked_add(postings_len_total)?;
838 if postings_end > cache_bytes.len() {
839 return None;
840 }
841 let postings_bytes = &cache_bytes[start..postings_end];
842 let lookup_bytes = &cache_bytes[postings_end..];
843 let lookup_len_total = lookup_bytes.len();
844 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
845 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
846 if postings_len_total < 4 || lookup_len_total < 4 {
847 return None;
848 }
849 verify_crc32_bytes_slice(postings_bytes).ok()?;
850 verify_crc32_bytes_slice(lookup_bytes).ok()?;
851
852 let mut magic = [0u8; 8];
853 postings_reader.read_exact(&mut magic).ok()?;
854 if &magic != INDEX_MAGIC {
855 return None;
856 }
857 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
858 return None;
859 }
860
861 let head_len = read_u32(&mut postings_reader).ok()? as usize;
862 let root_len = read_u32(&mut postings_reader).ok()? as usize;
863 let max_file_size = read_u64(&mut postings_reader).ok()?;
864 let file_count = read_u32(&mut postings_reader).ok()? as usize;
865 if file_count > MAX_ENTRIES {
866 return None;
867 }
868 let postings_body_len = postings_len_total.checked_sub(4)?;
869 let lookup_body_len = lookup_len_total.checked_sub(4)?;
870
871 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
872 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
873 if minimum_file_bytes > remaining_postings {
874 return None;
875 }
876
877 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
878 return None;
879 }
880 let mut head_bytes = vec![0u8; head_len];
881 postings_reader.read_exact(&mut head_bytes).ok()?;
882 let git_head = String::from_utf8(head_bytes)
883 .ok()
884 .filter(|head| !head.is_empty());
885
886 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
887 return None;
888 }
889 let mut root_bytes = vec![0u8; root_len];
890 postings_reader.read_exact(&mut root_bytes).ok()?;
891 let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
892
893 let mut files = Vec::with_capacity(file_count);
894 let mut path_to_id = HashMap::new();
895 let mut unindexed_files = HashSet::new();
896
897 for file_id in 0..file_count {
898 let mut unindexed = [0u8; 1];
899 postings_reader.read_exact(&mut unindexed).ok()?;
900 let path_len = read_u32(&mut postings_reader).ok()? as usize;
901 let size = read_u64(&mut postings_reader).ok()?;
902 let secs = read_u64(&mut postings_reader).ok()?;
903 let nanos = read_u32(&mut postings_reader).ok()?;
904 if nanos >= 1_000_000_000 {
905 return None;
906 }
907 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
908 return None;
909 }
910 let mut path_bytes = vec![0u8; path_len];
911 postings_reader.read_exact(&mut path_bytes).ok()?;
912 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
913 let full_path = project_root.join(relative_path);
914 let file_id_u32 = u32::try_from(file_id).ok()?;
915
916 files.push(FileEntry {
917 path: full_path.clone(),
918 size,
919 modified: UNIX_EPOCH + Duration::new(secs, nanos),
920 });
921 path_to_id.insert(full_path, file_id_u32);
922 if unindexed[0] == 1 {
923 unindexed_files.insert(file_id_u32);
924 }
925 }
926
927 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
928 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
929 if postings_len > max_postings_bytes {
930 return None;
931 }
932 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
933 return None;
934 }
935 let mut postings_blob = vec![0u8; postings_len];
936 postings_reader.read_exact(&mut postings_blob).ok()?;
937
938 let mut lookup_magic = [0u8; 8];
939 lookup_reader.read_exact(&mut lookup_magic).ok()?;
940 if &lookup_magic != LOOKUP_MAGIC {
941 return None;
942 }
943 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
944 return None;
945 }
946 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
947 if entry_count > MAX_ENTRIES {
948 return None;
949 }
950 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
951 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
952 if minimum_lookup_bytes > remaining_lookup {
953 return None;
954 }
955
956 let mut postings = HashMap::new();
957 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
958
959 for _ in 0..entry_count {
960 let trigram = read_u32(&mut lookup_reader).ok()?;
961 let offset = read_u64(&mut lookup_reader).ok()? as usize;
962 let count = read_u32(&mut lookup_reader).ok()? as usize;
963 if count > MAX_ENTRIES {
964 return None;
965 }
966 let bytes_len = count.checked_mul(POSTING_BYTES)?;
967 let end = offset.checked_add(bytes_len)?;
968 if end > postings_blob.len() {
969 return None;
970 }
971
972 let mut trigram_postings = Vec::with_capacity(count);
973 for chunk in postings_blob[offset..end].chunks_exact(6) {
974 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
975 let posting = Posting {
976 file_id,
977 next_mask: chunk[4],
978 loc_mask: chunk[5],
979 };
980 trigram_postings.push(posting.clone());
981 file_trigrams.entry(file_id).or_default().push(trigram);
982 }
983 postings.insert(trigram, trigram_postings);
984 }
985
986 Some(SearchIndex {
987 postings,
988 files,
989 path_to_id,
990 ready: true,
991 project_root,
992 git_head,
993 max_file_size,
994 file_trigrams,
995 unindexed_files,
996 })
997 }
998
999 pub(crate) fn stored_git_head(&self) -> Option<&str> {
1000 self.git_head.as_deref()
1001 }
1002
1003 pub(crate) fn set_ready(&mut self, ready: bool) {
1004 self.ready = ready;
1005 }
1006
1007 pub(crate) fn rebuild_or_refresh(
1008 root: &Path,
1009 max_file_size: u64,
1010 current_head: Option<String>,
1011 baseline: Option<SearchIndex>,
1012 ) -> Self {
1013 if let Some(mut baseline) = baseline {
1014 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
1015 baseline.max_file_size = max_file_size;
1016
1017 if baseline.git_head == current_head || current_head.is_none() {
1018 baseline.git_head = current_head;
1025 verify_file_mtimes(&mut baseline);
1026 baseline.ready = true;
1027 return baseline;
1028 }
1029
1030 if let (Some(previous), Some(current)) =
1031 (baseline.git_head.clone(), current_head.clone())
1032 {
1033 let project_root = baseline.project_root.clone();
1034 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
1035 baseline.git_head = Some(current);
1036 baseline.ready = true;
1037 return baseline;
1038 }
1039 }
1040 }
1041
1042 SearchIndex::build_with_limit(root, max_file_size)
1043 }
1044
1045 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
1046 let file_id = u32::try_from(self.files.len()).ok()?;
1047 let metadata = fs::metadata(path).ok();
1048 let size = metadata
1049 .as_ref()
1050 .map_or(size_hint, |metadata| metadata.len());
1051 let modified = metadata
1052 .and_then(|metadata| metadata.modified().ok())
1053 .unwrap_or(UNIX_EPOCH);
1054
1055 self.files.push(FileEntry {
1056 path: path.to_path_buf(),
1057 size,
1058 modified,
1059 });
1060 self.path_to_id.insert(path.to_path_buf(), file_id);
1061 Some(file_id)
1062 }
1063
1064 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1065 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1066 return;
1067 };
1068 self.unindexed_files.insert(file_id);
1069 self.file_trigrams.insert(file_id, Vec::new());
1070 }
1071
1072 fn active_file_ids(&self) -> Vec<u32> {
1073 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1074 ids.sort_unstable();
1075 ids
1076 }
1077
1078 fn is_active_file(&self, file_id: u32) -> bool {
1079 self.files
1080 .get(file_id as usize)
1081 .map(|file| !file.path.as_os_str().is_empty())
1082 .unwrap_or(false)
1083 }
1084
1085 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1086 let Some(postings) = self.postings.get(&trigram) else {
1087 return Vec::new();
1088 };
1089
1090 let mut matches = Vec::with_capacity(postings.len());
1091
1092 for posting in postings {
1093 if let Some(filter) = filter {
1094 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1097 continue;
1098 }
1099 }
1104 if self.is_active_file(posting.file_id) {
1105 matches.push(posting.file_id);
1106 }
1107 }
1108
1109 matches
1110 }
1111}
1112
1113fn search_candidate_file(
1114 file: &FileEntry,
1115 matcher: &SearchMatcher,
1116 max_results: usize,
1117 stop_after: usize,
1118 total_matches: &AtomicUsize,
1119 files_searched: &AtomicUsize,
1120 files_with_matches: &AtomicUsize,
1121 truncated: &AtomicBool,
1122) -> Vec<SharedGrepMatch> {
1123 if should_stop_search(truncated, total_matches, stop_after) {
1124 return Vec::new();
1125 }
1126
1127 let content = match read_indexed_file_bytes(&file.path) {
1128 Some(content) => content,
1129 None => return Vec::new(),
1130 };
1131 if is_binary_bytes(&content) {
1138 return Vec::new();
1139 }
1140 files_searched.fetch_add(1, Ordering::Relaxed);
1141
1142 let shared_path = Arc::new(file.path.clone());
1143 let mut matches = Vec::new();
1144 let mut line_starts = None;
1145 let mut seen_lines = HashSet::new();
1146 let mut matched_this_file = false;
1147
1148 match matcher {
1149 SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1150 let finder = memchr::memmem::Finder::new(needle);
1151 let mut start = 0;
1152
1153 while let Some(position) = finder.find(&content[start..]) {
1154 if should_stop_search(truncated, total_matches, stop_after) {
1155 break;
1156 }
1157
1158 let offset = start + position;
1159 start = offset + 1;
1160
1161 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1162 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1163 if !seen_lines.insert(line) {
1164 continue;
1165 }
1166
1167 matched_this_file = true;
1168 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1169 if match_number > max_results {
1170 truncated.store(true, Ordering::Relaxed);
1171 break;
1172 }
1173
1174 let end = offset + needle.len();
1175 matches.push(SharedGrepMatch {
1176 file: shared_path.clone(),
1177 line,
1178 column,
1179 line_text,
1180 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1181 });
1182 }
1183 }
1184 SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1185 let search_content = content.to_ascii_lowercase();
1186 let finder = memchr::memmem::Finder::new(needle);
1187 let mut start = 0;
1188
1189 while let Some(position) = finder.find(&search_content[start..]) {
1190 if should_stop_search(truncated, total_matches, stop_after) {
1191 break;
1192 }
1193
1194 let offset = start + position;
1195 start = offset + 1;
1196
1197 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1198 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1199 if !seen_lines.insert(line) {
1200 continue;
1201 }
1202
1203 matched_this_file = true;
1204 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1205 if match_number > max_results {
1206 truncated.store(true, Ordering::Relaxed);
1207 break;
1208 }
1209
1210 let end = offset + needle.len();
1211 matches.push(SharedGrepMatch {
1212 file: shared_path.clone(),
1213 line,
1214 column,
1215 line_text,
1216 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1217 });
1218 }
1219 }
1220 SearchMatcher::Regex(regex) => {
1221 for matched in regex.find_iter(&content) {
1222 if should_stop_search(truncated, total_matches, stop_after) {
1223 break;
1224 }
1225
1226 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1227 let (line, column, line_text) =
1228 line_details_bytes(&content, line_starts, matched.start());
1229 if !seen_lines.insert(line) {
1230 continue;
1231 }
1232
1233 matched_this_file = true;
1234 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1235 if match_number > max_results {
1236 truncated.store(true, Ordering::Relaxed);
1237 break;
1238 }
1239
1240 matches.push(SharedGrepMatch {
1241 file: shared_path.clone(),
1242 line,
1243 column,
1244 line_text,
1245 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1246 });
1247 }
1248 }
1249 }
1250
1251 if matched_this_file {
1252 files_with_matches.fetch_add(1, Ordering::Relaxed);
1253 }
1254
1255 matches
1256}
1257
1258fn should_stop_search(
1259 truncated: &AtomicBool,
1260 total_matches: &AtomicUsize,
1261 stop_after: usize,
1262) -> bool {
1263 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1264}
1265
1266fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1267 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1268 let mut left_index = 0;
1269 let mut right_index = 0;
1270
1271 while left_index < left.len() && right_index < right.len() {
1272 match left[left_index].cmp(&right[right_index]) {
1273 std::cmp::Ordering::Less => left_index += 1,
1274 std::cmp::Ordering::Greater => right_index += 1,
1275 std::cmp::Ordering::Equal => {
1276 merged.push(left[left_index]);
1277 left_index += 1;
1278 right_index += 1;
1279 }
1280 }
1281 }
1282
1283 merged
1284}
1285
1286fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1287 let mut merged = Vec::with_capacity(left.len() + right.len());
1288 let mut left_index = 0;
1289 let mut right_index = 0;
1290
1291 while left_index < left.len() && right_index < right.len() {
1292 match left[left_index].cmp(&right[right_index]) {
1293 std::cmp::Ordering::Less => {
1294 merged.push(left[left_index]);
1295 left_index += 1;
1296 }
1297 std::cmp::Ordering::Greater => {
1298 merged.push(right[right_index]);
1299 right_index += 1;
1300 }
1301 std::cmp::Ordering::Equal => {
1302 merged.push(left[left_index]);
1303 left_index += 1;
1304 right_index += 1;
1305 }
1306 }
1307 }
1308
1309 merged.extend_from_slice(&left[left_index..]);
1310 merged.extend_from_slice(&right[right_index..]);
1311 merged
1312}
1313
1314pub fn decompose_regex(pattern: &str) -> RegexQuery {
1315 let hir = match regex_syntax::parse(pattern) {
1316 Ok(hir) => hir,
1317 Err(_) => return RegexQuery::default(),
1318 };
1319
1320 let build = build_query(&hir);
1321 build.into_query()
1322}
1323
1324pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1325 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1326}
1327
1328pub fn normalize_char(c: u8) -> u8 {
1329 c.to_ascii_lowercase()
1330}
1331
1332pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1333 if content.len() < 3 {
1334 return Vec::new();
1335 }
1336
1337 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1338 for start in 0..=content.len() - 3 {
1339 let trigram = pack_trigram(
1340 normalize_char(content[start]),
1341 normalize_char(content[start + 1]),
1342 normalize_char(content[start + 2]),
1343 );
1344 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1345 trigrams.push((trigram, next_char, start));
1346 }
1347 trigrams
1348}
1349
1350pub fn query_trigrams_from_tokens(tokens: &[&str]) -> Vec<u32> {
1351 let mut seen = HashSet::new();
1352 let mut out = Vec::new();
1353 for token in tokens {
1354 for (trigram, _, _) in extract_trigrams(token.as_bytes()) {
1355 if seen.insert(trigram) {
1356 out.push(trigram);
1357 }
1358 }
1359 }
1360 out
1361}
1362
1363pub fn lexical_score(index: &SearchIndex, query_trigrams: &[u32], file_id: u32) -> f32 {
1364 if query_trigrams.is_empty() {
1365 return 0.0;
1366 }
1367
1368 let mut hits = 0u32;
1369 for &trigram in query_trigrams {
1370 if let Some(postings) = index.postings.get(&trigram) {
1371 if postings
1372 .binary_search_by(|posting| posting.file_id.cmp(&file_id))
1373 .is_ok()
1374 {
1375 hits += 1;
1376 }
1377 }
1378 }
1379
1380 if hits == 0 {
1381 return 0.0;
1382 }
1383
1384 let file_trigram_count = index
1385 .file_trigrams
1386 .get(&file_id)
1387 .map_or(1, |trigrams| trigrams.len().max(1)) as f32;
1388 (hits as f32) / (1.0 + file_trigram_count.ln())
1389}
1390
1391pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1392 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1394 return PathBuf::from(override_dir)
1395 .join("index")
1396 .join(project_cache_key(project_root));
1397 }
1398 if let Some(dir) = storage_dir {
1400 return dir.join("index").join(project_cache_key(project_root));
1401 }
1402 let home = std::env::var_os("HOME")
1407 .or_else(|| std::env::var_os("USERPROFILE"))
1408 .map(PathBuf::from)
1409 .unwrap_or_else(std::env::temp_dir);
1410 home.join(".cache")
1411 .join("aft")
1412 .join("index")
1413 .join(project_cache_key(project_root))
1414}
1415
1416pub(crate) fn build_path_filters(
1417 include: &[String],
1418 exclude: &[String],
1419) -> Result<PathFilters, String> {
1420 Ok(PathFilters {
1421 includes: build_globset(include)?,
1422 excludes: build_globset(exclude)?,
1423 })
1424}
1425
1426pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1427 walk_project_files_from(root, root, filters)
1428}
1429
1430pub(crate) fn walk_project_files_from(
1431 filter_root: &Path,
1432 search_root: &Path,
1433 filters: &PathFilters,
1434) -> Vec<PathBuf> {
1435 let mut builder = WalkBuilder::new(search_root);
1436 builder
1437 .hidden(false)
1438 .git_ignore(true)
1439 .git_global(true)
1440 .git_exclude(true)
1441 .filter_entry(|entry| {
1442 let name = entry.file_name().to_string_lossy();
1443 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1444 return !matches!(
1445 name.as_ref(),
1446 "node_modules"
1447 | "target"
1448 | "venv"
1449 | ".venv"
1450 | ".git"
1451 | "__pycache__"
1452 | ".tox"
1453 | "dist"
1454 | "build"
1455 );
1456 }
1457 true
1458 });
1459
1460 let mut files = Vec::new();
1461 for entry in builder.build().filter_map(|entry| entry.ok()) {
1462 if !entry
1463 .file_type()
1464 .map_or(false, |file_type| file_type.is_file())
1465 {
1466 continue;
1467 }
1468 let path = entry.into_path();
1469 if filters.matches(filter_root, &path) {
1470 files.push(path);
1471 }
1472 }
1473
1474 sort_paths_by_mtime_desc(&mut files);
1475 files
1476}
1477
1478pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1479 let bytes = fs::read(path).ok()?;
1480 if is_binary_bytes(&bytes) {
1481 return None;
1482 }
1483 String::from_utf8(bytes).ok()
1484}
1485
1486fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1487 fs::read(path).ok()
1488}
1489
1490pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1491 path.strip_prefix(root)
1492 .map(PathBuf::from)
1493 .unwrap_or_else(|_| path.to_path_buf())
1494}
1495
1496pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1509 use std::collections::HashMap;
1510 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1511 for path in paths.iter() {
1512 mtimes
1513 .entry(path.clone())
1514 .or_insert_with(|| path_modified_time(path));
1515 }
1516 paths.sort_by(|left, right| {
1517 let left_mtime = mtimes.get(left).and_then(|v| *v);
1518 let right_mtime = mtimes.get(right).and_then(|v| *v);
1519 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1520 });
1521}
1522
1523pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1526 use std::collections::HashMap;
1527 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1528 for m in matches.iter() {
1529 mtimes.entry(m.file.clone()).or_insert_with(|| {
1530 let resolved = resolve_match_path(project_root, &m.file);
1531 path_modified_time(&resolved)
1532 });
1533 }
1534 matches.sort_by(|left, right| {
1535 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1536 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1537 right_mtime
1538 .cmp(&left_mtime)
1539 .then_with(|| left.file.cmp(&right.file))
1540 .then_with(|| left.line.cmp(&right.line))
1541 .then_with(|| left.column.cmp(&right.column))
1542 });
1543}
1544
1545fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1550 matches: &mut [SharedGrepMatch],
1551 modified_for_path: F,
1552) where
1553 F: Fn(&Path) -> Option<SystemTime>,
1554{
1555 use std::collections::HashMap;
1556 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1557 for m in matches.iter() {
1558 let path = m.file.as_path().to_path_buf();
1559 mtimes
1560 .entry(path.clone())
1561 .or_insert_with(|| modified_for_path(&path));
1562 }
1563 matches.sort_by(|left, right| {
1564 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1565 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1566 right_mtime
1567 .cmp(&left_mtime)
1568 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1569 .then_with(|| left.line.cmp(&right.line))
1570 .then_with(|| left.column.cmp(&right.column))
1571 });
1572}
1573
1574pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1575 let resolved_project_root = canonicalize_or_normalize(project_root);
1576 let root = match path {
1577 Some(path) => {
1578 let path = PathBuf::from(path);
1579 if path.is_absolute() {
1580 canonicalize_or_normalize(&path)
1581 } else {
1582 normalize_path(&resolved_project_root.join(path))
1583 }
1584 }
1585 None => resolved_project_root.clone(),
1586 };
1587
1588 let use_index = is_within_search_root(&resolved_project_root, &root);
1589 SearchScope { root, use_index }
1590}
1591
1592pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1593 content_inspector::inspect(content).is_binary()
1594}
1595
1596pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1597 run_git(root, &["rev-parse", "HEAD"])
1598}
1599
1600pub fn project_cache_key(project_root: &Path) -> String {
1601 use sha2::{Digest, Sha256};
1602
1603 let mut hasher = Sha256::new();
1604
1605 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1606 hasher.update(root_commit.as_bytes());
1609 } else {
1610 let canonical_root = canonicalize_or_normalize(project_root);
1612 hasher.update(canonical_root.to_string_lossy().as_bytes());
1613 }
1614
1615 let digest = format!("{:x}", hasher.finalize());
1616 digest[..16].to_string()
1617}
1618
1619impl PathFilters {
1620 fn matches(&self, root: &Path, path: &Path) -> bool {
1621 let relative = to_glob_path(&relative_to_root(root, path));
1622 if self
1623 .includes
1624 .as_ref()
1625 .is_some_and(|includes| !includes.is_match(&relative))
1626 {
1627 return false;
1628 }
1629 if self
1630 .excludes
1631 .as_ref()
1632 .is_some_and(|excludes| excludes.is_match(&relative))
1633 {
1634 return false;
1635 }
1636 true
1637 }
1638}
1639
1640fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1641 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1642}
1643
1644fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1645 if path.is_absolute() {
1646 path.to_path_buf()
1647 } else {
1648 project_root.join(path)
1649 }
1650}
1651
1652fn path_modified_time(path: &Path) -> Option<SystemTime> {
1653 fs::metadata(path)
1654 .and_then(|metadata| metadata.modified())
1655 .ok()
1656}
1657
1658fn normalize_path(path: &Path) -> PathBuf {
1659 let mut result = PathBuf::new();
1660 for component in path.components() {
1661 match component {
1662 Component::ParentDir => {
1663 if !result.pop() {
1664 result.push(component);
1665 }
1666 }
1667 Component::CurDir => {}
1668 _ => result.push(component),
1669 }
1670 }
1671 result
1672}
1673
1674fn verify_file_mtimes(index: &mut SearchIndex) {
1677 let mut stale_paths = Vec::new();
1679 for entry in &index.files {
1680 if entry.path.as_os_str().is_empty() {
1681 continue; }
1683 match fs::metadata(&entry.path) {
1684 Ok(meta) => {
1685 let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1686 if current_mtime != entry.modified || meta.len() != entry.size {
1687 stale_paths.push(entry.path.clone());
1688 }
1689 }
1690 Err(_) => {
1691 stale_paths.push(entry.path.clone());
1693 }
1694 }
1695 }
1696
1697 for path in &stale_paths {
1699 index.update_file(path);
1700 }
1701
1702 let filters = PathFilters::default();
1704 for path in walk_project_files(&index.project_root, &filters) {
1705 if !index.path_to_id.contains_key(&path) {
1706 index.update_file(&path);
1707 }
1708 }
1709
1710 if !stale_paths.is_empty() {
1711 crate::slog_info!(
1712 "search index: refreshed {} stale file(s) from disk cache",
1713 stale_paths.len()
1714 );
1715 }
1716}
1717
1718fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1719 path.starts_with(search_root)
1720}
1721
1722impl QueryBuild {
1723 fn into_query(self) -> RegexQuery {
1724 let mut query = RegexQuery::default();
1725
1726 for run in self.and_runs {
1727 add_run_to_and_query(&mut query, &run);
1728 }
1729
1730 for group in self.or_groups {
1731 let mut trigrams = BTreeSet::new();
1732 let mut filters = HashMap::new();
1733 for run in group {
1734 for (trigram, filter) in trigram_filters(&run) {
1735 trigrams.insert(trigram);
1736 merge_filter(filters.entry(trigram).or_default(), filter);
1737 }
1738 }
1739 if !trigrams.is_empty() {
1740 query.or_groups.push(trigrams.into_iter().collect());
1741 query.or_filters.push(filters);
1742 }
1743 }
1744
1745 query
1746 }
1747}
1748
1749fn build_query(hir: &Hir) -> QueryBuild {
1750 match hir.kind() {
1751 HirKind::Literal(literal) => {
1752 if literal.0.len() >= 3 {
1753 QueryBuild {
1754 and_runs: vec![literal.0.to_vec()],
1755 or_groups: Vec::new(),
1756 }
1757 } else {
1758 QueryBuild::default()
1759 }
1760 }
1761 HirKind::Capture(capture) => build_query(&capture.sub),
1762 HirKind::Concat(parts) => {
1763 let mut build = QueryBuild::default();
1764 for part in parts {
1765 let part_build = build_query(part);
1766 build.and_runs.extend(part_build.and_runs);
1767 build.or_groups.extend(part_build.or_groups);
1768 }
1769 build
1770 }
1771 HirKind::Alternation(parts) => {
1772 let mut group = Vec::new();
1773 for part in parts {
1774 let Some(mut choices) = guaranteed_run_choices(part) else {
1775 return QueryBuild::default();
1776 };
1777 group.append(&mut choices);
1778 }
1779 if group.is_empty() {
1780 QueryBuild::default()
1781 } else {
1782 QueryBuild {
1783 and_runs: Vec::new(),
1784 or_groups: vec![group],
1785 }
1786 }
1787 }
1788 HirKind::Repetition(repetition) => {
1789 if repetition.min == 0 {
1790 QueryBuild::default()
1791 } else {
1792 build_query(&repetition.sub)
1793 }
1794 }
1795 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1796 }
1797}
1798
1799fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1800 match hir.kind() {
1801 HirKind::Literal(literal) => {
1802 if literal.0.len() >= 3 {
1803 Some(vec![literal.0.to_vec()])
1804 } else {
1805 None
1806 }
1807 }
1808 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1809 HirKind::Concat(parts) => {
1810 let mut runs = Vec::new();
1811 for part in parts {
1812 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1813 runs.append(&mut part_runs);
1814 }
1815 }
1816 if runs.is_empty() {
1817 None
1818 } else {
1819 Some(runs)
1820 }
1821 }
1822 HirKind::Alternation(parts) => {
1823 let mut runs = Vec::new();
1824 for part in parts {
1825 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1826 return None;
1827 };
1828 runs.append(&mut part_runs);
1829 }
1830 if runs.is_empty() {
1831 None
1832 } else {
1833 Some(runs)
1834 }
1835 }
1836 HirKind::Repetition(repetition) => {
1837 if repetition.min == 0 {
1838 None
1839 } else {
1840 guaranteed_run_choices(&repetition.sub)
1841 }
1842 }
1843 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1844 }
1845}
1846
1847fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1848 for (trigram, filter) in trigram_filters(run) {
1849 if !query.and_trigrams.contains(&trigram) {
1850 query.and_trigrams.push(trigram);
1851 }
1852 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1853 }
1854}
1855
1856fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1857 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1858 for (trigram, next_char, position) in extract_trigrams(run) {
1859 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1860 if next_char != EOF_SENTINEL {
1861 entry.next_mask |= mask_for_next_char(next_char);
1862 }
1863 entry.loc_mask |= mask_for_position(position);
1864 }
1865 filters.into_iter().collect()
1866}
1867
1868fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1869 target.next_mask |= filter.next_mask;
1870 target.loc_mask |= filter.loc_mask;
1871}
1872
1873fn mask_for_next_char(next_char: u8) -> u8 {
1874 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1875 1u8 << bit
1876}
1877
1878fn mask_for_position(position: usize) -> u8 {
1879 1u8 << (position % 8)
1880}
1881
1882fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1883 if patterns.is_empty() {
1884 return Ok(None);
1885 }
1886
1887 let mut builder = GlobSetBuilder::new();
1888 for pattern in patterns {
1889 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1890 builder.add(glob);
1891 }
1892 builder.build().map(Some).map_err(|error| error.to_string())
1893}
1894
1895fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1896 let mut buffer = [0u8; 4];
1897 reader.read_exact(&mut buffer)?;
1898 Ok(u32::from_le_bytes(buffer))
1899}
1900
1901fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1902 let mut buffer = [0u8; 8];
1903 reader.read_exact(&mut buffer)?;
1904 Ok(u64::from_le_bytes(buffer))
1905}
1906
1907fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1908 writer.write_all(&value.to_le_bytes())
1909}
1910
1911fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1912 writer.write_all(&value.to_le_bytes())
1913}
1914
1915fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1916 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1917 return Err(std::io::Error::other("search index checksum missing"));
1918 };
1919 let expected = u32::from_le_bytes(*stored);
1920 let actual = crc32fast::hash(body);
1921 if actual != expected {
1922 return Err(std::io::Error::other("search index checksum mismatch"));
1923 }
1924 Ok(())
1925}
1926
1927fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1928 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1929 total_len.checked_sub(pos)
1930}
1931
1932fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1933 let output = Command::new("git")
1934 .arg("-C")
1935 .arg(root)
1936 .args(args)
1937 .output()
1938 .ok()?;
1939 if !output.status.success() {
1940 return None;
1941 }
1942 let value = String::from_utf8(output.stdout).ok()?;
1943 let value = value.trim().to_string();
1944 if value.is_empty() {
1945 None
1946 } else {
1947 Some(value)
1948 }
1949}
1950
1951fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1952 let diff_range = format!("{}..{}", from, to);
1953 let output = match Command::new("git")
1954 .arg("-C")
1955 .arg(root)
1956 .args(["diff", "--name-only", &diff_range])
1957 .output()
1958 {
1959 Ok(output) => output,
1960 Err(_) => return false,
1961 };
1962
1963 if !output.status.success() {
1964 return false;
1965 }
1966
1967 let Ok(paths) = String::from_utf8(output.stdout) else {
1968 return false;
1969 };
1970
1971 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1972 let path = root.join(relative_path);
1973 if path.exists() {
1974 index.update_file(&path);
1975 } else {
1976 index.remove_file(&path);
1977 }
1978 }
1979
1980 true
1981}
1982
1983fn is_binary_path(path: &Path, size: u64) -> bool {
1984 if size == 0 {
1985 return false;
1986 }
1987
1988 let mut file = match File::open(path) {
1989 Ok(file) => file,
1990 Err(_) => return true,
1991 };
1992
1993 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1994 match file.read(&mut preview) {
1995 Ok(read) => is_binary_bytes(&preview[..read]),
1996 Err(_) => true,
1997 }
1998}
1999
2000fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
2001 let mut starts = vec![0usize];
2002 for (index, byte) in content.iter().copied().enumerate() {
2003 if byte == b'\n' {
2004 starts.push(index + 1);
2005 }
2006 }
2007 starts
2008}
2009
2010fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
2011 let line_index = match line_starts.binary_search(&offset) {
2012 Ok(index) => index,
2013 Err(index) => index.saturating_sub(1),
2014 };
2015 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
2016 let line_end = content[line_start..]
2017 .iter()
2018 .position(|byte| *byte == b'\n')
2019 .map(|length| line_start + length)
2020 .unwrap_or(content.len());
2021 let mut line_slice = &content[line_start..line_end];
2022 if line_slice.ends_with(b"\r") {
2023 line_slice = &line_slice[..line_slice.len() - 1];
2024 }
2025 let line_text = String::from_utf8_lossy(line_slice).into_owned();
2026 let column = String::from_utf8_lossy(&content[line_start..offset])
2027 .chars()
2028 .count() as u32
2029 + 1;
2030 (line_index as u32 + 1, column, line_text)
2031}
2032
2033fn to_glob_path(path: &Path) -> String {
2034 path.to_string_lossy().replace('\\', "/")
2035}
2036
2037#[cfg(test)]
2038mod tests {
2039 use std::process::Command;
2040
2041 use super::*;
2042
2043 #[test]
2044 fn extract_trigrams_tracks_next_char_and_position() {
2045 let trigrams = extract_trigrams(b"Rust");
2046 assert_eq!(trigrams.len(), 2);
2047 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
2048 assert_eq!(
2049 trigrams[1],
2050 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
2051 );
2052 }
2053
2054 #[test]
2055 fn decompose_regex_extracts_literals_and_alternations() {
2056 let query = decompose_regex("abc(def|ghi)xyz");
2057 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
2058 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
2059 assert_eq!(query.or_groups.len(), 1);
2060 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
2061 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
2062 }
2063
2064 #[test]
2065 fn candidates_intersect_posting_lists() {
2066 let mut index = SearchIndex::new();
2067 let dir = tempfile::tempdir().expect("create temp dir");
2068 let alpha = dir.path().join("alpha.txt");
2069 let beta = dir.path().join("beta.txt");
2070 fs::write(&alpha, "abcdef").expect("write alpha");
2071 fs::write(&beta, "abcxyz").expect("write beta");
2072 index.project_root = dir.path().to_path_buf();
2073 index.index_file(&alpha, b"abcdef");
2074 index.index_file(&beta, b"abcxyz");
2075
2076 let query = RegexQuery {
2077 and_trigrams: vec![
2078 pack_trigram(b'a', b'b', b'c'),
2079 pack_trigram(b'd', b'e', b'f'),
2080 ],
2081 ..RegexQuery::default()
2082 };
2083
2084 let candidates = index.candidates(&query);
2085 assert_eq!(candidates.len(), 1);
2086 assert_eq!(index.files[candidates[0] as usize].path, alpha);
2087 }
2088
2089 #[test]
2090 fn candidates_apply_bloom_filters() {
2091 let mut index = SearchIndex::new();
2092 let dir = tempfile::tempdir().expect("create temp dir");
2093 let file = dir.path().join("sample.txt");
2094 fs::write(&file, "abcd efgh").expect("write sample");
2095 index.project_root = dir.path().to_path_buf();
2096 index.index_file(&file, b"abcd efgh");
2097
2098 let trigram = pack_trigram(b'a', b'b', b'c');
2099 let matching_filter = PostingFilter {
2100 next_mask: mask_for_next_char(b'd'),
2101 loc_mask: mask_for_position(0),
2102 };
2103 let non_matching_filter = PostingFilter {
2104 next_mask: mask_for_next_char(b'z'),
2105 loc_mask: mask_for_position(0),
2106 };
2107
2108 assert_eq!(
2109 index
2110 .postings_for_trigram(trigram, Some(matching_filter))
2111 .len(),
2112 1
2113 );
2114 assert!(index
2115 .postings_for_trigram(trigram, Some(non_matching_filter))
2116 .is_empty());
2117 }
2118
2119 #[test]
2120 fn disk_round_trip_preserves_postings_and_files() {
2121 let dir = tempfile::tempdir().expect("create temp dir");
2122 let project = dir.path().join("project");
2123 fs::create_dir_all(&project).expect("create project dir");
2124 let file = project.join("src.txt");
2125 fs::write(&file, "abcdef").expect("write source");
2126
2127 let mut index = SearchIndex::build(&project);
2128 index.git_head = Some("deadbeef".to_string());
2129 let cache_dir = dir.path().join("cache");
2130 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2131
2132 let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
2133 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2134 assert_eq!(loaded.files.len(), 1);
2135 assert_eq!(
2136 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2137 PathBuf::from("src.txt")
2138 );
2139 assert_eq!(loaded.postings.len(), index.postings.len());
2140 assert!(loaded
2141 .postings
2142 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2143 }
2144
2145 #[test]
2146 fn read_from_disk_rejects_corrupt_postings_checksum() {
2147 let dir = tempfile::tempdir().expect("create temp dir");
2148 let project = dir.path().join("project");
2149 fs::create_dir_all(&project).expect("create project dir");
2150 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2151
2152 let index = SearchIndex::build(&project);
2153 let cache_dir = dir.path().join("cache");
2154 index.write_to_disk(&cache_dir, None);
2155
2156 let cache_path = cache_dir.join("cache.bin");
2157 let mut bytes = fs::read(&cache_path).expect("read cache");
2158 let middle = bytes.len() / 2;
2159 bytes[middle] ^= 0xff;
2160 fs::write(&cache_path, bytes).expect("write corrupted cache");
2161
2162 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2163 }
2164
2165 #[test]
2166 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2167 let dir = tempfile::tempdir().expect("create temp dir");
2168 let project = dir.path().join("project");
2169 fs::create_dir_all(&project).expect("create project dir");
2170 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2171
2172 let index = SearchIndex::build(&project);
2173 let cache_dir = dir.path().join("cache");
2174 index.write_to_disk(&cache_dir, None);
2175
2176 assert!(cache_dir.join("cache.bin").is_file());
2177 assert!(fs::read_dir(&cache_dir)
2178 .expect("read cache dir")
2179 .all(|entry| !entry
2180 .expect("cache entry")
2181 .file_name()
2182 .to_string_lossy()
2183 .contains(".tmp.")));
2184 }
2185
2186 #[test]
2187 fn concurrent_search_index_writes_do_not_corrupt() {
2188 let dir = tempfile::tempdir().expect("create temp dir");
2189 let project = dir.path().join("project");
2190 fs::create_dir_all(&project).expect("create project dir");
2191 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2192 let cache_dir = dir.path().join("cache");
2193
2194 let a_project = project.clone();
2195 let a_cache = cache_dir.clone();
2196 let a = std::thread::spawn(move || {
2197 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2198 let index = SearchIndex::build(&a_project);
2199 index.write_to_disk(&a_cache, None);
2200 });
2201 let b_project = project.clone();
2202 let b_cache = cache_dir.clone();
2203 let b = std::thread::spawn(move || {
2204 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2205 let index = SearchIndex::build(&b_project);
2206 index.write_to_disk(&b_cache, None);
2207 });
2208 a.join().expect("writer a");
2209 b.join().expect("writer b");
2210
2211 assert!(SearchIndex::read_from_disk(&cache_dir).is_some());
2212 }
2213
2214 #[test]
2215 fn search_index_atomic_rename_survives_partial_write() {
2216 let dir = tempfile::tempdir().expect("create temp dir");
2217 let cache_dir = dir.path().join("cache");
2218 fs::create_dir_all(&cache_dir).expect("create cache dir");
2219 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2220
2221 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2222 }
2223
2224 #[test]
2225 fn project_cache_key_includes_checkout_path() {
2226 let dir = tempfile::tempdir().expect("create temp dir");
2227 let source = dir.path().join("source");
2228 fs::create_dir_all(&source).expect("create source repo dir");
2229 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2230
2231 assert!(Command::new("git")
2232 .current_dir(&source)
2233 .args(["init"])
2234 .status()
2235 .expect("init git repo")
2236 .success());
2237 assert!(Command::new("git")
2238 .current_dir(&source)
2239 .args(["add", "."])
2240 .status()
2241 .expect("git add")
2242 .success());
2243 assert!(Command::new("git")
2244 .current_dir(&source)
2245 .args([
2246 "-c",
2247 "user.name=AFT Tests",
2248 "-c",
2249 "user.email=aft-tests@example.com",
2250 "commit",
2251 "-m",
2252 "initial",
2253 ])
2254 .status()
2255 .expect("git commit")
2256 .success());
2257
2258 let clone = dir.path().join("clone");
2259 assert!(Command::new("git")
2260 .args(["clone", "--quiet"])
2261 .arg(&source)
2262 .arg(&clone)
2263 .status()
2264 .expect("git clone")
2265 .success());
2266
2267 let source_key = project_cache_key(&source);
2268 let clone_key = project_cache_key(&clone);
2269
2270 assert_eq!(source_key.len(), 16);
2271 assert_eq!(clone_key.len(), 16);
2272 assert_eq!(source_key, clone_key);
2274 }
2275
2276 #[test]
2277 fn git_head_unchanged_picks_up_local_edits() {
2278 let dir = tempfile::tempdir().expect("create temp dir");
2279 let project = dir.path().join("repo");
2280 fs::create_dir_all(&project).expect("create repo dir");
2281 let file = project.join("tracked.txt");
2282 fs::write(&file, "oldtoken\n").expect("write file");
2283 assert!(Command::new("git")
2284 .current_dir(&project)
2285 .arg("init")
2286 .status()
2287 .unwrap()
2288 .success());
2289 assert!(Command::new("git")
2290 .current_dir(&project)
2291 .args(["add", "."])
2292 .status()
2293 .unwrap()
2294 .success());
2295 assert!(Command::new("git")
2296 .current_dir(&project)
2297 .args([
2298 "-c",
2299 "user.name=AFT Tests",
2300 "-c",
2301 "user.email=aft-tests@example.com",
2302 "commit",
2303 "-m",
2304 "initial"
2305 ])
2306 .status()
2307 .unwrap()
2308 .success());
2309 let head = current_git_head(&project);
2310 let mut baseline = SearchIndex::build(&project);
2311 baseline.git_head = head.clone();
2312 fs::write(&file, "newtoken\n").expect("edit tracked file");
2313
2314 let refreshed =
2315 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2316 let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2317
2318 assert_eq!(result.total_matches, 1);
2319 }
2320
2321 #[test]
2322 fn non_git_project_reuses_cache_when_files_unchanged() {
2323 let dir = tempfile::tempdir().expect("create temp dir");
2324 let project = dir.path().join("project");
2325 fs::create_dir_all(&project).expect("create project dir");
2326 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2327 let baseline = SearchIndex::build(&project);
2328 let baseline_file_count = baseline.file_count();
2329
2330 let refreshed =
2331 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2332
2333 assert_eq!(refreshed.file_count(), baseline_file_count);
2334 assert_eq!(
2335 refreshed
2336 .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2337 .total_matches,
2338 1
2339 );
2340 }
2341
2342 #[test]
2343 fn resolve_search_scope_disables_index_for_external_path() {
2344 let dir = tempfile::tempdir().expect("create temp dir");
2345 let project = dir.path().join("project");
2346 let outside = dir.path().join("outside");
2347 fs::create_dir_all(&project).expect("create project dir");
2348 fs::create_dir_all(&outside).expect("create outside dir");
2349
2350 let scope = resolve_search_scope(&project, outside.to_str());
2351
2352 assert_eq!(
2353 scope.root,
2354 fs::canonicalize(&outside).expect("canonicalize outside")
2355 );
2356 assert!(!scope.use_index);
2357 }
2358
2359 #[test]
2360 fn grep_filters_matches_to_search_root() {
2361 let dir = tempfile::tempdir().expect("create temp dir");
2362 let project = dir.path().join("project");
2363 let src = project.join("src");
2364 let docs = project.join("docs");
2365 fs::create_dir_all(&src).expect("create src dir");
2366 fs::create_dir_all(&docs).expect("create docs dir");
2367 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2368 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2369
2370 let index = SearchIndex::build(&project);
2371 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2372
2373 assert_eq!(result.files_searched, 1);
2374 assert_eq!(result.files_with_matches, 1);
2375 assert_eq!(result.matches.len(), 1);
2376 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2378 assert_eq!(result.matches[0].file, expected);
2379 }
2380
2381 #[test]
2382 fn grep_deduplicates_multiple_matches_on_same_line() {
2383 let dir = tempfile::tempdir().expect("create temp dir");
2384 let project = dir.path().join("project");
2385 let src = project.join("src");
2386 fs::create_dir_all(&src).expect("create src dir");
2387 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2388
2389 let index = SearchIndex::build(&project);
2390 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2391
2392 assert_eq!(result.total_matches, 1);
2393 assert_eq!(result.matches.len(), 1);
2394 }
2395
2396 #[test]
2397 fn grep_reports_total_matches_before_truncation() {
2398 let dir = tempfile::tempdir().expect("create temp dir");
2399 let project = dir.path().join("project");
2400 let src = project.join("src");
2401 fs::create_dir_all(&src).expect("create src dir");
2402 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2403
2404 let index = SearchIndex::build(&project);
2405 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2406
2407 assert_eq!(result.total_matches, 2);
2408 assert_eq!(result.matches.len(), 1);
2409 assert!(result.truncated);
2410 }
2411
2412 #[test]
2413 fn glob_filters_results_to_search_root() {
2414 let dir = tempfile::tempdir().expect("create temp dir");
2415 let project = dir.path().join("project");
2416 let src = project.join("src");
2417 let scripts = project.join("scripts");
2418 fs::create_dir_all(&src).expect("create src dir");
2419 fs::create_dir_all(&scripts).expect("create scripts dir");
2420 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2421 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2422
2423 let index = SearchIndex::build(&project);
2424 let files = index.glob("**/*.rs", &src);
2425
2426 assert_eq!(
2427 files,
2428 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2429 );
2430 }
2431
2432 #[test]
2433 fn glob_includes_hidden_and_binary_files() {
2434 let dir = tempfile::tempdir().expect("create temp dir");
2435 let project = dir.path().join("project");
2436 let hidden_dir = project.join(".hidden");
2437 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2438 let hidden_file = hidden_dir.join("data.bin");
2439 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2440
2441 let index = SearchIndex::build(&project);
2442 let files = index.glob("**/*.bin", &project);
2443
2444 assert_eq!(
2445 files,
2446 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2447 );
2448 }
2449
2450 #[test]
2451 fn read_from_disk_rejects_invalid_nanos() {
2452 let dir = tempfile::tempdir().expect("create temp dir");
2453 let cache_dir = dir.path().join("cache");
2454 fs::create_dir_all(&cache_dir).expect("create cache dir");
2455
2456 let mut postings = Vec::new();
2457 postings.extend_from_slice(INDEX_MAGIC);
2458 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2459 postings.extend_from_slice(&0u32.to_le_bytes());
2460 postings.extend_from_slice(&1u32.to_le_bytes());
2461 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2462 postings.extend_from_slice(&1u32.to_le_bytes());
2463 postings.extend_from_slice(b"/");
2464 postings.push(0u8);
2465 postings.extend_from_slice(&1u32.to_le_bytes());
2466 postings.extend_from_slice(&0u64.to_le_bytes());
2467 postings.extend_from_slice(&0u64.to_le_bytes());
2468 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2469 postings.extend_from_slice(b"a");
2470 postings.extend_from_slice(&0u64.to_le_bytes());
2471
2472 let mut lookup = Vec::new();
2473 lookup.extend_from_slice(LOOKUP_MAGIC);
2474 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2475 lookup.extend_from_slice(&0u32.to_le_bytes());
2476
2477 let postings_checksum = crc32fast::hash(&postings);
2478 postings.extend_from_slice(&postings_checksum.to_le_bytes());
2479 let lookup_checksum = crc32fast::hash(&lookup);
2480 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2481 let mut cache = Vec::new();
2482 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2483 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2484 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2485 cache.extend_from_slice(&postings);
2486 cache.extend_from_slice(&lookup);
2487 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2488
2489 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2490 }
2491
2492 #[test]
2507 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2508 let dir = tempfile::tempdir().expect("create tempdir");
2512 let mut paths: Vec<PathBuf> = Vec::new();
2513 for i in 0..30 {
2514 let path = if i % 2 == 0 {
2516 let p = dir.path().join(format!("real-{i}.rs"));
2517 fs::write(&p, format!("// {i}\n")).expect("write");
2518 p
2519 } else {
2520 dir.path().join(format!("missing-{i}.rs"))
2521 };
2522 paths.push(path);
2523 }
2524
2525 for _ in 0..50 {
2528 let mut copy = paths.clone();
2529 sort_paths_by_mtime_desc(&mut copy);
2530 assert_eq!(copy.len(), paths.len());
2531 }
2532 }
2533
2534 #[test]
2538 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2539 let dir = tempfile::tempdir().expect("create tempdir");
2540 let mut matches: Vec<GrepMatch> = Vec::new();
2541 for i in 0..30 {
2542 let file = if i % 2 == 0 {
2543 let p = dir.path().join(format!("real-{i}.rs"));
2544 fs::write(&p, format!("// {i}\n")).expect("write");
2545 p
2546 } else {
2547 dir.path().join(format!("missing-{i}.rs"))
2548 };
2549 matches.push(GrepMatch {
2550 file,
2551 line: u32::try_from(i).unwrap_or(0),
2552 column: 0,
2553 line_text: format!("match {i}"),
2554 match_text: format!("match {i}"),
2555 });
2556 }
2557
2558 for _ in 0..50 {
2559 let mut copy = matches.clone();
2560 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2561 assert_eq!(copy.len(), matches.len());
2562 }
2563 }
2564}