1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const CACHE_MAGIC: u32 = 0x3144_4958; const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
21const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
22const INDEX_VERSION: u32 = 2;
23const PREVIEW_BYTES: usize = 8 * 1024;
24const EOF_SENTINEL: u8 = 0;
25const MAX_ENTRIES: usize = 10_000_000;
26const MIN_FILE_ENTRY_BYTES: usize = 25;
27const LOOKUP_ENTRY_BYTES: usize = 16;
28const POSTING_BYTES: usize = 6;
29
30pub struct CacheLock {
31 path: PathBuf,
32}
33
34impl CacheLock {
35 pub fn acquire(cache_dir: &Path) -> std::io::Result<Self> {
36 fs::create_dir_all(cache_dir)?;
37 let path = cache_dir.join("cache.lock");
38 for _ in 0..200 {
39 match fs::OpenOptions::new()
40 .write(true)
41 .create_new(true)
42 .open(&path)
43 {
44 Ok(mut file) => {
45 let _ = writeln!(file, "{}", std::process::id());
46 let _ = file.sync_all();
47 return Ok(Self { path });
48 }
49 Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
50 std::thread::sleep(Duration::from_millis(10));
51 }
52 Err(error) => return Err(error),
53 }
54 }
55 Err(std::io::Error::other(
56 "timed out acquiring search cache lock",
57 ))
58 }
59}
60
61impl Drop for CacheLock {
62 fn drop(&mut self) {
63 let _ = fs::remove_file(&self.path);
64 }
65}
66
67#[derive(Clone, Debug)]
68pub struct SearchIndex {
69 pub postings: HashMap<u32, Vec<Posting>>,
70 pub files: Vec<FileEntry>,
71 pub path_to_id: HashMap<PathBuf, u32>,
72 pub ready: bool,
73 project_root: PathBuf,
74 git_head: Option<String>,
75 max_file_size: u64,
76 file_trigrams: HashMap<u32, Vec<u32>>,
77 unindexed_files: HashSet<u32>,
78}
79
80impl SearchIndex {
81 pub fn file_count(&self) -> usize {
83 self.files.len()
84 }
85
86 pub fn trigram_count(&self) -> usize {
88 self.postings.len()
89 }
90}
91
92#[derive(Clone, Debug, PartialEq, Eq)]
93pub struct Posting {
94 pub file_id: u32,
95 pub next_mask: u8,
96 pub loc_mask: u8,
97}
98
99#[derive(Clone, Debug)]
100pub struct FileEntry {
101 pub path: PathBuf,
102 pub size: u64,
103 pub modified: SystemTime,
104}
105
106#[derive(Clone, Debug, PartialEq, Eq)]
107pub struct GrepMatch {
108 pub file: PathBuf,
109 pub line: u32,
110 pub column: u32,
111 pub line_text: String,
112 pub match_text: String,
113}
114
115#[derive(Clone, Debug)]
116pub struct GrepResult {
117 pub matches: Vec<GrepMatch>,
118 pub total_matches: usize,
119 pub files_searched: usize,
120 pub files_with_matches: usize,
121 pub index_status: IndexStatus,
122 pub truncated: bool,
123}
124
125#[derive(Clone, Copy, Debug, PartialEq, Eq)]
126pub enum IndexStatus {
127 Ready,
128 Building,
129 Fallback,
130}
131
132impl IndexStatus {
133 pub fn as_str(&self) -> &'static str {
134 match self {
135 IndexStatus::Ready => "Ready",
136 IndexStatus::Building => "Building",
137 IndexStatus::Fallback => "Fallback",
138 }
139 }
140}
141
142#[derive(Clone, Debug, Default)]
143pub struct RegexQuery {
144 pub and_trigrams: Vec<u32>,
145 pub or_groups: Vec<Vec<u32>>,
146 pub(crate) and_filters: HashMap<u32, PostingFilter>,
147 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
148}
149
150#[derive(Clone, Copy, Debug, Default)]
151pub(crate) struct PostingFilter {
152 next_mask: u8,
153 loc_mask: u8,
154}
155
156#[derive(Clone, Debug, Default)]
157struct QueryBuild {
158 and_runs: Vec<Vec<u8>>,
159 or_groups: Vec<Vec<Vec<u8>>>,
160}
161
162#[derive(Clone, Debug, Default)]
163pub(crate) struct PathFilters {
164 includes: Option<GlobSet>,
165 excludes: Option<GlobSet>,
166}
167
168#[derive(Clone, Debug)]
169pub(crate) struct SearchScope {
170 pub root: PathBuf,
171 pub use_index: bool,
172}
173
174#[derive(Clone, Debug)]
175struct SharedGrepMatch {
176 file: Arc<PathBuf>,
177 line: u32,
178 column: u32,
179 line_text: String,
180 match_text: String,
181}
182
183#[derive(Clone, Debug)]
184enum SearchMatcher {
185 Literal(LiteralSearch),
186 Regex(Regex),
187}
188
189#[derive(Clone, Debug)]
190enum LiteralSearch {
191 CaseSensitive(Vec<u8>),
192 AsciiCaseInsensitive(Vec<u8>),
193}
194
195impl SearchIndex {
196 pub fn new() -> Self {
197 SearchIndex {
198 postings: HashMap::new(),
199 files: Vec::new(),
200 path_to_id: HashMap::new(),
201 ready: false,
202 project_root: PathBuf::new(),
203 git_head: None,
204 max_file_size: DEFAULT_MAX_FILE_SIZE,
205 file_trigrams: HashMap::new(),
206 unindexed_files: HashSet::new(),
207 }
208 }
209
210 pub fn build(root: &Path) -> Self {
211 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
212 }
213
214 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
215 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
216 let mut index = SearchIndex {
217 project_root: project_root.clone(),
218 max_file_size,
219 ..SearchIndex::new()
220 };
221
222 let filters = PathFilters::default();
223 for path in walk_project_files(&project_root, &filters) {
224 index.update_file(&path);
225 }
226
227 index.git_head = current_git_head(&project_root);
228 index.ready = true;
229 index
230 }
231
232 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
233 self.remove_file(path);
234
235 let file_id = match self.allocate_file_id(path, content.len() as u64) {
236 Some(file_id) => file_id,
237 None => return,
238 };
239
240 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
241 for (trigram, next_char, position) in extract_trigrams(content) {
242 let entry = trigram_map.entry(trigram).or_default();
243 entry.next_mask |= mask_for_next_char(next_char);
244 entry.loc_mask |= mask_for_position(position);
245 }
246
247 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
248 for (trigram, filter) in trigram_map {
249 let postings = self.postings.entry(trigram).or_default();
250 postings.push(Posting {
251 file_id,
252 next_mask: filter.next_mask,
253 loc_mask: filter.loc_mask,
254 });
255 if postings.len() > 1
259 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
260 {
261 postings.sort_unstable_by_key(|p| p.file_id);
262 }
263 file_trigrams.push(trigram);
264 }
265
266 self.file_trigrams.insert(file_id, file_trigrams);
267 self.unindexed_files.remove(&file_id);
268 }
269
270 pub fn remove_file(&mut self, path: &Path) {
271 let Some(file_id) = self.path_to_id.remove(path) else {
272 return;
273 };
274
275 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
276 for trigram in trigrams {
277 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
278 postings.retain(|posting| posting.file_id != file_id);
279 postings.is_empty()
280 } else {
281 false
282 };
283
284 if should_remove {
285 self.postings.remove(&trigram);
286 }
287 }
288 }
289
290 self.unindexed_files.remove(&file_id);
291 if let Some(file) = self.files.get_mut(file_id as usize) {
292 file.path = PathBuf::new();
293 file.size = 0;
294 file.modified = UNIX_EPOCH;
295 }
296 }
297
298 pub fn update_file(&mut self, path: &Path) {
299 self.remove_file(path);
300
301 let metadata = match fs::metadata(path) {
302 Ok(metadata) if metadata.is_file() => metadata,
303 _ => return,
304 };
305
306 if is_binary_path(path, metadata.len()) {
307 self.track_unindexed_file(path, &metadata);
308 return;
309 }
310
311 if metadata.len() > self.max_file_size {
312 self.track_unindexed_file(path, &metadata);
313 return;
314 }
315
316 let content = match fs::read(path) {
317 Ok(content) => content,
318 Err(_) => return,
319 };
320
321 if is_binary_bytes(&content) {
322 self.track_unindexed_file(path, &metadata);
323 return;
324 }
325
326 self.index_file(path, &content);
327 }
328
329 pub fn grep(
330 &self,
331 pattern: &str,
332 case_sensitive: bool,
333 include: &[String],
334 exclude: &[String],
335 search_root: &Path,
336 max_results: usize,
337 ) -> GrepResult {
338 self.search_grep(
339 pattern,
340 case_sensitive,
341 include,
342 exclude,
343 search_root,
344 max_results,
345 )
346 }
347
348 pub fn search_grep(
349 &self,
350 pattern: &str,
351 case_sensitive: bool,
352 include: &[String],
353 exclude: &[String],
354 search_root: &Path,
355 max_results: usize,
356 ) -> GrepResult {
357 let is_literal = !pattern.chars().any(|c| {
360 matches!(
361 c,
362 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
363 )
364 });
365
366 let literal_search = if is_literal {
367 if case_sensitive {
368 Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
369 } else if pattern.is_ascii() {
370 Some(LiteralSearch::AsciiCaseInsensitive(
371 pattern
372 .as_bytes()
373 .iter()
374 .map(|byte| byte.to_ascii_lowercase())
375 .collect(),
376 ))
377 } else {
378 None
379 }
380 } else {
381 None
382 };
383
384 let regex = if literal_search.is_some() {
386 None
387 } else {
388 let regex_pattern = if is_literal {
389 regex::escape(pattern)
390 } else {
391 pattern.to_string()
392 };
393 let mut builder = RegexBuilder::new(®ex_pattern);
394 builder.case_insensitive(!case_sensitive);
395 builder.multi_line(true);
397 match builder.build() {
398 Ok(r) => Some(r),
399 Err(_) => {
400 return GrepResult {
401 matches: Vec::new(),
402 total_matches: 0,
403 files_searched: 0,
404 files_with_matches: 0,
405 index_status: if self.ready {
406 IndexStatus::Ready
407 } else {
408 IndexStatus::Building
409 },
410 truncated: false,
411 };
412 }
413 }
414 };
415
416 let matcher = if let Some(literal_search) = literal_search {
417 SearchMatcher::Literal(literal_search)
418 } else {
419 SearchMatcher::Regex(
420 regex.expect("regex should exist when literal matcher is unavailable"),
421 )
422 };
423
424 let filters = match build_path_filters(include, exclude) {
425 Ok(filters) => filters,
426 Err(_) => PathFilters::default(),
427 };
428 let search_root = canonicalize_or_normalize(search_root);
429
430 let query = decompose_regex(pattern);
431 let candidate_ids = self.candidates(&query);
432
433 let candidate_files: Vec<&FileEntry> = candidate_ids
434 .into_iter()
435 .filter_map(|file_id| self.files.get(file_id as usize))
436 .filter(|file| !file.path.as_os_str().is_empty())
437 .filter(|file| is_within_search_root(&search_root, &file.path))
438 .filter(|file| filters.matches(&self.project_root, &file.path))
439 .collect();
440
441 let total_matches = AtomicUsize::new(0);
442 let files_searched = AtomicUsize::new(0);
443 let files_with_matches = AtomicUsize::new(0);
444 let truncated = AtomicBool::new(false);
445 let stop_after = max_results.saturating_mul(2);
446
447 let mut matches = if candidate_files.len() > 10 {
448 candidate_files
449 .par_iter()
450 .map(|file| {
451 search_candidate_file(
452 file,
453 &matcher,
454 max_results,
455 stop_after,
456 &total_matches,
457 &files_searched,
458 &files_with_matches,
459 &truncated,
460 )
461 })
462 .reduce(Vec::new, |mut left, mut right| {
463 left.append(&mut right);
464 left
465 })
466 } else {
467 let mut matches = Vec::new();
468 for file in candidate_files {
469 matches.extend(search_candidate_file(
470 file,
471 &matcher,
472 max_results,
473 stop_after,
474 &total_matches,
475 &files_searched,
476 &files_with_matches,
477 &truncated,
478 ));
479
480 if should_stop_search(&truncated, &total_matches, stop_after) {
481 break;
482 }
483 }
484 matches
485 };
486
487 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
488 self.path_to_id
489 .get(path)
490 .and_then(|file_id| self.files.get(*file_id as usize))
491 .map(|file| file.modified)
492 });
493
494 let matches = matches
495 .into_iter()
496 .map(|matched| GrepMatch {
497 file: matched.file.as_ref().clone(),
498 line: matched.line,
499 column: matched.column,
500 line_text: matched.line_text,
501 match_text: matched.match_text,
502 })
503 .collect();
504
505 GrepResult {
506 total_matches: total_matches.load(Ordering::Relaxed),
507 matches,
508 files_searched: files_searched.load(Ordering::Relaxed),
509 files_with_matches: files_with_matches.load(Ordering::Relaxed),
510 index_status: if self.ready {
511 IndexStatus::Ready
512 } else {
513 IndexStatus::Building
514 },
515 truncated: truncated.load(Ordering::Relaxed),
516 }
517 }
518
519 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
520 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
521 Ok(filters) => filters,
522 Err(_) => return Vec::new(),
523 };
524 let search_root = canonicalize_or_normalize(search_root);
525 let mut entries = self
526 .files
527 .iter()
528 .filter(|file| !file.path.as_os_str().is_empty())
529 .filter(|file| is_within_search_root(&search_root, &file.path))
530 .filter(|file| filters.matches(&self.project_root, &file.path))
531 .map(|file| (file.path.clone(), file.modified))
532 .collect::<Vec<_>>();
533
534 entries.sort_by(|(left_path, left_mtime), (right_path, right_mtime)| {
535 right_mtime
536 .cmp(left_mtime)
537 .then_with(|| left_path.cmp(right_path))
538 });
539
540 entries.into_iter().map(|(path, _)| path).collect()
541 }
542
543 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
544 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
545 return self.active_file_ids();
546 }
547
548 let mut and_trigrams = query.and_trigrams.clone();
549 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
550
551 let mut current: Option<Vec<u32>> = None;
552
553 for trigram in and_trigrams {
554 let filter = query.and_filters.get(&trigram).copied();
555 let matches = self.postings_for_trigram(trigram, filter);
556 current = Some(match current.take() {
557 Some(existing) => intersect_sorted_ids(&existing, &matches),
558 None => matches,
559 });
560
561 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
562 break;
563 }
564 }
565
566 let mut current = current.unwrap_or_else(|| self.active_file_ids());
567
568 for (index, group) in query.or_groups.iter().enumerate() {
569 let mut group_matches = Vec::new();
570 let filters = query.or_filters.get(index);
571
572 for trigram in group {
573 let filter = filters.and_then(|filters| filters.get(trigram).copied());
574 let matches = self.postings_for_trigram(*trigram, filter);
575 if group_matches.is_empty() {
576 group_matches = matches;
577 } else {
578 group_matches = union_sorted_ids(&group_matches, &matches);
579 }
580 }
581
582 current = intersect_sorted_ids(¤t, &group_matches);
583 if current.is_empty() {
584 break;
585 }
586 }
587
588 let mut unindexed = self
589 .unindexed_files
590 .iter()
591 .copied()
592 .filter(|file_id| self.is_active_file(*file_id))
593 .collect::<Vec<_>>();
594 if !unindexed.is_empty() {
595 unindexed.sort_unstable();
596 current = union_sorted_ids(¤t, &unindexed);
597 }
598
599 current
600 }
601
602 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
603 if fs::create_dir_all(cache_dir).is_err() {
604 return;
605 }
606
607 let cache_path = cache_dir.join("cache.bin");
608 let tmp_cache = cache_dir.join(format!(
609 "cache.bin.tmp.{}.{}",
610 std::process::id(),
611 SystemTime::now()
612 .duration_since(UNIX_EPOCH)
613 .unwrap_or(Duration::ZERO)
614 .as_nanos()
615 ));
616
617 let active_ids = self.active_file_ids();
618 let mut id_map = HashMap::new();
619 for (new_id, old_id) in active_ids.iter().enumerate() {
620 let Ok(new_id_u32) = u32::try_from(new_id) else {
621 return;
622 };
623 id_map.insert(*old_id, new_id_u32);
624 }
625
626 let write_result = (|| -> std::io::Result<()> {
627 let mut postings_writer = BufWriter::new(Cursor::new(Vec::new()));
628
629 postings_writer.write_all(INDEX_MAGIC)?;
630 write_u32(&mut postings_writer, INDEX_VERSION)?;
631
632 let head = git_head.unwrap_or_default();
633 let root = self.project_root.to_string_lossy();
634 let head_len = u32::try_from(head.len())
635 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
636 let root_len = u32::try_from(root.len())
637 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
638 let file_count = u32::try_from(active_ids.len())
639 .map_err(|_| std::io::Error::other("too many files to cache"))?;
640
641 write_u32(&mut postings_writer, head_len)?;
642 write_u32(&mut postings_writer, root_len)?;
643 write_u64(&mut postings_writer, self.max_file_size)?;
644 write_u32(&mut postings_writer, file_count)?;
645 postings_writer.write_all(head.as_bytes())?;
646 postings_writer.write_all(root.as_bytes())?;
647
648 for old_id in &active_ids {
649 let Some(file) = self.files.get(*old_id as usize) else {
650 return Err(std::io::Error::other("missing file entry for cache write"));
651 };
652 let path = relative_to_root(&self.project_root, &file.path);
653 let path = path.to_string_lossy();
654 let path_len = u32::try_from(path.len())
655 .map_err(|_| std::io::Error::other("cached path too large"))?;
656 let modified = file
657 .modified
658 .duration_since(UNIX_EPOCH)
659 .unwrap_or(Duration::ZERO);
660 let unindexed = if self.unindexed_files.contains(old_id) {
661 1u8
662 } else {
663 0u8
664 };
665
666 postings_writer.write_all(&[unindexed])?;
667 write_u32(&mut postings_writer, path_len)?;
668 write_u64(&mut postings_writer, file.size)?;
669 write_u64(&mut postings_writer, modified.as_secs())?;
670 write_u32(&mut postings_writer, modified.subsec_nanos())?;
671 postings_writer.write_all(path.as_bytes())?;
672 }
673
674 let mut lookup_entries = Vec::new();
675 let mut postings_blob = Vec::new();
676 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
677 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
678
679 for (trigram, postings) in sorted_postings {
680 let offset = u64::try_from(postings_blob.len())
681 .map_err(|_| std::io::Error::other("postings blob too large"))?;
682 let mut count = 0u32;
683
684 for posting in postings {
685 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
686 continue;
687 };
688
689 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
690 postings_blob.push(posting.next_mask);
691 postings_blob.push(posting.loc_mask);
692 count = count.saturating_add(1);
693 }
694
695 if count > 0 {
696 lookup_entries.push((*trigram, offset, count));
697 }
698 }
699
700 write_u64(
701 &mut postings_writer,
702 u64::try_from(postings_blob.len())
703 .map_err(|_| std::io::Error::other("postings blob too large"))?,
704 )?;
705 postings_writer.write_all(&postings_blob)?;
706 postings_writer.flush()?;
707 let mut postings_blob_file = postings_writer
708 .into_inner()
709 .map_err(|error| std::io::Error::other(error.to_string()))?
710 .into_inner();
711 let checksum = crc32fast::hash(&postings_blob_file);
712 postings_blob_file.extend_from_slice(&checksum.to_le_bytes());
713
714 let mut lookup_writer = BufWriter::new(Cursor::new(Vec::new()));
715 let entry_count = u32::try_from(lookup_entries.len())
716 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
717
718 lookup_writer.write_all(LOOKUP_MAGIC)?;
719 write_u32(&mut lookup_writer, INDEX_VERSION)?;
720 write_u32(&mut lookup_writer, entry_count)?;
721
722 for (trigram, offset, count) in lookup_entries {
723 write_u32(&mut lookup_writer, trigram)?;
724 write_u64(&mut lookup_writer, offset)?;
725 write_u32(&mut lookup_writer, count)?;
726 }
727
728 lookup_writer.flush()?;
729 let mut lookup_blob_file = lookup_writer
730 .into_inner()
731 .map_err(|error| std::io::Error::other(error.to_string()))?
732 .into_inner();
733 let checksum = crc32fast::hash(&lookup_blob_file);
734 lookup_blob_file.extend_from_slice(&checksum.to_le_bytes());
735
736 let mut cache_writer = BufWriter::new(File::create(&tmp_cache)?);
737 write_u32(&mut cache_writer, CACHE_MAGIC)?;
738 write_u32(&mut cache_writer, INDEX_VERSION)?;
739 write_u64(
740 &mut cache_writer,
741 u64::try_from(postings_blob_file.len())
742 .map_err(|_| std::io::Error::other("postings section too large"))?,
743 )?;
744 cache_writer.write_all(&postings_blob_file)?;
745 cache_writer.write_all(&lookup_blob_file)?;
746 cache_writer.flush()?;
747 cache_writer.get_ref().sync_all()?;
748 drop(cache_writer);
749 fs::rename(&tmp_cache, &cache_path)?;
750
751 Ok(())
752 })();
753
754 if write_result.is_err() {
755 let _ = fs::remove_file(&tmp_cache);
756 }
757 }
758
759 pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
760 let cache_path = cache_dir.join("cache.bin");
761 let cache_bytes = fs::read(&cache_path).ok()?;
762 if cache_bytes.len() < 16 {
763 return None;
764 }
765 let mut header = Cursor::new(&cache_bytes);
766 if read_u32(&mut header).ok()? != CACHE_MAGIC {
767 return None;
768 }
769 if read_u32(&mut header).ok()? != INDEX_VERSION {
770 return None;
771 }
772 let postings_len_total = usize::try_from(read_u64(&mut header).ok()?).ok()?;
773 let start = usize::try_from(header.position()).ok()?;
774 let postings_end = start.checked_add(postings_len_total)?;
775 if postings_end > cache_bytes.len() {
776 return None;
777 }
778 let postings_bytes = &cache_bytes[start..postings_end];
779 let lookup_bytes = &cache_bytes[postings_end..];
780 let lookup_len_total = lookup_bytes.len();
781 let mut postings_reader = BufReader::new(Cursor::new(postings_bytes));
782 let mut lookup_reader = BufReader::new(Cursor::new(lookup_bytes));
783 if postings_len_total < 4 || lookup_len_total < 4 {
784 return None;
785 }
786 verify_crc32_bytes_slice(postings_bytes).ok()?;
787 verify_crc32_bytes_slice(lookup_bytes).ok()?;
788
789 let mut magic = [0u8; 8];
790 postings_reader.read_exact(&mut magic).ok()?;
791 if &magic != INDEX_MAGIC {
792 return None;
793 }
794 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
795 return None;
796 }
797
798 let head_len = read_u32(&mut postings_reader).ok()? as usize;
799 let root_len = read_u32(&mut postings_reader).ok()? as usize;
800 let max_file_size = read_u64(&mut postings_reader).ok()?;
801 let file_count = read_u32(&mut postings_reader).ok()? as usize;
802 if file_count > MAX_ENTRIES {
803 return None;
804 }
805 let postings_body_len = postings_len_total.checked_sub(4)?;
806 let lookup_body_len = lookup_len_total.checked_sub(4)?;
807
808 let remaining_postings = remaining_bytes(&mut postings_reader, postings_body_len)?;
809 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
810 if minimum_file_bytes > remaining_postings {
811 return None;
812 }
813
814 if head_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
815 return None;
816 }
817 let mut head_bytes = vec![0u8; head_len];
818 postings_reader.read_exact(&mut head_bytes).ok()?;
819 let git_head = String::from_utf8(head_bytes)
820 .ok()
821 .filter(|head| !head.is_empty());
822
823 if root_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
824 return None;
825 }
826 let mut root_bytes = vec![0u8; root_len];
827 postings_reader.read_exact(&mut root_bytes).ok()?;
828 let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
829
830 let mut files = Vec::with_capacity(file_count);
831 let mut path_to_id = HashMap::new();
832 let mut unindexed_files = HashSet::new();
833
834 for file_id in 0..file_count {
835 let mut unindexed = [0u8; 1];
836 postings_reader.read_exact(&mut unindexed).ok()?;
837 let path_len = read_u32(&mut postings_reader).ok()? as usize;
838 let size = read_u64(&mut postings_reader).ok()?;
839 let secs = read_u64(&mut postings_reader).ok()?;
840 let nanos = read_u32(&mut postings_reader).ok()?;
841 if nanos >= 1_000_000_000 {
842 return None;
843 }
844 if path_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
845 return None;
846 }
847 let mut path_bytes = vec![0u8; path_len];
848 postings_reader.read_exact(&mut path_bytes).ok()?;
849 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
850 let full_path = project_root.join(relative_path);
851 let file_id_u32 = u32::try_from(file_id).ok()?;
852
853 files.push(FileEntry {
854 path: full_path.clone(),
855 size,
856 modified: UNIX_EPOCH + Duration::new(secs, nanos),
857 });
858 path_to_id.insert(full_path, file_id_u32);
859 if unindexed[0] == 1 {
860 unindexed_files.insert(file_id_u32);
861 }
862 }
863
864 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
865 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
866 if postings_len > max_postings_bytes {
867 return None;
868 }
869 if postings_len > remaining_bytes(&mut postings_reader, postings_body_len)? {
870 return None;
871 }
872 let mut postings_blob = vec![0u8; postings_len];
873 postings_reader.read_exact(&mut postings_blob).ok()?;
874
875 let mut lookup_magic = [0u8; 8];
876 lookup_reader.read_exact(&mut lookup_magic).ok()?;
877 if &lookup_magic != LOOKUP_MAGIC {
878 return None;
879 }
880 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
881 return None;
882 }
883 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
884 if entry_count > MAX_ENTRIES {
885 return None;
886 }
887 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_body_len)?;
888 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
889 if minimum_lookup_bytes > remaining_lookup {
890 return None;
891 }
892
893 let mut postings = HashMap::new();
894 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
895
896 for _ in 0..entry_count {
897 let trigram = read_u32(&mut lookup_reader).ok()?;
898 let offset = read_u64(&mut lookup_reader).ok()? as usize;
899 let count = read_u32(&mut lookup_reader).ok()? as usize;
900 if count > MAX_ENTRIES {
901 return None;
902 }
903 let bytes_len = count.checked_mul(POSTING_BYTES)?;
904 let end = offset.checked_add(bytes_len)?;
905 if end > postings_blob.len() {
906 return None;
907 }
908
909 let mut trigram_postings = Vec::with_capacity(count);
910 for chunk in postings_blob[offset..end].chunks_exact(6) {
911 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
912 let posting = Posting {
913 file_id,
914 next_mask: chunk[4],
915 loc_mask: chunk[5],
916 };
917 trigram_postings.push(posting.clone());
918 file_trigrams.entry(file_id).or_default().push(trigram);
919 }
920 postings.insert(trigram, trigram_postings);
921 }
922
923 Some(SearchIndex {
924 postings,
925 files,
926 path_to_id,
927 ready: true,
928 project_root,
929 git_head,
930 max_file_size,
931 file_trigrams,
932 unindexed_files,
933 })
934 }
935
936 pub(crate) fn stored_git_head(&self) -> Option<&str> {
937 self.git_head.as_deref()
938 }
939
940 pub(crate) fn set_ready(&mut self, ready: bool) {
941 self.ready = ready;
942 }
943
944 pub(crate) fn rebuild_or_refresh(
945 root: &Path,
946 max_file_size: u64,
947 current_head: Option<String>,
948 baseline: Option<SearchIndex>,
949 ) -> Self {
950 if let Some(mut baseline) = baseline {
951 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
952 baseline.max_file_size = max_file_size;
953
954 if baseline.git_head == current_head || current_head.is_none() {
955 baseline.git_head = current_head;
962 verify_file_mtimes(&mut baseline);
963 baseline.ready = true;
964 return baseline;
965 }
966
967 if let (Some(previous), Some(current)) =
968 (baseline.git_head.clone(), current_head.clone())
969 {
970 let project_root = baseline.project_root.clone();
971 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
972 baseline.git_head = Some(current);
973 baseline.ready = true;
974 return baseline;
975 }
976 }
977 }
978
979 SearchIndex::build_with_limit(root, max_file_size)
980 }
981
982 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
983 let file_id = u32::try_from(self.files.len()).ok()?;
984 let metadata = fs::metadata(path).ok();
985 let size = metadata
986 .as_ref()
987 .map_or(size_hint, |metadata| metadata.len());
988 let modified = metadata
989 .and_then(|metadata| metadata.modified().ok())
990 .unwrap_or(UNIX_EPOCH);
991
992 self.files.push(FileEntry {
993 path: path.to_path_buf(),
994 size,
995 modified,
996 });
997 self.path_to_id.insert(path.to_path_buf(), file_id);
998 Some(file_id)
999 }
1000
1001 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
1002 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
1003 return;
1004 };
1005 self.unindexed_files.insert(file_id);
1006 self.file_trigrams.insert(file_id, Vec::new());
1007 }
1008
1009 fn active_file_ids(&self) -> Vec<u32> {
1010 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
1011 ids.sort_unstable();
1012 ids
1013 }
1014
1015 fn is_active_file(&self, file_id: u32) -> bool {
1016 self.files
1017 .get(file_id as usize)
1018 .map(|file| !file.path.as_os_str().is_empty())
1019 .unwrap_or(false)
1020 }
1021
1022 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
1023 let Some(postings) = self.postings.get(&trigram) else {
1024 return Vec::new();
1025 };
1026
1027 let mut matches = Vec::with_capacity(postings.len());
1028
1029 for posting in postings {
1030 if let Some(filter) = filter {
1031 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
1034 continue;
1035 }
1036 }
1041 if self.is_active_file(posting.file_id) {
1042 matches.push(posting.file_id);
1043 }
1044 }
1045
1046 matches
1047 }
1048}
1049
1050fn search_candidate_file(
1051 file: &FileEntry,
1052 matcher: &SearchMatcher,
1053 max_results: usize,
1054 stop_after: usize,
1055 total_matches: &AtomicUsize,
1056 files_searched: &AtomicUsize,
1057 files_with_matches: &AtomicUsize,
1058 truncated: &AtomicBool,
1059) -> Vec<SharedGrepMatch> {
1060 if should_stop_search(truncated, total_matches, stop_after) {
1061 return Vec::new();
1062 }
1063
1064 let content = match read_indexed_file_bytes(&file.path) {
1065 Some(content) => content,
1066 None => return Vec::new(),
1067 };
1068 if is_binary_bytes(&content) {
1075 return Vec::new();
1076 }
1077 files_searched.fetch_add(1, Ordering::Relaxed);
1078
1079 let shared_path = Arc::new(file.path.clone());
1080 let mut matches = Vec::new();
1081 let mut line_starts = None;
1082 let mut seen_lines = HashSet::new();
1083 let mut matched_this_file = false;
1084
1085 match matcher {
1086 SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
1087 let finder = memchr::memmem::Finder::new(needle);
1088 let mut start = 0;
1089
1090 while let Some(position) = finder.find(&content[start..]) {
1091 if should_stop_search(truncated, total_matches, stop_after) {
1092 break;
1093 }
1094
1095 let offset = start + position;
1096 start = offset + 1;
1097
1098 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1099 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1100 if !seen_lines.insert(line) {
1101 continue;
1102 }
1103
1104 matched_this_file = true;
1105 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1106 if match_number > max_results {
1107 truncated.store(true, Ordering::Relaxed);
1108 break;
1109 }
1110
1111 let end = offset + needle.len();
1112 matches.push(SharedGrepMatch {
1113 file: shared_path.clone(),
1114 line,
1115 column,
1116 line_text,
1117 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1118 });
1119 }
1120 }
1121 SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1122 let search_content = content.to_ascii_lowercase();
1123 let finder = memchr::memmem::Finder::new(needle);
1124 let mut start = 0;
1125
1126 while let Some(position) = finder.find(&search_content[start..]) {
1127 if should_stop_search(truncated, total_matches, stop_after) {
1128 break;
1129 }
1130
1131 let offset = start + position;
1132 start = offset + 1;
1133
1134 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1135 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1136 if !seen_lines.insert(line) {
1137 continue;
1138 }
1139
1140 matched_this_file = true;
1141 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1142 if match_number > max_results {
1143 truncated.store(true, Ordering::Relaxed);
1144 break;
1145 }
1146
1147 let end = offset + needle.len();
1148 matches.push(SharedGrepMatch {
1149 file: shared_path.clone(),
1150 line,
1151 column,
1152 line_text,
1153 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1154 });
1155 }
1156 }
1157 SearchMatcher::Regex(regex) => {
1158 for matched in regex.find_iter(&content) {
1159 if should_stop_search(truncated, total_matches, stop_after) {
1160 break;
1161 }
1162
1163 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1164 let (line, column, line_text) =
1165 line_details_bytes(&content, line_starts, matched.start());
1166 if !seen_lines.insert(line) {
1167 continue;
1168 }
1169
1170 matched_this_file = true;
1171 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1172 if match_number > max_results {
1173 truncated.store(true, Ordering::Relaxed);
1174 break;
1175 }
1176
1177 matches.push(SharedGrepMatch {
1178 file: shared_path.clone(),
1179 line,
1180 column,
1181 line_text,
1182 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1183 });
1184 }
1185 }
1186 }
1187
1188 if matched_this_file {
1189 files_with_matches.fetch_add(1, Ordering::Relaxed);
1190 }
1191
1192 matches
1193}
1194
1195fn should_stop_search(
1196 truncated: &AtomicBool,
1197 total_matches: &AtomicUsize,
1198 stop_after: usize,
1199) -> bool {
1200 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1201}
1202
1203fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1204 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1205 let mut left_index = 0;
1206 let mut right_index = 0;
1207
1208 while left_index < left.len() && right_index < right.len() {
1209 match left[left_index].cmp(&right[right_index]) {
1210 std::cmp::Ordering::Less => left_index += 1,
1211 std::cmp::Ordering::Greater => right_index += 1,
1212 std::cmp::Ordering::Equal => {
1213 merged.push(left[left_index]);
1214 left_index += 1;
1215 right_index += 1;
1216 }
1217 }
1218 }
1219
1220 merged
1221}
1222
1223fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1224 let mut merged = Vec::with_capacity(left.len() + right.len());
1225 let mut left_index = 0;
1226 let mut right_index = 0;
1227
1228 while left_index < left.len() && right_index < right.len() {
1229 match left[left_index].cmp(&right[right_index]) {
1230 std::cmp::Ordering::Less => {
1231 merged.push(left[left_index]);
1232 left_index += 1;
1233 }
1234 std::cmp::Ordering::Greater => {
1235 merged.push(right[right_index]);
1236 right_index += 1;
1237 }
1238 std::cmp::Ordering::Equal => {
1239 merged.push(left[left_index]);
1240 left_index += 1;
1241 right_index += 1;
1242 }
1243 }
1244 }
1245
1246 merged.extend_from_slice(&left[left_index..]);
1247 merged.extend_from_slice(&right[right_index..]);
1248 merged
1249}
1250
1251pub fn decompose_regex(pattern: &str) -> RegexQuery {
1252 let hir = match regex_syntax::parse(pattern) {
1253 Ok(hir) => hir,
1254 Err(_) => return RegexQuery::default(),
1255 };
1256
1257 let build = build_query(&hir);
1258 build.into_query()
1259}
1260
1261pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1262 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1263}
1264
1265pub fn normalize_char(c: u8) -> u8 {
1266 c.to_ascii_lowercase()
1267}
1268
1269pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1270 if content.len() < 3 {
1271 return Vec::new();
1272 }
1273
1274 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1275 for start in 0..=content.len() - 3 {
1276 let trigram = pack_trigram(
1277 normalize_char(content[start]),
1278 normalize_char(content[start + 1]),
1279 normalize_char(content[start + 2]),
1280 );
1281 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1282 trigrams.push((trigram, next_char, start));
1283 }
1284 trigrams
1285}
1286
1287pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1288 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1290 return PathBuf::from(override_dir)
1291 .join("index")
1292 .join(project_cache_key(project_root));
1293 }
1294 if let Some(dir) = storage_dir {
1296 return dir.join("index").join(project_cache_key(project_root));
1297 }
1298 let home = std::env::var_os("HOME")
1303 .or_else(|| std::env::var_os("USERPROFILE"))
1304 .map(PathBuf::from)
1305 .unwrap_or_else(std::env::temp_dir);
1306 home.join(".cache")
1307 .join("aft")
1308 .join("index")
1309 .join(project_cache_key(project_root))
1310}
1311
1312pub(crate) fn build_path_filters(
1313 include: &[String],
1314 exclude: &[String],
1315) -> Result<PathFilters, String> {
1316 Ok(PathFilters {
1317 includes: build_globset(include)?,
1318 excludes: build_globset(exclude)?,
1319 })
1320}
1321
1322pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1323 walk_project_files_from(root, root, filters)
1324}
1325
1326pub(crate) fn walk_project_files_from(
1327 filter_root: &Path,
1328 search_root: &Path,
1329 filters: &PathFilters,
1330) -> Vec<PathBuf> {
1331 let mut builder = WalkBuilder::new(search_root);
1332 builder
1333 .hidden(false)
1334 .git_ignore(true)
1335 .git_global(true)
1336 .git_exclude(true)
1337 .filter_entry(|entry| {
1338 let name = entry.file_name().to_string_lossy();
1339 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1340 return !matches!(
1341 name.as_ref(),
1342 "node_modules"
1343 | "target"
1344 | "venv"
1345 | ".venv"
1346 | ".git"
1347 | "__pycache__"
1348 | ".tox"
1349 | "dist"
1350 | "build"
1351 );
1352 }
1353 true
1354 });
1355
1356 let mut files = Vec::new();
1357 for entry in builder.build().filter_map(|entry| entry.ok()) {
1358 if !entry
1359 .file_type()
1360 .map_or(false, |file_type| file_type.is_file())
1361 {
1362 continue;
1363 }
1364 let path = entry.into_path();
1365 if filters.matches(filter_root, &path) {
1366 files.push(path);
1367 }
1368 }
1369
1370 sort_paths_by_mtime_desc(&mut files);
1371 files
1372}
1373
1374pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1375 let bytes = fs::read(path).ok()?;
1376 if is_binary_bytes(&bytes) {
1377 return None;
1378 }
1379 String::from_utf8(bytes).ok()
1380}
1381
1382fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1383 fs::read(path).ok()
1384}
1385
1386pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1387 path.strip_prefix(root)
1388 .map(PathBuf::from)
1389 .unwrap_or_else(|_| path.to_path_buf())
1390}
1391
1392pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1405 use std::collections::HashMap;
1406 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(paths.len());
1407 for path in paths.iter() {
1408 mtimes
1409 .entry(path.clone())
1410 .or_insert_with(|| path_modified_time(path));
1411 }
1412 paths.sort_by(|left, right| {
1413 let left_mtime = mtimes.get(left).and_then(|v| *v);
1414 let right_mtime = mtimes.get(right).and_then(|v| *v);
1415 right_mtime.cmp(&left_mtime).then_with(|| left.cmp(right))
1416 });
1417}
1418
1419pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1422 use std::collections::HashMap;
1423 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::new();
1424 for m in matches.iter() {
1425 mtimes.entry(m.file.clone()).or_insert_with(|| {
1426 let resolved = resolve_match_path(project_root, &m.file);
1427 path_modified_time(&resolved)
1428 });
1429 }
1430 matches.sort_by(|left, right| {
1431 let left_mtime = mtimes.get(&left.file).and_then(|v| *v);
1432 let right_mtime = mtimes.get(&right.file).and_then(|v| *v);
1433 right_mtime
1434 .cmp(&left_mtime)
1435 .then_with(|| left.file.cmp(&right.file))
1436 .then_with(|| left.line.cmp(&right.line))
1437 .then_with(|| left.column.cmp(&right.column))
1438 });
1439}
1440
1441fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1446 matches: &mut [SharedGrepMatch],
1447 modified_for_path: F,
1448) where
1449 F: Fn(&Path) -> Option<SystemTime>,
1450{
1451 use std::collections::HashMap;
1452 let mut mtimes: HashMap<PathBuf, Option<SystemTime>> = HashMap::with_capacity(matches.len());
1453 for m in matches.iter() {
1454 let path = m.file.as_path().to_path_buf();
1455 mtimes
1456 .entry(path.clone())
1457 .or_insert_with(|| modified_for_path(&path));
1458 }
1459 matches.sort_by(|left, right| {
1460 let left_mtime = mtimes.get(left.file.as_path()).and_then(|v| *v);
1461 let right_mtime = mtimes.get(right.file.as_path()).and_then(|v| *v);
1462 right_mtime
1463 .cmp(&left_mtime)
1464 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1465 .then_with(|| left.line.cmp(&right.line))
1466 .then_with(|| left.column.cmp(&right.column))
1467 });
1468}
1469
1470pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1471 let resolved_project_root = canonicalize_or_normalize(project_root);
1472 let root = match path {
1473 Some(path) => {
1474 let path = PathBuf::from(path);
1475 if path.is_absolute() {
1476 canonicalize_or_normalize(&path)
1477 } else {
1478 normalize_path(&resolved_project_root.join(path))
1479 }
1480 }
1481 None => resolved_project_root.clone(),
1482 };
1483
1484 let use_index = is_within_search_root(&resolved_project_root, &root);
1485 SearchScope { root, use_index }
1486}
1487
1488pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1489 content_inspector::inspect(content).is_binary()
1490}
1491
1492pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1493 run_git(root, &["rev-parse", "HEAD"])
1494}
1495
1496pub fn project_cache_key(project_root: &Path) -> String {
1497 use sha2::{Digest, Sha256};
1498
1499 let mut hasher = Sha256::new();
1500
1501 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1502 hasher.update(root_commit.as_bytes());
1505 } else {
1506 let canonical_root = canonicalize_or_normalize(project_root);
1508 hasher.update(canonical_root.to_string_lossy().as_bytes());
1509 }
1510
1511 let digest = format!("{:x}", hasher.finalize());
1512 digest[..16].to_string()
1513}
1514
1515impl PathFilters {
1516 fn matches(&self, root: &Path, path: &Path) -> bool {
1517 let relative = to_glob_path(&relative_to_root(root, path));
1518 if self
1519 .includes
1520 .as_ref()
1521 .is_some_and(|includes| !includes.is_match(&relative))
1522 {
1523 return false;
1524 }
1525 if self
1526 .excludes
1527 .as_ref()
1528 .is_some_and(|excludes| excludes.is_match(&relative))
1529 {
1530 return false;
1531 }
1532 true
1533 }
1534}
1535
1536fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1537 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1538}
1539
1540fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1541 if path.is_absolute() {
1542 path.to_path_buf()
1543 } else {
1544 project_root.join(path)
1545 }
1546}
1547
1548fn path_modified_time(path: &Path) -> Option<SystemTime> {
1549 fs::metadata(path)
1550 .and_then(|metadata| metadata.modified())
1551 .ok()
1552}
1553
1554fn normalize_path(path: &Path) -> PathBuf {
1555 let mut result = PathBuf::new();
1556 for component in path.components() {
1557 match component {
1558 Component::ParentDir => {
1559 if !result.pop() {
1560 result.push(component);
1561 }
1562 }
1563 Component::CurDir => {}
1564 _ => result.push(component),
1565 }
1566 }
1567 result
1568}
1569
1570fn verify_file_mtimes(index: &mut SearchIndex) {
1573 let mut stale_paths = Vec::new();
1575 for entry in &index.files {
1576 if entry.path.as_os_str().is_empty() {
1577 continue; }
1579 match fs::metadata(&entry.path) {
1580 Ok(meta) => {
1581 let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1582 if current_mtime != entry.modified || meta.len() != entry.size {
1583 stale_paths.push(entry.path.clone());
1584 }
1585 }
1586 Err(_) => {
1587 stale_paths.push(entry.path.clone());
1589 }
1590 }
1591 }
1592
1593 for path in &stale_paths {
1595 index.update_file(path);
1596 }
1597
1598 let filters = PathFilters::default();
1600 for path in walk_project_files(&index.project_root, &filters) {
1601 if !index.path_to_id.contains_key(&path) {
1602 index.update_file(&path);
1603 }
1604 }
1605
1606 if !stale_paths.is_empty() {
1607 log::info!(
1608 "search index: refreshed {} stale file(s) from disk cache",
1609 stale_paths.len()
1610 );
1611 }
1612}
1613
1614fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1615 path.starts_with(search_root)
1616}
1617
1618impl QueryBuild {
1619 fn into_query(self) -> RegexQuery {
1620 let mut query = RegexQuery::default();
1621
1622 for run in self.and_runs {
1623 add_run_to_and_query(&mut query, &run);
1624 }
1625
1626 for group in self.or_groups {
1627 let mut trigrams = BTreeSet::new();
1628 let mut filters = HashMap::new();
1629 for run in group {
1630 for (trigram, filter) in trigram_filters(&run) {
1631 trigrams.insert(trigram);
1632 merge_filter(filters.entry(trigram).or_default(), filter);
1633 }
1634 }
1635 if !trigrams.is_empty() {
1636 query.or_groups.push(trigrams.into_iter().collect());
1637 query.or_filters.push(filters);
1638 }
1639 }
1640
1641 query
1642 }
1643}
1644
1645fn build_query(hir: &Hir) -> QueryBuild {
1646 match hir.kind() {
1647 HirKind::Literal(literal) => {
1648 if literal.0.len() >= 3 {
1649 QueryBuild {
1650 and_runs: vec![literal.0.to_vec()],
1651 or_groups: Vec::new(),
1652 }
1653 } else {
1654 QueryBuild::default()
1655 }
1656 }
1657 HirKind::Capture(capture) => build_query(&capture.sub),
1658 HirKind::Concat(parts) => {
1659 let mut build = QueryBuild::default();
1660 for part in parts {
1661 let part_build = build_query(part);
1662 build.and_runs.extend(part_build.and_runs);
1663 build.or_groups.extend(part_build.or_groups);
1664 }
1665 build
1666 }
1667 HirKind::Alternation(parts) => {
1668 let mut group = Vec::new();
1669 for part in parts {
1670 let Some(mut choices) = guaranteed_run_choices(part) else {
1671 return QueryBuild::default();
1672 };
1673 group.append(&mut choices);
1674 }
1675 if group.is_empty() {
1676 QueryBuild::default()
1677 } else {
1678 QueryBuild {
1679 and_runs: Vec::new(),
1680 or_groups: vec![group],
1681 }
1682 }
1683 }
1684 HirKind::Repetition(repetition) => {
1685 if repetition.min == 0 {
1686 QueryBuild::default()
1687 } else {
1688 build_query(&repetition.sub)
1689 }
1690 }
1691 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1692 }
1693}
1694
1695fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1696 match hir.kind() {
1697 HirKind::Literal(literal) => {
1698 if literal.0.len() >= 3 {
1699 Some(vec![literal.0.to_vec()])
1700 } else {
1701 None
1702 }
1703 }
1704 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1705 HirKind::Concat(parts) => {
1706 let mut runs = Vec::new();
1707 for part in parts {
1708 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1709 runs.append(&mut part_runs);
1710 }
1711 }
1712 if runs.is_empty() {
1713 None
1714 } else {
1715 Some(runs)
1716 }
1717 }
1718 HirKind::Alternation(parts) => {
1719 let mut runs = Vec::new();
1720 for part in parts {
1721 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1722 return None;
1723 };
1724 runs.append(&mut part_runs);
1725 }
1726 if runs.is_empty() {
1727 None
1728 } else {
1729 Some(runs)
1730 }
1731 }
1732 HirKind::Repetition(repetition) => {
1733 if repetition.min == 0 {
1734 None
1735 } else {
1736 guaranteed_run_choices(&repetition.sub)
1737 }
1738 }
1739 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1740 }
1741}
1742
1743fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1744 for (trigram, filter) in trigram_filters(run) {
1745 if !query.and_trigrams.contains(&trigram) {
1746 query.and_trigrams.push(trigram);
1747 }
1748 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1749 }
1750}
1751
1752fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1753 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1754 for (trigram, next_char, position) in extract_trigrams(run) {
1755 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1756 if next_char != EOF_SENTINEL {
1757 entry.next_mask |= mask_for_next_char(next_char);
1758 }
1759 entry.loc_mask |= mask_for_position(position);
1760 }
1761 filters.into_iter().collect()
1762}
1763
1764fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1765 target.next_mask |= filter.next_mask;
1766 target.loc_mask |= filter.loc_mask;
1767}
1768
1769fn mask_for_next_char(next_char: u8) -> u8 {
1770 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1771 1u8 << bit
1772}
1773
1774fn mask_for_position(position: usize) -> u8 {
1775 1u8 << (position % 8)
1776}
1777
1778fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1779 if patterns.is_empty() {
1780 return Ok(None);
1781 }
1782
1783 let mut builder = GlobSetBuilder::new();
1784 for pattern in patterns {
1785 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1786 builder.add(glob);
1787 }
1788 builder.build().map(Some).map_err(|error| error.to_string())
1789}
1790
1791fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1792 let mut buffer = [0u8; 4];
1793 reader.read_exact(&mut buffer)?;
1794 Ok(u32::from_le_bytes(buffer))
1795}
1796
1797fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1798 let mut buffer = [0u8; 8];
1799 reader.read_exact(&mut buffer)?;
1800 Ok(u64::from_le_bytes(buffer))
1801}
1802
1803fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1804 writer.write_all(&value.to_le_bytes())
1805}
1806
1807fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1808 writer.write_all(&value.to_le_bytes())
1809}
1810
1811fn verify_crc32_bytes_slice(bytes: &[u8]) -> std::io::Result<()> {
1812 let Some((body, stored)) = bytes.split_last_chunk::<4>() else {
1813 return Err(std::io::Error::other("search index checksum missing"));
1814 };
1815 let expected = u32::from_le_bytes(*stored);
1816 let actual = crc32fast::hash(body);
1817 if actual != expected {
1818 return Err(std::io::Error::other("search index checksum mismatch"));
1819 }
1820 Ok(())
1821}
1822
1823fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1824 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1825 total_len.checked_sub(pos)
1826}
1827
1828fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1829 let output = Command::new("git")
1830 .arg("-C")
1831 .arg(root)
1832 .args(args)
1833 .output()
1834 .ok()?;
1835 if !output.status.success() {
1836 return None;
1837 }
1838 let value = String::from_utf8(output.stdout).ok()?;
1839 let value = value.trim().to_string();
1840 if value.is_empty() {
1841 None
1842 } else {
1843 Some(value)
1844 }
1845}
1846
1847fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1848 let diff_range = format!("{}..{}", from, to);
1849 let output = match Command::new("git")
1850 .arg("-C")
1851 .arg(root)
1852 .args(["diff", "--name-only", &diff_range])
1853 .output()
1854 {
1855 Ok(output) => output,
1856 Err(_) => return false,
1857 };
1858
1859 if !output.status.success() {
1860 return false;
1861 }
1862
1863 let Ok(paths) = String::from_utf8(output.stdout) else {
1864 return false;
1865 };
1866
1867 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1868 let path = root.join(relative_path);
1869 if path.exists() {
1870 index.update_file(&path);
1871 } else {
1872 index.remove_file(&path);
1873 }
1874 }
1875
1876 true
1877}
1878
1879fn is_binary_path(path: &Path, size: u64) -> bool {
1880 if size == 0 {
1881 return false;
1882 }
1883
1884 let mut file = match File::open(path) {
1885 Ok(file) => file,
1886 Err(_) => return true,
1887 };
1888
1889 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1890 match file.read(&mut preview) {
1891 Ok(read) => is_binary_bytes(&preview[..read]),
1892 Err(_) => true,
1893 }
1894}
1895
1896fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1897 let mut starts = vec![0usize];
1898 for (index, byte) in content.iter().copied().enumerate() {
1899 if byte == b'\n' {
1900 starts.push(index + 1);
1901 }
1902 }
1903 starts
1904}
1905
1906fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1907 let line_index = match line_starts.binary_search(&offset) {
1908 Ok(index) => index,
1909 Err(index) => index.saturating_sub(1),
1910 };
1911 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1912 let line_end = content[line_start..]
1913 .iter()
1914 .position(|byte| *byte == b'\n')
1915 .map(|length| line_start + length)
1916 .unwrap_or(content.len());
1917 let mut line_slice = &content[line_start..line_end];
1918 if line_slice.ends_with(b"\r") {
1919 line_slice = &line_slice[..line_slice.len() - 1];
1920 }
1921 let line_text = String::from_utf8_lossy(line_slice).into_owned();
1922 let column = String::from_utf8_lossy(&content[line_start..offset])
1923 .chars()
1924 .count() as u32
1925 + 1;
1926 (line_index as u32 + 1, column, line_text)
1927}
1928
1929fn to_glob_path(path: &Path) -> String {
1930 path.to_string_lossy().replace('\\', "/")
1931}
1932
1933#[cfg(test)]
1934mod tests {
1935 use std::process::Command;
1936
1937 use super::*;
1938
1939 #[test]
1940 fn extract_trigrams_tracks_next_char_and_position() {
1941 let trigrams = extract_trigrams(b"Rust");
1942 assert_eq!(trigrams.len(), 2);
1943 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1944 assert_eq!(
1945 trigrams[1],
1946 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1947 );
1948 }
1949
1950 #[test]
1951 fn decompose_regex_extracts_literals_and_alternations() {
1952 let query = decompose_regex("abc(def|ghi)xyz");
1953 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1954 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1955 assert_eq!(query.or_groups.len(), 1);
1956 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1957 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1958 }
1959
1960 #[test]
1961 fn candidates_intersect_posting_lists() {
1962 let mut index = SearchIndex::new();
1963 let dir = tempfile::tempdir().expect("create temp dir");
1964 let alpha = dir.path().join("alpha.txt");
1965 let beta = dir.path().join("beta.txt");
1966 fs::write(&alpha, "abcdef").expect("write alpha");
1967 fs::write(&beta, "abcxyz").expect("write beta");
1968 index.project_root = dir.path().to_path_buf();
1969 index.index_file(&alpha, b"abcdef");
1970 index.index_file(&beta, b"abcxyz");
1971
1972 let query = RegexQuery {
1973 and_trigrams: vec![
1974 pack_trigram(b'a', b'b', b'c'),
1975 pack_trigram(b'd', b'e', b'f'),
1976 ],
1977 ..RegexQuery::default()
1978 };
1979
1980 let candidates = index.candidates(&query);
1981 assert_eq!(candidates.len(), 1);
1982 assert_eq!(index.files[candidates[0] as usize].path, alpha);
1983 }
1984
1985 #[test]
1986 fn candidates_apply_bloom_filters() {
1987 let mut index = SearchIndex::new();
1988 let dir = tempfile::tempdir().expect("create temp dir");
1989 let file = dir.path().join("sample.txt");
1990 fs::write(&file, "abcd efgh").expect("write sample");
1991 index.project_root = dir.path().to_path_buf();
1992 index.index_file(&file, b"abcd efgh");
1993
1994 let trigram = pack_trigram(b'a', b'b', b'c');
1995 let matching_filter = PostingFilter {
1996 next_mask: mask_for_next_char(b'd'),
1997 loc_mask: mask_for_position(0),
1998 };
1999 let non_matching_filter = PostingFilter {
2000 next_mask: mask_for_next_char(b'z'),
2001 loc_mask: mask_for_position(0),
2002 };
2003
2004 assert_eq!(
2005 index
2006 .postings_for_trigram(trigram, Some(matching_filter))
2007 .len(),
2008 1
2009 );
2010 assert!(index
2011 .postings_for_trigram(trigram, Some(non_matching_filter))
2012 .is_empty());
2013 }
2014
2015 #[test]
2016 fn disk_round_trip_preserves_postings_and_files() {
2017 let dir = tempfile::tempdir().expect("create temp dir");
2018 let project = dir.path().join("project");
2019 fs::create_dir_all(&project).expect("create project dir");
2020 let file = project.join("src.txt");
2021 fs::write(&file, "abcdef").expect("write source");
2022
2023 let mut index = SearchIndex::build(&project);
2024 index.git_head = Some("deadbeef".to_string());
2025 let cache_dir = dir.path().join("cache");
2026 index.write_to_disk(&cache_dir, index.git_head.as_deref());
2027
2028 let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
2029 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
2030 assert_eq!(loaded.files.len(), 1);
2031 assert_eq!(
2032 relative_to_root(&loaded.project_root, &loaded.files[0].path),
2033 PathBuf::from("src.txt")
2034 );
2035 assert_eq!(loaded.postings.len(), index.postings.len());
2036 assert!(loaded
2037 .postings
2038 .contains_key(&pack_trigram(b'a', b'b', b'c')));
2039 }
2040
2041 #[test]
2042 fn read_from_disk_rejects_corrupt_postings_checksum() {
2043 let dir = tempfile::tempdir().expect("create temp dir");
2044 let project = dir.path().join("project");
2045 fs::create_dir_all(&project).expect("create project dir");
2046 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2047
2048 let index = SearchIndex::build(&project);
2049 let cache_dir = dir.path().join("cache");
2050 index.write_to_disk(&cache_dir, None);
2051
2052 let cache_path = cache_dir.join("cache.bin");
2053 let mut bytes = fs::read(&cache_path).expect("read cache");
2054 let middle = bytes.len() / 2;
2055 bytes[middle] ^= 0xff;
2056 fs::write(&cache_path, bytes).expect("write corrupted cache");
2057
2058 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2059 }
2060
2061 #[test]
2062 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
2063 let dir = tempfile::tempdir().expect("create temp dir");
2064 let project = dir.path().join("project");
2065 fs::create_dir_all(&project).expect("create project dir");
2066 fs::write(project.join("src.txt"), "abcdef").expect("write source");
2067
2068 let index = SearchIndex::build(&project);
2069 let cache_dir = dir.path().join("cache");
2070 index.write_to_disk(&cache_dir, None);
2071
2072 assert!(cache_dir.join("cache.bin").is_file());
2073 assert!(fs::read_dir(&cache_dir)
2074 .expect("read cache dir")
2075 .all(|entry| !entry
2076 .expect("cache entry")
2077 .file_name()
2078 .to_string_lossy()
2079 .contains(".tmp.")));
2080 }
2081
2082 #[test]
2083 fn concurrent_search_index_writes_do_not_corrupt() {
2084 let dir = tempfile::tempdir().expect("create temp dir");
2085 let project = dir.path().join("project");
2086 fs::create_dir_all(&project).expect("create project dir");
2087 fs::write(project.join("src.txt"), "abcdef\n").expect("write source");
2088 let cache_dir = dir.path().join("cache");
2089
2090 let a_project = project.clone();
2091 let a_cache = cache_dir.clone();
2092 let a = std::thread::spawn(move || {
2093 let _lock = CacheLock::acquire(&a_cache).expect("acquire cache lock a");
2094 let index = SearchIndex::build(&a_project);
2095 index.write_to_disk(&a_cache, None);
2096 });
2097 let b_project = project.clone();
2098 let b_cache = cache_dir.clone();
2099 let b = std::thread::spawn(move || {
2100 let _lock = CacheLock::acquire(&b_cache).expect("acquire cache lock b");
2101 let index = SearchIndex::build(&b_project);
2102 index.write_to_disk(&b_cache, None);
2103 });
2104 a.join().expect("writer a");
2105 b.join().expect("writer b");
2106
2107 assert!(SearchIndex::read_from_disk(&cache_dir).is_some());
2108 }
2109
2110 #[test]
2111 fn search_index_atomic_rename_survives_partial_write() {
2112 let dir = tempfile::tempdir().expect("create temp dir");
2113 let cache_dir = dir.path().join("cache");
2114 fs::create_dir_all(&cache_dir).expect("create cache dir");
2115 fs::write(cache_dir.join("cache.bin.tmp.1.1"), b"partial").expect("write partial tmp");
2116
2117 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2118 }
2119
2120 #[test]
2121 fn project_cache_key_includes_checkout_path() {
2122 let dir = tempfile::tempdir().expect("create temp dir");
2123 let source = dir.path().join("source");
2124 fs::create_dir_all(&source).expect("create source repo dir");
2125 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
2126
2127 assert!(Command::new("git")
2128 .current_dir(&source)
2129 .args(["init"])
2130 .status()
2131 .expect("init git repo")
2132 .success());
2133 assert!(Command::new("git")
2134 .current_dir(&source)
2135 .args(["add", "."])
2136 .status()
2137 .expect("git add")
2138 .success());
2139 assert!(Command::new("git")
2140 .current_dir(&source)
2141 .args([
2142 "-c",
2143 "user.name=AFT Tests",
2144 "-c",
2145 "user.email=aft-tests@example.com",
2146 "commit",
2147 "-m",
2148 "initial",
2149 ])
2150 .status()
2151 .expect("git commit")
2152 .success());
2153
2154 let clone = dir.path().join("clone");
2155 assert!(Command::new("git")
2156 .args(["clone", "--quiet"])
2157 .arg(&source)
2158 .arg(&clone)
2159 .status()
2160 .expect("git clone")
2161 .success());
2162
2163 let source_key = project_cache_key(&source);
2164 let clone_key = project_cache_key(&clone);
2165
2166 assert_eq!(source_key.len(), 16);
2167 assert_eq!(clone_key.len(), 16);
2168 assert_eq!(source_key, clone_key);
2170 }
2171
2172 #[test]
2173 fn git_head_unchanged_picks_up_local_edits() {
2174 let dir = tempfile::tempdir().expect("create temp dir");
2175 let project = dir.path().join("repo");
2176 fs::create_dir_all(&project).expect("create repo dir");
2177 let file = project.join("tracked.txt");
2178 fs::write(&file, "oldtoken\n").expect("write file");
2179 assert!(Command::new("git")
2180 .current_dir(&project)
2181 .arg("init")
2182 .status()
2183 .unwrap()
2184 .success());
2185 assert!(Command::new("git")
2186 .current_dir(&project)
2187 .args(["add", "."])
2188 .status()
2189 .unwrap()
2190 .success());
2191 assert!(Command::new("git")
2192 .current_dir(&project)
2193 .args([
2194 "-c",
2195 "user.name=AFT Tests",
2196 "-c",
2197 "user.email=aft-tests@example.com",
2198 "commit",
2199 "-m",
2200 "initial"
2201 ])
2202 .status()
2203 .unwrap()
2204 .success());
2205 let head = current_git_head(&project);
2206 let mut baseline = SearchIndex::build(&project);
2207 baseline.git_head = head.clone();
2208 fs::write(&file, "newtoken\n").expect("edit tracked file");
2209
2210 let refreshed =
2211 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, head, Some(baseline));
2212 let result = refreshed.search_grep("newtoken", true, &[], &[], &project, 10);
2213
2214 assert_eq!(result.total_matches, 1);
2215 }
2216
2217 #[test]
2218 fn non_git_project_reuses_cache_when_files_unchanged() {
2219 let dir = tempfile::tempdir().expect("create temp dir");
2220 let project = dir.path().join("project");
2221 fs::create_dir_all(&project).expect("create project dir");
2222 fs::write(project.join("file.txt"), "unchangedtoken\n").expect("write file");
2223 let baseline = SearchIndex::build(&project);
2224 let baseline_file_count = baseline.file_count();
2225
2226 let refreshed =
2227 SearchIndex::rebuild_or_refresh(&project, DEFAULT_MAX_FILE_SIZE, None, Some(baseline));
2228
2229 assert_eq!(refreshed.file_count(), baseline_file_count);
2230 assert_eq!(
2231 refreshed
2232 .search_grep("unchangedtoken", true, &[], &[], &project, 10)
2233 .total_matches,
2234 1
2235 );
2236 }
2237
2238 #[test]
2239 fn resolve_search_scope_disables_index_for_external_path() {
2240 let dir = tempfile::tempdir().expect("create temp dir");
2241 let project = dir.path().join("project");
2242 let outside = dir.path().join("outside");
2243 fs::create_dir_all(&project).expect("create project dir");
2244 fs::create_dir_all(&outside).expect("create outside dir");
2245
2246 let scope = resolve_search_scope(&project, outside.to_str());
2247
2248 assert_eq!(
2249 scope.root,
2250 fs::canonicalize(&outside).expect("canonicalize outside")
2251 );
2252 assert!(!scope.use_index);
2253 }
2254
2255 #[test]
2256 fn grep_filters_matches_to_search_root() {
2257 let dir = tempfile::tempdir().expect("create temp dir");
2258 let project = dir.path().join("project");
2259 let src = project.join("src");
2260 let docs = project.join("docs");
2261 fs::create_dir_all(&src).expect("create src dir");
2262 fs::create_dir_all(&docs).expect("create docs dir");
2263 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
2264 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
2265
2266 let index = SearchIndex::build(&project);
2267 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2268
2269 assert_eq!(result.files_searched, 1);
2270 assert_eq!(result.files_with_matches, 1);
2271 assert_eq!(result.matches.len(), 1);
2272 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
2274 assert_eq!(result.matches[0].file, expected);
2275 }
2276
2277 #[test]
2278 fn grep_deduplicates_multiple_matches_on_same_line() {
2279 let dir = tempfile::tempdir().expect("create temp dir");
2280 let project = dir.path().join("project");
2281 let src = project.join("src");
2282 fs::create_dir_all(&src).expect("create src dir");
2283 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
2284
2285 let index = SearchIndex::build(&project);
2286 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
2287
2288 assert_eq!(result.total_matches, 1);
2289 assert_eq!(result.matches.len(), 1);
2290 }
2291
2292 #[test]
2293 fn grep_reports_total_matches_before_truncation() {
2294 let dir = tempfile::tempdir().expect("create temp dir");
2295 let project = dir.path().join("project");
2296 let src = project.join("src");
2297 fs::create_dir_all(&src).expect("create src dir");
2298 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2299
2300 let index = SearchIndex::build(&project);
2301 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2302
2303 assert_eq!(result.total_matches, 2);
2304 assert_eq!(result.matches.len(), 1);
2305 assert!(result.truncated);
2306 }
2307
2308 #[test]
2309 fn glob_filters_results_to_search_root() {
2310 let dir = tempfile::tempdir().expect("create temp dir");
2311 let project = dir.path().join("project");
2312 let src = project.join("src");
2313 let scripts = project.join("scripts");
2314 fs::create_dir_all(&src).expect("create src dir");
2315 fs::create_dir_all(&scripts).expect("create scripts dir");
2316 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2317 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2318
2319 let index = SearchIndex::build(&project);
2320 let files = index.glob("**/*.rs", &src);
2321
2322 assert_eq!(
2323 files,
2324 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2325 );
2326 }
2327
2328 #[test]
2329 fn glob_includes_hidden_and_binary_files() {
2330 let dir = tempfile::tempdir().expect("create temp dir");
2331 let project = dir.path().join("project");
2332 let hidden_dir = project.join(".hidden");
2333 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2334 let hidden_file = hidden_dir.join("data.bin");
2335 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2336
2337 let index = SearchIndex::build(&project);
2338 let files = index.glob("**/*.bin", &project);
2339
2340 assert_eq!(
2341 files,
2342 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2343 );
2344 }
2345
2346 #[test]
2347 fn read_from_disk_rejects_invalid_nanos() {
2348 let dir = tempfile::tempdir().expect("create temp dir");
2349 let cache_dir = dir.path().join("cache");
2350 fs::create_dir_all(&cache_dir).expect("create cache dir");
2351
2352 let mut postings = Vec::new();
2353 postings.extend_from_slice(INDEX_MAGIC);
2354 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2355 postings.extend_from_slice(&0u32.to_le_bytes());
2356 postings.extend_from_slice(&1u32.to_le_bytes());
2357 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2358 postings.extend_from_slice(&1u32.to_le_bytes());
2359 postings.extend_from_slice(b"/");
2360 postings.push(0u8);
2361 postings.extend_from_slice(&1u32.to_le_bytes());
2362 postings.extend_from_slice(&0u64.to_le_bytes());
2363 postings.extend_from_slice(&0u64.to_le_bytes());
2364 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2365 postings.extend_from_slice(b"a");
2366 postings.extend_from_slice(&0u64.to_le_bytes());
2367
2368 let mut lookup = Vec::new();
2369 lookup.extend_from_slice(LOOKUP_MAGIC);
2370 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2371 lookup.extend_from_slice(&0u32.to_le_bytes());
2372
2373 let postings_checksum = crc32fast::hash(&postings);
2374 postings.extend_from_slice(&postings_checksum.to_le_bytes());
2375 let lookup_checksum = crc32fast::hash(&lookup);
2376 lookup.extend_from_slice(&lookup_checksum.to_le_bytes());
2377 let mut cache = Vec::new();
2378 cache.extend_from_slice(&CACHE_MAGIC.to_le_bytes());
2379 cache.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2380 cache.extend_from_slice(&(postings.len() as u64).to_le_bytes());
2381 cache.extend_from_slice(&postings);
2382 cache.extend_from_slice(&lookup);
2383 fs::write(cache_dir.join("cache.bin"), cache).expect("write cache");
2384
2385 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2386 }
2387
2388 #[test]
2403 fn sort_paths_by_mtime_desc_does_not_panic_on_missing_files() {
2404 let dir = tempfile::tempdir().expect("create tempdir");
2408 let mut paths: Vec<PathBuf> = Vec::new();
2409 for i in 0..30 {
2410 let path = if i % 2 == 0 {
2412 let p = dir.path().join(format!("real-{i}.rs"));
2413 fs::write(&p, format!("// {i}\n")).expect("write");
2414 p
2415 } else {
2416 dir.path().join(format!("missing-{i}.rs"))
2417 };
2418 paths.push(path);
2419 }
2420
2421 for _ in 0..50 {
2424 let mut copy = paths.clone();
2425 sort_paths_by_mtime_desc(&mut copy);
2426 assert_eq!(copy.len(), paths.len());
2427 }
2428 }
2429
2430 #[test]
2434 fn sort_grep_matches_by_mtime_desc_does_not_panic_on_missing_files() {
2435 let dir = tempfile::tempdir().expect("create tempdir");
2436 let mut matches: Vec<GrepMatch> = Vec::new();
2437 for i in 0..30 {
2438 let file = if i % 2 == 0 {
2439 let p = dir.path().join(format!("real-{i}.rs"));
2440 fs::write(&p, format!("// {i}\n")).expect("write");
2441 p
2442 } else {
2443 dir.path().join(format!("missing-{i}.rs"))
2444 };
2445 matches.push(GrepMatch {
2446 file,
2447 line: u32::try_from(i).unwrap_or(0),
2448 column: 0,
2449 line_text: format!("match {i}"),
2450 match_text: format!("match {i}"),
2451 });
2452 }
2453
2454 for _ in 0..50 {
2455 let mut copy = matches.clone();
2456 sort_grep_matches_by_mtime_desc(&mut copy, dir.path());
2457 assert_eq!(copy.len(), matches.len());
2458 }
2459 }
2460}