1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Seek, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 1;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24const MAX_ENTRIES: usize = 10_000_000;
25const MIN_FILE_ENTRY_BYTES: usize = 25;
26const LOOKUP_ENTRY_BYTES: usize = 16;
27const POSTING_BYTES: usize = 6;
28
29#[derive(Clone, Debug)]
30pub struct SearchIndex {
31 pub postings: HashMap<u32, Vec<Posting>>,
32 pub files: Vec<FileEntry>,
33 pub path_to_id: HashMap<PathBuf, u32>,
34 pub ready: bool,
35 project_root: PathBuf,
36 git_head: Option<String>,
37 max_file_size: u64,
38 file_trigrams: HashMap<u32, Vec<u32>>,
39 unindexed_files: HashSet<u32>,
40}
41
42impl SearchIndex {
43 pub fn file_count(&self) -> usize {
45 self.files.len()
46 }
47
48 pub fn trigram_count(&self) -> usize {
50 self.postings.len()
51 }
52}
53
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct Posting {
56 pub file_id: u32,
57 pub next_mask: u8,
58 pub loc_mask: u8,
59}
60
61#[derive(Clone, Debug)]
62pub struct FileEntry {
63 pub path: PathBuf,
64 pub size: u64,
65 pub modified: SystemTime,
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69pub struct GrepMatch {
70 pub file: PathBuf,
71 pub line: u32,
72 pub column: u32,
73 pub line_text: String,
74 pub match_text: String,
75}
76
77#[derive(Clone, Debug)]
78pub struct GrepResult {
79 pub matches: Vec<GrepMatch>,
80 pub total_matches: usize,
81 pub files_searched: usize,
82 pub files_with_matches: usize,
83 pub index_status: IndexStatus,
84 pub truncated: bool,
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq)]
88pub enum IndexStatus {
89 Ready,
90 Building,
91 Fallback,
92}
93
94impl IndexStatus {
95 pub fn as_str(&self) -> &'static str {
96 match self {
97 IndexStatus::Ready => "Ready",
98 IndexStatus::Building => "Building",
99 IndexStatus::Fallback => "Fallback",
100 }
101 }
102}
103
104#[derive(Clone, Debug, Default)]
105pub struct RegexQuery {
106 pub and_trigrams: Vec<u32>,
107 pub or_groups: Vec<Vec<u32>>,
108 pub(crate) and_filters: HashMap<u32, PostingFilter>,
109 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
110}
111
112#[derive(Clone, Copy, Debug, Default)]
113pub(crate) struct PostingFilter {
114 next_mask: u8,
115 loc_mask: u8,
116}
117
118#[derive(Clone, Debug, Default)]
119struct QueryBuild {
120 and_runs: Vec<Vec<u8>>,
121 or_groups: Vec<Vec<Vec<u8>>>,
122}
123
124#[derive(Clone, Debug, Default)]
125pub(crate) struct PathFilters {
126 includes: Option<GlobSet>,
127 excludes: Option<GlobSet>,
128}
129
130#[derive(Clone, Debug)]
131pub(crate) struct SearchScope {
132 pub root: PathBuf,
133 pub use_index: bool,
134}
135
136#[derive(Clone, Debug)]
137struct SharedGrepMatch {
138 file: Arc<PathBuf>,
139 line: u32,
140 column: u32,
141 line_text: String,
142 match_text: String,
143}
144
145#[derive(Clone, Debug)]
146enum SearchMatcher {
147 Literal(LiteralSearch),
148 Regex(Regex),
149}
150
151#[derive(Clone, Debug)]
152enum LiteralSearch {
153 CaseSensitive(Vec<u8>),
154 AsciiCaseInsensitive(Vec<u8>),
155}
156
157impl SearchIndex {
158 pub fn new() -> Self {
159 SearchIndex {
160 postings: HashMap::new(),
161 files: Vec::new(),
162 path_to_id: HashMap::new(),
163 ready: false,
164 project_root: PathBuf::new(),
165 git_head: None,
166 max_file_size: DEFAULT_MAX_FILE_SIZE,
167 file_trigrams: HashMap::new(),
168 unindexed_files: HashSet::new(),
169 }
170 }
171
172 pub fn build(root: &Path) -> Self {
173 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
174 }
175
176 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
177 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
178 let mut index = SearchIndex {
179 project_root: project_root.clone(),
180 max_file_size,
181 ..SearchIndex::new()
182 };
183
184 let filters = PathFilters::default();
185 for path in walk_project_files(&project_root, &filters) {
186 index.update_file(&path);
187 }
188
189 index.git_head = current_git_head(&project_root);
190 index.ready = true;
191 index
192 }
193
194 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
195 self.remove_file(path);
196
197 let file_id = match self.allocate_file_id(path, content.len() as u64) {
198 Some(file_id) => file_id,
199 None => return,
200 };
201
202 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
203 for (trigram, next_char, position) in extract_trigrams(content) {
204 let entry = trigram_map.entry(trigram).or_default();
205 entry.next_mask |= mask_for_next_char(next_char);
206 entry.loc_mask |= mask_for_position(position);
207 }
208
209 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
210 for (trigram, filter) in trigram_map {
211 let postings = self.postings.entry(trigram).or_default();
212 postings.push(Posting {
213 file_id,
214 next_mask: filter.next_mask,
215 loc_mask: filter.loc_mask,
216 });
217 if postings.len() > 1
221 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
222 {
223 postings.sort_unstable_by_key(|p| p.file_id);
224 }
225 file_trigrams.push(trigram);
226 }
227
228 self.file_trigrams.insert(file_id, file_trigrams);
229 self.unindexed_files.remove(&file_id);
230 }
231
232 pub fn remove_file(&mut self, path: &Path) {
233 let Some(file_id) = self.path_to_id.remove(path) else {
234 return;
235 };
236
237 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
238 for trigram in trigrams {
239 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
240 postings.retain(|posting| posting.file_id != file_id);
241 postings.is_empty()
242 } else {
243 false
244 };
245
246 if should_remove {
247 self.postings.remove(&trigram);
248 }
249 }
250 }
251
252 self.unindexed_files.remove(&file_id);
253 if let Some(file) = self.files.get_mut(file_id as usize) {
254 file.path = PathBuf::new();
255 file.size = 0;
256 file.modified = UNIX_EPOCH;
257 }
258 }
259
260 pub fn update_file(&mut self, path: &Path) {
261 self.remove_file(path);
262
263 let metadata = match fs::metadata(path) {
264 Ok(metadata) if metadata.is_file() => metadata,
265 _ => return,
266 };
267
268 if is_binary_path(path, metadata.len()) {
269 return;
270 }
271
272 if metadata.len() > self.max_file_size {
273 self.track_unindexed_file(path, &metadata);
274 return;
275 }
276
277 let content = match fs::read(path) {
278 Ok(content) => content,
279 Err(_) => return,
280 };
281
282 if is_binary_bytes(&content) {
283 return;
284 }
285
286 self.index_file(path, &content);
287 }
288
289 pub fn grep(
290 &self,
291 pattern: &str,
292 case_sensitive: bool,
293 include: &[String],
294 exclude: &[String],
295 search_root: &Path,
296 max_results: usize,
297 ) -> GrepResult {
298 self.search_grep(
299 pattern,
300 case_sensitive,
301 include,
302 exclude,
303 search_root,
304 max_results,
305 )
306 }
307
308 pub fn search_grep(
309 &self,
310 pattern: &str,
311 case_sensitive: bool,
312 include: &[String],
313 exclude: &[String],
314 search_root: &Path,
315 max_results: usize,
316 ) -> GrepResult {
317 let is_literal = !pattern.chars().any(|c| {
320 matches!(
321 c,
322 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
323 )
324 });
325
326 let literal_search = if is_literal {
327 if case_sensitive {
328 Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
329 } else if pattern.is_ascii() {
330 Some(LiteralSearch::AsciiCaseInsensitive(
331 pattern
332 .as_bytes()
333 .iter()
334 .map(|byte| byte.to_ascii_lowercase())
335 .collect(),
336 ))
337 } else {
338 None
339 }
340 } else {
341 None
342 };
343
344 let regex = if literal_search.is_some() {
346 None
347 } else {
348 let regex_pattern = if is_literal {
349 regex::escape(pattern)
350 } else {
351 pattern.to_string()
352 };
353 let mut builder = RegexBuilder::new(®ex_pattern);
354 builder.case_insensitive(!case_sensitive);
355 match builder.build() {
356 Ok(r) => Some(r),
357 Err(_) => {
358 return GrepResult {
359 matches: Vec::new(),
360 total_matches: 0,
361 files_searched: 0,
362 files_with_matches: 0,
363 index_status: if self.ready {
364 IndexStatus::Ready
365 } else {
366 IndexStatus::Building
367 },
368 truncated: false,
369 };
370 }
371 }
372 };
373
374 let matcher = if let Some(literal_search) = literal_search {
375 SearchMatcher::Literal(literal_search)
376 } else {
377 SearchMatcher::Regex(
378 regex.expect("regex should exist when literal matcher is unavailable"),
379 )
380 };
381
382 let filters = match build_path_filters(include, exclude) {
383 Ok(filters) => filters,
384 Err(_) => PathFilters::default(),
385 };
386 let search_root = canonicalize_or_normalize(search_root);
387
388 let query = decompose_regex(pattern);
389 let candidate_ids = self.candidates(&query);
390
391 let candidate_files: Vec<&FileEntry> = candidate_ids
392 .into_iter()
393 .filter_map(|file_id| self.files.get(file_id as usize))
394 .filter(|file| !file.path.as_os_str().is_empty())
395 .filter(|file| is_within_search_root(&search_root, &file.path))
396 .filter(|file| filters.matches(&self.project_root, &file.path))
397 .collect();
398
399 let total_matches = AtomicUsize::new(0);
400 let files_searched = AtomicUsize::new(0);
401 let files_with_matches = AtomicUsize::new(0);
402 let truncated = AtomicBool::new(false);
403 let stop_after = max_results.saturating_mul(2);
404
405 let mut matches = if candidate_files.len() > 10 {
406 candidate_files
407 .par_iter()
408 .map(|file| {
409 search_candidate_file(
410 file,
411 &matcher,
412 max_results,
413 stop_after,
414 &total_matches,
415 &files_searched,
416 &files_with_matches,
417 &truncated,
418 )
419 })
420 .reduce(Vec::new, |mut left, mut right| {
421 left.append(&mut right);
422 left
423 })
424 } else {
425 let mut matches = Vec::new();
426 for file in candidate_files {
427 matches.extend(search_candidate_file(
428 file,
429 &matcher,
430 max_results,
431 stop_after,
432 &total_matches,
433 &files_searched,
434 &files_with_matches,
435 &truncated,
436 ));
437
438 if should_stop_search(&truncated, &total_matches, stop_after) {
439 break;
440 }
441 }
442 matches
443 };
444
445 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
446 self.path_to_id
447 .get(path)
448 .and_then(|file_id| self.files.get(*file_id as usize))
449 .map(|file| file.modified)
450 });
451
452 let matches = matches
453 .into_iter()
454 .map(|matched| GrepMatch {
455 file: matched.file.as_ref().clone(),
456 line: matched.line,
457 column: matched.column,
458 line_text: matched.line_text,
459 match_text: matched.match_text,
460 })
461 .collect();
462
463 GrepResult {
464 total_matches: total_matches.load(Ordering::Relaxed),
465 matches,
466 files_searched: files_searched.load(Ordering::Relaxed),
467 files_with_matches: files_with_matches.load(Ordering::Relaxed),
468 index_status: if self.ready {
469 IndexStatus::Ready
470 } else {
471 IndexStatus::Building
472 },
473 truncated: truncated.load(Ordering::Relaxed),
474 }
475 }
476
477 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
478 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
479 Ok(filters) => filters,
480 Err(_) => return Vec::new(),
481 };
482 let search_root = canonicalize_or_normalize(search_root);
483 let filter_root = if search_root.starts_with(&self.project_root) {
484 &self.project_root
485 } else {
486 &search_root
487 };
488
489 let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
490 sort_paths_by_mtime_desc(&mut paths);
491 paths
492 }
493
494 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
495 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
496 return self.active_file_ids();
497 }
498
499 let mut and_trigrams = query.and_trigrams.clone();
500 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
501
502 let mut current: Option<Vec<u32>> = None;
503
504 for trigram in and_trigrams {
505 let filter = query.and_filters.get(&trigram).copied();
506 let matches = self.postings_for_trigram(trigram, filter);
507 current = Some(match current.take() {
508 Some(existing) => intersect_sorted_ids(&existing, &matches),
509 None => matches,
510 });
511
512 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
513 break;
514 }
515 }
516
517 let mut current = current.unwrap_or_else(|| self.active_file_ids());
518
519 for (index, group) in query.or_groups.iter().enumerate() {
520 let mut group_matches = Vec::new();
521 let filters = query.or_filters.get(index);
522
523 for trigram in group {
524 let filter = filters.and_then(|filters| filters.get(trigram).copied());
525 let matches = self.postings_for_trigram(*trigram, filter);
526 if group_matches.is_empty() {
527 group_matches = matches;
528 } else {
529 group_matches = union_sorted_ids(&group_matches, &matches);
530 }
531 }
532
533 current = intersect_sorted_ids(¤t, &group_matches);
534 if current.is_empty() {
535 break;
536 }
537 }
538
539 let mut unindexed = self
540 .unindexed_files
541 .iter()
542 .copied()
543 .filter(|file_id| self.is_active_file(*file_id))
544 .collect::<Vec<_>>();
545 if !unindexed.is_empty() {
546 unindexed.sort_unstable();
547 current = union_sorted_ids(¤t, &unindexed);
548 }
549
550 current
551 }
552
553 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
554 if fs::create_dir_all(cache_dir).is_err() {
555 return;
556 }
557
558 let postings_path = cache_dir.join("postings.bin");
559 let lookup_path = cache_dir.join("lookup.bin");
560 let tmp_postings = cache_dir.join("postings.bin.tmp");
561 let tmp_lookup = cache_dir.join("lookup.bin.tmp");
562
563 let active_ids = self.active_file_ids();
564 let mut id_map = HashMap::new();
565 for (new_id, old_id) in active_ids.iter().enumerate() {
566 let Ok(new_id_u32) = u32::try_from(new_id) else {
567 return;
568 };
569 id_map.insert(*old_id, new_id_u32);
570 }
571
572 let write_result = (|| -> std::io::Result<()> {
573 let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
574
575 postings_writer.write_all(INDEX_MAGIC)?;
576 write_u32(&mut postings_writer, INDEX_VERSION)?;
577
578 let head = git_head.unwrap_or_default();
579 let root = self.project_root.to_string_lossy();
580 let head_len = u32::try_from(head.len())
581 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
582 let root_len = u32::try_from(root.len())
583 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
584 let file_count = u32::try_from(active_ids.len())
585 .map_err(|_| std::io::Error::other("too many files to cache"))?;
586
587 write_u32(&mut postings_writer, head_len)?;
588 write_u32(&mut postings_writer, root_len)?;
589 write_u64(&mut postings_writer, self.max_file_size)?;
590 write_u32(&mut postings_writer, file_count)?;
591 postings_writer.write_all(head.as_bytes())?;
592 postings_writer.write_all(root.as_bytes())?;
593
594 for old_id in &active_ids {
595 let Some(file) = self.files.get(*old_id as usize) else {
596 return Err(std::io::Error::other("missing file entry for cache write"));
597 };
598 let path = relative_to_root(&self.project_root, &file.path);
599 let path = path.to_string_lossy();
600 let path_len = u32::try_from(path.len())
601 .map_err(|_| std::io::Error::other("cached path too large"))?;
602 let modified = file
603 .modified
604 .duration_since(UNIX_EPOCH)
605 .unwrap_or(Duration::ZERO);
606 let unindexed = if self.unindexed_files.contains(old_id) {
607 1u8
608 } else {
609 0u8
610 };
611
612 postings_writer.write_all(&[unindexed])?;
613 write_u32(&mut postings_writer, path_len)?;
614 write_u64(&mut postings_writer, file.size)?;
615 write_u64(&mut postings_writer, modified.as_secs())?;
616 write_u32(&mut postings_writer, modified.subsec_nanos())?;
617 postings_writer.write_all(path.as_bytes())?;
618 }
619
620 let mut lookup_entries = Vec::new();
621 let mut postings_blob = Vec::new();
622 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
623 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
624
625 for (trigram, postings) in sorted_postings {
626 let offset = u64::try_from(postings_blob.len())
627 .map_err(|_| std::io::Error::other("postings blob too large"))?;
628 let mut count = 0u32;
629
630 for posting in postings {
631 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
632 continue;
633 };
634
635 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
636 postings_blob.push(posting.next_mask);
637 postings_blob.push(posting.loc_mask);
638 count = count.saturating_add(1);
639 }
640
641 if count > 0 {
642 lookup_entries.push((*trigram, offset, count));
643 }
644 }
645
646 write_u64(
647 &mut postings_writer,
648 u64::try_from(postings_blob.len())
649 .map_err(|_| std::io::Error::other("postings blob too large"))?,
650 )?;
651 postings_writer.write_all(&postings_blob)?;
652 postings_writer.flush()?;
653 drop(postings_writer);
654
655 let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
656 let entry_count = u32::try_from(lookup_entries.len())
657 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
658
659 lookup_writer.write_all(LOOKUP_MAGIC)?;
660 write_u32(&mut lookup_writer, INDEX_VERSION)?;
661 write_u32(&mut lookup_writer, entry_count)?;
662
663 for (trigram, offset, count) in lookup_entries {
664 write_u32(&mut lookup_writer, trigram)?;
665 write_u64(&mut lookup_writer, offset)?;
666 write_u32(&mut lookup_writer, count)?;
667 }
668
669 lookup_writer.flush()?;
670 drop(lookup_writer);
671
672 fs::rename(&tmp_postings, &postings_path)?;
673 fs::rename(&tmp_lookup, &lookup_path)?;
674
675 Ok(())
676 })();
677
678 if write_result.is_err() {
679 let _ = fs::remove_file(&tmp_postings);
680 let _ = fs::remove_file(&tmp_lookup);
681 }
682 }
683
684 pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
685 let postings_path = cache_dir.join("postings.bin");
686 let lookup_path = cache_dir.join("lookup.bin");
687
688 let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
689 let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
690 let postings_len_total =
691 usize::try_from(postings_reader.get_ref().metadata().ok()?.len()).ok()?;
692 let lookup_len_total =
693 usize::try_from(lookup_reader.get_ref().metadata().ok()?.len()).ok()?;
694
695 let mut magic = [0u8; 8];
696 postings_reader.read_exact(&mut magic).ok()?;
697 if &magic != INDEX_MAGIC {
698 return None;
699 }
700 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
701 return None;
702 }
703
704 let head_len = read_u32(&mut postings_reader).ok()? as usize;
705 let root_len = read_u32(&mut postings_reader).ok()? as usize;
706 let max_file_size = read_u64(&mut postings_reader).ok()?;
707 let file_count = read_u32(&mut postings_reader).ok()? as usize;
708 if file_count > MAX_ENTRIES {
709 return None;
710 }
711 let remaining_postings = remaining_bytes(&mut postings_reader, postings_len_total)?;
712 let minimum_file_bytes = file_count.checked_mul(MIN_FILE_ENTRY_BYTES)?;
713 if minimum_file_bytes > remaining_postings {
714 return None;
715 }
716
717 if head_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
718 return None;
719 }
720 let mut head_bytes = vec![0u8; head_len];
721 postings_reader.read_exact(&mut head_bytes).ok()?;
722 let git_head = String::from_utf8(head_bytes)
723 .ok()
724 .filter(|head| !head.is_empty());
725
726 if root_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
727 return None;
728 }
729 let mut root_bytes = vec![0u8; root_len];
730 postings_reader.read_exact(&mut root_bytes).ok()?;
731 let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
732
733 let mut files = Vec::with_capacity(file_count);
734 let mut path_to_id = HashMap::new();
735 let mut unindexed_files = HashSet::new();
736
737 for file_id in 0..file_count {
738 let mut unindexed = [0u8; 1];
739 postings_reader.read_exact(&mut unindexed).ok()?;
740 let path_len = read_u32(&mut postings_reader).ok()? as usize;
741 let size = read_u64(&mut postings_reader).ok()?;
742 let secs = read_u64(&mut postings_reader).ok()?;
743 let nanos = read_u32(&mut postings_reader).ok()?;
744 if nanos >= 1_000_000_000 {
745 return None;
746 }
747 if path_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
748 return None;
749 }
750 let mut path_bytes = vec![0u8; path_len];
751 postings_reader.read_exact(&mut path_bytes).ok()?;
752 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
753 let full_path = project_root.join(relative_path);
754 let file_id_u32 = u32::try_from(file_id).ok()?;
755
756 files.push(FileEntry {
757 path: full_path.clone(),
758 size,
759 modified: UNIX_EPOCH + Duration::new(secs, nanos),
760 });
761 path_to_id.insert(full_path, file_id_u32);
762 if unindexed[0] == 1 {
763 unindexed_files.insert(file_id_u32);
764 }
765 }
766
767 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
768 let max_postings_bytes = MAX_ENTRIES.checked_mul(POSTING_BYTES)?;
769 if postings_len > max_postings_bytes {
770 return None;
771 }
772 if postings_len > remaining_bytes(&mut postings_reader, postings_len_total)? {
773 return None;
774 }
775 let mut postings_blob = vec![0u8; postings_len];
776 postings_reader.read_exact(&mut postings_blob).ok()?;
777
778 let mut lookup_magic = [0u8; 8];
779 lookup_reader.read_exact(&mut lookup_magic).ok()?;
780 if &lookup_magic != LOOKUP_MAGIC {
781 return None;
782 }
783 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
784 return None;
785 }
786 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
787 if entry_count > MAX_ENTRIES {
788 return None;
789 }
790 let remaining_lookup = remaining_bytes(&mut lookup_reader, lookup_len_total)?;
791 let minimum_lookup_bytes = entry_count.checked_mul(LOOKUP_ENTRY_BYTES)?;
792 if minimum_lookup_bytes > remaining_lookup {
793 return None;
794 }
795
796 let mut postings = HashMap::new();
797 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
798
799 for _ in 0..entry_count {
800 let trigram = read_u32(&mut lookup_reader).ok()?;
801 let offset = read_u64(&mut lookup_reader).ok()? as usize;
802 let count = read_u32(&mut lookup_reader).ok()? as usize;
803 if count > MAX_ENTRIES {
804 return None;
805 }
806 let bytes_len = count.checked_mul(POSTING_BYTES)?;
807 let end = offset.checked_add(bytes_len)?;
808 if end > postings_blob.len() {
809 return None;
810 }
811
812 let mut trigram_postings = Vec::with_capacity(count);
813 for chunk in postings_blob[offset..end].chunks_exact(6) {
814 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
815 let posting = Posting {
816 file_id,
817 next_mask: chunk[4],
818 loc_mask: chunk[5],
819 };
820 trigram_postings.push(posting.clone());
821 file_trigrams.entry(file_id).or_default().push(trigram);
822 }
823 postings.insert(trigram, trigram_postings);
824 }
825
826 Some(SearchIndex {
827 postings,
828 files,
829 path_to_id,
830 ready: true,
831 project_root,
832 git_head,
833 max_file_size,
834 file_trigrams,
835 unindexed_files,
836 })
837 }
838
839 pub(crate) fn stored_git_head(&self) -> Option<&str> {
840 self.git_head.as_deref()
841 }
842
843 pub(crate) fn set_ready(&mut self, ready: bool) {
844 self.ready = ready;
845 }
846
847 pub(crate) fn rebuild_or_refresh(
848 root: &Path,
849 max_file_size: u64,
850 current_head: Option<String>,
851 baseline: Option<SearchIndex>,
852 ) -> Self {
853 if current_head.is_none() {
854 return SearchIndex::build_with_limit(root, max_file_size);
855 }
856
857 if let Some(mut baseline) = baseline {
858 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
859 baseline.max_file_size = max_file_size;
860
861 if baseline.git_head == current_head {
862 verify_file_mtimes(&mut baseline);
866 baseline.ready = true;
867 return baseline;
868 }
869
870 if let (Some(previous), Some(current)) =
871 (baseline.git_head.clone(), current_head.clone())
872 {
873 let project_root = baseline.project_root.clone();
874 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
875 baseline.git_head = Some(current);
876 baseline.ready = true;
877 return baseline;
878 }
879 }
880 }
881
882 SearchIndex::build_with_limit(root, max_file_size)
883 }
884
885 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
886 let file_id = u32::try_from(self.files.len()).ok()?;
887 let metadata = fs::metadata(path).ok();
888 let size = metadata
889 .as_ref()
890 .map_or(size_hint, |metadata| metadata.len());
891 let modified = metadata
892 .and_then(|metadata| metadata.modified().ok())
893 .unwrap_or(UNIX_EPOCH);
894
895 self.files.push(FileEntry {
896 path: path.to_path_buf(),
897 size,
898 modified,
899 });
900 self.path_to_id.insert(path.to_path_buf(), file_id);
901 Some(file_id)
902 }
903
904 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
905 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
906 return;
907 };
908 self.unindexed_files.insert(file_id);
909 self.file_trigrams.insert(file_id, Vec::new());
910 }
911
912 fn active_file_ids(&self) -> Vec<u32> {
913 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
914 ids.sort_unstable();
915 ids
916 }
917
918 fn is_active_file(&self, file_id: u32) -> bool {
919 self.files
920 .get(file_id as usize)
921 .map(|file| !file.path.as_os_str().is_empty())
922 .unwrap_or(false)
923 }
924
925 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
926 let Some(postings) = self.postings.get(&trigram) else {
927 return Vec::new();
928 };
929
930 let mut matches = Vec::with_capacity(postings.len());
931
932 for posting in postings {
933 if let Some(filter) = filter {
934 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
937 continue;
938 }
939 }
944 if self.is_active_file(posting.file_id) {
945 matches.push(posting.file_id);
946 }
947 }
948
949 matches
950 }
951}
952
953fn search_candidate_file(
954 file: &FileEntry,
955 matcher: &SearchMatcher,
956 max_results: usize,
957 stop_after: usize,
958 total_matches: &AtomicUsize,
959 files_searched: &AtomicUsize,
960 files_with_matches: &AtomicUsize,
961 truncated: &AtomicBool,
962) -> Vec<SharedGrepMatch> {
963 if should_stop_search(truncated, total_matches, stop_after) {
964 return Vec::new();
965 }
966
967 let content = match read_indexed_file_bytes(&file.path) {
968 Some(content) => content,
969 None => return Vec::new(),
970 };
971 files_searched.fetch_add(1, Ordering::Relaxed);
972
973 let shared_path = Arc::new(file.path.clone());
974 let mut matches = Vec::new();
975 let mut line_starts = None;
976 let mut seen_lines = HashSet::new();
977 let mut matched_this_file = false;
978
979 match matcher {
980 SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
981 let finder = memchr::memmem::Finder::new(needle);
982 let mut start = 0;
983
984 while let Some(position) = finder.find(&content[start..]) {
985 if should_stop_search(truncated, total_matches, stop_after) {
986 break;
987 }
988
989 let offset = start + position;
990 start = offset + 1;
991
992 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
993 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
994 if !seen_lines.insert(line) {
995 continue;
996 }
997
998 matched_this_file = true;
999 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1000 if match_number > max_results {
1001 truncated.store(true, Ordering::Relaxed);
1002 break;
1003 }
1004
1005 let end = offset + needle.len();
1006 matches.push(SharedGrepMatch {
1007 file: shared_path.clone(),
1008 line,
1009 column,
1010 line_text,
1011 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1012 });
1013 }
1014 }
1015 SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
1016 let search_content = content.to_ascii_lowercase();
1017 let finder = memchr::memmem::Finder::new(needle);
1018 let mut start = 0;
1019
1020 while let Some(position) = finder.find(&search_content[start..]) {
1021 if should_stop_search(truncated, total_matches, stop_after) {
1022 break;
1023 }
1024
1025 let offset = start + position;
1026 start = offset + 1;
1027
1028 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1029 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
1030 if !seen_lines.insert(line) {
1031 continue;
1032 }
1033
1034 matched_this_file = true;
1035 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1036 if match_number > max_results {
1037 truncated.store(true, Ordering::Relaxed);
1038 break;
1039 }
1040
1041 let end = offset + needle.len();
1042 matches.push(SharedGrepMatch {
1043 file: shared_path.clone(),
1044 line,
1045 column,
1046 line_text,
1047 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
1048 });
1049 }
1050 }
1051 SearchMatcher::Regex(regex) => {
1052 for matched in regex.find_iter(&content) {
1053 if should_stop_search(truncated, total_matches, stop_after) {
1054 break;
1055 }
1056
1057 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1058 let (line, column, line_text) =
1059 line_details_bytes(&content, line_starts, matched.start());
1060 if !seen_lines.insert(line) {
1061 continue;
1062 }
1063
1064 matched_this_file = true;
1065 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1066 if match_number > max_results {
1067 truncated.store(true, Ordering::Relaxed);
1068 break;
1069 }
1070
1071 matches.push(SharedGrepMatch {
1072 file: shared_path.clone(),
1073 line,
1074 column,
1075 line_text,
1076 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1077 });
1078 }
1079 }
1080 }
1081
1082 if matched_this_file {
1083 files_with_matches.fetch_add(1, Ordering::Relaxed);
1084 }
1085
1086 matches
1087}
1088
1089fn should_stop_search(
1090 truncated: &AtomicBool,
1091 total_matches: &AtomicUsize,
1092 stop_after: usize,
1093) -> bool {
1094 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1095}
1096
1097fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1098 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1099 let mut left_index = 0;
1100 let mut right_index = 0;
1101
1102 while left_index < left.len() && right_index < right.len() {
1103 match left[left_index].cmp(&right[right_index]) {
1104 std::cmp::Ordering::Less => left_index += 1,
1105 std::cmp::Ordering::Greater => right_index += 1,
1106 std::cmp::Ordering::Equal => {
1107 merged.push(left[left_index]);
1108 left_index += 1;
1109 right_index += 1;
1110 }
1111 }
1112 }
1113
1114 merged
1115}
1116
1117fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1118 let mut merged = Vec::with_capacity(left.len() + right.len());
1119 let mut left_index = 0;
1120 let mut right_index = 0;
1121
1122 while left_index < left.len() && right_index < right.len() {
1123 match left[left_index].cmp(&right[right_index]) {
1124 std::cmp::Ordering::Less => {
1125 merged.push(left[left_index]);
1126 left_index += 1;
1127 }
1128 std::cmp::Ordering::Greater => {
1129 merged.push(right[right_index]);
1130 right_index += 1;
1131 }
1132 std::cmp::Ordering::Equal => {
1133 merged.push(left[left_index]);
1134 left_index += 1;
1135 right_index += 1;
1136 }
1137 }
1138 }
1139
1140 merged.extend_from_slice(&left[left_index..]);
1141 merged.extend_from_slice(&right[right_index..]);
1142 merged
1143}
1144
1145pub fn decompose_regex(pattern: &str) -> RegexQuery {
1146 let hir = match regex_syntax::parse(pattern) {
1147 Ok(hir) => hir,
1148 Err(_) => return RegexQuery::default(),
1149 };
1150
1151 let build = build_query(&hir);
1152 build.into_query()
1153}
1154
1155pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1156 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1157}
1158
1159pub fn normalize_char(c: u8) -> u8 {
1160 c.to_ascii_lowercase()
1161}
1162
1163pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1164 if content.len() < 3 {
1165 return Vec::new();
1166 }
1167
1168 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1169 for start in 0..=content.len() - 3 {
1170 let trigram = pack_trigram(
1171 normalize_char(content[start]),
1172 normalize_char(content[start + 1]),
1173 normalize_char(content[start + 2]),
1174 );
1175 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1176 trigrams.push((trigram, next_char, start));
1177 }
1178 trigrams
1179}
1180
1181pub fn resolve_cache_dir(project_root: &Path, storage_dir: Option<&Path>) -> PathBuf {
1182 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1184 return PathBuf::from(override_dir)
1185 .join("index")
1186 .join(project_cache_key(project_root));
1187 }
1188 if let Some(dir) = storage_dir {
1190 return dir.join("index").join(project_cache_key(project_root));
1191 }
1192 let home = std::env::var_os("HOME")
1194 .map(PathBuf::from)
1195 .unwrap_or_else(|| PathBuf::from("."));
1196 home.join(".cache")
1197 .join("aft")
1198 .join("index")
1199 .join(project_cache_key(project_root))
1200}
1201
1202pub(crate) fn build_path_filters(
1203 include: &[String],
1204 exclude: &[String],
1205) -> Result<PathFilters, String> {
1206 Ok(PathFilters {
1207 includes: build_globset(include)?,
1208 excludes: build_globset(exclude)?,
1209 })
1210}
1211
1212pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1213 walk_project_files_from(root, root, filters)
1214}
1215
1216pub(crate) fn walk_project_files_from(
1217 filter_root: &Path,
1218 search_root: &Path,
1219 filters: &PathFilters,
1220) -> Vec<PathBuf> {
1221 let mut builder = WalkBuilder::new(search_root);
1222 builder
1223 .hidden(false)
1224 .git_ignore(true)
1225 .git_global(true)
1226 .git_exclude(true)
1227 .filter_entry(|entry| {
1228 let name = entry.file_name().to_string_lossy();
1229 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1230 return !matches!(
1231 name.as_ref(),
1232 "node_modules"
1233 | "target"
1234 | "venv"
1235 | ".venv"
1236 | ".git"
1237 | "__pycache__"
1238 | ".tox"
1239 | "dist"
1240 | "build"
1241 );
1242 }
1243 true
1244 });
1245
1246 let mut files = Vec::new();
1247 for entry in builder.build().filter_map(|entry| entry.ok()) {
1248 if !entry
1249 .file_type()
1250 .map_or(false, |file_type| file_type.is_file())
1251 {
1252 continue;
1253 }
1254 let path = entry.into_path();
1255 if filters.matches(filter_root, &path) {
1256 files.push(path);
1257 }
1258 }
1259
1260 sort_paths_by_mtime_desc(&mut files);
1261 files
1262}
1263
1264pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1265 let bytes = fs::read(path).ok()?;
1266 if is_binary_bytes(&bytes) {
1267 return None;
1268 }
1269 String::from_utf8(bytes).ok()
1270}
1271
1272fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1273 fs::read(path).ok()
1274}
1275
1276pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1277 path.strip_prefix(root)
1278 .map(PathBuf::from)
1279 .unwrap_or_else(|_| path.to_path_buf())
1280}
1281
1282pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1283 paths.sort_by(|left, right| {
1284 path_modified_time(right)
1285 .cmp(&path_modified_time(left))
1286 .then_with(|| left.cmp(right))
1287 });
1288}
1289
1290pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1291 matches.sort_by(|left, right| {
1292 let left_path = resolve_match_path(project_root, &left.file);
1293 let right_path = resolve_match_path(project_root, &right.file);
1294
1295 path_modified_time(&right_path)
1296 .cmp(&path_modified_time(&left_path))
1297 .then_with(|| left.file.cmp(&right.file))
1298 .then_with(|| left.line.cmp(&right.line))
1299 .then_with(|| left.column.cmp(&right.column))
1300 });
1301}
1302
1303fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1304 matches: &mut [SharedGrepMatch],
1305 modified_for_path: F,
1306) where
1307 F: Fn(&Path) -> Option<SystemTime>,
1308{
1309 matches.sort_by(|left, right| {
1310 modified_for_path(right.file.as_path())
1311 .cmp(&modified_for_path(left.file.as_path()))
1312 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1313 .then_with(|| left.line.cmp(&right.line))
1314 .then_with(|| left.column.cmp(&right.column))
1315 });
1316}
1317
1318pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1319 let resolved_project_root = canonicalize_or_normalize(project_root);
1320 let root = match path {
1321 Some(path) => {
1322 let path = PathBuf::from(path);
1323 if path.is_absolute() {
1324 canonicalize_or_normalize(&path)
1325 } else {
1326 normalize_path(&resolved_project_root.join(path))
1327 }
1328 }
1329 None => resolved_project_root.clone(),
1330 };
1331
1332 let use_index = is_within_search_root(&resolved_project_root, &root);
1333 SearchScope { root, use_index }
1334}
1335
1336pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1337 content_inspector::inspect(content).is_binary()
1338}
1339
1340pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1341 run_git(root, &["rev-parse", "HEAD"])
1342}
1343
1344pub(crate) fn project_cache_key(project_root: &Path) -> String {
1345 use sha2::{Digest, Sha256};
1346
1347 let mut hasher = Sha256::new();
1348
1349 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1350 hasher.update(root_commit.as_bytes());
1353 } else {
1354 let canonical_root = canonicalize_or_normalize(project_root);
1356 hasher.update(canonical_root.to_string_lossy().as_bytes());
1357 }
1358
1359 let digest = format!("{:x}", hasher.finalize());
1360 digest[..16].to_string()
1361}
1362
1363impl PathFilters {
1364 fn matches(&self, root: &Path, path: &Path) -> bool {
1365 let relative = to_glob_path(&relative_to_root(root, path));
1366 if self
1367 .includes
1368 .as_ref()
1369 .is_some_and(|includes| !includes.is_match(&relative))
1370 {
1371 return false;
1372 }
1373 if self
1374 .excludes
1375 .as_ref()
1376 .is_some_and(|excludes| excludes.is_match(&relative))
1377 {
1378 return false;
1379 }
1380 true
1381 }
1382}
1383
1384fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1385 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1386}
1387
1388fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1389 if path.is_absolute() {
1390 path.to_path_buf()
1391 } else {
1392 project_root.join(path)
1393 }
1394}
1395
1396fn path_modified_time(path: &Path) -> Option<SystemTime> {
1397 fs::metadata(path)
1398 .and_then(|metadata| metadata.modified())
1399 .ok()
1400}
1401
1402fn normalize_path(path: &Path) -> PathBuf {
1403 let mut result = PathBuf::new();
1404 for component in path.components() {
1405 match component {
1406 Component::ParentDir => {
1407 if !result.pop() {
1408 result.push(component);
1409 }
1410 }
1411 Component::CurDir => {}
1412 _ => result.push(component),
1413 }
1414 }
1415 result
1416}
1417
1418fn verify_file_mtimes(index: &mut SearchIndex) {
1421 let mut stale_paths = Vec::new();
1423 for entry in &index.files {
1424 if entry.path.as_os_str().is_empty() {
1425 continue; }
1427 match fs::metadata(&entry.path) {
1428 Ok(meta) => {
1429 let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1430 if current_mtime != entry.modified || meta.len() != entry.size {
1431 stale_paths.push(entry.path.clone());
1432 }
1433 }
1434 Err(_) => {
1435 stale_paths.push(entry.path.clone());
1437 }
1438 }
1439 }
1440
1441 for path in &stale_paths {
1443 index.update_file(path);
1444 }
1445
1446 let filters = PathFilters::default();
1448 for path in walk_project_files(&index.project_root, &filters) {
1449 if !index.path_to_id.contains_key(&path) {
1450 index.update_file(&path);
1451 }
1452 }
1453
1454 if !stale_paths.is_empty() {
1455 log::info!(
1456 "[aft] search index: refreshed {} stale file(s) from disk cache",
1457 stale_paths.len()
1458 );
1459 }
1460}
1461
1462fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1463 path.starts_with(search_root)
1464}
1465
1466impl QueryBuild {
1467 fn into_query(self) -> RegexQuery {
1468 let mut query = RegexQuery::default();
1469
1470 for run in self.and_runs {
1471 add_run_to_and_query(&mut query, &run);
1472 }
1473
1474 for group in self.or_groups {
1475 let mut trigrams = BTreeSet::new();
1476 let mut filters = HashMap::new();
1477 for run in group {
1478 for (trigram, filter) in trigram_filters(&run) {
1479 trigrams.insert(trigram);
1480 merge_filter(filters.entry(trigram).or_default(), filter);
1481 }
1482 }
1483 if !trigrams.is_empty() {
1484 query.or_groups.push(trigrams.into_iter().collect());
1485 query.or_filters.push(filters);
1486 }
1487 }
1488
1489 query
1490 }
1491}
1492
1493fn build_query(hir: &Hir) -> QueryBuild {
1494 match hir.kind() {
1495 HirKind::Literal(literal) => {
1496 if literal.0.len() >= 3 {
1497 QueryBuild {
1498 and_runs: vec![literal.0.to_vec()],
1499 or_groups: Vec::new(),
1500 }
1501 } else {
1502 QueryBuild::default()
1503 }
1504 }
1505 HirKind::Capture(capture) => build_query(&capture.sub),
1506 HirKind::Concat(parts) => {
1507 let mut build = QueryBuild::default();
1508 for part in parts {
1509 let part_build = build_query(part);
1510 build.and_runs.extend(part_build.and_runs);
1511 build.or_groups.extend(part_build.or_groups);
1512 }
1513 build
1514 }
1515 HirKind::Alternation(parts) => {
1516 let mut group = Vec::new();
1517 for part in parts {
1518 let Some(mut choices) = guaranteed_run_choices(part) else {
1519 return QueryBuild::default();
1520 };
1521 group.append(&mut choices);
1522 }
1523 if group.is_empty() {
1524 QueryBuild::default()
1525 } else {
1526 QueryBuild {
1527 and_runs: Vec::new(),
1528 or_groups: vec![group],
1529 }
1530 }
1531 }
1532 HirKind::Repetition(repetition) => {
1533 if repetition.min == 0 {
1534 QueryBuild::default()
1535 } else {
1536 build_query(&repetition.sub)
1537 }
1538 }
1539 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1540 }
1541}
1542
1543fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1544 match hir.kind() {
1545 HirKind::Literal(literal) => {
1546 if literal.0.len() >= 3 {
1547 Some(vec![literal.0.to_vec()])
1548 } else {
1549 None
1550 }
1551 }
1552 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1553 HirKind::Concat(parts) => {
1554 let mut runs = Vec::new();
1555 for part in parts {
1556 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1557 runs.append(&mut part_runs);
1558 }
1559 }
1560 if runs.is_empty() {
1561 None
1562 } else {
1563 Some(runs)
1564 }
1565 }
1566 HirKind::Alternation(parts) => {
1567 let mut runs = Vec::new();
1568 for part in parts {
1569 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1570 return None;
1571 };
1572 runs.append(&mut part_runs);
1573 }
1574 if runs.is_empty() {
1575 None
1576 } else {
1577 Some(runs)
1578 }
1579 }
1580 HirKind::Repetition(repetition) => {
1581 if repetition.min == 0 {
1582 None
1583 } else {
1584 guaranteed_run_choices(&repetition.sub)
1585 }
1586 }
1587 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1588 }
1589}
1590
1591fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1592 for (trigram, filter) in trigram_filters(run) {
1593 if !query.and_trigrams.contains(&trigram) {
1594 query.and_trigrams.push(trigram);
1595 }
1596 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1597 }
1598}
1599
1600fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1601 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1602 for (trigram, next_char, position) in extract_trigrams(run) {
1603 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1604 if next_char != EOF_SENTINEL {
1605 entry.next_mask |= mask_for_next_char(next_char);
1606 }
1607 entry.loc_mask |= mask_for_position(position);
1608 }
1609 filters.into_iter().collect()
1610}
1611
1612fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1613 target.next_mask |= filter.next_mask;
1614 target.loc_mask |= filter.loc_mask;
1615}
1616
1617fn mask_for_next_char(next_char: u8) -> u8 {
1618 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1619 1u8 << bit
1620}
1621
1622fn mask_for_position(position: usize) -> u8 {
1623 1u8 << (position % 8)
1624}
1625
1626fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1627 if patterns.is_empty() {
1628 return Ok(None);
1629 }
1630
1631 let mut builder = GlobSetBuilder::new();
1632 for pattern in patterns {
1633 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1634 builder.add(glob);
1635 }
1636 builder.build().map(Some).map_err(|error| error.to_string())
1637}
1638
1639fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1640 let mut buffer = [0u8; 4];
1641 reader.read_exact(&mut buffer)?;
1642 Ok(u32::from_le_bytes(buffer))
1643}
1644
1645fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1646 let mut buffer = [0u8; 8];
1647 reader.read_exact(&mut buffer)?;
1648 Ok(u64::from_le_bytes(buffer))
1649}
1650
1651fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1652 writer.write_all(&value.to_le_bytes())
1653}
1654
1655fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1656 writer.write_all(&value.to_le_bytes())
1657}
1658
1659fn remaining_bytes<R: Seek>(reader: &mut R, total_len: usize) -> Option<usize> {
1660 let pos = usize::try_from(reader.stream_position().ok()?).ok()?;
1661 total_len.checked_sub(pos)
1662}
1663
1664fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1665 let output = Command::new("git")
1666 .arg("-C")
1667 .arg(root)
1668 .args(args)
1669 .output()
1670 .ok()?;
1671 if !output.status.success() {
1672 return None;
1673 }
1674 let value = String::from_utf8(output.stdout).ok()?;
1675 let value = value.trim().to_string();
1676 if value.is_empty() {
1677 None
1678 } else {
1679 Some(value)
1680 }
1681}
1682
1683fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1684 let diff_range = format!("{}..{}", from, to);
1685 let output = match Command::new("git")
1686 .arg("-C")
1687 .arg(root)
1688 .args(["diff", "--name-only", &diff_range])
1689 .output()
1690 {
1691 Ok(output) => output,
1692 Err(_) => return false,
1693 };
1694
1695 if !output.status.success() {
1696 return false;
1697 }
1698
1699 let Ok(paths) = String::from_utf8(output.stdout) else {
1700 return false;
1701 };
1702
1703 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1704 let path = root.join(relative_path);
1705 if path.exists() {
1706 index.update_file(&path);
1707 } else {
1708 index.remove_file(&path);
1709 }
1710 }
1711
1712 true
1713}
1714
1715fn is_binary_path(path: &Path, size: u64) -> bool {
1716 if size == 0 {
1717 return false;
1718 }
1719
1720 let mut file = match File::open(path) {
1721 Ok(file) => file,
1722 Err(_) => return true,
1723 };
1724
1725 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1726 match file.read(&mut preview) {
1727 Ok(read) => is_binary_bytes(&preview[..read]),
1728 Err(_) => true,
1729 }
1730}
1731
1732fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1733 let mut starts = vec![0usize];
1734 for (index, byte) in content.iter().copied().enumerate() {
1735 if byte == b'\n' {
1736 starts.push(index + 1);
1737 }
1738 }
1739 starts
1740}
1741
1742fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1743 let line_index = match line_starts.binary_search(&offset) {
1744 Ok(index) => index,
1745 Err(index) => index.saturating_sub(1),
1746 };
1747 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1748 let line_end = content[line_start..]
1749 .iter()
1750 .position(|byte| *byte == b'\n')
1751 .map(|length| line_start + length)
1752 .unwrap_or(content.len());
1753 let mut line_slice = &content[line_start..line_end];
1754 if line_slice.ends_with(b"\r") {
1755 line_slice = &line_slice[..line_slice.len() - 1];
1756 }
1757 let line_text = String::from_utf8_lossy(line_slice).into_owned();
1758 let column = String::from_utf8_lossy(&content[line_start..offset])
1759 .chars()
1760 .count() as u32
1761 + 1;
1762 (line_index as u32 + 1, column, line_text)
1763}
1764
1765fn to_glob_path(path: &Path) -> String {
1766 path.to_string_lossy().replace('\\', "/")
1767}
1768
1769#[cfg(test)]
1770mod tests {
1771 use std::process::Command;
1772
1773 use super::*;
1774
1775 #[test]
1776 fn extract_trigrams_tracks_next_char_and_position() {
1777 let trigrams = extract_trigrams(b"Rust");
1778 assert_eq!(trigrams.len(), 2);
1779 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1780 assert_eq!(
1781 trigrams[1],
1782 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1783 );
1784 }
1785
1786 #[test]
1787 fn decompose_regex_extracts_literals_and_alternations() {
1788 let query = decompose_regex("abc(def|ghi)xyz");
1789 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1790 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1791 assert_eq!(query.or_groups.len(), 1);
1792 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1793 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1794 }
1795
1796 #[test]
1797 fn candidates_intersect_posting_lists() {
1798 let mut index = SearchIndex::new();
1799 let dir = tempfile::tempdir().expect("create temp dir");
1800 let alpha = dir.path().join("alpha.txt");
1801 let beta = dir.path().join("beta.txt");
1802 fs::write(&alpha, "abcdef").expect("write alpha");
1803 fs::write(&beta, "abcxyz").expect("write beta");
1804 index.project_root = dir.path().to_path_buf();
1805 index.index_file(&alpha, b"abcdef");
1806 index.index_file(&beta, b"abcxyz");
1807
1808 let query = RegexQuery {
1809 and_trigrams: vec![
1810 pack_trigram(b'a', b'b', b'c'),
1811 pack_trigram(b'd', b'e', b'f'),
1812 ],
1813 ..RegexQuery::default()
1814 };
1815
1816 let candidates = index.candidates(&query);
1817 assert_eq!(candidates.len(), 1);
1818 assert_eq!(index.files[candidates[0] as usize].path, alpha);
1819 }
1820
1821 #[test]
1822 fn candidates_apply_bloom_filters() {
1823 let mut index = SearchIndex::new();
1824 let dir = tempfile::tempdir().expect("create temp dir");
1825 let file = dir.path().join("sample.txt");
1826 fs::write(&file, "abcd efgh").expect("write sample");
1827 index.project_root = dir.path().to_path_buf();
1828 index.index_file(&file, b"abcd efgh");
1829
1830 let trigram = pack_trigram(b'a', b'b', b'c');
1831 let matching_filter = PostingFilter {
1832 next_mask: mask_for_next_char(b'd'),
1833 loc_mask: mask_for_position(0),
1834 };
1835 let non_matching_filter = PostingFilter {
1836 next_mask: mask_for_next_char(b'z'),
1837 loc_mask: mask_for_position(0),
1838 };
1839
1840 assert_eq!(
1841 index
1842 .postings_for_trigram(trigram, Some(matching_filter))
1843 .len(),
1844 1
1845 );
1846 assert!(index
1847 .postings_for_trigram(trigram, Some(non_matching_filter))
1848 .is_empty());
1849 }
1850
1851 #[test]
1852 fn disk_round_trip_preserves_postings_and_files() {
1853 let dir = tempfile::tempdir().expect("create temp dir");
1854 let project = dir.path().join("project");
1855 fs::create_dir_all(&project).expect("create project dir");
1856 let file = project.join("src.txt");
1857 fs::write(&file, "abcdef").expect("write source");
1858
1859 let mut index = SearchIndex::build(&project);
1860 index.git_head = Some("deadbeef".to_string());
1861 let cache_dir = dir.path().join("cache");
1862 index.write_to_disk(&cache_dir, index.git_head.as_deref());
1863
1864 let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1865 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1866 assert_eq!(loaded.files.len(), 1);
1867 assert_eq!(
1868 relative_to_root(&loaded.project_root, &loaded.files[0].path),
1869 PathBuf::from("src.txt")
1870 );
1871 assert_eq!(loaded.postings.len(), index.postings.len());
1872 assert!(loaded
1873 .postings
1874 .contains_key(&pack_trigram(b'a', b'b', b'c')));
1875 }
1876
1877 #[test]
1878 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1879 let dir = tempfile::tempdir().expect("create temp dir");
1880 let project = dir.path().join("project");
1881 fs::create_dir_all(&project).expect("create project dir");
1882 fs::write(project.join("src.txt"), "abcdef").expect("write source");
1883
1884 let index = SearchIndex::build(&project);
1885 let cache_dir = dir.path().join("cache");
1886 index.write_to_disk(&cache_dir, None);
1887
1888 assert!(cache_dir.join("postings.bin").is_file());
1889 assert!(cache_dir.join("lookup.bin").is_file());
1890 assert!(!cache_dir.join("postings.bin.tmp").exists());
1891 assert!(!cache_dir.join("lookup.bin.tmp").exists());
1892 }
1893
1894 #[test]
1895 fn project_cache_key_includes_checkout_path() {
1896 let dir = tempfile::tempdir().expect("create temp dir");
1897 let source = dir.path().join("source");
1898 fs::create_dir_all(&source).expect("create source repo dir");
1899 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1900
1901 assert!(Command::new("git")
1902 .current_dir(&source)
1903 .args(["init"])
1904 .status()
1905 .expect("init git repo")
1906 .success());
1907 assert!(Command::new("git")
1908 .current_dir(&source)
1909 .args(["add", "."])
1910 .status()
1911 .expect("git add")
1912 .success());
1913 assert!(Command::new("git")
1914 .current_dir(&source)
1915 .args([
1916 "-c",
1917 "user.name=AFT Tests",
1918 "-c",
1919 "user.email=aft-tests@example.com",
1920 "commit",
1921 "-m",
1922 "initial",
1923 ])
1924 .status()
1925 .expect("git commit")
1926 .success());
1927
1928 let clone = dir.path().join("clone");
1929 assert!(Command::new("git")
1930 .args(["clone", "--quiet"])
1931 .arg(&source)
1932 .arg(&clone)
1933 .status()
1934 .expect("git clone")
1935 .success());
1936
1937 let source_key = project_cache_key(&source);
1938 let clone_key = project_cache_key(&clone);
1939
1940 assert_eq!(source_key.len(), 16);
1941 assert_eq!(clone_key.len(), 16);
1942 assert_eq!(source_key, clone_key);
1944 }
1945
1946 #[test]
1947 fn resolve_search_scope_disables_index_for_external_path() {
1948 let dir = tempfile::tempdir().expect("create temp dir");
1949 let project = dir.path().join("project");
1950 let outside = dir.path().join("outside");
1951 fs::create_dir_all(&project).expect("create project dir");
1952 fs::create_dir_all(&outside).expect("create outside dir");
1953
1954 let scope = resolve_search_scope(&project, outside.to_str());
1955
1956 assert_eq!(
1957 scope.root,
1958 fs::canonicalize(&outside).expect("canonicalize outside")
1959 );
1960 assert!(!scope.use_index);
1961 }
1962
1963 #[test]
1964 fn grep_filters_matches_to_search_root() {
1965 let dir = tempfile::tempdir().expect("create temp dir");
1966 let project = dir.path().join("project");
1967 let src = project.join("src");
1968 let docs = project.join("docs");
1969 fs::create_dir_all(&src).expect("create src dir");
1970 fs::create_dir_all(&docs).expect("create docs dir");
1971 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1972 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1973
1974 let index = SearchIndex::build(&project);
1975 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1976
1977 assert_eq!(result.files_searched, 1);
1978 assert_eq!(result.files_with_matches, 1);
1979 assert_eq!(result.matches.len(), 1);
1980 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
1982 assert_eq!(result.matches[0].file, expected);
1983 }
1984
1985 #[test]
1986 fn grep_deduplicates_multiple_matches_on_same_line() {
1987 let dir = tempfile::tempdir().expect("create temp dir");
1988 let project = dir.path().join("project");
1989 let src = project.join("src");
1990 fs::create_dir_all(&src).expect("create src dir");
1991 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1992
1993 let index = SearchIndex::build(&project);
1994 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1995
1996 assert_eq!(result.total_matches, 1);
1997 assert_eq!(result.matches.len(), 1);
1998 }
1999
2000 #[test]
2001 fn grep_reports_total_matches_before_truncation() {
2002 let dir = tempfile::tempdir().expect("create temp dir");
2003 let project = dir.path().join("project");
2004 let src = project.join("src");
2005 fs::create_dir_all(&src).expect("create src dir");
2006 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
2007
2008 let index = SearchIndex::build(&project);
2009 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
2010
2011 assert_eq!(result.total_matches, 2);
2012 assert_eq!(result.matches.len(), 1);
2013 assert!(result.truncated);
2014 }
2015
2016 #[test]
2017 fn glob_filters_results_to_search_root() {
2018 let dir = tempfile::tempdir().expect("create temp dir");
2019 let project = dir.path().join("project");
2020 let src = project.join("src");
2021 let scripts = project.join("scripts");
2022 fs::create_dir_all(&src).expect("create src dir");
2023 fs::create_dir_all(&scripts).expect("create scripts dir");
2024 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
2025 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
2026
2027 let index = SearchIndex::build(&project);
2028 let files = index.glob("**/*.rs", &src);
2029
2030 assert_eq!(
2031 files,
2032 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
2033 );
2034 }
2035
2036 #[test]
2037 fn glob_includes_hidden_and_binary_files() {
2038 let dir = tempfile::tempdir().expect("create temp dir");
2039 let project = dir.path().join("project");
2040 let hidden_dir = project.join(".hidden");
2041 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
2042 let hidden_file = hidden_dir.join("data.bin");
2043 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
2044
2045 let index = SearchIndex::build(&project);
2046 let files = index.glob("**/*.bin", &project);
2047
2048 assert_eq!(
2049 files,
2050 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
2051 );
2052 }
2053
2054 #[test]
2055 fn read_from_disk_rejects_invalid_nanos() {
2056 let dir = tempfile::tempdir().expect("create temp dir");
2057 let cache_dir = dir.path().join("cache");
2058 fs::create_dir_all(&cache_dir).expect("create cache dir");
2059
2060 let mut postings = Vec::new();
2061 postings.extend_from_slice(INDEX_MAGIC);
2062 postings.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2063 postings.extend_from_slice(&0u32.to_le_bytes());
2064 postings.extend_from_slice(&1u32.to_le_bytes());
2065 postings.extend_from_slice(&DEFAULT_MAX_FILE_SIZE.to_le_bytes());
2066 postings.extend_from_slice(&1u32.to_le_bytes());
2067 postings.extend_from_slice(b"/");
2068 postings.push(0u8);
2069 postings.extend_from_slice(&1u32.to_le_bytes());
2070 postings.extend_from_slice(&0u64.to_le_bytes());
2071 postings.extend_from_slice(&0u64.to_le_bytes());
2072 postings.extend_from_slice(&1_000_000_000u32.to_le_bytes());
2073 postings.extend_from_slice(b"a");
2074 postings.extend_from_slice(&0u64.to_le_bytes());
2075
2076 let mut lookup = Vec::new();
2077 lookup.extend_from_slice(LOOKUP_MAGIC);
2078 lookup.extend_from_slice(&INDEX_VERSION.to_le_bytes());
2079 lookup.extend_from_slice(&0u32.to_le_bytes());
2080
2081 fs::write(cache_dir.join("postings.bin"), postings).expect("write postings");
2082 fs::write(cache_dir.join("lookup.bin"), lookup).expect("write lookup");
2083
2084 assert!(SearchIndex::read_from_disk(&cache_dir).is_none());
2085 }
2086}