1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::sync::{
7 atomic::{AtomicBool, AtomicUsize, Ordering},
8 Arc,
9};
10use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
12use globset::{Glob, GlobSet, GlobSetBuilder};
13use ignore::WalkBuilder;
14use rayon::prelude::*;
15use regex::bytes::{Regex, RegexBuilder};
16use regex_syntax::hir::{Hir, HirKind};
17
18const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
19const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
20const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
21const INDEX_VERSION: u32 = 1;
22const PREVIEW_BYTES: usize = 8 * 1024;
23const EOF_SENTINEL: u8 = 0;
24
25#[derive(Clone, Debug)]
26pub struct SearchIndex {
27 pub postings: HashMap<u32, Vec<Posting>>,
28 pub files: Vec<FileEntry>,
29 pub path_to_id: HashMap<PathBuf, u32>,
30 pub ready: bool,
31 project_root: PathBuf,
32 git_head: Option<String>,
33 max_file_size: u64,
34 file_trigrams: HashMap<u32, Vec<u32>>,
35 unindexed_files: HashSet<u32>,
36}
37
38#[derive(Clone, Debug, PartialEq, Eq)]
39pub struct Posting {
40 pub file_id: u32,
41 pub next_mask: u8,
42 pub loc_mask: u8,
43}
44
45#[derive(Clone, Debug)]
46pub struct FileEntry {
47 pub path: PathBuf,
48 pub size: u64,
49 pub modified: SystemTime,
50}
51
52#[derive(Clone, Debug, PartialEq, Eq)]
53pub struct GrepMatch {
54 pub file: PathBuf,
55 pub line: u32,
56 pub column: u32,
57 pub line_text: String,
58 pub match_text: String,
59}
60
61#[derive(Clone, Debug)]
62pub struct GrepResult {
63 pub matches: Vec<GrepMatch>,
64 pub total_matches: usize,
65 pub files_searched: usize,
66 pub files_with_matches: usize,
67 pub index_status: IndexStatus,
68 pub truncated: bool,
69}
70
71#[derive(Clone, Copy, Debug, PartialEq, Eq)]
72pub enum IndexStatus {
73 Ready,
74 Building,
75 Fallback,
76}
77
78impl IndexStatus {
79 pub fn as_str(&self) -> &'static str {
80 match self {
81 IndexStatus::Ready => "Ready",
82 IndexStatus::Building => "Building",
83 IndexStatus::Fallback => "Fallback",
84 }
85 }
86}
87
88#[derive(Clone, Debug, Default)]
89pub struct RegexQuery {
90 pub and_trigrams: Vec<u32>,
91 pub or_groups: Vec<Vec<u32>>,
92 pub(crate) and_filters: HashMap<u32, PostingFilter>,
93 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
94}
95
96#[derive(Clone, Copy, Debug, Default)]
97pub(crate) struct PostingFilter {
98 next_mask: u8,
99 loc_mask: u8,
100}
101
102#[derive(Clone, Debug, Default)]
103struct QueryBuild {
104 and_runs: Vec<Vec<u8>>,
105 or_groups: Vec<Vec<Vec<u8>>>,
106}
107
108#[derive(Clone, Debug, Default)]
109pub(crate) struct PathFilters {
110 includes: Option<GlobSet>,
111 excludes: Option<GlobSet>,
112}
113
114#[derive(Clone, Debug)]
115pub(crate) struct SearchScope {
116 pub root: PathBuf,
117 pub use_index: bool,
118}
119
120#[derive(Clone, Debug)]
121struct SharedGrepMatch {
122 file: Arc<PathBuf>,
123 line: u32,
124 column: u32,
125 line_text: String,
126 match_text: String,
127}
128
129#[derive(Clone, Debug)]
130enum SearchMatcher {
131 Literal(LiteralSearch),
132 Regex(Regex),
133}
134
135#[derive(Clone, Debug)]
136enum LiteralSearch {
137 CaseSensitive(Vec<u8>),
138 AsciiCaseInsensitive(Vec<u8>),
139}
140
141impl SearchIndex {
142 pub fn new() -> Self {
143 SearchIndex {
144 postings: HashMap::new(),
145 files: Vec::new(),
146 path_to_id: HashMap::new(),
147 ready: false,
148 project_root: PathBuf::new(),
149 git_head: None,
150 max_file_size: DEFAULT_MAX_FILE_SIZE,
151 file_trigrams: HashMap::new(),
152 unindexed_files: HashSet::new(),
153 }
154 }
155
156 pub fn build(root: &Path) -> Self {
157 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
158 }
159
160 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
161 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
162 let mut index = SearchIndex {
163 project_root: project_root.clone(),
164 max_file_size,
165 ..SearchIndex::new()
166 };
167
168 let filters = PathFilters::default();
169 for path in walk_project_files(&project_root, &filters) {
170 index.update_file(&path);
171 }
172
173 index.git_head = current_git_head(&project_root);
174 index.ready = true;
175 index
176 }
177
178 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
179 self.remove_file(path);
180
181 let file_id = match self.allocate_file_id(path, content.len() as u64) {
182 Some(file_id) => file_id,
183 None => return,
184 };
185
186 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
187 for (trigram, next_char, position) in extract_trigrams(content) {
188 let entry = trigram_map.entry(trigram).or_default();
189 entry.next_mask |= mask_for_next_char(next_char);
190 entry.loc_mask |= mask_for_position(position);
191 }
192
193 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
194 for (trigram, filter) in trigram_map {
195 let postings = self.postings.entry(trigram).or_default();
196 postings.push(Posting {
197 file_id,
198 next_mask: filter.next_mask,
199 loc_mask: filter.loc_mask,
200 });
201 if postings.len() > 1
205 && postings[postings.len() - 2].file_id > postings[postings.len() - 1].file_id
206 {
207 postings.sort_unstable_by_key(|p| p.file_id);
208 }
209 file_trigrams.push(trigram);
210 }
211
212 self.file_trigrams.insert(file_id, file_trigrams);
213 self.unindexed_files.remove(&file_id);
214 }
215
216 pub fn remove_file(&mut self, path: &Path) {
217 let Some(file_id) = self.path_to_id.remove(path) else {
218 return;
219 };
220
221 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
222 for trigram in trigrams {
223 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
224 postings.retain(|posting| posting.file_id != file_id);
225 postings.is_empty()
226 } else {
227 false
228 };
229
230 if should_remove {
231 self.postings.remove(&trigram);
232 }
233 }
234 }
235
236 self.unindexed_files.remove(&file_id);
237 if let Some(file) = self.files.get_mut(file_id as usize) {
238 file.path = PathBuf::new();
239 file.size = 0;
240 file.modified = UNIX_EPOCH;
241 }
242 }
243
244 pub fn update_file(&mut self, path: &Path) {
245 self.remove_file(path);
246
247 let metadata = match fs::metadata(path) {
248 Ok(metadata) if metadata.is_file() => metadata,
249 _ => return,
250 };
251
252 if is_binary_path(path, metadata.len()) {
253 return;
254 }
255
256 if metadata.len() > self.max_file_size {
257 self.track_unindexed_file(path, &metadata);
258 return;
259 }
260
261 let content = match fs::read(path) {
262 Ok(content) => content,
263 Err(_) => return,
264 };
265
266 if is_binary_bytes(&content) {
267 return;
268 }
269
270 self.index_file(path, &content);
271 }
272
273 pub fn grep(
274 &self,
275 pattern: &str,
276 case_sensitive: bool,
277 include: &[String],
278 exclude: &[String],
279 search_root: &Path,
280 max_results: usize,
281 ) -> GrepResult {
282 self.search_grep(
283 pattern,
284 case_sensitive,
285 include,
286 exclude,
287 search_root,
288 max_results,
289 )
290 }
291
292 pub fn search_grep(
293 &self,
294 pattern: &str,
295 case_sensitive: bool,
296 include: &[String],
297 exclude: &[String],
298 search_root: &Path,
299 max_results: usize,
300 ) -> GrepResult {
301 let is_literal = !pattern.chars().any(|c| {
304 matches!(
305 c,
306 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\'
307 )
308 });
309
310 let literal_search = if is_literal {
311 if case_sensitive {
312 Some(LiteralSearch::CaseSensitive(pattern.as_bytes().to_vec()))
313 } else if pattern.is_ascii() {
314 Some(LiteralSearch::AsciiCaseInsensitive(
315 pattern
316 .as_bytes()
317 .iter()
318 .map(|byte| byte.to_ascii_lowercase())
319 .collect(),
320 ))
321 } else {
322 None
323 }
324 } else {
325 None
326 };
327
328 let regex = if literal_search.is_some() {
330 None
331 } else {
332 let regex_pattern = if is_literal {
333 regex::escape(pattern)
334 } else {
335 pattern.to_string()
336 };
337 let mut builder = RegexBuilder::new(®ex_pattern);
338 builder.case_insensitive(!case_sensitive);
339 match builder.build() {
340 Ok(r) => Some(r),
341 Err(_) => {
342 return GrepResult {
343 matches: Vec::new(),
344 total_matches: 0,
345 files_searched: 0,
346 files_with_matches: 0,
347 index_status: if self.ready {
348 IndexStatus::Ready
349 } else {
350 IndexStatus::Building
351 },
352 truncated: false,
353 };
354 }
355 }
356 };
357
358 let matcher = if let Some(literal_search) = literal_search {
359 SearchMatcher::Literal(literal_search)
360 } else {
361 SearchMatcher::Regex(
362 regex.expect("regex should exist when literal matcher is unavailable"),
363 )
364 };
365
366 let filters = match build_path_filters(include, exclude) {
367 Ok(filters) => filters,
368 Err(_) => PathFilters::default(),
369 };
370 let search_root = canonicalize_or_normalize(search_root);
371
372 let query = decompose_regex(pattern);
373 let candidate_ids = self.candidates(&query);
374
375 let candidate_files: Vec<&FileEntry> = candidate_ids
376 .into_iter()
377 .filter_map(|file_id| self.files.get(file_id as usize))
378 .filter(|file| !file.path.as_os_str().is_empty())
379 .filter(|file| is_within_search_root(&search_root, &file.path))
380 .filter(|file| filters.matches(&self.project_root, &file.path))
381 .collect();
382
383 let total_matches = AtomicUsize::new(0);
384 let files_searched = AtomicUsize::new(0);
385 let files_with_matches = AtomicUsize::new(0);
386 let truncated = AtomicBool::new(false);
387 let stop_after = max_results.saturating_mul(2);
388
389 let mut matches = if candidate_files.len() > 10 {
390 candidate_files
391 .par_iter()
392 .map(|file| {
393 search_candidate_file(
394 file,
395 &matcher,
396 max_results,
397 stop_after,
398 &total_matches,
399 &files_searched,
400 &files_with_matches,
401 &truncated,
402 )
403 })
404 .reduce(Vec::new, |mut left, mut right| {
405 left.append(&mut right);
406 left
407 })
408 } else {
409 let mut matches = Vec::new();
410 for file in candidate_files {
411 matches.extend(search_candidate_file(
412 file,
413 &matcher,
414 max_results,
415 stop_after,
416 &total_matches,
417 &files_searched,
418 &files_with_matches,
419 &truncated,
420 ));
421
422 if should_stop_search(&truncated, &total_matches, stop_after) {
423 break;
424 }
425 }
426 matches
427 };
428
429 sort_shared_grep_matches_by_cached_mtime_desc(&mut matches, |path| {
430 self.path_to_id
431 .get(path)
432 .and_then(|file_id| self.files.get(*file_id as usize))
433 .map(|file| file.modified)
434 });
435
436 let matches = matches
437 .into_iter()
438 .map(|matched| GrepMatch {
439 file: matched.file.as_ref().clone(),
440 line: matched.line,
441 column: matched.column,
442 line_text: matched.line_text,
443 match_text: matched.match_text,
444 })
445 .collect();
446
447 GrepResult {
448 total_matches: total_matches.load(Ordering::Relaxed),
449 matches,
450 files_searched: files_searched.load(Ordering::Relaxed),
451 files_with_matches: files_with_matches.load(Ordering::Relaxed),
452 index_status: if self.ready {
453 IndexStatus::Ready
454 } else {
455 IndexStatus::Building
456 },
457 truncated: truncated.load(Ordering::Relaxed),
458 }
459 }
460
461 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
462 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
463 Ok(filters) => filters,
464 Err(_) => return Vec::new(),
465 };
466 let search_root = canonicalize_or_normalize(search_root);
467 let filter_root = if search_root.starts_with(&self.project_root) {
468 &self.project_root
469 } else {
470 &search_root
471 };
472
473 let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
474 sort_paths_by_mtime_desc(&mut paths);
475 paths
476 }
477
478 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
479 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
480 return self.active_file_ids();
481 }
482
483 let mut and_trigrams = query.and_trigrams.clone();
484 and_trigrams.sort_unstable_by_key(|trigram| self.postings.get(trigram).map_or(0, Vec::len));
485
486 let mut current: Option<Vec<u32>> = None;
487
488 for trigram in and_trigrams {
489 let filter = query.and_filters.get(&trigram).copied();
490 let matches = self.postings_for_trigram(trigram, filter);
491 current = Some(match current.take() {
492 Some(existing) => intersect_sorted_ids(&existing, &matches),
493 None => matches,
494 });
495
496 if current.as_ref().is_some_and(|ids| ids.is_empty()) {
497 break;
498 }
499 }
500
501 let mut current = current.unwrap_or_else(|| self.active_file_ids());
502
503 for (index, group) in query.or_groups.iter().enumerate() {
504 let mut group_matches = Vec::new();
505 let filters = query.or_filters.get(index);
506
507 for trigram in group {
508 let filter = filters.and_then(|filters| filters.get(trigram).copied());
509 let matches = self.postings_for_trigram(*trigram, filter);
510 if group_matches.is_empty() {
511 group_matches = matches;
512 } else {
513 group_matches = union_sorted_ids(&group_matches, &matches);
514 }
515 }
516
517 current = intersect_sorted_ids(¤t, &group_matches);
518 if current.is_empty() {
519 break;
520 }
521 }
522
523 let mut unindexed = self
524 .unindexed_files
525 .iter()
526 .copied()
527 .filter(|file_id| self.is_active_file(*file_id))
528 .collect::<Vec<_>>();
529 if !unindexed.is_empty() {
530 unindexed.sort_unstable();
531 current = union_sorted_ids(¤t, &unindexed);
532 }
533
534 current
535 }
536
537 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
538 if fs::create_dir_all(cache_dir).is_err() {
539 return;
540 }
541
542 let postings_path = cache_dir.join("postings.bin");
543 let lookup_path = cache_dir.join("lookup.bin");
544 let tmp_postings = cache_dir.join("postings.bin.tmp");
545 let tmp_lookup = cache_dir.join("lookup.bin.tmp");
546
547 let active_ids = self.active_file_ids();
548 let mut id_map = HashMap::new();
549 for (new_id, old_id) in active_ids.iter().enumerate() {
550 let Ok(new_id_u32) = u32::try_from(new_id) else {
551 return;
552 };
553 id_map.insert(*old_id, new_id_u32);
554 }
555
556 let write_result = (|| -> std::io::Result<()> {
557 let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
558
559 postings_writer.write_all(INDEX_MAGIC)?;
560 write_u32(&mut postings_writer, INDEX_VERSION)?;
561
562 let head = git_head.unwrap_or_default();
563 let root = self.project_root.to_string_lossy();
564 let head_len = u32::try_from(head.len())
565 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
566 let root_len = u32::try_from(root.len())
567 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
568 let file_count = u32::try_from(active_ids.len())
569 .map_err(|_| std::io::Error::other("too many files to cache"))?;
570
571 write_u32(&mut postings_writer, head_len)?;
572 write_u32(&mut postings_writer, root_len)?;
573 write_u64(&mut postings_writer, self.max_file_size)?;
574 write_u32(&mut postings_writer, file_count)?;
575 postings_writer.write_all(head.as_bytes())?;
576 postings_writer.write_all(root.as_bytes())?;
577
578 for old_id in &active_ids {
579 let Some(file) = self.files.get(*old_id as usize) else {
580 return Err(std::io::Error::other("missing file entry for cache write"));
581 };
582 let path = relative_to_root(&self.project_root, &file.path);
583 let path = path.to_string_lossy();
584 let path_len = u32::try_from(path.len())
585 .map_err(|_| std::io::Error::other("cached path too large"))?;
586 let modified = file
587 .modified
588 .duration_since(UNIX_EPOCH)
589 .unwrap_or(Duration::ZERO);
590 let unindexed = if self.unindexed_files.contains(old_id) {
591 1u8
592 } else {
593 0u8
594 };
595
596 postings_writer.write_all(&[unindexed])?;
597 write_u32(&mut postings_writer, path_len)?;
598 write_u64(&mut postings_writer, file.size)?;
599 write_u64(&mut postings_writer, modified.as_secs())?;
600 write_u32(&mut postings_writer, modified.subsec_nanos())?;
601 postings_writer.write_all(path.as_bytes())?;
602 }
603
604 let mut lookup_entries = Vec::new();
605 let mut postings_blob = Vec::new();
606 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
607 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
608
609 for (trigram, postings) in sorted_postings {
610 let offset = u64::try_from(postings_blob.len())
611 .map_err(|_| std::io::Error::other("postings blob too large"))?;
612 let mut count = 0u32;
613
614 for posting in postings {
615 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
616 continue;
617 };
618
619 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
620 postings_blob.push(posting.next_mask);
621 postings_blob.push(posting.loc_mask);
622 count = count.saturating_add(1);
623 }
624
625 if count > 0 {
626 lookup_entries.push((*trigram, offset, count));
627 }
628 }
629
630 write_u64(
631 &mut postings_writer,
632 u64::try_from(postings_blob.len())
633 .map_err(|_| std::io::Error::other("postings blob too large"))?,
634 )?;
635 postings_writer.write_all(&postings_blob)?;
636 postings_writer.flush()?;
637 drop(postings_writer);
638
639 let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
640 let entry_count = u32::try_from(lookup_entries.len())
641 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
642
643 lookup_writer.write_all(LOOKUP_MAGIC)?;
644 write_u32(&mut lookup_writer, INDEX_VERSION)?;
645 write_u32(&mut lookup_writer, entry_count)?;
646
647 for (trigram, offset, count) in lookup_entries {
648 write_u32(&mut lookup_writer, trigram)?;
649 write_u64(&mut lookup_writer, offset)?;
650 write_u32(&mut lookup_writer, count)?;
651 }
652
653 lookup_writer.flush()?;
654 drop(lookup_writer);
655
656 fs::rename(&tmp_postings, &postings_path)?;
657 fs::rename(&tmp_lookup, &lookup_path)?;
658
659 Ok(())
660 })();
661
662 if write_result.is_err() {
663 let _ = fs::remove_file(&tmp_postings);
664 let _ = fs::remove_file(&tmp_lookup);
665 }
666 }
667
668 pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
669 let postings_path = cache_dir.join("postings.bin");
670 let lookup_path = cache_dir.join("lookup.bin");
671
672 let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
673 let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
674
675 let mut magic = [0u8; 8];
676 postings_reader.read_exact(&mut magic).ok()?;
677 if &magic != INDEX_MAGIC {
678 return None;
679 }
680 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
681 return None;
682 }
683
684 let head_len = read_u32(&mut postings_reader).ok()? as usize;
685 let root_len = read_u32(&mut postings_reader).ok()? as usize;
686 let max_file_size = read_u64(&mut postings_reader).ok()?;
687 let file_count = read_u32(&mut postings_reader).ok()? as usize;
688
689 let mut head_bytes = vec![0u8; head_len];
690 postings_reader.read_exact(&mut head_bytes).ok()?;
691 let git_head = String::from_utf8(head_bytes)
692 .ok()
693 .filter(|head| !head.is_empty());
694
695 let mut root_bytes = vec![0u8; root_len];
696 postings_reader.read_exact(&mut root_bytes).ok()?;
697 let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
698
699 let mut files = Vec::with_capacity(file_count);
700 let mut path_to_id = HashMap::new();
701 let mut unindexed_files = HashSet::new();
702
703 for file_id in 0..file_count {
704 let mut unindexed = [0u8; 1];
705 postings_reader.read_exact(&mut unindexed).ok()?;
706 let path_len = read_u32(&mut postings_reader).ok()? as usize;
707 let size = read_u64(&mut postings_reader).ok()?;
708 let secs = read_u64(&mut postings_reader).ok()?;
709 let nanos = read_u32(&mut postings_reader).ok()?;
710 let mut path_bytes = vec![0u8; path_len];
711 postings_reader.read_exact(&mut path_bytes).ok()?;
712 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
713 let full_path = project_root.join(relative_path);
714 let file_id_u32 = u32::try_from(file_id).ok()?;
715
716 files.push(FileEntry {
717 path: full_path.clone(),
718 size,
719 modified: UNIX_EPOCH + Duration::new(secs, nanos),
720 });
721 path_to_id.insert(full_path, file_id_u32);
722 if unindexed[0] == 1 {
723 unindexed_files.insert(file_id_u32);
724 }
725 }
726
727 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
728 let mut postings_blob = vec![0u8; postings_len];
729 postings_reader.read_exact(&mut postings_blob).ok()?;
730
731 let mut lookup_magic = [0u8; 8];
732 lookup_reader.read_exact(&mut lookup_magic).ok()?;
733 if &lookup_magic != LOOKUP_MAGIC {
734 return None;
735 }
736 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
737 return None;
738 }
739 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
740
741 let mut postings = HashMap::new();
742 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
743
744 for _ in 0..entry_count {
745 let trigram = read_u32(&mut lookup_reader).ok()?;
746 let offset = read_u64(&mut lookup_reader).ok()? as usize;
747 let count = read_u32(&mut lookup_reader).ok()? as usize;
748 let bytes_len = count.checked_mul(6)?;
749 let end = offset.checked_add(bytes_len)?;
750 if end > postings_blob.len() {
751 return None;
752 }
753
754 let mut trigram_postings = Vec::with_capacity(count);
755 for chunk in postings_blob[offset..end].chunks_exact(6) {
756 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
757 let posting = Posting {
758 file_id,
759 next_mask: chunk[4],
760 loc_mask: chunk[5],
761 };
762 trigram_postings.push(posting.clone());
763 file_trigrams.entry(file_id).or_default().push(trigram);
764 }
765 postings.insert(trigram, trigram_postings);
766 }
767
768 Some(SearchIndex {
769 postings,
770 files,
771 path_to_id,
772 ready: true,
773 project_root,
774 git_head,
775 max_file_size,
776 file_trigrams,
777 unindexed_files,
778 })
779 }
780
781 pub(crate) fn stored_git_head(&self) -> Option<&str> {
782 self.git_head.as_deref()
783 }
784
785 pub(crate) fn set_ready(&mut self, ready: bool) {
786 self.ready = ready;
787 }
788
789 pub(crate) fn rebuild_or_refresh(
790 root: &Path,
791 max_file_size: u64,
792 current_head: Option<String>,
793 baseline: Option<SearchIndex>,
794 ) -> Self {
795 if current_head.is_none() {
796 return SearchIndex::build_with_limit(root, max_file_size);
797 }
798
799 if let Some(mut baseline) = baseline {
800 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
801 baseline.max_file_size = max_file_size;
802
803 if baseline.git_head == current_head {
804 verify_file_mtimes(&mut baseline);
808 baseline.ready = true;
809 return baseline;
810 }
811
812 if let (Some(previous), Some(current)) =
813 (baseline.git_head.clone(), current_head.clone())
814 {
815 let project_root = baseline.project_root.clone();
816 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
817 baseline.git_head = Some(current);
818 baseline.ready = true;
819 return baseline;
820 }
821 }
822 }
823
824 SearchIndex::build_with_limit(root, max_file_size)
825 }
826
827 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
828 let file_id = u32::try_from(self.files.len()).ok()?;
829 let metadata = fs::metadata(path).ok();
830 let size = metadata
831 .as_ref()
832 .map_or(size_hint, |metadata| metadata.len());
833 let modified = metadata
834 .and_then(|metadata| metadata.modified().ok())
835 .unwrap_or(UNIX_EPOCH);
836
837 self.files.push(FileEntry {
838 path: path.to_path_buf(),
839 size,
840 modified,
841 });
842 self.path_to_id.insert(path.to_path_buf(), file_id);
843 Some(file_id)
844 }
845
846 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
847 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
848 return;
849 };
850 self.unindexed_files.insert(file_id);
851 self.file_trigrams.insert(file_id, Vec::new());
852 }
853
854 fn active_file_ids(&self) -> Vec<u32> {
855 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
856 ids.sort_unstable();
857 ids
858 }
859
860 fn is_active_file(&self, file_id: u32) -> bool {
861 self.files
862 .get(file_id as usize)
863 .map(|file| !file.path.as_os_str().is_empty())
864 .unwrap_or(false)
865 }
866
867 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> Vec<u32> {
868 let Some(postings) = self.postings.get(&trigram) else {
869 return Vec::new();
870 };
871
872 let mut matches = Vec::with_capacity(postings.len());
873
874 for posting in postings {
875 if let Some(filter) = filter {
876 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
879 continue;
880 }
881 }
886 if self.is_active_file(posting.file_id) {
887 matches.push(posting.file_id);
888 }
889 }
890
891 matches
892 }
893}
894
895fn search_candidate_file(
896 file: &FileEntry,
897 matcher: &SearchMatcher,
898 max_results: usize,
899 stop_after: usize,
900 total_matches: &AtomicUsize,
901 files_searched: &AtomicUsize,
902 files_with_matches: &AtomicUsize,
903 truncated: &AtomicBool,
904) -> Vec<SharedGrepMatch> {
905 if should_stop_search(truncated, total_matches, stop_after) {
906 return Vec::new();
907 }
908
909 let content = match read_indexed_file_bytes(&file.path) {
910 Some(content) => content,
911 None => return Vec::new(),
912 };
913 files_searched.fetch_add(1, Ordering::Relaxed);
914
915 let shared_path = Arc::new(file.path.clone());
916 let mut matches = Vec::new();
917 let mut line_starts = None;
918 let mut seen_lines = HashSet::new();
919 let mut matched_this_file = false;
920
921 match matcher {
922 SearchMatcher::Literal(LiteralSearch::CaseSensitive(needle)) => {
923 let finder = memchr::memmem::Finder::new(needle);
924 let mut start = 0;
925
926 while let Some(position) = finder.find(&content[start..]) {
927 if should_stop_search(truncated, total_matches, stop_after) {
928 break;
929 }
930
931 let offset = start + position;
932 start = offset + 1;
933
934 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
935 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
936 if !seen_lines.insert(line) {
937 continue;
938 }
939
940 matched_this_file = true;
941 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
942 if match_number > max_results {
943 truncated.store(true, Ordering::Relaxed);
944 break;
945 }
946
947 let end = offset + needle.len();
948 matches.push(SharedGrepMatch {
949 file: shared_path.clone(),
950 line,
951 column,
952 line_text,
953 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
954 });
955 }
956 }
957 SearchMatcher::Literal(LiteralSearch::AsciiCaseInsensitive(needle)) => {
958 let search_content = content.to_ascii_lowercase();
959 let finder = memchr::memmem::Finder::new(needle);
960 let mut start = 0;
961
962 while let Some(position) = finder.find(&search_content[start..]) {
963 if should_stop_search(truncated, total_matches, stop_after) {
964 break;
965 }
966
967 let offset = start + position;
968 start = offset + 1;
969
970 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
971 let (line, column, line_text) = line_details_bytes(&content, line_starts, offset);
972 if !seen_lines.insert(line) {
973 continue;
974 }
975
976 matched_this_file = true;
977 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
978 if match_number > max_results {
979 truncated.store(true, Ordering::Relaxed);
980 break;
981 }
982
983 let end = offset + needle.len();
984 matches.push(SharedGrepMatch {
985 file: shared_path.clone(),
986 line,
987 column,
988 line_text,
989 match_text: String::from_utf8_lossy(&content[offset..end]).into_owned(),
990 });
991 }
992 }
993 SearchMatcher::Regex(regex) => {
994 for matched in regex.find_iter(&content) {
995 if should_stop_search(truncated, total_matches, stop_after) {
996 break;
997 }
998
999 let line_starts = line_starts.get_or_insert_with(|| line_starts_bytes(&content));
1000 let (line, column, line_text) =
1001 line_details_bytes(&content, line_starts, matched.start());
1002 if !seen_lines.insert(line) {
1003 continue;
1004 }
1005
1006 matched_this_file = true;
1007 let match_number = total_matches.fetch_add(1, Ordering::Relaxed) + 1;
1008 if match_number > max_results {
1009 truncated.store(true, Ordering::Relaxed);
1010 break;
1011 }
1012
1013 matches.push(SharedGrepMatch {
1014 file: shared_path.clone(),
1015 line,
1016 column,
1017 line_text,
1018 match_text: String::from_utf8_lossy(matched.as_bytes()).into_owned(),
1019 });
1020 }
1021 }
1022 }
1023
1024 if matched_this_file {
1025 files_with_matches.fetch_add(1, Ordering::Relaxed);
1026 }
1027
1028 matches
1029}
1030
1031fn should_stop_search(
1032 truncated: &AtomicBool,
1033 total_matches: &AtomicUsize,
1034 stop_after: usize,
1035) -> bool {
1036 truncated.load(Ordering::Relaxed) && total_matches.load(Ordering::Relaxed) >= stop_after
1037}
1038
1039fn intersect_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1040 let mut merged = Vec::with_capacity(left.len().min(right.len()));
1041 let mut left_index = 0;
1042 let mut right_index = 0;
1043
1044 while left_index < left.len() && right_index < right.len() {
1045 match left[left_index].cmp(&right[right_index]) {
1046 std::cmp::Ordering::Less => left_index += 1,
1047 std::cmp::Ordering::Greater => right_index += 1,
1048 std::cmp::Ordering::Equal => {
1049 merged.push(left[left_index]);
1050 left_index += 1;
1051 right_index += 1;
1052 }
1053 }
1054 }
1055
1056 merged
1057}
1058
1059fn union_sorted_ids(left: &[u32], right: &[u32]) -> Vec<u32> {
1060 let mut merged = Vec::with_capacity(left.len() + right.len());
1061 let mut left_index = 0;
1062 let mut right_index = 0;
1063
1064 while left_index < left.len() && right_index < right.len() {
1065 match left[left_index].cmp(&right[right_index]) {
1066 std::cmp::Ordering::Less => {
1067 merged.push(left[left_index]);
1068 left_index += 1;
1069 }
1070 std::cmp::Ordering::Greater => {
1071 merged.push(right[right_index]);
1072 right_index += 1;
1073 }
1074 std::cmp::Ordering::Equal => {
1075 merged.push(left[left_index]);
1076 left_index += 1;
1077 right_index += 1;
1078 }
1079 }
1080 }
1081
1082 merged.extend_from_slice(&left[left_index..]);
1083 merged.extend_from_slice(&right[right_index..]);
1084 merged
1085}
1086
1087pub fn decompose_regex(pattern: &str) -> RegexQuery {
1088 let hir = match regex_syntax::parse(pattern) {
1089 Ok(hir) => hir,
1090 Err(_) => return RegexQuery::default(),
1091 };
1092
1093 let build = build_query(&hir);
1094 build.into_query()
1095}
1096
1097pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
1098 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
1099}
1100
1101pub fn normalize_char(c: u8) -> u8 {
1102 c.to_ascii_lowercase()
1103}
1104
1105pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
1106 if content.len() < 3 {
1107 return Vec::new();
1108 }
1109
1110 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
1111 for start in 0..=content.len() - 3 {
1112 let trigram = pack_trigram(
1113 normalize_char(content[start]),
1114 normalize_char(content[start + 1]),
1115 normalize_char(content[start + 2]),
1116 );
1117 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
1118 trigrams.push((trigram, next_char, start));
1119 }
1120 trigrams
1121}
1122
1123pub fn resolve_cache_dir(project_root: &Path) -> PathBuf {
1124 if let Some(override_dir) = std::env::var_os("AFT_CACHE_DIR") {
1126 return PathBuf::from(override_dir)
1127 .join("index")
1128 .join(project_cache_key(project_root));
1129 }
1130 let home = std::env::var_os("HOME")
1131 .map(PathBuf::from)
1132 .unwrap_or_else(|| PathBuf::from("."));
1133 home.join(".cache")
1134 .join("aft")
1135 .join("index")
1136 .join(project_cache_key(project_root))
1137}
1138
1139pub(crate) fn build_path_filters(
1140 include: &[String],
1141 exclude: &[String],
1142) -> Result<PathFilters, String> {
1143 Ok(PathFilters {
1144 includes: build_globset(include)?,
1145 excludes: build_globset(exclude)?,
1146 })
1147}
1148
1149pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
1150 walk_project_files_from(root, root, filters)
1151}
1152
1153pub(crate) fn walk_project_files_from(
1154 filter_root: &Path,
1155 search_root: &Path,
1156 filters: &PathFilters,
1157) -> Vec<PathBuf> {
1158 let mut builder = WalkBuilder::new(search_root);
1159 builder
1160 .hidden(false)
1161 .git_ignore(true)
1162 .git_global(true)
1163 .git_exclude(true)
1164 .filter_entry(|entry| {
1165 let name = entry.file_name().to_string_lossy();
1166 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
1167 return !matches!(
1168 name.as_ref(),
1169 "node_modules"
1170 | "target"
1171 | "venv"
1172 | ".venv"
1173 | ".git"
1174 | "__pycache__"
1175 | ".tox"
1176 | "dist"
1177 | "build"
1178 );
1179 }
1180 true
1181 });
1182
1183 let mut files = Vec::new();
1184 for entry in builder.build().filter_map(|entry| entry.ok()) {
1185 if !entry
1186 .file_type()
1187 .map_or(false, |file_type| file_type.is_file())
1188 {
1189 continue;
1190 }
1191 let path = entry.into_path();
1192 if filters.matches(filter_root, &path) {
1193 files.push(path);
1194 }
1195 }
1196
1197 sort_paths_by_mtime_desc(&mut files);
1198 files
1199}
1200
1201pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
1202 let bytes = fs::read(path).ok()?;
1203 if is_binary_bytes(&bytes) {
1204 return None;
1205 }
1206 String::from_utf8(bytes).ok()
1207}
1208
1209fn read_indexed_file_bytes(path: &Path) -> Option<Vec<u8>> {
1210 fs::read(path).ok()
1211}
1212
1213pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
1214 path.strip_prefix(root)
1215 .map(PathBuf::from)
1216 .unwrap_or_else(|_| path.to_path_buf())
1217}
1218
1219pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
1220 paths.sort_by(|left, right| {
1221 path_modified_time(right)
1222 .cmp(&path_modified_time(left))
1223 .then_with(|| left.cmp(right))
1224 });
1225}
1226
1227pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
1228 matches.sort_by(|left, right| {
1229 let left_path = resolve_match_path(project_root, &left.file);
1230 let right_path = resolve_match_path(project_root, &right.file);
1231
1232 path_modified_time(&right_path)
1233 .cmp(&path_modified_time(&left_path))
1234 .then_with(|| left.file.cmp(&right.file))
1235 .then_with(|| left.line.cmp(&right.line))
1236 .then_with(|| left.column.cmp(&right.column))
1237 });
1238}
1239
1240fn sort_shared_grep_matches_by_cached_mtime_desc<F>(
1241 matches: &mut [SharedGrepMatch],
1242 modified_for_path: F,
1243) where
1244 F: Fn(&Path) -> Option<SystemTime>,
1245{
1246 matches.sort_by(|left, right| {
1247 modified_for_path(right.file.as_path())
1248 .cmp(&modified_for_path(left.file.as_path()))
1249 .then_with(|| left.file.as_path().cmp(right.file.as_path()))
1250 .then_with(|| left.line.cmp(&right.line))
1251 .then_with(|| left.column.cmp(&right.column))
1252 });
1253}
1254
1255pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
1256 let resolved_project_root = canonicalize_or_normalize(project_root);
1257 let root = match path {
1258 Some(path) => {
1259 let path = PathBuf::from(path);
1260 if path.is_absolute() {
1261 canonicalize_or_normalize(&path)
1262 } else {
1263 normalize_path(&resolved_project_root.join(path))
1264 }
1265 }
1266 None => resolved_project_root.clone(),
1267 };
1268
1269 let use_index = is_within_search_root(&resolved_project_root, &root);
1270 SearchScope { root, use_index }
1271}
1272
1273pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
1274 content_inspector::inspect(content).is_binary()
1275}
1276
1277pub(crate) fn current_git_head(root: &Path) -> Option<String> {
1278 run_git(root, &["rev-parse", "HEAD"])
1279}
1280
1281pub(crate) fn project_cache_key(project_root: &Path) -> String {
1282 use sha2::{Digest, Sha256};
1283
1284 let mut hasher = Sha256::new();
1285
1286 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
1287 hasher.update(root_commit.as_bytes());
1290 } else {
1291 let canonical_root = canonicalize_or_normalize(project_root);
1293 hasher.update(canonical_root.to_string_lossy().as_bytes());
1294 }
1295
1296 let digest = format!("{:x}", hasher.finalize());
1297 digest[..16].to_string()
1298}
1299
1300impl PathFilters {
1301 fn matches(&self, root: &Path, path: &Path) -> bool {
1302 let relative = to_glob_path(&relative_to_root(root, path));
1303 if self
1304 .includes
1305 .as_ref()
1306 .is_some_and(|includes| !includes.is_match(&relative))
1307 {
1308 return false;
1309 }
1310 if self
1311 .excludes
1312 .as_ref()
1313 .is_some_and(|excludes| excludes.is_match(&relative))
1314 {
1315 return false;
1316 }
1317 true
1318 }
1319}
1320
1321fn canonicalize_or_normalize(path: &Path) -> PathBuf {
1322 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
1323}
1324
1325fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
1326 if path.is_absolute() {
1327 path.to_path_buf()
1328 } else {
1329 project_root.join(path)
1330 }
1331}
1332
1333fn path_modified_time(path: &Path) -> Option<SystemTime> {
1334 fs::metadata(path)
1335 .and_then(|metadata| metadata.modified())
1336 .ok()
1337}
1338
1339fn normalize_path(path: &Path) -> PathBuf {
1340 let mut result = PathBuf::new();
1341 for component in path.components() {
1342 match component {
1343 Component::ParentDir => {
1344 if !result.pop() {
1345 result.push(component);
1346 }
1347 }
1348 Component::CurDir => {}
1349 _ => result.push(component),
1350 }
1351 }
1352 result
1353}
1354
1355fn verify_file_mtimes(index: &mut SearchIndex) {
1358 let mut stale_paths = Vec::new();
1360 for entry in &index.files {
1361 if entry.path.as_os_str().is_empty() {
1362 continue; }
1364 match fs::metadata(&entry.path) {
1365 Ok(meta) => {
1366 let current_mtime = meta.modified().unwrap_or(UNIX_EPOCH);
1367 if current_mtime != entry.modified || meta.len() != entry.size {
1368 stale_paths.push(entry.path.clone());
1369 }
1370 }
1371 Err(_) => {
1372 stale_paths.push(entry.path.clone());
1374 }
1375 }
1376 }
1377
1378 for path in &stale_paths {
1380 index.update_file(path);
1381 }
1382
1383 let filters = PathFilters::default();
1385 for path in walk_project_files(&index.project_root, &filters) {
1386 if !index.path_to_id.contains_key(&path) {
1387 index.update_file(&path);
1388 }
1389 }
1390
1391 if !stale_paths.is_empty() {
1392 log::info!(
1393 "[aft] search index: refreshed {} stale file(s) from disk cache",
1394 stale_paths.len()
1395 );
1396 }
1397}
1398
1399fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1400 path.starts_with(search_root)
1401}
1402
1403impl QueryBuild {
1404 fn into_query(self) -> RegexQuery {
1405 let mut query = RegexQuery::default();
1406
1407 for run in self.and_runs {
1408 add_run_to_and_query(&mut query, &run);
1409 }
1410
1411 for group in self.or_groups {
1412 let mut trigrams = BTreeSet::new();
1413 let mut filters = HashMap::new();
1414 for run in group {
1415 for (trigram, filter) in trigram_filters(&run) {
1416 trigrams.insert(trigram);
1417 merge_filter(filters.entry(trigram).or_default(), filter);
1418 }
1419 }
1420 if !trigrams.is_empty() {
1421 query.or_groups.push(trigrams.into_iter().collect());
1422 query.or_filters.push(filters);
1423 }
1424 }
1425
1426 query
1427 }
1428}
1429
1430fn build_query(hir: &Hir) -> QueryBuild {
1431 match hir.kind() {
1432 HirKind::Literal(literal) => {
1433 if literal.0.len() >= 3 {
1434 QueryBuild {
1435 and_runs: vec![literal.0.to_vec()],
1436 or_groups: Vec::new(),
1437 }
1438 } else {
1439 QueryBuild::default()
1440 }
1441 }
1442 HirKind::Capture(capture) => build_query(&capture.sub),
1443 HirKind::Concat(parts) => {
1444 let mut build = QueryBuild::default();
1445 for part in parts {
1446 let part_build = build_query(part);
1447 build.and_runs.extend(part_build.and_runs);
1448 build.or_groups.extend(part_build.or_groups);
1449 }
1450 build
1451 }
1452 HirKind::Alternation(parts) => {
1453 let mut group = Vec::new();
1454 for part in parts {
1455 let Some(mut choices) = guaranteed_run_choices(part) else {
1456 return QueryBuild::default();
1457 };
1458 group.append(&mut choices);
1459 }
1460 if group.is_empty() {
1461 QueryBuild::default()
1462 } else {
1463 QueryBuild {
1464 and_runs: Vec::new(),
1465 or_groups: vec![group],
1466 }
1467 }
1468 }
1469 HirKind::Repetition(repetition) => {
1470 if repetition.min == 0 {
1471 QueryBuild::default()
1472 } else {
1473 build_query(&repetition.sub)
1474 }
1475 }
1476 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1477 }
1478}
1479
1480fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1481 match hir.kind() {
1482 HirKind::Literal(literal) => {
1483 if literal.0.len() >= 3 {
1484 Some(vec![literal.0.to_vec()])
1485 } else {
1486 None
1487 }
1488 }
1489 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1490 HirKind::Concat(parts) => {
1491 let mut runs = Vec::new();
1492 for part in parts {
1493 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1494 runs.append(&mut part_runs);
1495 }
1496 }
1497 if runs.is_empty() {
1498 None
1499 } else {
1500 Some(runs)
1501 }
1502 }
1503 HirKind::Alternation(parts) => {
1504 let mut runs = Vec::new();
1505 for part in parts {
1506 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1507 return None;
1508 };
1509 runs.append(&mut part_runs);
1510 }
1511 if runs.is_empty() {
1512 None
1513 } else {
1514 Some(runs)
1515 }
1516 }
1517 HirKind::Repetition(repetition) => {
1518 if repetition.min == 0 {
1519 None
1520 } else {
1521 guaranteed_run_choices(&repetition.sub)
1522 }
1523 }
1524 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1525 }
1526}
1527
1528fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1529 for (trigram, filter) in trigram_filters(run) {
1530 if !query.and_trigrams.contains(&trigram) {
1531 query.and_trigrams.push(trigram);
1532 }
1533 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1534 }
1535}
1536
1537fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1538 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1539 for (trigram, next_char, position) in extract_trigrams(run) {
1540 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1541 if next_char != EOF_SENTINEL {
1542 entry.next_mask |= mask_for_next_char(next_char);
1543 }
1544 entry.loc_mask |= mask_for_position(position);
1545 }
1546 filters.into_iter().collect()
1547}
1548
1549fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1550 target.next_mask |= filter.next_mask;
1551 target.loc_mask |= filter.loc_mask;
1552}
1553
1554fn mask_for_next_char(next_char: u8) -> u8 {
1555 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1556 1u8 << bit
1557}
1558
1559fn mask_for_position(position: usize) -> u8 {
1560 1u8 << (position % 8)
1561}
1562
1563fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1564 if patterns.is_empty() {
1565 return Ok(None);
1566 }
1567
1568 let mut builder = GlobSetBuilder::new();
1569 for pattern in patterns {
1570 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1571 builder.add(glob);
1572 }
1573 builder.build().map(Some).map_err(|error| error.to_string())
1574}
1575
1576fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1577 let mut buffer = [0u8; 4];
1578 reader.read_exact(&mut buffer)?;
1579 Ok(u32::from_le_bytes(buffer))
1580}
1581
1582fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1583 let mut buffer = [0u8; 8];
1584 reader.read_exact(&mut buffer)?;
1585 Ok(u64::from_le_bytes(buffer))
1586}
1587
1588fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1589 writer.write_all(&value.to_le_bytes())
1590}
1591
1592fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1593 writer.write_all(&value.to_le_bytes())
1594}
1595
1596fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1597 let output = Command::new("git")
1598 .arg("-C")
1599 .arg(root)
1600 .args(args)
1601 .output()
1602 .ok()?;
1603 if !output.status.success() {
1604 return None;
1605 }
1606 let value = String::from_utf8(output.stdout).ok()?;
1607 let value = value.trim().to_string();
1608 if value.is_empty() {
1609 None
1610 } else {
1611 Some(value)
1612 }
1613}
1614
1615fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1616 let diff_range = format!("{}..{}", from, to);
1617 let output = match Command::new("git")
1618 .arg("-C")
1619 .arg(root)
1620 .args(["diff", "--name-only", &diff_range])
1621 .output()
1622 {
1623 Ok(output) => output,
1624 Err(_) => return false,
1625 };
1626
1627 if !output.status.success() {
1628 return false;
1629 }
1630
1631 let Ok(paths) = String::from_utf8(output.stdout) else {
1632 return false;
1633 };
1634
1635 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1636 let path = root.join(relative_path);
1637 if path.exists() {
1638 index.update_file(&path);
1639 } else {
1640 index.remove_file(&path);
1641 }
1642 }
1643
1644 true
1645}
1646
1647fn is_binary_path(path: &Path, size: u64) -> bool {
1648 if size == 0 {
1649 return false;
1650 }
1651
1652 let mut file = match File::open(path) {
1653 Ok(file) => file,
1654 Err(_) => return true,
1655 };
1656
1657 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1658 match file.read(&mut preview) {
1659 Ok(read) => is_binary_bytes(&preview[..read]),
1660 Err(_) => true,
1661 }
1662}
1663
1664fn line_starts_bytes(content: &[u8]) -> Vec<usize> {
1665 let mut starts = vec![0usize];
1666 for (index, byte) in content.iter().copied().enumerate() {
1667 if byte == b'\n' {
1668 starts.push(index + 1);
1669 }
1670 }
1671 starts
1672}
1673
1674fn line_details_bytes(content: &[u8], line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1675 let line_index = match line_starts.binary_search(&offset) {
1676 Ok(index) => index,
1677 Err(index) => index.saturating_sub(1),
1678 };
1679 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1680 let line_end = content[line_start..]
1681 .iter()
1682 .position(|byte| *byte == b'\n')
1683 .map(|length| line_start + length)
1684 .unwrap_or(content.len());
1685 let mut line_slice = &content[line_start..line_end];
1686 if line_slice.ends_with(b"\r") {
1687 line_slice = &line_slice[..line_slice.len() - 1];
1688 }
1689 let line_text = String::from_utf8_lossy(line_slice).into_owned();
1690 let column = String::from_utf8_lossy(&content[line_start..offset])
1691 .chars()
1692 .count() as u32
1693 + 1;
1694 (line_index as u32 + 1, column, line_text)
1695}
1696
1697fn to_glob_path(path: &Path) -> String {
1698 path.to_string_lossy().replace('\\', "/")
1699}
1700
1701#[cfg(test)]
1702mod tests {
1703 use std::process::Command;
1704
1705 use super::*;
1706
1707 #[test]
1708 fn extract_trigrams_tracks_next_char_and_position() {
1709 let trigrams = extract_trigrams(b"Rust");
1710 assert_eq!(trigrams.len(), 2);
1711 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1712 assert_eq!(
1713 trigrams[1],
1714 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1715 );
1716 }
1717
1718 #[test]
1719 fn decompose_regex_extracts_literals_and_alternations() {
1720 let query = decompose_regex("abc(def|ghi)xyz");
1721 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1722 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1723 assert_eq!(query.or_groups.len(), 1);
1724 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1725 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1726 }
1727
1728 #[test]
1729 fn candidates_intersect_posting_lists() {
1730 let mut index = SearchIndex::new();
1731 let dir = tempfile::tempdir().expect("create temp dir");
1732 let alpha = dir.path().join("alpha.txt");
1733 let beta = dir.path().join("beta.txt");
1734 fs::write(&alpha, "abcdef").expect("write alpha");
1735 fs::write(&beta, "abcxyz").expect("write beta");
1736 index.project_root = dir.path().to_path_buf();
1737 index.index_file(&alpha, b"abcdef");
1738 index.index_file(&beta, b"abcxyz");
1739
1740 let query = RegexQuery {
1741 and_trigrams: vec![
1742 pack_trigram(b'a', b'b', b'c'),
1743 pack_trigram(b'd', b'e', b'f'),
1744 ],
1745 ..RegexQuery::default()
1746 };
1747
1748 let candidates = index.candidates(&query);
1749 assert_eq!(candidates.len(), 1);
1750 assert_eq!(index.files[candidates[0] as usize].path, alpha);
1751 }
1752
1753 #[test]
1754 fn candidates_apply_bloom_filters() {
1755 let mut index = SearchIndex::new();
1756 let dir = tempfile::tempdir().expect("create temp dir");
1757 let file = dir.path().join("sample.txt");
1758 fs::write(&file, "abcd efgh").expect("write sample");
1759 index.project_root = dir.path().to_path_buf();
1760 index.index_file(&file, b"abcd efgh");
1761
1762 let trigram = pack_trigram(b'a', b'b', b'c');
1763 let matching_filter = PostingFilter {
1764 next_mask: mask_for_next_char(b'd'),
1765 loc_mask: mask_for_position(0),
1766 };
1767 let non_matching_filter = PostingFilter {
1768 next_mask: mask_for_next_char(b'z'),
1769 loc_mask: mask_for_position(0),
1770 };
1771
1772 assert_eq!(
1773 index
1774 .postings_for_trigram(trigram, Some(matching_filter))
1775 .len(),
1776 1
1777 );
1778 assert!(index
1779 .postings_for_trigram(trigram, Some(non_matching_filter))
1780 .is_empty());
1781 }
1782
1783 #[test]
1784 fn disk_round_trip_preserves_postings_and_files() {
1785 let dir = tempfile::tempdir().expect("create temp dir");
1786 let project = dir.path().join("project");
1787 fs::create_dir_all(&project).expect("create project dir");
1788 let file = project.join("src.txt");
1789 fs::write(&file, "abcdef").expect("write source");
1790
1791 let mut index = SearchIndex::build(&project);
1792 index.git_head = Some("deadbeef".to_string());
1793 let cache_dir = dir.path().join("cache");
1794 index.write_to_disk(&cache_dir, index.git_head.as_deref());
1795
1796 let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1797 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1798 assert_eq!(loaded.files.len(), 1);
1799 assert_eq!(
1800 relative_to_root(&loaded.project_root, &loaded.files[0].path),
1801 PathBuf::from("src.txt")
1802 );
1803 assert_eq!(loaded.postings.len(), index.postings.len());
1804 assert!(loaded
1805 .postings
1806 .contains_key(&pack_trigram(b'a', b'b', b'c')));
1807 }
1808
1809 #[test]
1810 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1811 let dir = tempfile::tempdir().expect("create temp dir");
1812 let project = dir.path().join("project");
1813 fs::create_dir_all(&project).expect("create project dir");
1814 fs::write(project.join("src.txt"), "abcdef").expect("write source");
1815
1816 let index = SearchIndex::build(&project);
1817 let cache_dir = dir.path().join("cache");
1818 index.write_to_disk(&cache_dir, None);
1819
1820 assert!(cache_dir.join("postings.bin").is_file());
1821 assert!(cache_dir.join("lookup.bin").is_file());
1822 assert!(!cache_dir.join("postings.bin.tmp").exists());
1823 assert!(!cache_dir.join("lookup.bin.tmp").exists());
1824 }
1825
1826 #[test]
1827 fn project_cache_key_includes_checkout_path() {
1828 let dir = tempfile::tempdir().expect("create temp dir");
1829 let source = dir.path().join("source");
1830 fs::create_dir_all(&source).expect("create source repo dir");
1831 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1832
1833 assert!(Command::new("git")
1834 .current_dir(&source)
1835 .args(["init"])
1836 .status()
1837 .expect("init git repo")
1838 .success());
1839 assert!(Command::new("git")
1840 .current_dir(&source)
1841 .args(["add", "."])
1842 .status()
1843 .expect("git add")
1844 .success());
1845 assert!(Command::new("git")
1846 .current_dir(&source)
1847 .args([
1848 "-c",
1849 "user.name=AFT Tests",
1850 "-c",
1851 "user.email=aft-tests@example.com",
1852 "commit",
1853 "-m",
1854 "initial",
1855 ])
1856 .status()
1857 .expect("git commit")
1858 .success());
1859
1860 let clone = dir.path().join("clone");
1861 assert!(Command::new("git")
1862 .args(["clone", "--quiet"])
1863 .arg(&source)
1864 .arg(&clone)
1865 .status()
1866 .expect("git clone")
1867 .success());
1868
1869 let source_key = project_cache_key(&source);
1870 let clone_key = project_cache_key(&clone);
1871
1872 assert_eq!(source_key.len(), 16);
1873 assert_eq!(clone_key.len(), 16);
1874 assert_eq!(source_key, clone_key);
1876 }
1877
1878 #[test]
1879 fn resolve_search_scope_disables_index_for_external_path() {
1880 let dir = tempfile::tempdir().expect("create temp dir");
1881 let project = dir.path().join("project");
1882 let outside = dir.path().join("outside");
1883 fs::create_dir_all(&project).expect("create project dir");
1884 fs::create_dir_all(&outside).expect("create outside dir");
1885
1886 let scope = resolve_search_scope(&project, outside.to_str());
1887
1888 assert_eq!(
1889 scope.root,
1890 fs::canonicalize(&outside).expect("canonicalize outside")
1891 );
1892 assert!(!scope.use_index);
1893 }
1894
1895 #[test]
1896 fn grep_filters_matches_to_search_root() {
1897 let dir = tempfile::tempdir().expect("create temp dir");
1898 let project = dir.path().join("project");
1899 let src = project.join("src");
1900 let docs = project.join("docs");
1901 fs::create_dir_all(&src).expect("create src dir");
1902 fs::create_dir_all(&docs).expect("create docs dir");
1903 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1904 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1905
1906 let index = SearchIndex::build(&project);
1907 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1908
1909 assert_eq!(result.files_searched, 1);
1910 assert_eq!(result.files_with_matches, 1);
1911 assert_eq!(result.matches.len(), 1);
1912 let expected = fs::canonicalize(src.join("main.rs")).expect("canonicalize");
1914 assert_eq!(result.matches[0].file, expected);
1915 }
1916
1917 #[test]
1918 fn grep_deduplicates_multiple_matches_on_same_line() {
1919 let dir = tempfile::tempdir().expect("create temp dir");
1920 let project = dir.path().join("project");
1921 let src = project.join("src");
1922 fs::create_dir_all(&src).expect("create src dir");
1923 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1924
1925 let index = SearchIndex::build(&project);
1926 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1927
1928 assert_eq!(result.total_matches, 1);
1929 assert_eq!(result.matches.len(), 1);
1930 }
1931
1932 #[test]
1933 fn grep_reports_total_matches_before_truncation() {
1934 let dir = tempfile::tempdir().expect("create temp dir");
1935 let project = dir.path().join("project");
1936 let src = project.join("src");
1937 fs::create_dir_all(&src).expect("create src dir");
1938 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
1939
1940 let index = SearchIndex::build(&project);
1941 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
1942
1943 assert_eq!(result.total_matches, 2);
1944 assert_eq!(result.matches.len(), 1);
1945 assert!(result.truncated);
1946 }
1947
1948 #[test]
1949 fn glob_filters_results_to_search_root() {
1950 let dir = tempfile::tempdir().expect("create temp dir");
1951 let project = dir.path().join("project");
1952 let src = project.join("src");
1953 let scripts = project.join("scripts");
1954 fs::create_dir_all(&src).expect("create src dir");
1955 fs::create_dir_all(&scripts).expect("create scripts dir");
1956 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
1957 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
1958
1959 let index = SearchIndex::build(&project);
1960 let files = index.glob("**/*.rs", &src);
1961
1962 assert_eq!(
1963 files,
1964 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
1965 );
1966 }
1967
1968 #[test]
1969 fn glob_includes_hidden_and_binary_files() {
1970 let dir = tempfile::tempdir().expect("create temp dir");
1971 let project = dir.path().join("project");
1972 let hidden_dir = project.join(".hidden");
1973 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
1974 let hidden_file = hidden_dir.join("data.bin");
1975 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
1976
1977 let index = SearchIndex::build(&project);
1978 let files = index.glob("**/*.bin", &project);
1979
1980 assert_eq!(
1981 files,
1982 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
1983 );
1984 }
1985}