1use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2use std::fs::{self, File};
3use std::io::{BufReader, BufWriter, Read, Write};
4use std::path::{Component, Path, PathBuf};
5use std::process::Command;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7
8use globset::{Glob, GlobSet, GlobSetBuilder};
9use ignore::WalkBuilder;
10use regex::RegexBuilder;
11use regex_syntax::hir::{Hir, HirKind};
12
13const DEFAULT_MAX_FILE_SIZE: u64 = 1_048_576;
14const INDEX_MAGIC: &[u8; 8] = b"AFTIDX01";
15const LOOKUP_MAGIC: &[u8; 8] = b"AFTLKP01";
16const INDEX_VERSION: u32 = 1;
17const PREVIEW_BYTES: usize = 8 * 1024;
18const EOF_SENTINEL: u8 = 0;
19
20#[derive(Clone, Debug)]
21pub struct SearchIndex {
22 pub postings: HashMap<u32, Vec<Posting>>,
23 pub files: Vec<FileEntry>,
24 pub path_to_id: HashMap<PathBuf, u32>,
25 pub ready: bool,
26 project_root: PathBuf,
27 git_head: Option<String>,
28 max_file_size: u64,
29 file_trigrams: HashMap<u32, Vec<u32>>,
30 unindexed_files: HashSet<u32>,
31}
32
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub struct Posting {
35 pub file_id: u32,
36 pub next_mask: u8,
37 pub loc_mask: u8,
38}
39
40#[derive(Clone, Debug)]
41pub struct FileEntry {
42 pub path: PathBuf,
43 pub size: u64,
44 pub modified: SystemTime,
45}
46
47#[derive(Clone, Debug, PartialEq, Eq)]
48pub struct GrepMatch {
49 pub file: PathBuf,
50 pub line: u32,
51 pub column: u32,
52 pub line_text: String,
53 pub match_text: String,
54}
55
56#[derive(Clone, Debug)]
57pub struct GrepResult {
58 pub matches: Vec<GrepMatch>,
59 pub total_matches: usize,
60 pub files_searched: usize,
61 pub files_with_matches: usize,
62 pub index_status: IndexStatus,
63 pub truncated: bool,
64}
65
66#[derive(Clone, Copy, Debug, PartialEq, Eq)]
67pub enum IndexStatus {
68 Ready,
69 Building,
70 Fallback,
71}
72
73impl IndexStatus {
74 pub fn as_str(&self) -> &'static str {
75 match self {
76 IndexStatus::Ready => "Ready",
77 IndexStatus::Building => "Building",
78 IndexStatus::Fallback => "Fallback",
79 }
80 }
81}
82
83#[derive(Clone, Debug, Default)]
84pub struct RegexQuery {
85 pub and_trigrams: Vec<u32>,
86 pub or_groups: Vec<Vec<u32>>,
87 pub(crate) and_filters: HashMap<u32, PostingFilter>,
88 pub(crate) or_filters: Vec<HashMap<u32, PostingFilter>>,
89}
90
91#[derive(Clone, Copy, Debug, Default)]
92pub(crate) struct PostingFilter {
93 next_mask: u8,
94 loc_mask: u8,
95}
96
97#[derive(Clone, Debug, Default)]
98struct QueryBuild {
99 and_runs: Vec<Vec<u8>>,
100 or_groups: Vec<Vec<Vec<u8>>>,
101}
102
103#[derive(Clone, Debug, Default)]
104pub(crate) struct PathFilters {
105 includes: Option<GlobSet>,
106 excludes: Option<GlobSet>,
107}
108
109#[derive(Clone, Debug)]
110pub(crate) struct SearchScope {
111 pub root: PathBuf,
112 pub use_index: bool,
113}
114
115impl SearchIndex {
116 pub fn new() -> Self {
117 SearchIndex {
118 postings: HashMap::new(),
119 files: Vec::new(),
120 path_to_id: HashMap::new(),
121 ready: false,
122 project_root: PathBuf::new(),
123 git_head: None,
124 max_file_size: DEFAULT_MAX_FILE_SIZE,
125 file_trigrams: HashMap::new(),
126 unindexed_files: HashSet::new(),
127 }
128 }
129
130 pub fn build(root: &Path) -> Self {
131 Self::build_with_limit(root, DEFAULT_MAX_FILE_SIZE)
132 }
133
134 pub(crate) fn build_with_limit(root: &Path, max_file_size: u64) -> Self {
135 let project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
136 let mut index = SearchIndex {
137 project_root: project_root.clone(),
138 max_file_size,
139 ..SearchIndex::new()
140 };
141
142 let filters = PathFilters::default();
143 for path in walk_project_files(&project_root, &filters) {
144 index.update_file(&path);
145 }
146
147 index.git_head = current_git_head(&project_root);
148 index.ready = true;
149 index
150 }
151
152 pub fn index_file(&mut self, path: &Path, content: &[u8]) {
153 self.remove_file(path);
154
155 let file_id = match self.allocate_file_id(path, content.len() as u64) {
156 Some(file_id) => file_id,
157 None => return,
158 };
159
160 let mut trigram_map: BTreeMap<u32, PostingFilter> = BTreeMap::new();
161 for (trigram, next_char, position) in extract_trigrams(content) {
162 let entry = trigram_map.entry(trigram).or_default();
163 entry.next_mask |= mask_for_next_char(next_char);
164 entry.loc_mask |= mask_for_position(position);
165 }
166
167 let mut file_trigrams = Vec::with_capacity(trigram_map.len());
168 for (trigram, filter) in trigram_map {
169 self.postings.entry(trigram).or_default().push(Posting {
170 file_id,
171 next_mask: filter.next_mask,
172 loc_mask: filter.loc_mask,
173 });
174 file_trigrams.push(trigram);
175 }
176
177 for postings in self.postings.values_mut() {
178 postings.sort_by_key(|posting| posting.file_id);
179 }
180
181 self.file_trigrams.insert(file_id, file_trigrams);
182 self.unindexed_files.remove(&file_id);
183 }
184
185 pub fn remove_file(&mut self, path: &Path) {
186 let Some(file_id) = self.path_to_id.remove(path) else {
187 return;
188 };
189
190 if let Some(trigrams) = self.file_trigrams.remove(&file_id) {
191 for trigram in trigrams {
192 let should_remove = if let Some(postings) = self.postings.get_mut(&trigram) {
193 postings.retain(|posting| posting.file_id != file_id);
194 postings.is_empty()
195 } else {
196 false
197 };
198
199 if should_remove {
200 self.postings.remove(&trigram);
201 }
202 }
203 }
204
205 self.unindexed_files.remove(&file_id);
206 if let Some(file) = self.files.get_mut(file_id as usize) {
207 file.path = PathBuf::new();
208 file.size = 0;
209 file.modified = UNIX_EPOCH;
210 }
211 }
212
213 pub fn update_file(&mut self, path: &Path) {
214 self.remove_file(path);
215
216 let metadata = match fs::metadata(path) {
217 Ok(metadata) if metadata.is_file() => metadata,
218 _ => return,
219 };
220
221 if is_binary_path(path, metadata.len()) {
222 return;
223 }
224
225 if metadata.len() > self.max_file_size {
226 self.track_unindexed_file(path, &metadata);
227 return;
228 }
229
230 let content = match fs::read(path) {
231 Ok(content) => content,
232 Err(_) => return,
233 };
234
235 if is_binary_bytes(&content) {
236 return;
237 }
238
239 self.index_file(path, &content);
240 }
241
242 pub fn grep(
243 &self,
244 pattern: &str,
245 case_sensitive: bool,
246 include: &[String],
247 exclude: &[String],
248 search_root: &Path,
249 max_results: usize,
250 ) -> GrepResult {
251 self.search_grep(
252 pattern,
253 case_sensitive,
254 include,
255 exclude,
256 search_root,
257 max_results,
258 )
259 }
260
261 pub fn search_grep(
262 &self,
263 pattern: &str,
264 case_sensitive: bool,
265 include: &[String],
266 exclude: &[String],
267 search_root: &Path,
268 max_results: usize,
269 ) -> GrepResult {
270 let mut regex_builder = RegexBuilder::new(pattern);
271 regex_builder.case_insensitive(!case_sensitive);
272 let regex = match regex_builder.build() {
273 Ok(regex) => regex,
274 Err(_) => {
275 return GrepResult {
276 matches: Vec::new(),
277 total_matches: 0,
278 files_searched: 0,
279 files_with_matches: 0,
280 index_status: if self.ready {
281 IndexStatus::Ready
282 } else {
283 IndexStatus::Building
284 },
285 truncated: false,
286 };
287 }
288 };
289
290 let filters = match build_path_filters(include, exclude) {
291 Ok(filters) => filters,
292 Err(_) => PathFilters::default(),
293 };
294 let search_root = canonicalize_or_normalize(search_root);
295
296 let query = decompose_regex(pattern);
297 let candidate_ids = self.candidates(&query);
298
299 let mut matches = Vec::new();
300 let mut total_matches = 0usize;
301 let mut files_searched = 0usize;
302 let mut files_with_matches = 0usize;
303 let mut truncated = false;
304
305 for file_id in candidate_ids {
306 let Some(file) = self.files.get(file_id as usize) else {
307 continue;
308 };
309 if file.path.as_os_str().is_empty() {
310 continue;
311 }
312 if !is_within_search_root(&search_root, &file.path) {
313 continue;
314 }
315 if !filters.matches(&self.project_root, &file.path) {
316 continue;
317 }
318
319 let content = match read_searchable_text(&file.path) {
320 Some(content) => content,
321 None => continue,
322 };
323
324 files_searched += 1;
325 let line_starts = line_starts(&content);
326 let mut seen_lines = HashSet::new();
327 let mut matched_this_file = false;
328
329 for matched in regex.find_iter(&content) {
330 let (line, column, line_text) =
331 line_details(&content, &line_starts, matched.start());
332 if !seen_lines.insert(line) {
333 continue;
334 }
335
336 total_matches += 1;
337 if matches.len() < max_results {
338 matches.push(GrepMatch {
339 file: relative_to_root(&self.project_root, &file.path),
340 line,
341 column,
342 line_text,
343 match_text: matched.as_str().to_string(),
344 });
345 } else {
346 truncated = true;
347 }
348 matched_this_file = true;
349 }
350
351 if matched_this_file {
352 files_with_matches += 1;
353 }
354 }
355
356 sort_grep_matches_by_mtime_desc(&mut matches, &self.project_root);
357
358 GrepResult {
359 total_matches,
360 matches,
361 files_searched,
362 files_with_matches,
363 index_status: if self.ready {
364 IndexStatus::Ready
365 } else {
366 IndexStatus::Building
367 },
368 truncated,
369 }
370 }
371
372 pub fn glob(&self, pattern: &str, search_root: &Path) -> Vec<PathBuf> {
373 let filters = match build_path_filters(&[pattern.to_string()], &[]) {
374 Ok(filters) => filters,
375 Err(_) => return Vec::new(),
376 };
377 let search_root = canonicalize_or_normalize(search_root);
378 let filter_root = if search_root.starts_with(&self.project_root) {
379 &self.project_root
380 } else {
381 &search_root
382 };
383
384 let mut paths = walk_project_files_from(filter_root, &search_root, &filters);
385 sort_paths_by_mtime_desc(&mut paths);
386 paths
387 }
388
389 pub fn candidates(&self, query: &RegexQuery) -> Vec<u32> {
390 if query.and_trigrams.is_empty() && query.or_groups.is_empty() {
391 return self.active_file_ids();
392 }
393
394 let mut current: Option<BTreeSet<u32>> = None;
395
396 for trigram in &query.and_trigrams {
397 let filter = query.and_filters.get(trigram).copied();
398 let matches = self.postings_for_trigram(*trigram, filter);
399 current = Some(match current.take() {
400 Some(existing) => existing.intersection(&matches).copied().collect(),
401 None => matches,
402 });
403
404 if current.as_ref().is_some_and(|set| set.is_empty()) {
405 break;
406 }
407 }
408
409 let mut current = current.unwrap_or_else(|| self.active_file_ids().into_iter().collect());
410
411 for (index, group) in query.or_groups.iter().enumerate() {
412 let mut group_matches = BTreeSet::new();
413 let filters = query.or_filters.get(index);
414
415 for trigram in group {
416 let filter = filters.and_then(|filters| filters.get(trigram).copied());
417 group_matches.extend(self.postings_for_trigram(*trigram, filter));
418 }
419
420 current = current.intersection(&group_matches).copied().collect();
421 if current.is_empty() {
422 break;
423 }
424 }
425
426 for file_id in &self.unindexed_files {
427 if self.is_active_file(*file_id) {
428 current.insert(*file_id);
429 }
430 }
431
432 current.into_iter().collect()
433 }
434
435 pub fn write_to_disk(&self, cache_dir: &Path, git_head: Option<&str>) {
436 if fs::create_dir_all(cache_dir).is_err() {
437 return;
438 }
439
440 let postings_path = cache_dir.join("postings.bin");
441 let lookup_path = cache_dir.join("lookup.bin");
442 let tmp_postings = cache_dir.join("postings.bin.tmp");
443 let tmp_lookup = cache_dir.join("lookup.bin.tmp");
444
445 let active_ids = self.active_file_ids();
446 let mut id_map = HashMap::new();
447 for (new_id, old_id) in active_ids.iter().enumerate() {
448 let Ok(new_id_u32) = u32::try_from(new_id) else {
449 return;
450 };
451 id_map.insert(*old_id, new_id_u32);
452 }
453
454 let write_result = (|| -> std::io::Result<()> {
455 let mut postings_writer = BufWriter::new(File::create(&tmp_postings)?);
456
457 postings_writer.write_all(INDEX_MAGIC)?;
458 write_u32(&mut postings_writer, INDEX_VERSION)?;
459
460 let head = git_head.unwrap_or_default();
461 let root = self.project_root.to_string_lossy();
462 let head_len = u32::try_from(head.len())
463 .map_err(|_| std::io::Error::other("git head too large to cache"))?;
464 let root_len = u32::try_from(root.len())
465 .map_err(|_| std::io::Error::other("project root too large to cache"))?;
466 let file_count = u32::try_from(active_ids.len())
467 .map_err(|_| std::io::Error::other("too many files to cache"))?;
468
469 write_u32(&mut postings_writer, head_len)?;
470 write_u32(&mut postings_writer, root_len)?;
471 write_u64(&mut postings_writer, self.max_file_size)?;
472 write_u32(&mut postings_writer, file_count)?;
473 postings_writer.write_all(head.as_bytes())?;
474 postings_writer.write_all(root.as_bytes())?;
475
476 for old_id in &active_ids {
477 let Some(file) = self.files.get(*old_id as usize) else {
478 return Err(std::io::Error::other("missing file entry for cache write"));
479 };
480 let path = relative_to_root(&self.project_root, &file.path);
481 let path = path.to_string_lossy();
482 let path_len = u32::try_from(path.len())
483 .map_err(|_| std::io::Error::other("cached path too large"))?;
484 let modified = file
485 .modified
486 .duration_since(UNIX_EPOCH)
487 .unwrap_or(Duration::ZERO);
488 let unindexed = if self.unindexed_files.contains(old_id) {
489 1u8
490 } else {
491 0u8
492 };
493
494 postings_writer.write_all(&[unindexed])?;
495 write_u32(&mut postings_writer, path_len)?;
496 write_u64(&mut postings_writer, file.size)?;
497 write_u64(&mut postings_writer, modified.as_secs())?;
498 write_u32(&mut postings_writer, modified.subsec_nanos())?;
499 postings_writer.write_all(path.as_bytes())?;
500 }
501
502 let mut lookup_entries = Vec::new();
503 let mut postings_blob = Vec::new();
504 let mut sorted_postings: Vec<_> = self.postings.iter().collect();
505 sorted_postings.sort_by_key(|(trigram, _)| **trigram);
506
507 for (trigram, postings) in sorted_postings {
508 let offset = u64::try_from(postings_blob.len())
509 .map_err(|_| std::io::Error::other("postings blob too large"))?;
510 let mut count = 0u32;
511
512 for posting in postings {
513 let Some(mapped_file_id) = id_map.get(&posting.file_id).copied() else {
514 continue;
515 };
516
517 postings_blob.extend_from_slice(&mapped_file_id.to_le_bytes());
518 postings_blob.push(posting.next_mask);
519 postings_blob.push(posting.loc_mask);
520 count = count.saturating_add(1);
521 }
522
523 if count > 0 {
524 lookup_entries.push((*trigram, offset, count));
525 }
526 }
527
528 write_u64(
529 &mut postings_writer,
530 u64::try_from(postings_blob.len())
531 .map_err(|_| std::io::Error::other("postings blob too large"))?,
532 )?;
533 postings_writer.write_all(&postings_blob)?;
534 postings_writer.flush()?;
535 drop(postings_writer);
536
537 let mut lookup_writer = BufWriter::new(File::create(&tmp_lookup)?);
538 let entry_count = u32::try_from(lookup_entries.len())
539 .map_err(|_| std::io::Error::other("too many lookup entries to cache"))?;
540
541 lookup_writer.write_all(LOOKUP_MAGIC)?;
542 write_u32(&mut lookup_writer, INDEX_VERSION)?;
543 write_u32(&mut lookup_writer, entry_count)?;
544
545 for (trigram, offset, count) in lookup_entries {
546 write_u32(&mut lookup_writer, trigram)?;
547 write_u64(&mut lookup_writer, offset)?;
548 write_u32(&mut lookup_writer, count)?;
549 }
550
551 lookup_writer.flush()?;
552 drop(lookup_writer);
553
554 fs::rename(&tmp_postings, &postings_path)?;
555 fs::rename(&tmp_lookup, &lookup_path)?;
556
557 Ok(())
558 })();
559
560 if write_result.is_err() {
561 let _ = fs::remove_file(&tmp_postings);
562 let _ = fs::remove_file(&tmp_lookup);
563 }
564 }
565
566 pub fn read_from_disk(cache_dir: &Path) -> Option<Self> {
567 let postings_path = cache_dir.join("postings.bin");
568 let lookup_path = cache_dir.join("lookup.bin");
569
570 let mut postings_reader = BufReader::new(File::open(postings_path).ok()?);
571 let mut lookup_reader = BufReader::new(File::open(lookup_path).ok()?);
572
573 let mut magic = [0u8; 8];
574 postings_reader.read_exact(&mut magic).ok()?;
575 if &magic != INDEX_MAGIC {
576 return None;
577 }
578 if read_u32(&mut postings_reader).ok()? != INDEX_VERSION {
579 return None;
580 }
581
582 let head_len = read_u32(&mut postings_reader).ok()? as usize;
583 let root_len = read_u32(&mut postings_reader).ok()? as usize;
584 let max_file_size = read_u64(&mut postings_reader).ok()?;
585 let file_count = read_u32(&mut postings_reader).ok()? as usize;
586
587 let mut head_bytes = vec![0u8; head_len];
588 postings_reader.read_exact(&mut head_bytes).ok()?;
589 let git_head = String::from_utf8(head_bytes)
590 .ok()
591 .filter(|head| !head.is_empty());
592
593 let mut root_bytes = vec![0u8; root_len];
594 postings_reader.read_exact(&mut root_bytes).ok()?;
595 let project_root = PathBuf::from(String::from_utf8(root_bytes).ok()?);
596
597 let mut files = Vec::with_capacity(file_count);
598 let mut path_to_id = HashMap::new();
599 let mut unindexed_files = HashSet::new();
600
601 for file_id in 0..file_count {
602 let mut unindexed = [0u8; 1];
603 postings_reader.read_exact(&mut unindexed).ok()?;
604 let path_len = read_u32(&mut postings_reader).ok()? as usize;
605 let size = read_u64(&mut postings_reader).ok()?;
606 let secs = read_u64(&mut postings_reader).ok()?;
607 let nanos = read_u32(&mut postings_reader).ok()?;
608 let mut path_bytes = vec![0u8; path_len];
609 postings_reader.read_exact(&mut path_bytes).ok()?;
610 let relative_path = PathBuf::from(String::from_utf8(path_bytes).ok()?);
611 let full_path = project_root.join(relative_path);
612 let file_id_u32 = u32::try_from(file_id).ok()?;
613
614 files.push(FileEntry {
615 path: full_path.clone(),
616 size,
617 modified: UNIX_EPOCH + Duration::new(secs, nanos),
618 });
619 path_to_id.insert(full_path, file_id_u32);
620 if unindexed[0] == 1 {
621 unindexed_files.insert(file_id_u32);
622 }
623 }
624
625 let postings_len = read_u64(&mut postings_reader).ok()? as usize;
626 let mut postings_blob = vec![0u8; postings_len];
627 postings_reader.read_exact(&mut postings_blob).ok()?;
628
629 let mut lookup_magic = [0u8; 8];
630 lookup_reader.read_exact(&mut lookup_magic).ok()?;
631 if &lookup_magic != LOOKUP_MAGIC {
632 return None;
633 }
634 if read_u32(&mut lookup_reader).ok()? != INDEX_VERSION {
635 return None;
636 }
637 let entry_count = read_u32(&mut lookup_reader).ok()? as usize;
638
639 let mut postings = HashMap::new();
640 let mut file_trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
641
642 for _ in 0..entry_count {
643 let trigram = read_u32(&mut lookup_reader).ok()?;
644 let offset = read_u64(&mut lookup_reader).ok()? as usize;
645 let count = read_u32(&mut lookup_reader).ok()? as usize;
646 let bytes_len = count.checked_mul(6)?;
647 let end = offset.checked_add(bytes_len)?;
648 if end > postings_blob.len() {
649 return None;
650 }
651
652 let mut trigram_postings = Vec::with_capacity(count);
653 for chunk in postings_blob[offset..end].chunks_exact(6) {
654 let file_id = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
655 let posting = Posting {
656 file_id,
657 next_mask: chunk[4],
658 loc_mask: chunk[5],
659 };
660 trigram_postings.push(posting.clone());
661 file_trigrams.entry(file_id).or_default().push(trigram);
662 }
663 postings.insert(trigram, trigram_postings);
664 }
665
666 Some(SearchIndex {
667 postings,
668 files,
669 path_to_id,
670 ready: true,
671 project_root,
672 git_head,
673 max_file_size,
674 file_trigrams,
675 unindexed_files,
676 })
677 }
678
679 pub(crate) fn stored_git_head(&self) -> Option<&str> {
680 self.git_head.as_deref()
681 }
682
683 pub(crate) fn set_ready(&mut self, ready: bool) {
684 self.ready = ready;
685 }
686
687 pub(crate) fn rebuild_or_refresh(
688 root: &Path,
689 max_file_size: u64,
690 current_head: Option<String>,
691 baseline: Option<SearchIndex>,
692 ) -> Self {
693 if current_head.is_none() {
694 return SearchIndex::build_with_limit(root, max_file_size);
695 }
696
697 if let Some(mut baseline) = baseline {
698 baseline.project_root = fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
699 baseline.max_file_size = max_file_size;
700
701 if baseline.git_head == current_head {
702 baseline.ready = true;
703 return baseline;
704 }
705
706 if let (Some(previous), Some(current)) =
707 (baseline.git_head.clone(), current_head.clone())
708 {
709 let project_root = baseline.project_root.clone();
710 if apply_git_diff_updates(&mut baseline, &project_root, &previous, ¤t) {
711 baseline.git_head = Some(current);
712 baseline.ready = true;
713 return baseline;
714 }
715 }
716 }
717
718 SearchIndex::build_with_limit(root, max_file_size)
719 }
720
721 fn allocate_file_id(&mut self, path: &Path, size_hint: u64) -> Option<u32> {
722 let file_id = u32::try_from(self.files.len()).ok()?;
723 let metadata = fs::metadata(path).ok();
724 let size = metadata
725 .as_ref()
726 .map_or(size_hint, |metadata| metadata.len());
727 let modified = metadata
728 .and_then(|metadata| metadata.modified().ok())
729 .unwrap_or(UNIX_EPOCH);
730
731 self.files.push(FileEntry {
732 path: path.to_path_buf(),
733 size,
734 modified,
735 });
736 self.path_to_id.insert(path.to_path_buf(), file_id);
737 Some(file_id)
738 }
739
740 fn track_unindexed_file(&mut self, path: &Path, metadata: &fs::Metadata) {
741 let Some(file_id) = self.allocate_file_id(path, metadata.len()) else {
742 return;
743 };
744 self.unindexed_files.insert(file_id);
745 self.file_trigrams.insert(file_id, Vec::new());
746 }
747
748 fn active_file_ids(&self) -> Vec<u32> {
749 let mut ids: Vec<u32> = self.path_to_id.values().copied().collect();
750 ids.sort_unstable();
751 ids
752 }
753
754 fn is_active_file(&self, file_id: u32) -> bool {
755 self.files
756 .get(file_id as usize)
757 .map(|file| !file.path.as_os_str().is_empty())
758 .unwrap_or(false)
759 }
760
761 fn postings_for_trigram(&self, trigram: u32, filter: Option<PostingFilter>) -> BTreeSet<u32> {
762 let mut matches = BTreeSet::new();
763 let Some(postings) = self.postings.get(&trigram) else {
764 return matches;
765 };
766
767 for posting in postings {
768 if let Some(filter) = filter {
769 if filter.next_mask != 0 && posting.next_mask & filter.next_mask == 0 {
772 continue;
773 }
774 }
779 if self.is_active_file(posting.file_id) {
780 matches.insert(posting.file_id);
781 }
782 }
783
784 matches
785 }
786}
787
788pub fn decompose_regex(pattern: &str) -> RegexQuery {
789 let hir = match regex_syntax::parse(pattern) {
790 Ok(hir) => hir,
791 Err(_) => return RegexQuery::default(),
792 };
793
794 let build = build_query(&hir);
795 build.into_query()
796}
797
798pub fn pack_trigram(a: u8, b: u8, c: u8) -> u32 {
799 ((a as u32) << 16) | ((b as u32) << 8) | c as u32
800}
801
802pub fn normalize_char(c: u8) -> u8 {
803 c.to_ascii_lowercase()
804}
805
806pub fn extract_trigrams(content: &[u8]) -> Vec<(u32, u8, usize)> {
807 if content.len() < 3 {
808 return Vec::new();
809 }
810
811 let mut trigrams = Vec::with_capacity(content.len().saturating_sub(2));
812 for start in 0..=content.len() - 3 {
813 let trigram = pack_trigram(
814 normalize_char(content[start]),
815 normalize_char(content[start + 1]),
816 normalize_char(content[start + 2]),
817 );
818 let next_char = content.get(start + 3).copied().unwrap_or(EOF_SENTINEL);
819 trigrams.push((trigram, next_char, start));
820 }
821 trigrams
822}
823
824pub fn resolve_cache_dir(project_root: &Path) -> PathBuf {
825 let home = std::env::var_os("HOME")
826 .map(PathBuf::from)
827 .unwrap_or_else(|| PathBuf::from("."));
828 home.join(".cache")
829 .join("aft")
830 .join("index")
831 .join(project_cache_key(project_root))
832}
833
834pub(crate) fn build_path_filters(
835 include: &[String],
836 exclude: &[String],
837) -> Result<PathFilters, String> {
838 Ok(PathFilters {
839 includes: build_globset(include)?,
840 excludes: build_globset(exclude)?,
841 })
842}
843
844pub(crate) fn walk_project_files(root: &Path, filters: &PathFilters) -> Vec<PathBuf> {
845 walk_project_files_from(root, root, filters)
846}
847
848pub(crate) fn walk_project_files_from(
849 filter_root: &Path,
850 search_root: &Path,
851 filters: &PathFilters,
852) -> Vec<PathBuf> {
853 let mut builder = WalkBuilder::new(search_root);
854 builder
855 .hidden(false)
856 .git_ignore(true)
857 .git_global(true)
858 .git_exclude(true)
859 .filter_entry(|entry| {
860 let name = entry.file_name().to_string_lossy();
861 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
862 return !matches!(
863 name.as_ref(),
864 "node_modules"
865 | "target"
866 | "venv"
867 | ".venv"
868 | ".git"
869 | "__pycache__"
870 | ".tox"
871 | "dist"
872 | "build"
873 );
874 }
875 true
876 });
877
878 let mut files = Vec::new();
879 for entry in builder.build().filter_map(|entry| entry.ok()) {
880 if !entry
881 .file_type()
882 .map_or(false, |file_type| file_type.is_file())
883 {
884 continue;
885 }
886 let path = entry.into_path();
887 if filters.matches(filter_root, &path) {
888 files.push(path);
889 }
890 }
891
892 sort_paths_by_mtime_desc(&mut files);
893 files
894}
895
896pub(crate) fn read_searchable_text(path: &Path) -> Option<String> {
897 let bytes = fs::read(path).ok()?;
898 if is_binary_bytes(&bytes) {
899 return None;
900 }
901 String::from_utf8(bytes).ok()
902}
903
904pub(crate) fn relative_to_root(root: &Path, path: &Path) -> PathBuf {
905 path.strip_prefix(root)
906 .map(PathBuf::from)
907 .unwrap_or_else(|_| path.to_path_buf())
908}
909
910pub(crate) fn sort_paths_by_mtime_desc(paths: &mut [PathBuf]) {
911 paths.sort_by(|left, right| {
912 path_modified_time(right)
913 .cmp(&path_modified_time(left))
914 .then_with(|| left.cmp(right))
915 });
916}
917
918pub(crate) fn sort_grep_matches_by_mtime_desc(matches: &mut [GrepMatch], project_root: &Path) {
919 matches.sort_by(|left, right| {
920 let left_path = resolve_match_path(project_root, &left.file);
921 let right_path = resolve_match_path(project_root, &right.file);
922
923 path_modified_time(&right_path)
924 .cmp(&path_modified_time(&left_path))
925 .then_with(|| left.file.cmp(&right.file))
926 .then_with(|| left.line.cmp(&right.line))
927 .then_with(|| left.column.cmp(&right.column))
928 });
929}
930
931pub(crate) fn resolve_search_scope(project_root: &Path, path: Option<&str>) -> SearchScope {
932 let resolved_project_root = canonicalize_or_normalize(project_root);
933 let root = match path {
934 Some(path) => {
935 let path = PathBuf::from(path);
936 if path.is_absolute() {
937 canonicalize_or_normalize(&path)
938 } else {
939 normalize_path(&resolved_project_root.join(path))
940 }
941 }
942 None => resolved_project_root.clone(),
943 };
944
945 let use_index = is_within_search_root(&resolved_project_root, &root);
946 SearchScope { root, use_index }
947}
948
949pub(crate) fn is_binary_bytes(content: &[u8]) -> bool {
950 content_inspector::inspect(content).is_binary()
951}
952
953pub(crate) fn current_git_head(root: &Path) -> Option<String> {
954 run_git(root, &["rev-parse", "HEAD"])
955}
956
957pub(crate) fn project_cache_key(project_root: &Path) -> String {
958 use sha2::{Digest, Sha256};
959
960 let canonical_root = canonicalize_or_normalize(project_root);
961 let mut hasher = Sha256::new();
962 hasher.update(canonical_root.to_string_lossy().as_bytes());
963 if let Some(root_commit) = run_git(project_root, &["rev-list", "--max-parents=0", "HEAD"]) {
964 hasher.update(root_commit.as_bytes());
965 }
966 let digest = format!("{:x}", hasher.finalize());
967 digest[..16].to_string()
968}
969
970impl PathFilters {
971 fn matches(&self, root: &Path, path: &Path) -> bool {
972 let relative = to_glob_path(&relative_to_root(root, path));
973 if self
974 .includes
975 .as_ref()
976 .is_some_and(|includes| !includes.is_match(&relative))
977 {
978 return false;
979 }
980 if self
981 .excludes
982 .as_ref()
983 .is_some_and(|excludes| excludes.is_match(&relative))
984 {
985 return false;
986 }
987 true
988 }
989}
990
991fn canonicalize_or_normalize(path: &Path) -> PathBuf {
992 fs::canonicalize(path).unwrap_or_else(|_| normalize_path(path))
993}
994
995fn resolve_match_path(project_root: &Path, path: &Path) -> PathBuf {
996 if path.is_absolute() {
997 path.to_path_buf()
998 } else {
999 project_root.join(path)
1000 }
1001}
1002
1003fn path_modified_time(path: &Path) -> Option<SystemTime> {
1004 fs::metadata(path)
1005 .and_then(|metadata| metadata.modified())
1006 .ok()
1007}
1008
1009fn normalize_path(path: &Path) -> PathBuf {
1010 let mut result = PathBuf::new();
1011 for component in path.components() {
1012 match component {
1013 Component::ParentDir => {
1014 if !result.pop() {
1015 result.push(component);
1016 }
1017 }
1018 Component::CurDir => {}
1019 _ => result.push(component),
1020 }
1021 }
1022 result
1023}
1024
1025fn is_within_search_root(search_root: &Path, path: &Path) -> bool {
1026 path.starts_with(search_root)
1027}
1028
1029impl QueryBuild {
1030 fn into_query(self) -> RegexQuery {
1031 let mut query = RegexQuery::default();
1032
1033 for run in self.and_runs {
1034 add_run_to_and_query(&mut query, &run);
1035 }
1036
1037 for group in self.or_groups {
1038 let mut trigrams = BTreeSet::new();
1039 let mut filters = HashMap::new();
1040 for run in group {
1041 for (trigram, filter) in trigram_filters(&run) {
1042 trigrams.insert(trigram);
1043 merge_filter(filters.entry(trigram).or_default(), filter);
1044 }
1045 }
1046 if !trigrams.is_empty() {
1047 query.or_groups.push(trigrams.into_iter().collect());
1048 query.or_filters.push(filters);
1049 }
1050 }
1051
1052 query
1053 }
1054}
1055
1056fn build_query(hir: &Hir) -> QueryBuild {
1057 match hir.kind() {
1058 HirKind::Literal(literal) => {
1059 if literal.0.len() >= 3 {
1060 QueryBuild {
1061 and_runs: vec![literal.0.to_vec()],
1062 or_groups: Vec::new(),
1063 }
1064 } else {
1065 QueryBuild::default()
1066 }
1067 }
1068 HirKind::Capture(capture) => build_query(&capture.sub),
1069 HirKind::Concat(parts) => {
1070 let mut build = QueryBuild::default();
1071 for part in parts {
1072 let part_build = build_query(part);
1073 build.and_runs.extend(part_build.and_runs);
1074 build.or_groups.extend(part_build.or_groups);
1075 }
1076 build
1077 }
1078 HirKind::Alternation(parts) => {
1079 let mut group = Vec::new();
1080 for part in parts {
1081 let Some(mut choices) = guaranteed_run_choices(part) else {
1082 return QueryBuild::default();
1083 };
1084 group.append(&mut choices);
1085 }
1086 if group.is_empty() {
1087 QueryBuild::default()
1088 } else {
1089 QueryBuild {
1090 and_runs: Vec::new(),
1091 or_groups: vec![group],
1092 }
1093 }
1094 }
1095 HirKind::Repetition(repetition) => {
1096 if repetition.min == 0 {
1097 QueryBuild::default()
1098 } else {
1099 build_query(&repetition.sub)
1100 }
1101 }
1102 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => QueryBuild::default(),
1103 }
1104}
1105
1106fn guaranteed_run_choices(hir: &Hir) -> Option<Vec<Vec<u8>>> {
1107 match hir.kind() {
1108 HirKind::Literal(literal) => {
1109 if literal.0.len() >= 3 {
1110 Some(vec![literal.0.to_vec()])
1111 } else {
1112 None
1113 }
1114 }
1115 HirKind::Capture(capture) => guaranteed_run_choices(&capture.sub),
1116 HirKind::Concat(parts) => {
1117 let mut runs = Vec::new();
1118 for part in parts {
1119 if let Some(mut part_runs) = guaranteed_run_choices(part) {
1120 runs.append(&mut part_runs);
1121 }
1122 }
1123 if runs.is_empty() {
1124 None
1125 } else {
1126 Some(runs)
1127 }
1128 }
1129 HirKind::Alternation(parts) => {
1130 let mut runs = Vec::new();
1131 for part in parts {
1132 let Some(mut part_runs) = guaranteed_run_choices(part) else {
1133 return None;
1134 };
1135 runs.append(&mut part_runs);
1136 }
1137 if runs.is_empty() {
1138 None
1139 } else {
1140 Some(runs)
1141 }
1142 }
1143 HirKind::Repetition(repetition) => {
1144 if repetition.min == 0 {
1145 None
1146 } else {
1147 guaranteed_run_choices(&repetition.sub)
1148 }
1149 }
1150 HirKind::Empty | HirKind::Class(_) | HirKind::Look(_) => None,
1151 }
1152}
1153
1154fn add_run_to_and_query(query: &mut RegexQuery, run: &[u8]) {
1155 for (trigram, filter) in trigram_filters(run) {
1156 if !query.and_trigrams.contains(&trigram) {
1157 query.and_trigrams.push(trigram);
1158 }
1159 merge_filter(query.and_filters.entry(trigram).or_default(), filter);
1160 }
1161}
1162
1163fn trigram_filters(run: &[u8]) -> Vec<(u32, PostingFilter)> {
1164 let mut filters: BTreeMap<u32, PostingFilter> = BTreeMap::new();
1165 for (trigram, next_char, position) in extract_trigrams(run) {
1166 let entry: &mut PostingFilter = filters.entry(trigram).or_default();
1167 if next_char != EOF_SENTINEL {
1168 entry.next_mask |= mask_for_next_char(next_char);
1169 }
1170 entry.loc_mask |= mask_for_position(position);
1171 }
1172 filters.into_iter().collect()
1173}
1174
1175fn merge_filter(target: &mut PostingFilter, filter: PostingFilter) {
1176 target.next_mask |= filter.next_mask;
1177 target.loc_mask |= filter.loc_mask;
1178}
1179
1180fn mask_for_next_char(next_char: u8) -> u8 {
1181 let bit = (normalize_char(next_char).wrapping_mul(31) & 7) as u32;
1182 1u8 << bit
1183}
1184
1185fn mask_for_position(position: usize) -> u8 {
1186 1u8 << (position % 8)
1187}
1188
1189fn build_globset(patterns: &[String]) -> Result<Option<GlobSet>, String> {
1190 if patterns.is_empty() {
1191 return Ok(None);
1192 }
1193
1194 let mut builder = GlobSetBuilder::new();
1195 for pattern in patterns {
1196 let glob = Glob::new(pattern).map_err(|error| error.to_string())?;
1197 builder.add(glob);
1198 }
1199 builder.build().map(Some).map_err(|error| error.to_string())
1200}
1201
1202fn read_u32<R: Read>(reader: &mut R) -> std::io::Result<u32> {
1203 let mut buffer = [0u8; 4];
1204 reader.read_exact(&mut buffer)?;
1205 Ok(u32::from_le_bytes(buffer))
1206}
1207
1208fn read_u64<R: Read>(reader: &mut R) -> std::io::Result<u64> {
1209 let mut buffer = [0u8; 8];
1210 reader.read_exact(&mut buffer)?;
1211 Ok(u64::from_le_bytes(buffer))
1212}
1213
1214fn write_u32<W: Write>(writer: &mut W, value: u32) -> std::io::Result<()> {
1215 writer.write_all(&value.to_le_bytes())
1216}
1217
1218fn write_u64<W: Write>(writer: &mut W, value: u64) -> std::io::Result<()> {
1219 writer.write_all(&value.to_le_bytes())
1220}
1221
1222fn run_git(root: &Path, args: &[&str]) -> Option<String> {
1223 let output = Command::new("git")
1224 .arg("-C")
1225 .arg(root)
1226 .args(args)
1227 .output()
1228 .ok()?;
1229 if !output.status.success() {
1230 return None;
1231 }
1232 let value = String::from_utf8(output.stdout).ok()?;
1233 let value = value.trim().to_string();
1234 if value.is_empty() {
1235 None
1236 } else {
1237 Some(value)
1238 }
1239}
1240
1241fn apply_git_diff_updates(index: &mut SearchIndex, root: &Path, from: &str, to: &str) -> bool {
1242 let diff_range = format!("{}..{}", from, to);
1243 let output = match Command::new("git")
1244 .arg("-C")
1245 .arg(root)
1246 .args(["diff", "--name-only", &diff_range])
1247 .output()
1248 {
1249 Ok(output) => output,
1250 Err(_) => return false,
1251 };
1252
1253 if !output.status.success() {
1254 return false;
1255 }
1256
1257 let Ok(paths) = String::from_utf8(output.stdout) else {
1258 return false;
1259 };
1260
1261 for relative_path in paths.lines().map(str::trim).filter(|path| !path.is_empty()) {
1262 let path = root.join(relative_path);
1263 if path.exists() {
1264 index.update_file(&path);
1265 } else {
1266 index.remove_file(&path);
1267 }
1268 }
1269
1270 true
1271}
1272
1273fn is_binary_path(path: &Path, size: u64) -> bool {
1274 if size == 0 {
1275 return false;
1276 }
1277
1278 let mut file = match File::open(path) {
1279 Ok(file) => file,
1280 Err(_) => return true,
1281 };
1282
1283 let mut preview = vec![0u8; PREVIEW_BYTES.min(size as usize)];
1284 match file.read(&mut preview) {
1285 Ok(read) => is_binary_bytes(&preview[..read]),
1286 Err(_) => true,
1287 }
1288}
1289
1290fn line_starts(content: &str) -> Vec<usize> {
1291 let mut starts = vec![0usize];
1292 for (index, byte) in content.bytes().enumerate() {
1293 if byte == b'\n' {
1294 starts.push(index + 1);
1295 }
1296 }
1297 starts
1298}
1299
1300fn line_details(content: &str, line_starts: &[usize], offset: usize) -> (u32, u32, String) {
1301 let line_index = match line_starts.binary_search(&offset) {
1302 Ok(index) => index,
1303 Err(index) => index.saturating_sub(1),
1304 };
1305 let line_start = line_starts.get(line_index).copied().unwrap_or(0);
1306 let line_end = content[line_start..]
1307 .find('\n')
1308 .map(|length| line_start + length)
1309 .unwrap_or(content.len());
1310 let line_text = content[line_start..line_end]
1311 .trim_end_matches('\r')
1312 .to_string();
1313 let column = content[line_start..offset].chars().count() as u32 + 1;
1314 (line_index as u32 + 1, column, line_text)
1315}
1316
1317fn to_glob_path(path: &Path) -> String {
1318 path.to_string_lossy().replace('\\', "/")
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323 use std::process::Command;
1324
1325 use super::*;
1326
1327 #[test]
1328 fn extract_trigrams_tracks_next_char_and_position() {
1329 let trigrams = extract_trigrams(b"Rust");
1330 assert_eq!(trigrams.len(), 2);
1331 assert_eq!(trigrams[0], (pack_trigram(b'r', b'u', b's'), b't', 0));
1332 assert_eq!(
1333 trigrams[1],
1334 (pack_trigram(b'u', b's', b't'), EOF_SENTINEL, 1)
1335 );
1336 }
1337
1338 #[test]
1339 fn decompose_regex_extracts_literals_and_alternations() {
1340 let query = decompose_regex("abc(def|ghi)xyz");
1341 assert!(query.and_trigrams.contains(&pack_trigram(b'a', b'b', b'c')));
1342 assert!(query.and_trigrams.contains(&pack_trigram(b'x', b'y', b'z')));
1343 assert_eq!(query.or_groups.len(), 1);
1344 assert!(query.or_groups[0].contains(&pack_trigram(b'd', b'e', b'f')));
1345 assert!(query.or_groups[0].contains(&pack_trigram(b'g', b'h', b'i')));
1346 }
1347
1348 #[test]
1349 fn candidates_intersect_posting_lists() {
1350 let mut index = SearchIndex::new();
1351 let dir = tempfile::tempdir().expect("create temp dir");
1352 let alpha = dir.path().join("alpha.txt");
1353 let beta = dir.path().join("beta.txt");
1354 fs::write(&alpha, "abcdef").expect("write alpha");
1355 fs::write(&beta, "abcxyz").expect("write beta");
1356 index.project_root = dir.path().to_path_buf();
1357 index.index_file(&alpha, b"abcdef");
1358 index.index_file(&beta, b"abcxyz");
1359
1360 let query = RegexQuery {
1361 and_trigrams: vec![
1362 pack_trigram(b'a', b'b', b'c'),
1363 pack_trigram(b'd', b'e', b'f'),
1364 ],
1365 ..RegexQuery::default()
1366 };
1367
1368 let candidates = index.candidates(&query);
1369 assert_eq!(candidates.len(), 1);
1370 assert_eq!(index.files[candidates[0] as usize].path, alpha);
1371 }
1372
1373 #[test]
1374 fn candidates_apply_bloom_filters() {
1375 let mut index = SearchIndex::new();
1376 let dir = tempfile::tempdir().expect("create temp dir");
1377 let file = dir.path().join("sample.txt");
1378 fs::write(&file, "abcd efgh").expect("write sample");
1379 index.project_root = dir.path().to_path_buf();
1380 index.index_file(&file, b"abcd efgh");
1381
1382 let trigram = pack_trigram(b'a', b'b', b'c');
1383 let matching_filter = PostingFilter {
1384 next_mask: mask_for_next_char(b'd'),
1385 loc_mask: mask_for_position(0),
1386 };
1387 let non_matching_filter = PostingFilter {
1388 next_mask: mask_for_next_char(b'z'),
1389 loc_mask: mask_for_position(0),
1390 };
1391
1392 assert_eq!(
1393 index
1394 .postings_for_trigram(trigram, Some(matching_filter))
1395 .len(),
1396 1
1397 );
1398 assert!(index
1399 .postings_for_trigram(trigram, Some(non_matching_filter))
1400 .is_empty());
1401 }
1402
1403 #[test]
1404 fn disk_round_trip_preserves_postings_and_files() {
1405 let dir = tempfile::tempdir().expect("create temp dir");
1406 let project = dir.path().join("project");
1407 fs::create_dir_all(&project).expect("create project dir");
1408 let file = project.join("src.txt");
1409 fs::write(&file, "abcdef").expect("write source");
1410
1411 let mut index = SearchIndex::build(&project);
1412 index.git_head = Some("deadbeef".to_string());
1413 let cache_dir = dir.path().join("cache");
1414 index.write_to_disk(&cache_dir, index.git_head.as_deref());
1415
1416 let loaded = SearchIndex::read_from_disk(&cache_dir).expect("load index from disk");
1417 assert_eq!(loaded.stored_git_head(), Some("deadbeef"));
1418 assert_eq!(loaded.files.len(), 1);
1419 assert_eq!(
1420 relative_to_root(&loaded.project_root, &loaded.files[0].path),
1421 PathBuf::from("src.txt")
1422 );
1423 assert_eq!(loaded.postings.len(), index.postings.len());
1424 assert!(loaded
1425 .postings
1426 .contains_key(&pack_trigram(b'a', b'b', b'c')));
1427 }
1428
1429 #[test]
1430 fn write_to_disk_uses_temp_files_and_cleans_them_up() {
1431 let dir = tempfile::tempdir().expect("create temp dir");
1432 let project = dir.path().join("project");
1433 fs::create_dir_all(&project).expect("create project dir");
1434 fs::write(project.join("src.txt"), "abcdef").expect("write source");
1435
1436 let index = SearchIndex::build(&project);
1437 let cache_dir = dir.path().join("cache");
1438 index.write_to_disk(&cache_dir, None);
1439
1440 assert!(cache_dir.join("postings.bin").is_file());
1441 assert!(cache_dir.join("lookup.bin").is_file());
1442 assert!(!cache_dir.join("postings.bin.tmp").exists());
1443 assert!(!cache_dir.join("lookup.bin.tmp").exists());
1444 }
1445
1446 #[test]
1447 fn project_cache_key_includes_checkout_path() {
1448 let dir = tempfile::tempdir().expect("create temp dir");
1449 let source = dir.path().join("source");
1450 fs::create_dir_all(&source).expect("create source repo dir");
1451 fs::write(source.join("tracked.txt"), "content\n").expect("write tracked file");
1452
1453 assert!(Command::new("git")
1454 .current_dir(&source)
1455 .args(["init"])
1456 .status()
1457 .expect("init git repo")
1458 .success());
1459 assert!(Command::new("git")
1460 .current_dir(&source)
1461 .args(["add", "."])
1462 .status()
1463 .expect("git add")
1464 .success());
1465 assert!(Command::new("git")
1466 .current_dir(&source)
1467 .args([
1468 "-c",
1469 "user.name=AFT Tests",
1470 "-c",
1471 "user.email=aft-tests@example.com",
1472 "commit",
1473 "-m",
1474 "initial",
1475 ])
1476 .status()
1477 .expect("git commit")
1478 .success());
1479
1480 let clone = dir.path().join("clone");
1481 assert!(Command::new("git")
1482 .args(["clone", "--quiet"])
1483 .arg(&source)
1484 .arg(&clone)
1485 .status()
1486 .expect("git clone")
1487 .success());
1488
1489 let source_key = project_cache_key(&source);
1490 let clone_key = project_cache_key(&clone);
1491
1492 assert_eq!(source_key.len(), 16);
1493 assert_eq!(clone_key.len(), 16);
1494 assert_ne!(source_key, clone_key);
1495 }
1496
1497 #[test]
1498 fn resolve_search_scope_disables_index_for_external_path() {
1499 let dir = tempfile::tempdir().expect("create temp dir");
1500 let project = dir.path().join("project");
1501 let outside = dir.path().join("outside");
1502 fs::create_dir_all(&project).expect("create project dir");
1503 fs::create_dir_all(&outside).expect("create outside dir");
1504
1505 let scope = resolve_search_scope(&project, outside.to_str());
1506
1507 assert_eq!(
1508 scope.root,
1509 fs::canonicalize(&outside).expect("canonicalize outside")
1510 );
1511 assert!(!scope.use_index);
1512 }
1513
1514 #[test]
1515 fn grep_filters_matches_to_search_root() {
1516 let dir = tempfile::tempdir().expect("create temp dir");
1517 let project = dir.path().join("project");
1518 let src = project.join("src");
1519 let docs = project.join("docs");
1520 fs::create_dir_all(&src).expect("create src dir");
1521 fs::create_dir_all(&docs).expect("create docs dir");
1522 fs::write(src.join("main.rs"), "pub struct SearchIndex;\n").expect("write src file");
1523 fs::write(docs.join("guide.md"), "SearchIndex guide\n").expect("write docs file");
1524
1525 let index = SearchIndex::build(&project);
1526 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1527
1528 assert_eq!(result.files_searched, 1);
1529 assert_eq!(result.files_with_matches, 1);
1530 assert_eq!(result.matches.len(), 1);
1531 assert_eq!(result.matches[0].file, PathBuf::from("src/main.rs"));
1532 }
1533
1534 #[test]
1535 fn grep_deduplicates_multiple_matches_on_same_line() {
1536 let dir = tempfile::tempdir().expect("create temp dir");
1537 let project = dir.path().join("project");
1538 let src = project.join("src");
1539 fs::create_dir_all(&src).expect("create src dir");
1540 fs::write(src.join("main.rs"), "SearchIndex SearchIndex\n").expect("write src file");
1541
1542 let index = SearchIndex::build(&project);
1543 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 10);
1544
1545 assert_eq!(result.total_matches, 1);
1546 assert_eq!(result.matches.len(), 1);
1547 }
1548
1549 #[test]
1550 fn grep_reports_total_matches_before_truncation() {
1551 let dir = tempfile::tempdir().expect("create temp dir");
1552 let project = dir.path().join("project");
1553 let src = project.join("src");
1554 fs::create_dir_all(&src).expect("create src dir");
1555 fs::write(src.join("main.rs"), "SearchIndex\nSearchIndex\n").expect("write src file");
1556
1557 let index = SearchIndex::build(&project);
1558 let result = index.search_grep("SearchIndex", true, &[], &[], &src, 1);
1559
1560 assert_eq!(result.total_matches, 2);
1561 assert_eq!(result.matches.len(), 1);
1562 assert!(result.truncated);
1563 }
1564
1565 #[test]
1566 fn glob_filters_results_to_search_root() {
1567 let dir = tempfile::tempdir().expect("create temp dir");
1568 let project = dir.path().join("project");
1569 let src = project.join("src");
1570 let scripts = project.join("scripts");
1571 fs::create_dir_all(&src).expect("create src dir");
1572 fs::create_dir_all(&scripts).expect("create scripts dir");
1573 fs::write(src.join("main.rs"), "pub fn main() {}\n").expect("write src file");
1574 fs::write(scripts.join("tool.rs"), "pub fn tool() {}\n").expect("write scripts file");
1575
1576 let index = SearchIndex::build(&project);
1577 let files = index.glob("**/*.rs", &src);
1578
1579 assert_eq!(
1580 files,
1581 vec![fs::canonicalize(src.join("main.rs")).expect("canonicalize src file")]
1582 );
1583 }
1584
1585 #[test]
1586 fn glob_includes_hidden_and_binary_files() {
1587 let dir = tempfile::tempdir().expect("create temp dir");
1588 let project = dir.path().join("project");
1589 let hidden_dir = project.join(".hidden");
1590 fs::create_dir_all(&hidden_dir).expect("create hidden dir");
1591 let hidden_file = hidden_dir.join("data.bin");
1592 fs::write(&hidden_file, [0u8, 159, 146, 150]).expect("write binary file");
1593
1594 let index = SearchIndex::build(&project);
1595 let files = index.glob("**/*.bin", &project);
1596
1597 assert_eq!(
1598 files,
1599 vec![fs::canonicalize(hidden_file).expect("canonicalize binary file")]
1600 );
1601 }
1602}