1use std::collections::{HashMap, HashSet};
27use std::path::{Path, PathBuf};
28use std::sync::{Arc, Mutex, OnceLock};
29use std::time::{Duration, Instant};
30
31use ignore::WalkBuilder;
32
33use crate::tools::ctx_search::{is_binary_ext, is_generated_file, MAX_FILE_SIZE, MAX_WALK_DEPTH};
34
35const TTL: Duration = Duration::from_secs(15);
38
39const MAX_FILES: usize = 200_000;
41
42const MAX_POSTING_ENTRIES: usize = 12_000_000;
47
48const MAX_TOTAL_ENTRIES: usize = 48_000_000;
52
53const BLOOM_BITS_PER_ITEM: usize = 12;
57const BLOOM_K: usize = 7;
58const BLOOM_MIN_BITS: usize = 64;
60const BLOOM_MAX_BITS: usize = 1 << 20;
61
62fn is_word_byte(b: u8) -> bool {
64 b.is_ascii_alphanumeric() || b == b'_'
65}
66
67fn pack(b0: u8, b1: u8, b2: u8) -> u32 {
68 (u32::from(b0) << 16) | (u32::from(b1) << 8) | u32::from(b2)
69}
70
71enum Narrowing {
80 Postings(HashMap<u32, Vec<u32>>),
81 Blooms(Vec<FileBloom>),
82}
83
84struct FileBloom {
87 bits: Vec<u64>,
89}
90
91#[inline]
94fn mix64(mut x: u64) -> u64 {
95 x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
96 x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
97 x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
98 x ^ (x >> 31)
99}
100
101impl FileBloom {
102 fn with_capacity(distinct_trigrams: usize) -> Self {
103 let target = distinct_trigrams
104 .saturating_mul(BLOOM_BITS_PER_ITEM)
105 .next_power_of_two()
106 .clamp(BLOOM_MIN_BITS, BLOOM_MAX_BITS);
107 FileBloom {
108 bits: vec![0u64; target / 64],
109 }
110 }
111
112 #[inline]
113 fn m_bits(&self) -> usize {
114 self.bits.len() * 64
115 }
116
117 #[inline]
119 fn probes(&self, trigram: u32) -> impl Iterator<Item = usize> + '_ {
120 let m = self.m_bits();
121 let mask = m - 1; let h = mix64(u64::from(trigram));
123 let h1 = (h & 0xFFFF_FFFF) as usize;
124 let h2 = ((h >> 32) as usize) | 1; (0..BLOOM_K).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) & mask)
126 }
127
128 fn insert(&mut self, trigram: u32) {
129 for p in self.probes(trigram).collect::<Vec<_>>() {
130 self.bits[p / 64] |= 1u64 << (p % 64);
131 }
132 }
133
134 fn maybe_contains(&self, trigram: u32) -> bool {
135 self.probes(trigram)
136 .all(|p| self.bits[p / 64] & (1u64 << (p % 64)) != 0)
137 }
138}
139
140pub struct SearchIndex {
142 files: Vec<PathBuf>,
143 narrowing: Narrowing,
145 respect_gitignore: bool,
146 allow_secret_paths: bool,
147 built_at: Instant,
148}
149
150impl SearchIndex {
151 pub fn build(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> Option<Self> {
154 let root_path = Path::new(root);
155 if !root_path.exists() {
156 return None;
157 }
158 if !crate::core::graph_index::is_safe_scan_root_public(root) {
163 return None;
164 }
165
166 let walker = WalkBuilder::new(root_path)
167 .hidden(true)
168 .max_depth(Some(MAX_WALK_DEPTH))
169 .git_ignore(respect_gitignore)
170 .git_global(respect_gitignore)
171 .git_exclude(respect_gitignore)
172 .filter_entry(crate::core::cloud_files::keep_entry)
173 .build();
174
175 let mut files: Vec<PathBuf> = Vec::new();
176 let mut per_file_trigrams: Vec<Vec<u32>> = Vec::new();
180 let mut total_entries: usize = 0;
181 let mut scratch: HashSet<u32> = HashSet::new();
182
183 for entry in walker.filter_map(std::result::Result::ok) {
184 if entry.file_type().is_none_or(|ft| ft.is_dir()) {
185 continue;
186 }
187 if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
188 continue;
189 }
190 let path = entry.path();
191 if is_binary_ext(path) || is_generated_file(path) {
192 continue;
193 }
194 if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
195 continue;
196 }
197 let state = match std::fs::metadata(path) {
202 Ok(meta) if !meta.file_type().is_file() => continue,
203 Ok(meta) if meta.len() > MAX_FILE_SIZE => continue,
204 Ok(meta) => crate::core::content_cache::FileState::from_metadata(&meta),
205 Err(_) => continue,
206 };
207 let content: std::sync::Arc<str> = if let Some(cached) =
213 state.and_then(|s| crate::core::content_cache::get(path, s))
214 {
215 cached
216 } else {
217 let Ok(text) = std::fs::read_to_string(path) else {
218 continue;
219 };
220 let arc: std::sync::Arc<str> = std::sync::Arc::from(text);
221 if let Some(s) = state {
222 crate::core::content_cache::insert(path, s, std::sync::Arc::clone(&arc));
223 }
224 arc
225 };
226
227 if files.len() >= MAX_FILES {
228 return None; }
230
231 scratch.clear();
232 let bytes = content.as_bytes();
233 if bytes.len() >= 3 {
234 for w in bytes.windows(3) {
235 if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
236 scratch.insert(pack(w[0], w[1], w[2]));
237 }
238 }
239 }
240 total_entries += scratch.len();
241 if total_entries > MAX_TOTAL_ENTRIES {
242 return None; }
244 let mut tris: Vec<u32> = scratch.iter().copied().collect();
245 tris.sort_unstable();
246 files.push(path.to_path_buf());
247 per_file_trigrams.push(tris);
248 }
249
250 let narrowing = build_narrowing(&per_file_trigrams, total_entries);
251
252 Some(Self {
253 files,
254 narrowing,
255 respect_gitignore,
256 allow_secret_paths,
257 built_at: Instant::now(),
258 })
259 }
260
261 fn is_fresh(&self) -> bool {
262 self.built_at.elapsed() < TTL
263 }
264
265 fn config_matches(&self, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
266 self.respect_gitignore == respect_gitignore && self.allow_secret_paths == allow_secret_paths
267 }
268
269 pub fn candidate_paths(&self, pattern: &str, ext: Option<&str>) -> CandidateSet {
278 if let Some(ids) = self.literal_candidates(pattern) {
279 let paths = ids
280 .into_iter()
281 .map(|id| self.files[id as usize].clone())
282 .filter(|p| ext_matches(p, ext))
283 .collect();
284 CandidateSet::Narrowed(paths)
285 } else {
286 let paths = self
287 .files
288 .iter()
289 .filter(|p| ext_matches(p, ext))
290 .cloned()
291 .collect();
292 CandidateSet::FullList(paths)
293 }
294 }
295
296 fn literal_candidates(&self, pattern: &str) -> Option<Vec<u32>> {
300 let bytes = pattern.as_bytes();
301 if bytes.len() < 3 || !bytes.iter().all(|&b| is_word_byte(b)) {
302 return None;
303 }
304 let mut tris: Vec<u32> = bytes.windows(3).map(|w| pack(w[0], w[1], w[2])).collect();
306 tris.sort_unstable();
307 tris.dedup();
308
309 match &self.narrowing {
310 Narrowing::Postings(trigrams) => Some(Self::postings_intersect(trigrams, &tris)),
311 Narrowing::Blooms(blooms) => Some(Self::bloom_scan(blooms, &tris)),
312 }
313 }
314
315 fn postings_intersect(trigrams: &HashMap<u32, Vec<u32>>, tris: &[u32]) -> Vec<u32> {
318 let mut lists: Vec<&Vec<u32>> = Vec::with_capacity(tris.len());
319 for &tri in tris {
320 match trigrams.get(&tri) {
321 None => return Vec::new(),
323 Some(list) => lists.push(list),
324 }
325 }
326 lists.sort_by_key(|l| l.len());
327
328 let mut acc: Vec<u32> = lists[0].clone();
329 for list in &lists[1..] {
330 acc = intersect_sorted(&acc, list);
331 if acc.is_empty() {
332 break;
333 }
334 }
335 acc
336 }
337
338 fn bloom_scan(blooms: &[FileBloom], tris: &[u32]) -> Vec<u32> {
342 let mut out = Vec::new();
343 for (fid, bloom) in blooms.iter().enumerate() {
344 if tris.iter().all(|&t| bloom.maybe_contains(t)) {
345 out.push(fid as u32);
346 }
347 }
348 out
349 }
350}
351
352fn build_narrowing(per_file: &[Vec<u32>], total_entries: usize) -> Narrowing {
354 if total_entries <= MAX_POSTING_ENTRIES {
355 let mut trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
356 for (fid, tris) in per_file.iter().enumerate() {
357 for &t in tris {
358 trigrams.entry(t).or_default().push(fid as u32);
360 }
361 }
362 Narrowing::Postings(trigrams)
363 } else {
364 let blooms = per_file
365 .iter()
366 .map(|tris| {
367 let mut b = FileBloom::with_capacity(tris.len());
368 for &t in tris {
369 b.insert(t);
370 }
371 b
372 })
373 .collect();
374 Narrowing::Blooms(blooms)
375 }
376}
377
378pub enum CandidateSet {
380 Narrowed(Vec<PathBuf>),
382 FullList(Vec<PathBuf>),
384}
385
386impl CandidateSet {
387 pub fn into_paths(self) -> Vec<PathBuf> {
388 match self {
389 CandidateSet::Narrowed(p) | CandidateSet::FullList(p) => p,
390 }
391 }
392}
393
394fn ext_matches(path: &Path, ext: Option<&str>) -> bool {
395 match ext {
396 None => true,
397 Some(want) => path.extension().and_then(|e| e.to_str()) == Some(want),
398 }
399}
400
401fn intersect_sorted(a: &[u32], b: &[u32]) -> Vec<u32> {
403 let mut out = Vec::new();
404 let (mut i, mut j) = (0, 0);
405 while i < a.len() && j < b.len() {
406 match a[i].cmp(&b[j]) {
407 std::cmp::Ordering::Less => i += 1,
408 std::cmp::Ordering::Greater => j += 1,
409 std::cmp::Ordering::Equal => {
410 out.push(a[i]);
411 i += 1;
412 j += 1;
413 }
414 }
415 }
416 out
417}
418
419struct CacheEntry {
424 index: Option<Arc<SearchIndex>>,
425 building: bool,
426}
427
428static CACHE: OnceLock<Mutex<HashMap<String, CacheEntry>>> = OnceLock::new();
429
430fn cache() -> &'static Mutex<HashMap<String, CacheEntry>> {
431 CACHE.get_or_init(|| Mutex::new(HashMap::new()))
432}
433
434fn index_disabled() -> bool {
437 std::env::var("LEAN_CTX_DISABLE_SEARCH_INDEX")
438 .is_ok_and(|v| v == "1" || v.eq_ignore_ascii_case("true"))
439}
440
441pub fn get_fresh(
445 root: &str,
446 respect_gitignore: bool,
447 allow_secret_paths: bool,
448) -> Option<Arc<SearchIndex>> {
449 if !respect_gitignore || index_disabled() {
451 return None;
452 }
453
454 let mut needs_build = false;
455 let result = {
456 let mut map = cache()
457 .lock()
458 .unwrap_or_else(std::sync::PoisonError::into_inner);
459 let entry = map.entry(root.to_string()).or_insert(CacheEntry {
460 index: None,
461 building: false,
462 });
463 match &entry.index {
464 Some(idx)
465 if idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh() =>
466 {
467 Some(Arc::clone(idx))
468 }
469 Some(idx) if idx.config_matches(respect_gitignore, allow_secret_paths) => {
470 needs_build = !entry.building;
472 if needs_build {
473 entry.building = true;
474 }
475 Some(Arc::clone(idx))
476 }
477 _ => {
478 needs_build = !entry.building;
479 if needs_build {
480 entry.building = true;
481 }
482 None
483 }
484 }
485 };
486
487 if needs_build {
488 spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
489 }
490 result
491}
492
493pub fn ensure_background(root: &str, respect_gitignore: bool, allow_secret_paths: bool) {
496 if !respect_gitignore || index_disabled() {
497 return;
498 }
499 let needs_build = {
500 let mut map = cache()
501 .lock()
502 .unwrap_or_else(std::sync::PoisonError::into_inner);
503 let entry = map.entry(root.to_string()).or_insert(CacheEntry {
504 index: None,
505 building: false,
506 });
507 let fresh = entry.index.as_ref().is_some_and(|idx| {
508 idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh()
509 });
510 if fresh || entry.building {
511 false
512 } else {
513 entry.building = true;
514 true
515 }
516 };
517 if needs_build {
518 spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
519 }
520}
521
522pub fn warm_blocking(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
526 if !respect_gitignore || index_disabled() {
527 return false;
528 }
529 let Some(idx) = SearchIndex::build(root, respect_gitignore, allow_secret_paths) else {
530 return false;
531 };
532 let mut map = cache()
533 .lock()
534 .unwrap_or_else(std::sync::PoisonError::into_inner);
535 map.insert(
536 root.to_string(),
537 CacheEntry {
538 index: Some(Arc::new(idx)),
539 building: false,
540 },
541 );
542 true
543}
544
545fn spawn_build(root: String, respect_gitignore: bool, allow_secret_paths: bool) {
546 std::thread::spawn(move || {
547 let built = std::panic::catch_unwind(|| {
548 SearchIndex::build(&root, respect_gitignore, allow_secret_paths)
549 })
550 .ok()
551 .flatten();
552
553 let mut map = cache()
554 .lock()
555 .unwrap_or_else(std::sync::PoisonError::into_inner);
556 if let Some(entry) = map.get_mut(&root) {
557 entry.building = false;
558 if let Some(idx) = built {
559 entry.index = Some(Arc::new(idx));
560 }
561 }
562 });
563}
564
565#[cfg(test)]
566mod tests {
567 use super::*;
568
569 fn corpus() -> tempfile::TempDir {
570 let dir = tempfile::tempdir().unwrap();
571 std::fs::write(dir.path().join("a.rs"), "fn handler() {}\nlet x = 1;\n").unwrap();
572 std::fs::write(dir.path().join("b.rs"), "fn other() {}\n// nothing here\n").unwrap();
573 std::fs::write(dir.path().join("c.txt"), "handler appears in text too\n").unwrap();
574 dir
575 }
576
577 #[test]
578 fn build_refuses_to_index_home_directory() {
579 if let Some(home) = dirs::home_dir() {
582 assert!(
583 SearchIndex::build(&home.to_string_lossy(), true, false).is_none(),
584 "search index must never auto-build over the home directory"
585 );
586 }
587 }
588
589 #[test]
590 fn narrows_to_files_containing_literal() {
591 let dir = corpus();
592 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
593 let cands = idx.candidate_paths("handler", None);
594 let paths = cands.into_paths();
595 assert!(paths.iter().any(|p| p.ends_with("a.rs")));
597 assert!(paths.iter().any(|p| p.ends_with("c.txt")));
598 assert!(!paths.iter().any(|p| p.ends_with("b.rs")));
599 }
600
601 #[test]
602 fn absent_trigram_yields_empty_candidates() {
603 let dir = corpus();
604 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
605 match idx.candidate_paths("zzzqqq", None) {
606 CandidateSet::Narrowed(p) => assert!(p.is_empty()),
607 CandidateSet::FullList(_) => panic!("pure literal should narrow"),
608 }
609 }
610
611 #[test]
612 fn ext_filter_restricts_candidates() {
613 let dir = corpus();
614 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
615 let paths = idx.candidate_paths("handler", Some("rs")).into_paths();
616 assert!(paths.iter().all(|p| p.extension().unwrap() == "rs"));
617 assert!(paths.iter().any(|p| p.ends_with("a.rs")));
618 }
619
620 #[test]
621 #[cfg(unix)]
622 fn build_skips_named_pipe_without_hanging() {
623 use std::sync::mpsc;
624 use std::time::Duration;
625 let dir = corpus();
629 let fifo = dir.path().join("pipe.fifo");
630 let c = std::ffi::CString::new(fifo.to_string_lossy().as_bytes()).unwrap();
631 assert_eq!(
632 unsafe { libc::mkfifo(c.as_ptr(), 0o644) },
635 0,
636 "mkfifo failed"
637 );
638
639 let root = dir.path().to_str().unwrap().to_string();
640 let (tx, rx) = mpsc::channel();
641 std::thread::spawn(move || {
642 let built = SearchIndex::build(&root, true, false);
643 let _ = tx.send(built.map(|idx| idx.candidate_paths("handler", None).into_paths()));
644 });
645 let paths = rx
646 .recv_timeout(Duration::from_secs(5))
647 .expect("SearchIndex::build hung on a FIFO (#336 regression)")
648 .expect("index should build");
649 assert!(paths.iter().any(|p| p.ends_with("a.rs")));
650 assert!(!paths.iter().any(|p| p.ends_with("pipe.fifo")));
651 }
652
653 #[test]
654 fn regex_query_falls_back_to_full_list() {
655 let dir = corpus();
656 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
657 match idx.candidate_paths("fn .*\\(\\)", None) {
658 CandidateSet::FullList(p) => assert!(!p.is_empty()),
659 CandidateSet::Narrowed(_) => panic!("metachar query must not narrow"),
660 }
661 }
662
663 #[test]
664 fn short_query_falls_back_to_full_list() {
665 let dir = corpus();
666 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
667 assert!(matches!(
668 idx.candidate_paths("fn", None),
669 CandidateSet::FullList(_)
670 ));
671 }
672
673 #[test]
677 fn narrowing_has_identical_recall_to_full_scan() {
678 use regex::Regex;
679 use std::collections::BTreeSet;
680
681 let dir = tempfile::tempdir().unwrap();
682 let samples = [
684 (
685 "auth/login.rs",
686 "fn authenticate(user) {}\nlet token = mint();\n",
687 ),
688 (
689 "auth/session.rs",
690 "struct Session;\n// authenticate again here\n",
691 ),
692 ("db/pool.rs", "fn connect() {}\nlet retries = 3;\n"),
693 (
694 "ui/button.tsx",
695 "export const Button = () => authenticate;\n",
696 ),
697 ("readme.md", "This project uses authenticate flows.\n"),
698 ("unrelated.rs", "fn helper() { let v = 1; }\n"),
699 ];
700 for (rel, content) in samples {
701 let p = dir.path().join(rel);
702 std::fs::create_dir_all(p.parent().unwrap()).unwrap();
703 std::fs::write(p, content).unwrap();
704 }
705 let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
706
707 let full_scan = |pat: &str| -> BTreeSet<String> {
708 let re = Regex::new(pat).unwrap();
709 let mut hits = BTreeSet::new();
710 for (rel, content) in samples {
711 for (i, line) in content.lines().enumerate() {
712 if re.is_match(line) {
713 hits.insert(format!("{rel}:{}", i + 1));
714 }
715 }
716 }
717 hits
718 };
719
720 for query in ["authenticate", "Session", "retries", "token"] {
721 let re = Regex::new(query).unwrap();
722 let candidates = idx.candidate_paths(query, None).into_paths();
723 let mut narrowed = BTreeSet::new();
724 for path in &candidates {
725 let content = std::fs::read_to_string(path).unwrap();
726 let rel = path.strip_prefix(dir.path()).unwrap().to_string_lossy();
727 for (i, line) in content.lines().enumerate() {
728 if re.is_match(line) {
729 narrowed.insert(format!("{}:{}", rel.replace('\\', "/"), i + 1));
730 }
731 }
732 }
733 assert_eq!(
734 narrowed,
735 full_scan(query),
736 "recall mismatch for query {query:?}"
737 );
738 }
739 }
740
741 #[test]
742 fn intersect_sorted_basic() {
743 assert_eq!(
744 intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
745 vec![2, 3, 5]
746 );
747 assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
748 }
749
750 fn trigrams_of(s: &str) -> Vec<u32> {
753 let mut set = HashSet::new();
754 let b = s.as_bytes();
755 if b.len() >= 3 {
756 for w in b.windows(3) {
757 if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
758 set.insert(pack(w[0], w[1], w[2]));
759 }
760 }
761 }
762 let mut v: Vec<u32> = set.into_iter().collect();
763 v.sort_unstable();
764 v
765 }
766
767 #[test]
768 fn file_bloom_has_no_false_negatives() {
769 let tris = trigrams_of("fn authenticate(user) { let token = mint(); }");
770 let mut bloom = FileBloom::with_capacity(tris.len());
771 for &t in &tris {
772 bloom.insert(t);
773 }
774 assert!(tris.iter().all(|&t| bloom.maybe_contains(t)));
776 }
777
778 #[test]
782 fn bloom_tier_is_superset_of_postings_tier() {
783 let mut seed = 0x1234_5678_9abc_def0u64;
785 let mut rng = || {
786 seed = seed
787 .wrapping_mul(6364136223846793005)
788 .wrapping_add(1442695040888963407);
789 (seed >> 33) as u32
790 };
791 let mut per_file: Vec<Vec<u32>> = Vec::new();
792 for _ in 0..80 {
793 let n = 50 + (rng() % 250) as usize;
794 let mut s = HashSet::new();
795 for _ in 0..n {
796 s.insert(rng() & 0x00FF_FFFF);
797 }
798 let mut v: Vec<u32> = s.into_iter().collect();
799 v.sort_unstable();
800 per_file.push(v);
801 }
802 let total: usize = per_file.iter().map(Vec::len).sum();
803
804 let postings = build_narrowing(&per_file, total); let blooms = build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1); let (Narrowing::Postings(pt), Narrowing::Blooms(bl)) = (&postings, &blooms) else {
807 panic!("unexpected narrowing tiers");
808 };
809
810 for f in &per_file {
813 if f.len() < 3 {
814 continue;
815 }
816 let q = vec![f[0], f[f.len() / 2], f[f.len() - 1]];
817 let exact = SearchIndex::postings_intersect(pt, &q);
818 let bloom: HashSet<u32> = SearchIndex::bloom_scan(bl, &q).into_iter().collect();
819 for id in exact {
820 assert!(
821 bloom.contains(&id),
822 "Bloom tier dropped a true match (false negative) for {q:?}"
823 );
824 }
825 }
826 }
827
828 #[test]
831 fn bloom_tier_end_to_end_recall() {
832 let samples = [
833 (
834 "auth_login.rs",
835 "fn authenticate(user) {}\nlet token = mint();\n",
836 ),
837 (
838 "auth_session.rs",
839 "struct Session;\n// authenticate again here\n",
840 ),
841 ("db_pool.rs", "fn connect() {}\nlet retries = 3;\n"),
842 (
843 "ui_button.tsx",
844 "export const Button = () => authenticate;\n",
845 ),
846 ("readme.md", "This project uses authenticate flows.\n"),
847 ("unrelated.rs", "fn helper() { let v = 1; }\n"),
848 ];
849 let files: Vec<PathBuf> = samples.iter().map(|(rel, _)| PathBuf::from(rel)).collect();
850 let per_file: Vec<Vec<u32>> = samples.iter().map(|(_, c)| trigrams_of(c)).collect();
851
852 let idx = SearchIndex {
853 files,
854 narrowing: build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1),
855 respect_gitignore: true,
856 allow_secret_paths: false,
857 built_at: Instant::now(),
858 };
859 assert!(
860 matches!(idx.narrowing, Narrowing::Blooms(_)),
861 "test must exercise the Bloom tier"
862 );
863
864 for query in ["authenticate", "Session", "retries", "token"] {
865 let cands: HashSet<String> = idx
866 .candidate_paths(query, None)
867 .into_paths()
868 .iter()
869 .map(|p| p.to_string_lossy().to_string())
870 .collect();
871 for (rel, content) in samples {
872 if content.contains(query) {
873 assert!(
874 cands.contains(rel),
875 "Bloom tier dropped real match {rel} for query {query:?}"
876 );
877 }
878 }
879 }
880 }
881}