Skip to main content

lean_ctx/core/
search_index.rs

1//! Resident line-search index for `ctx_search` (Phase 1 of the efficiency epic).
2//!
3//! Historically `ctx_search` walked the filesystem, read every file, and ran a
4//! regex on every line on *every* call — `O(files × lines)`. That is the
5//! 40–200 ms latency floor this module eliminates.
6//!
7//! This module keeps a RAM-resident trigram index (`trigram → file ids`) so the
8//! common case (an identifier / literal query) collapses to: intersect a few
9//! posting lists in memory → read & regex-verify only the handful of candidate
10//! files. The index never decides matches itself; it only *narrows the file
11//! set*, then `ctx_search` verifies candidates with the exact same regex loop,
12//! so the returned `file:line` hits are byte-identical to the walk path.
13//!
14//! Design notes:
15//! - The index is built with the *same* walk config and file filters as
16//!   `ctx_search` (see [`crate::tools::ctx_search`]) so the searchable universe
17//!   is identical — that is what guarantees recall parity.
18//! - Only `[A-Za-z0-9_]` trigrams are indexed. Lookups only ever use trigrams
19//!   from pure-identifier queries, so this is both sufficient and memory-bounded.
20//! - Narrowing is applied *only* for pure `[A-Za-z0-9_]` literal queries (the
21//!   dominant agent case). Any query containing a regex metacharacter falls
22//!   back to scanning the cached file list (still skips the directory walk).
23//! - Freshness uses a short TTL with background rebuild, mirroring
24//!   [`crate::core::bm25_cache`]. A real fs watcher is Phase 5.
25
26use std::collections::{HashMap, HashSet};
27use std::path::{Path, PathBuf};
28use std::sync::{Arc, Mutex, OnceLock};
29use std::time::{Duration, Instant};
30
31use ignore::WalkBuilder;
32
33use crate::tools::ctx_search::{is_binary_ext, is_generated_file, MAX_FILE_SIZE, MAX_WALK_DEPTH};
34
35/// Freshness window before a background rebuild is triggered. Matches the
36/// bounded-staleness model already used by the BM25 cache.
37const TTL: Duration = Duration::from_secs(15);
38
39/// Upper bound on indexed files; larger trees fall back to the walk path.
40const MAX_FILES: usize = 200_000;
41
42/// Posting-entry budget (`file_id` occurrences across all trigrams). Up to this
43/// many entries we keep exact inverted posting lists (fastest, sparse lookups).
44/// Beyond it we switch to the per-file Bloom tier instead of giving up — see
45/// [`Narrowing`]. ~4 bytes each → ~48 MB before the switch.
46const MAX_POSTING_ENTRIES: usize = 12_000_000;
47
48/// Hard ceiling on total trigram entries collected during a build. Past this we
49/// abandon indexing (walk fallback) to avoid pathological memory use even with
50/// the compact Bloom tier.
51const MAX_TOTAL_ENTRIES: usize = 48_000_000;
52
53/// Bloom tuning: bits per distinct trigram and number of hash probes. ~12 bits
54/// with k=7 keeps the false-positive rate well under 1% — and a false positive
55/// only costs one extra regex-verified file read (never a missed match).
56const BLOOM_BITS_PER_ITEM: usize = 12;
57const BLOOM_K: usize = 7;
58/// Per-file Bloom size clamp (in bits): 64 bits min, 1 Mi bits (128 KiB) max.
59const BLOOM_MIN_BITS: usize = 64;
60const BLOOM_MAX_BITS: usize = 1 << 20;
61
62/// A trigram is indexable only if all three bytes are `[A-Za-z0-9_]`.
63fn is_word_byte(b: u8) -> bool {
64    b.is_ascii_alphanumeric() || b == b'_'
65}
66
67fn pack(b0: u8, b1: u8, b2: u8) -> u32 {
68    (u32::from(b0) << 16) | (u32::from(b1) << 8) | u32::from(b2)
69}
70
71/// How candidate files are narrowed for a literal query. Two tiers, chosen by
72/// corpus size, both providing a *superset* of true matches (zero false
73/// negatives) which `ctx_search` then regex-verifies:
74/// - `Postings`: exact inverted lists `trigram → sorted file ids`. Fast, sparse
75///   lookups; used while total entries fit [`MAX_POSTING_ENTRIES`].
76/// - `Blooms`: one compact per-file Bloom filter of the file's trigrams. ~3×
77///   smaller than postings, so monorepos that would otherwise blow the posting
78///   budget still get index-narrowing instead of a full directory walk.
79enum Narrowing {
80    Postings(HashMap<u32, Vec<u32>>),
81    Blooms(Vec<FileBloom>),
82}
83
84/// A per-file Bloom filter over the file's word-trigrams. No false negatives:
85/// if any probed bit for a trigram is unset, the file provably lacks it.
86struct FileBloom {
87    /// Bit storage; the filter width `m = bits.len() * 64` is a power of two.
88    bits: Vec<u64>,
89}
90
91/// 64-bit avalanche mix (splitmix64 finalizer) — spreads a packed trigram into
92/// a well-distributed hash for double-probing.
93#[inline]
94fn mix64(mut x: u64) -> u64 {
95    x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
96    x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
97    x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
98    x ^ (x >> 31)
99}
100
101impl FileBloom {
102    fn with_capacity(distinct_trigrams: usize) -> Self {
103        let target = distinct_trigrams
104            .saturating_mul(BLOOM_BITS_PER_ITEM)
105            .next_power_of_two()
106            .clamp(BLOOM_MIN_BITS, BLOOM_MAX_BITS);
107        FileBloom {
108            bits: vec![0u64; target / 64],
109        }
110    }
111
112    #[inline]
113    fn m_bits(&self) -> usize {
114        self.bits.len() * 64
115    }
116
117    /// Double hashing: `p_i = h1 + i·h2 (mod m)` with `m` a power of two.
118    #[inline]
119    fn probes(&self, trigram: u32) -> impl Iterator<Item = usize> + '_ {
120        let m = self.m_bits();
121        let mask = m - 1; // m is a power of two
122        let h = mix64(u64::from(trigram));
123        let h1 = (h & 0xFFFF_FFFF) as usize;
124        let h2 = ((h >> 32) as usize) | 1; // odd step → full-period probing
125        (0..BLOOM_K).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) & mask)
126    }
127
128    fn insert(&mut self, trigram: u32) {
129        for p in self.probes(trigram).collect::<Vec<_>>() {
130            self.bits[p / 64] |= 1u64 << (p % 64);
131        }
132    }
133
134    fn maybe_contains(&self, trigram: u32) -> bool {
135        self.probes(trigram)
136            .all(|p| self.bits[p / 64] & (1u64 << (p % 64)) != 0)
137    }
138}
139
140/// RAM-resident trigram index over one project root.
141pub struct SearchIndex {
142    files: Vec<PathBuf>,
143    /// Candidate-narrowing structure (exact postings or compact per-file Bloom).
144    narrowing: Narrowing,
145    respect_gitignore: bool,
146    allow_secret_paths: bool,
147    built_at: Instant,
148}
149
150impl SearchIndex {
151    /// Build the index by walking `root` with the exact same config and filters
152    /// as `ctx_search`, so the searchable file universe is identical.
153    pub fn build(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> Option<Self> {
154        let root_path = Path::new(root);
155        if !root_path.exists() {
156            return None;
157        }
158
159        let walker = WalkBuilder::new(root_path)
160            .hidden(true)
161            .max_depth(Some(MAX_WALK_DEPTH))
162            .git_ignore(respect_gitignore)
163            .git_global(respect_gitignore)
164            .git_exclude(respect_gitignore)
165            .build();
166
167        let mut files: Vec<PathBuf> = Vec::new();
168        // Per-file sorted, deduped trigrams. Same memory as the posting lists
169        // would be, but grouped by file so we can materialise *either* tier
170        // afterwards without a second pass over the corpus.
171        let mut per_file_trigrams: Vec<Vec<u32>> = Vec::new();
172        let mut total_entries: usize = 0;
173        let mut scratch: HashSet<u32> = HashSet::new();
174
175        for entry in walker.filter_map(std::result::Result::ok) {
176            if entry.file_type().is_none_or(|ft| ft.is_dir()) {
177                continue;
178            }
179            if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
180                continue;
181            }
182            let path = entry.path();
183            if is_binary_ext(path) || is_generated_file(path) {
184                continue;
185            }
186            if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
187                continue;
188            }
189            // Only index regular files within the size budget. A FIFO/socket/
190            // device node would block the `read_to_string` below forever (#336),
191            // hanging the background build and starving the fast path. `metadata`
192            // (stat) never opens the file, so it is safe on special files.
193            let state = match std::fs::metadata(path) {
194                Ok(meta) if !meta.file_type().is_file() => continue,
195                Ok(meta) if meta.len() > MAX_FILE_SIZE => continue,
196                Ok(meta) => crate::core::content_cache::FileState::from_metadata(&meta),
197                Err(_) => continue,
198            };
199            // Read the corpus exactly once (issue #148): reuse a fresh cached
200            // copy if a prior `ctx_search`/build already read this file, else
201            // read it now and publish it so the upcoming `ctx_search` verify
202            // pass is an in-memory hit instead of a second disk read. Mirrors
203            // ctx_search: a non-UTF-8 file is never searchable, so it is skipped.
204            let content: std::sync::Arc<str> = if let Some(cached) =
205                state.and_then(|s| crate::core::content_cache::get(path, s))
206            {
207                cached
208            } else {
209                let Ok(text) = std::fs::read_to_string(path) else {
210                    continue;
211                };
212                let arc: std::sync::Arc<str> = std::sync::Arc::from(text);
213                if let Some(s) = state {
214                    crate::core::content_cache::insert(path, s, std::sync::Arc::clone(&arc));
215                }
216                arc
217            };
218
219            if files.len() >= MAX_FILES {
220                return None; // too large even for the Bloom tier — use the walk
221            }
222
223            scratch.clear();
224            let bytes = content.as_bytes();
225            if bytes.len() >= 3 {
226                for w in bytes.windows(3) {
227                    if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
228                        scratch.insert(pack(w[0], w[1], w[2]));
229                    }
230                }
231            }
232            total_entries += scratch.len();
233            if total_entries > MAX_TOTAL_ENTRIES {
234                return None; // memory guard — fall back to walk
235            }
236            let mut tris: Vec<u32> = scratch.iter().copied().collect();
237            tris.sort_unstable();
238            files.push(path.to_path_buf());
239            per_file_trigrams.push(tris);
240        }
241
242        let narrowing = build_narrowing(&per_file_trigrams, total_entries);
243
244        Some(Self {
245            files,
246            narrowing,
247            respect_gitignore,
248            allow_secret_paths,
249            built_at: Instant::now(),
250        })
251    }
252
253    fn is_fresh(&self) -> bool {
254        self.built_at.elapsed() < TTL
255    }
256
257    fn config_matches(&self, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
258        self.respect_gitignore == respect_gitignore && self.allow_secret_paths == allow_secret_paths
259    }
260
261    /// Candidate files for `pattern`, filtered by `ext`. `None` means "no safe
262    /// narrowing possible" — the caller should scan the full file list.
263    ///
264    /// Narrowing is applied only for pure `[A-Za-z0-9_]` literals of length ≥ 3.
265    /// For such a literal every match contains it on a single line, hence the
266    /// file contains all of its consecutive trigrams: intersecting their
267    /// posting lists yields a *superset* of matching files (zero false
268    /// negatives), which the caller then regex-verifies.
269    pub fn candidate_paths(&self, pattern: &str, ext: Option<&str>) -> CandidateSet {
270        if let Some(ids) = self.literal_candidates(pattern) {
271            let paths = ids
272                .into_iter()
273                .map(|id| self.files[id as usize].clone())
274                .filter(|p| ext_matches(p, ext))
275                .collect();
276            CandidateSet::Narrowed(paths)
277        } else {
278            let paths = self
279                .files
280                .iter()
281                .filter(|p| ext_matches(p, ext))
282                .cloned()
283                .collect();
284            CandidateSet::FullList(paths)
285        }
286    }
287
288    /// Returns candidate file ids for a pure-literal query, or `None` if the
289    /// query is not a trigram-narrowable pure `[A-Za-z0-9_]` literal. Both tiers
290    /// return a *superset* of true matches (zero false negatives).
291    fn literal_candidates(&self, pattern: &str) -> Option<Vec<u32>> {
292        let bytes = pattern.as_bytes();
293        if bytes.len() < 3 || !bytes.iter().all(|&b| is_word_byte(b)) {
294            return None;
295        }
296        // Distinct trigrams of the literal.
297        let mut tris: Vec<u32> = bytes.windows(3).map(|w| pack(w[0], w[1], w[2])).collect();
298        tris.sort_unstable();
299        tris.dedup();
300
301        match &self.narrowing {
302            Narrowing::Postings(trigrams) => Some(Self::postings_intersect(trigrams, &tris)),
303            Narrowing::Blooms(blooms) => Some(Self::bloom_scan(blooms, &tris)),
304        }
305    }
306
307    /// Exact-tier: intersect the posting lists of every required trigram
308    /// (smallest first for a cheap intersection).
309    fn postings_intersect(trigrams: &HashMap<u32, Vec<u32>>, tris: &[u32]) -> Vec<u32> {
310        let mut lists: Vec<&Vec<u32>> = Vec::with_capacity(tris.len());
311        for &tri in tris {
312            match trigrams.get(&tri) {
313                // A required trigram is absent → provably no match anywhere.
314                None => return Vec::new(),
315                Some(list) => lists.push(list),
316            }
317        }
318        lists.sort_by_key(|l| l.len());
319
320        let mut acc: Vec<u32> = lists[0].clone();
321        for list in &lists[1..] {
322            acc = intersect_sorted(&acc, list);
323            if acc.is_empty() {
324                break;
325            }
326        }
327        acc
328    }
329
330    /// Bloom-tier: a file is a candidate iff its Bloom filter may contain every
331    /// required trigram. No false negatives (an unset probe bit ⇒ the trigram is
332    /// provably absent), so the result is still a superset of true matches.
333    fn bloom_scan(blooms: &[FileBloom], tris: &[u32]) -> Vec<u32> {
334        let mut out = Vec::new();
335        for (fid, bloom) in blooms.iter().enumerate() {
336            if tris.iter().all(|&t| bloom.maybe_contains(t)) {
337                out.push(fid as u32);
338            }
339        }
340        out
341    }
342}
343
344/// Materialise the appropriate narrowing tier for a freshly walked corpus.
345fn build_narrowing(per_file: &[Vec<u32>], total_entries: usize) -> Narrowing {
346    if total_entries <= MAX_POSTING_ENTRIES {
347        let mut trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
348        for (fid, tris) in per_file.iter().enumerate() {
349            for &t in tris {
350                // file ids are appended in ascending order ⇒ lists stay sorted.
351                trigrams.entry(t).or_default().push(fid as u32);
352            }
353        }
354        Narrowing::Postings(trigrams)
355    } else {
356        let blooms = per_file
357            .iter()
358            .map(|tris| {
359                let mut b = FileBloom::with_capacity(tris.len());
360                for &t in tris {
361                    b.insert(t);
362                }
363                b
364            })
365            .collect();
366        Narrowing::Blooms(blooms)
367    }
368}
369
370/// Result of [`SearchIndex::candidate_paths`].
371pub enum CandidateSet {
372    /// Trigram-narrowed candidate files (a superset of real matches).
373    Narrowed(Vec<PathBuf>),
374    /// No safe narrowing — the full cached file list (still skips the walk).
375    FullList(Vec<PathBuf>),
376}
377
378impl CandidateSet {
379    pub fn into_paths(self) -> Vec<PathBuf> {
380        match self {
381            CandidateSet::Narrowed(p) | CandidateSet::FullList(p) => p,
382        }
383    }
384}
385
386fn ext_matches(path: &Path, ext: Option<&str>) -> bool {
387    match ext {
388        None => true,
389        Some(want) => path.extension().and_then(|e| e.to_str()) == Some(want),
390    }
391}
392
393/// Intersection of two ascending, deduped `u32` slices.
394fn intersect_sorted(a: &[u32], b: &[u32]) -> Vec<u32> {
395    let mut out = Vec::new();
396    let (mut i, mut j) = (0, 0);
397    while i < a.len() && j < b.len() {
398        match a[i].cmp(&b[j]) {
399            std::cmp::Ordering::Less => i += 1,
400            std::cmp::Ordering::Greater => j += 1,
401            std::cmp::Ordering::Equal => {
402                out.push(a[i]);
403                i += 1;
404                j += 1;
405            }
406        }
407    }
408    out
409}
410
411// ---------------------------------------------------------------------------
412// Resident cache (one index per project root) with background (re)build.
413// ---------------------------------------------------------------------------
414
415struct CacheEntry {
416    index: Option<Arc<SearchIndex>>,
417    building: bool,
418}
419
420static CACHE: OnceLock<Mutex<HashMap<String, CacheEntry>>> = OnceLock::new();
421
422fn cache() -> &'static Mutex<HashMap<String, CacheEntry>> {
423    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
424}
425
426/// Escape hatch: `LEAN_CTX_DISABLE_SEARCH_INDEX=1` forces the walk path
427/// everywhere (debugging / A-B measurement / opt-out).
428fn index_disabled() -> bool {
429    std::env::var("LEAN_CTX_DISABLE_SEARCH_INDEX")
430        .is_ok_and(|v| v == "1" || v.eq_ignore_ascii_case("true"))
431}
432
433/// Returns a fresh resident index for `root` if one is available for the given
434/// config, otherwise spawns a background (re)build and returns `None` so the
435/// caller uses the walk fallback for this call.
436pub fn get_fresh(
437    root: &str,
438    respect_gitignore: bool,
439    allow_secret_paths: bool,
440) -> Option<Arc<SearchIndex>> {
441    // Privileged "ignore gitignore" scans are rare and bypass the index.
442    if !respect_gitignore || index_disabled() {
443        return None;
444    }
445
446    let mut needs_build = false;
447    let result = {
448        let mut map = cache()
449            .lock()
450            .unwrap_or_else(std::sync::PoisonError::into_inner);
451        let entry = map.entry(root.to_string()).or_insert(CacheEntry {
452            index: None,
453            building: false,
454        });
455        match &entry.index {
456            Some(idx)
457                if idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh() =>
458            {
459                Some(Arc::clone(idx))
460            }
461            Some(idx) if idx.config_matches(respect_gitignore, allow_secret_paths) => {
462                // Stale but usable: serve it and refresh in the background.
463                needs_build = !entry.building;
464                if needs_build {
465                    entry.building = true;
466                }
467                Some(Arc::clone(idx))
468            }
469            _ => {
470                needs_build = !entry.building;
471                if needs_build {
472                    entry.building = true;
473                }
474                None
475            }
476        }
477    };
478
479    if needs_build {
480        spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
481    }
482    result
483}
484
485/// Ensure a resident index for `root` is built (or building) in the background.
486/// Safe to call repeatedly; deduped via the per-root `building` flag.
487pub fn ensure_background(root: &str, respect_gitignore: bool, allow_secret_paths: bool) {
488    if !respect_gitignore || index_disabled() {
489        return;
490    }
491    let needs_build = {
492        let mut map = cache()
493            .lock()
494            .unwrap_or_else(std::sync::PoisonError::into_inner);
495        let entry = map.entry(root.to_string()).or_insert(CacheEntry {
496            index: None,
497            building: false,
498        });
499        let fresh = entry.index.as_ref().is_some_and(|idx| {
500            idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh()
501        });
502        if fresh || entry.building {
503            false
504        } else {
505            entry.building = true;
506            true
507        }
508    };
509    if needs_build {
510        spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
511    }
512}
513
514/// Build the index synchronously and install it in the resident cache.
515/// Returns `true` on success. Useful for CLI prewarm and benchmarks that need a
516/// guaranteed-warm index. Respects the disable env var.
517pub fn warm_blocking(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
518    if !respect_gitignore || index_disabled() {
519        return false;
520    }
521    let Some(idx) = SearchIndex::build(root, respect_gitignore, allow_secret_paths) else {
522        return false;
523    };
524    let mut map = cache()
525        .lock()
526        .unwrap_or_else(std::sync::PoisonError::into_inner);
527    map.insert(
528        root.to_string(),
529        CacheEntry {
530            index: Some(Arc::new(idx)),
531            building: false,
532        },
533    );
534    true
535}
536
537fn spawn_build(root: String, respect_gitignore: bool, allow_secret_paths: bool) {
538    std::thread::spawn(move || {
539        let built = std::panic::catch_unwind(|| {
540            SearchIndex::build(&root, respect_gitignore, allow_secret_paths)
541        })
542        .ok()
543        .flatten();
544
545        let mut map = cache()
546            .lock()
547            .unwrap_or_else(std::sync::PoisonError::into_inner);
548        if let Some(entry) = map.get_mut(&root) {
549            entry.building = false;
550            if let Some(idx) = built {
551                entry.index = Some(Arc::new(idx));
552            }
553        }
554    });
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560
561    fn corpus() -> tempfile::TempDir {
562        let dir = tempfile::tempdir().unwrap();
563        std::fs::write(dir.path().join("a.rs"), "fn handler() {}\nlet x = 1;\n").unwrap();
564        std::fs::write(dir.path().join("b.rs"), "fn other() {}\n// nothing here\n").unwrap();
565        std::fs::write(dir.path().join("c.txt"), "handler appears in text too\n").unwrap();
566        dir
567    }
568
569    #[test]
570    fn narrows_to_files_containing_literal() {
571        let dir = corpus();
572        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
573        let cands = idx.candidate_paths("handler", None);
574        let paths = cands.into_paths();
575        // a.rs and c.txt contain "handler"; b.rs must be excluded.
576        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
577        assert!(paths.iter().any(|p| p.ends_with("c.txt")));
578        assert!(!paths.iter().any(|p| p.ends_with("b.rs")));
579    }
580
581    #[test]
582    fn absent_trigram_yields_empty_candidates() {
583        let dir = corpus();
584        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
585        match idx.candidate_paths("zzzqqq", None) {
586            CandidateSet::Narrowed(p) => assert!(p.is_empty()),
587            CandidateSet::FullList(_) => panic!("pure literal should narrow"),
588        }
589    }
590
591    #[test]
592    fn ext_filter_restricts_candidates() {
593        let dir = corpus();
594        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
595        let paths = idx.candidate_paths("handler", Some("rs")).into_paths();
596        assert!(paths.iter().all(|p| p.extension().unwrap() == "rs"));
597        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
598    }
599
600    #[test]
601    #[cfg(unix)]
602    fn build_skips_named_pipe_without_hanging() {
603        use std::sync::mpsc;
604        use std::time::Duration;
605        // #336: the background index build read every file, so a FIFO in the
606        // corpus blocked the build thread forever. It must be skipped while the
607        // regular files are still indexed, and the build must return.
608        let dir = corpus();
609        let fifo = dir.path().join("pipe.fifo");
610        let c = std::ffi::CString::new(fifo.to_string_lossy().as_bytes()).unwrap();
611        assert_eq!(
612            // SAFETY: `c` is a live CString providing a valid NUL-terminated
613            // path pointer for the duration of the call.
614            unsafe { libc::mkfifo(c.as_ptr(), 0o644) },
615            0,
616            "mkfifo failed"
617        );
618
619        let root = dir.path().to_str().unwrap().to_string();
620        let (tx, rx) = mpsc::channel();
621        std::thread::spawn(move || {
622            let built = SearchIndex::build(&root, true, false);
623            let _ = tx.send(built.map(|idx| idx.candidate_paths("handler", None).into_paths()));
624        });
625        let paths = rx
626            .recv_timeout(Duration::from_secs(5))
627            .expect("SearchIndex::build hung on a FIFO (#336 regression)")
628            .expect("index should build");
629        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
630        assert!(!paths.iter().any(|p| p.ends_with("pipe.fifo")));
631    }
632
633    #[test]
634    fn regex_query_falls_back_to_full_list() {
635        let dir = corpus();
636        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
637        match idx.candidate_paths("fn .*\\(\\)", None) {
638            CandidateSet::FullList(p) => assert!(!p.is_empty()),
639            CandidateSet::Narrowed(_) => panic!("metachar query must not narrow"),
640        }
641    }
642
643    #[test]
644    fn short_query_falls_back_to_full_list() {
645        let dir = corpus();
646        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
647        assert!(matches!(
648            idx.candidate_paths("fn", None),
649            CandidateSet::FullList(_)
650        ));
651    }
652
653    /// The core correctness claim: trigram narrowing never drops a real match.
654    /// For each literal query, the set of `file:line` hits found by scanning only
655    /// the narrowed candidates must equal the set found by scanning every file.
656    #[test]
657    fn narrowing_has_identical_recall_to_full_scan() {
658        use regex::Regex;
659        use std::collections::BTreeSet;
660
661        let dir = tempfile::tempdir().unwrap();
662        // A spread of files; some contain the query tokens, most do not.
663        let samples = [
664            (
665                "auth/login.rs",
666                "fn authenticate(user) {}\nlet token = mint();\n",
667            ),
668            (
669                "auth/session.rs",
670                "struct Session;\n// authenticate again here\n",
671            ),
672            ("db/pool.rs", "fn connect() {}\nlet retries = 3;\n"),
673            (
674                "ui/button.tsx",
675                "export const Button = () => authenticate;\n",
676            ),
677            ("readme.md", "This project uses authenticate flows.\n"),
678            ("unrelated.rs", "fn helper() { let v = 1; }\n"),
679        ];
680        for (rel, content) in samples {
681            let p = dir.path().join(rel);
682            std::fs::create_dir_all(p.parent().unwrap()).unwrap();
683            std::fs::write(p, content).unwrap();
684        }
685        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
686
687        let full_scan = |pat: &str| -> BTreeSet<String> {
688            let re = Regex::new(pat).unwrap();
689            let mut hits = BTreeSet::new();
690            for (rel, content) in samples {
691                for (i, line) in content.lines().enumerate() {
692                    if re.is_match(line) {
693                        hits.insert(format!("{rel}:{}", i + 1));
694                    }
695                }
696            }
697            hits
698        };
699
700        for query in ["authenticate", "Session", "retries", "token"] {
701            let re = Regex::new(query).unwrap();
702            let candidates = idx.candidate_paths(query, None).into_paths();
703            let mut narrowed = BTreeSet::new();
704            for path in &candidates {
705                let content = std::fs::read_to_string(path).unwrap();
706                let rel = path.strip_prefix(dir.path()).unwrap().to_string_lossy();
707                for (i, line) in content.lines().enumerate() {
708                    if re.is_match(line) {
709                        narrowed.insert(format!("{}:{}", rel.replace('\\', "/"), i + 1));
710                    }
711                }
712            }
713            assert_eq!(
714                narrowed,
715                full_scan(query),
716                "recall mismatch for query {query:?}"
717            );
718        }
719    }
720
721    #[test]
722    fn intersect_sorted_basic() {
723        assert_eq!(
724            intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
725            vec![2, 3, 5]
726        );
727        assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
728    }
729
730    // ── Bloom tier ────────────────────────────────────────────────────────
731
732    fn trigrams_of(s: &str) -> Vec<u32> {
733        let mut set = HashSet::new();
734        let b = s.as_bytes();
735        if b.len() >= 3 {
736            for w in b.windows(3) {
737                if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
738                    set.insert(pack(w[0], w[1], w[2]));
739                }
740            }
741        }
742        let mut v: Vec<u32> = set.into_iter().collect();
743        v.sort_unstable();
744        v
745    }
746
747    #[test]
748    fn file_bloom_has_no_false_negatives() {
749        let tris = trigrams_of("fn authenticate(user) { let token = mint(); }");
750        let mut bloom = FileBloom::with_capacity(tris.len());
751        for &t in &tris {
752            bloom.insert(t);
753        }
754        // Every inserted trigram must be reported present (Bloom guarantee).
755        assert!(tris.iter().all(|&t| bloom.maybe_contains(t)));
756    }
757
758    /// Parity fuzz: the Bloom tier must return a *superset* of the exact posting
759    /// tier for every query (zero false negatives). False positives are allowed
760    /// (and verified away downstream), so we assert containment, not equality.
761    #[test]
762    fn bloom_tier_is_superset_of_postings_tier() {
763        // Deterministic synthetic corpus (LCG → reproducible).
764        let mut seed = 0x1234_5678_9abc_def0u64;
765        let mut rng = || {
766            seed = seed
767                .wrapping_mul(6364136223846793005)
768                .wrapping_add(1442695040888963407);
769            (seed >> 33) as u32
770        };
771        let mut per_file: Vec<Vec<u32>> = Vec::new();
772        for _ in 0..80 {
773            let n = 50 + (rng() % 250) as usize;
774            let mut s = HashSet::new();
775            for _ in 0..n {
776                s.insert(rng() & 0x00FF_FFFF);
777            }
778            let mut v: Vec<u32> = s.into_iter().collect();
779            v.sort_unstable();
780            per_file.push(v);
781        }
782        let total: usize = per_file.iter().map(Vec::len).sum();
783
784        let postings = build_narrowing(&per_file, total); // ≤ cap → postings
785        let blooms = build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1); // forced bloom
786        let (Narrowing::Postings(pt), Narrowing::Blooms(bl)) = (&postings, &blooms) else {
787            panic!("unexpected narrowing tiers");
788        };
789
790        // Queries drawn from real file trigrams (these MUST be found by both),
791        // plus a few that are unlikely to exist anywhere.
792        for f in &per_file {
793            if f.len() < 3 {
794                continue;
795            }
796            let q = vec![f[0], f[f.len() / 2], f[f.len() - 1]];
797            let exact = SearchIndex::postings_intersect(pt, &q);
798            let bloom: HashSet<u32> = SearchIndex::bloom_scan(bl, &q).into_iter().collect();
799            for id in exact {
800                assert!(
801                    bloom.contains(&id),
802                    "Bloom tier dropped a true match (false negative) for {q:?}"
803                );
804            }
805        }
806    }
807
808    /// End-to-end: an index forced onto the Bloom tier must still surface every
809    /// file that actually contains the literal (recall parity with a full scan).
810    #[test]
811    fn bloom_tier_end_to_end_recall() {
812        let samples = [
813            (
814                "auth_login.rs",
815                "fn authenticate(user) {}\nlet token = mint();\n",
816            ),
817            (
818                "auth_session.rs",
819                "struct Session;\n// authenticate again here\n",
820            ),
821            ("db_pool.rs", "fn connect() {}\nlet retries = 3;\n"),
822            (
823                "ui_button.tsx",
824                "export const Button = () => authenticate;\n",
825            ),
826            ("readme.md", "This project uses authenticate flows.\n"),
827            ("unrelated.rs", "fn helper() { let v = 1; }\n"),
828        ];
829        let files: Vec<PathBuf> = samples.iter().map(|(rel, _)| PathBuf::from(rel)).collect();
830        let per_file: Vec<Vec<u32>> = samples.iter().map(|(_, c)| trigrams_of(c)).collect();
831
832        let idx = SearchIndex {
833            files,
834            narrowing: build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1),
835            respect_gitignore: true,
836            allow_secret_paths: false,
837            built_at: Instant::now(),
838        };
839        assert!(
840            matches!(idx.narrowing, Narrowing::Blooms(_)),
841            "test must exercise the Bloom tier"
842        );
843
844        for query in ["authenticate", "Session", "retries", "token"] {
845            let cands: HashSet<String> = idx
846                .candidate_paths(query, None)
847                .into_paths()
848                .iter()
849                .map(|p| p.to_string_lossy().to_string())
850                .collect();
851            for (rel, content) in samples {
852                if content.contains(query) {
853                    assert!(
854                        cands.contains(rel),
855                        "Bloom tier dropped real match {rel} for query {query:?}"
856                    );
857                }
858            }
859        }
860    }
861}