Skip to main content

lean_ctx/core/
search_index.rs

1//! Resident line-search index for `ctx_search` (Phase 1 of the efficiency epic).
2//!
3//! Historically `ctx_search` walked the filesystem, read every file, and ran a
4//! regex on every line on *every* call — `O(files × lines)`. That is the
5//! 40–200 ms latency floor this module eliminates.
6//!
7//! This module keeps a RAM-resident trigram index (`trigram → file ids`) so the
8//! common case (an identifier / literal query) collapses to: intersect a few
9//! posting lists in memory → read & regex-verify only the handful of candidate
10//! files. The index never decides matches itself; it only *narrows the file
11//! set*, then `ctx_search` verifies candidates with the exact same regex loop,
12//! so the returned `file:line` hits are byte-identical to the walk path.
13//!
14//! Design notes:
15//! - The index is built with the *same* walk config and file filters as
16//!   `ctx_search` (see [`crate::tools::ctx_search`]) so the searchable universe
17//!   is identical — that is what guarantees recall parity.
18//! - Only `[A-Za-z0-9_]` trigrams are indexed. Lookups only ever use trigrams
19//!   from pure-identifier queries, so this is both sufficient and memory-bounded.
20//! - Narrowing is applied *only* for pure `[A-Za-z0-9_]` literal queries (the
21//!   dominant agent case). Any query containing a regex metacharacter falls
22//!   back to scanning the cached file list (still skips the directory walk).
23//! - Freshness uses a short TTL with background rebuild, mirroring
24//!   [`crate::core::bm25_cache`]. A real fs watcher is Phase 5.
25
26use std::collections::{HashMap, HashSet};
27use std::path::{Path, PathBuf};
28use std::sync::{Arc, Mutex, OnceLock};
29use std::time::{Duration, Instant};
30
31use ignore::WalkBuilder;
32
33use crate::tools::ctx_search::{is_binary_ext, is_generated_file, MAX_FILE_SIZE, MAX_WALK_DEPTH};
34
35/// Freshness window before a background rebuild is triggered. Matches the
36/// bounded-staleness model already used by the BM25 cache.
37const TTL: Duration = Duration::from_secs(15);
38
39/// Upper bound on indexed files; larger trees fall back to the walk path.
40const MAX_FILES: usize = 200_000;
41
42/// Posting-entry budget (`file_id` occurrences across all trigrams). Up to this
43/// many entries we keep exact inverted posting lists (fastest, sparse lookups).
44/// Beyond it we switch to the per-file Bloom tier instead of giving up — see
45/// [`Narrowing`]. ~4 bytes each → ~48 MB before the switch.
46const MAX_POSTING_ENTRIES: usize = 12_000_000;
47
48/// Hard ceiling on total trigram entries collected during a build. Past this we
49/// abandon indexing (walk fallback) to avoid pathological memory use even with
50/// the compact Bloom tier.
51const MAX_TOTAL_ENTRIES: usize = 48_000_000;
52
53/// Bloom tuning: bits per distinct trigram and number of hash probes. ~12 bits
54/// with k=7 keeps the false-positive rate well under 1% — and a false positive
55/// only costs one extra regex-verified file read (never a missed match).
56const BLOOM_BITS_PER_ITEM: usize = 12;
57const BLOOM_K: usize = 7;
58/// Per-file Bloom size clamp (in bits): 64 bits min, 1 Mi bits (128 KiB) max.
59const BLOOM_MIN_BITS: usize = 64;
60const BLOOM_MAX_BITS: usize = 1 << 20;
61
62/// A trigram is indexable only if all three bytes are `[A-Za-z0-9_]`.
63fn is_word_byte(b: u8) -> bool {
64    b.is_ascii_alphanumeric() || b == b'_'
65}
66
67fn pack(b0: u8, b1: u8, b2: u8) -> u32 {
68    (u32::from(b0) << 16) | (u32::from(b1) << 8) | u32::from(b2)
69}
70
71/// How candidate files are narrowed for a literal query. Two tiers, chosen by
72/// corpus size, both providing a *superset* of true matches (zero false
73/// negatives) which `ctx_search` then regex-verifies:
74/// - `Postings`: exact inverted lists `trigram → sorted file ids`. Fast, sparse
75///   lookups; used while total entries fit [`MAX_POSTING_ENTRIES`].
76/// - `Blooms`: one compact per-file Bloom filter of the file's trigrams. ~3×
77///   smaller than postings, so monorepos that would otherwise blow the posting
78///   budget still get index-narrowing instead of a full directory walk.
79enum Narrowing {
80    Postings(HashMap<u32, Vec<u32>>),
81    Blooms(Vec<FileBloom>),
82}
83
84/// A per-file Bloom filter over the file's word-trigrams. No false negatives:
85/// if any probed bit for a trigram is unset, the file provably lacks it.
86struct FileBloom {
87    /// Bit storage; the filter width `m = bits.len() * 64` is a power of two.
88    bits: Vec<u64>,
89}
90
91/// 64-bit avalanche mix (splitmix64 finalizer) — spreads a packed trigram into
92/// a well-distributed hash for double-probing.
93#[inline]
94fn mix64(mut x: u64) -> u64 {
95    x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
96    x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
97    x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
98    x ^ (x >> 31)
99}
100
101impl FileBloom {
102    fn with_capacity(distinct_trigrams: usize) -> Self {
103        let target = distinct_trigrams
104            .saturating_mul(BLOOM_BITS_PER_ITEM)
105            .next_power_of_two()
106            .clamp(BLOOM_MIN_BITS, BLOOM_MAX_BITS);
107        FileBloom {
108            bits: vec![0u64; target / 64],
109        }
110    }
111
112    #[inline]
113    fn m_bits(&self) -> usize {
114        self.bits.len() * 64
115    }
116
117    /// Double hashing: `p_i = h1 + i·h2 (mod m)` with `m` a power of two.
118    #[inline]
119    fn probes(&self, trigram: u32) -> impl Iterator<Item = usize> + '_ {
120        let m = self.m_bits();
121        let mask = m - 1; // m is a power of two
122        let h = mix64(u64::from(trigram));
123        let h1 = (h & 0xFFFF_FFFF) as usize;
124        let h2 = ((h >> 32) as usize) | 1; // odd step → full-period probing
125        (0..BLOOM_K).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) & mask)
126    }
127
128    fn insert(&mut self, trigram: u32) {
129        for p in self.probes(trigram).collect::<Vec<_>>() {
130            self.bits[p / 64] |= 1u64 << (p % 64);
131        }
132    }
133
134    fn maybe_contains(&self, trigram: u32) -> bool {
135        self.probes(trigram)
136            .all(|p| self.bits[p / 64] & (1u64 << (p % 64)) != 0)
137    }
138}
139
140/// RAM-resident trigram index over one project root.
141pub struct SearchIndex {
142    files: Vec<PathBuf>,
143    /// Candidate-narrowing structure (exact postings or compact per-file Bloom).
144    narrowing: Narrowing,
145    respect_gitignore: bool,
146    allow_secret_paths: bool,
147    built_at: Instant,
148}
149
150impl SearchIndex {
151    /// Build the index by walking `root` with the exact same config and filters
152    /// as `ctx_search`, so the searchable file universe is identical.
153    pub fn build(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> Option<Self> {
154        let root_path = Path::new(root);
155        if !root_path.exists() {
156            return None;
157        }
158        // Never auto-index a broad/unsafe root (HOME, filesystem root, a dir with
159        // dozens of unrelated subtrees). This mirrors the graph/BM25 guard and
160        // stops a background build from walking the whole home directory — which
161        // on Windows would hydrate OneDrive placeholders (#363).
162        if !crate::core::graph_index::is_safe_scan_root_public(root) {
163            return None;
164        }
165
166        let walker = WalkBuilder::new(root_path)
167            .hidden(true)
168            .max_depth(Some(MAX_WALK_DEPTH))
169            .git_ignore(respect_gitignore)
170            .git_global(respect_gitignore)
171            .git_exclude(respect_gitignore)
172            .filter_entry(crate::core::cloud_files::keep_entry)
173            .build();
174
175        let mut files: Vec<PathBuf> = Vec::new();
176        // Per-file sorted, deduped trigrams. Same memory as the posting lists
177        // would be, but grouped by file so we can materialise *either* tier
178        // afterwards without a second pass over the corpus.
179        let mut per_file_trigrams: Vec<Vec<u32>> = Vec::new();
180        let mut total_entries: usize = 0;
181        let mut scratch: HashSet<u32> = HashSet::new();
182
183        for entry in walker.filter_map(std::result::Result::ok) {
184            if entry.file_type().is_none_or(|ft| ft.is_dir()) {
185                continue;
186            }
187            if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
188                continue;
189            }
190            let path = entry.path();
191            if is_binary_ext(path) || is_generated_file(path) {
192                continue;
193            }
194            if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
195                continue;
196            }
197            // Only index regular files within the size budget. A FIFO/socket/
198            // device node would block the `read_to_string` below forever (#336),
199            // hanging the background build and starving the fast path. `metadata`
200            // (stat) never opens the file, so it is safe on special files.
201            let state = match std::fs::metadata(path) {
202                Ok(meta) if !meta.file_type().is_file() => continue,
203                Ok(meta) if meta.len() > MAX_FILE_SIZE => continue,
204                Ok(meta) => crate::core::content_cache::FileState::from_metadata(&meta),
205                Err(_) => continue,
206            };
207            // Read the corpus exactly once (issue #148): reuse a fresh cached
208            // copy if a prior `ctx_search`/build already read this file, else
209            // read it now and publish it so the upcoming `ctx_search` verify
210            // pass is an in-memory hit instead of a second disk read. Mirrors
211            // ctx_search: a non-UTF-8 file is never searchable, so it is skipped.
212            let content: std::sync::Arc<str> = if let Some(cached) =
213                state.and_then(|s| crate::core::content_cache::get(path, s))
214            {
215                cached
216            } else {
217                let Ok(text) = std::fs::read_to_string(path) else {
218                    continue;
219                };
220                let arc: std::sync::Arc<str> = std::sync::Arc::from(text);
221                if let Some(s) = state {
222                    crate::core::content_cache::insert(path, s, std::sync::Arc::clone(&arc));
223                }
224                arc
225            };
226
227            if files.len() >= MAX_FILES {
228                return None; // too large even for the Bloom tier — use the walk
229            }
230
231            scratch.clear();
232            let bytes = content.as_bytes();
233            if bytes.len() >= 3 {
234                for w in bytes.windows(3) {
235                    if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
236                        scratch.insert(pack(w[0], w[1], w[2]));
237                    }
238                }
239            }
240            total_entries += scratch.len();
241            if total_entries > MAX_TOTAL_ENTRIES {
242                return None; // memory guard — fall back to walk
243            }
244            let mut tris: Vec<u32> = scratch.iter().copied().collect();
245            tris.sort_unstable();
246            files.push(path.to_path_buf());
247            per_file_trigrams.push(tris);
248        }
249
250        let narrowing = build_narrowing(&per_file_trigrams, total_entries);
251
252        Some(Self {
253            files,
254            narrowing,
255            respect_gitignore,
256            allow_secret_paths,
257            built_at: Instant::now(),
258        })
259    }
260
261    fn is_fresh(&self) -> bool {
262        self.built_at.elapsed() < TTL
263    }
264
265    fn config_matches(&self, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
266        self.respect_gitignore == respect_gitignore && self.allow_secret_paths == allow_secret_paths
267    }
268
269    /// Candidate files for `pattern`, filtered by `ext`. `None` means "no safe
270    /// narrowing possible" — the caller should scan the full file list.
271    ///
272    /// Narrowing is applied only for pure `[A-Za-z0-9_]` literals of length ≥ 3.
273    /// For such a literal every match contains it on a single line, hence the
274    /// file contains all of its consecutive trigrams: intersecting their
275    /// posting lists yields a *superset* of matching files (zero false
276    /// negatives), which the caller then regex-verifies.
277    pub fn candidate_paths(&self, pattern: &str, ext: Option<&str>) -> CandidateSet {
278        if let Some(ids) = self.literal_candidates(pattern) {
279            let paths = ids
280                .into_iter()
281                .map(|id| self.files[id as usize].clone())
282                .filter(|p| ext_matches(p, ext))
283                .collect();
284            CandidateSet::Narrowed(paths)
285        } else {
286            let paths = self
287                .files
288                .iter()
289                .filter(|p| ext_matches(p, ext))
290                .cloned()
291                .collect();
292            CandidateSet::FullList(paths)
293        }
294    }
295
296    /// Returns candidate file ids for a pure-literal query, or `None` if the
297    /// query is not a trigram-narrowable pure `[A-Za-z0-9_]` literal. Both tiers
298    /// return a *superset* of true matches (zero false negatives).
299    fn literal_candidates(&self, pattern: &str) -> Option<Vec<u32>> {
300        let bytes = pattern.as_bytes();
301        if bytes.len() < 3 || !bytes.iter().all(|&b| is_word_byte(b)) {
302            return None;
303        }
304        // Distinct trigrams of the literal.
305        let mut tris: Vec<u32> = bytes.windows(3).map(|w| pack(w[0], w[1], w[2])).collect();
306        tris.sort_unstable();
307        tris.dedup();
308
309        match &self.narrowing {
310            Narrowing::Postings(trigrams) => Some(Self::postings_intersect(trigrams, &tris)),
311            Narrowing::Blooms(blooms) => Some(Self::bloom_scan(blooms, &tris)),
312        }
313    }
314
315    /// Exact-tier: intersect the posting lists of every required trigram
316    /// (smallest first for a cheap intersection).
317    fn postings_intersect(trigrams: &HashMap<u32, Vec<u32>>, tris: &[u32]) -> Vec<u32> {
318        let mut lists: Vec<&Vec<u32>> = Vec::with_capacity(tris.len());
319        for &tri in tris {
320            match trigrams.get(&tri) {
321                // A required trigram is absent → provably no match anywhere.
322                None => return Vec::new(),
323                Some(list) => lists.push(list),
324            }
325        }
326        lists.sort_by_key(|l| l.len());
327
328        let mut acc: Vec<u32> = lists[0].clone();
329        for list in &lists[1..] {
330            acc = intersect_sorted(&acc, list);
331            if acc.is_empty() {
332                break;
333            }
334        }
335        acc
336    }
337
338    /// Bloom-tier: a file is a candidate iff its Bloom filter may contain every
339    /// required trigram. No false negatives (an unset probe bit ⇒ the trigram is
340    /// provably absent), so the result is still a superset of true matches.
341    fn bloom_scan(blooms: &[FileBloom], tris: &[u32]) -> Vec<u32> {
342        let mut out = Vec::new();
343        for (fid, bloom) in blooms.iter().enumerate() {
344            if tris.iter().all(|&t| bloom.maybe_contains(t)) {
345                out.push(fid as u32);
346            }
347        }
348        out
349    }
350}
351
352/// Materialise the appropriate narrowing tier for a freshly walked corpus.
353fn build_narrowing(per_file: &[Vec<u32>], total_entries: usize) -> Narrowing {
354    if total_entries <= MAX_POSTING_ENTRIES {
355        let mut trigrams: HashMap<u32, Vec<u32>> = HashMap::new();
356        for (fid, tris) in per_file.iter().enumerate() {
357            for &t in tris {
358                // file ids are appended in ascending order ⇒ lists stay sorted.
359                trigrams.entry(t).or_default().push(fid as u32);
360            }
361        }
362        Narrowing::Postings(trigrams)
363    } else {
364        let blooms = per_file
365            .iter()
366            .map(|tris| {
367                let mut b = FileBloom::with_capacity(tris.len());
368                for &t in tris {
369                    b.insert(t);
370                }
371                b
372            })
373            .collect();
374        Narrowing::Blooms(blooms)
375    }
376}
377
378/// Result of [`SearchIndex::candidate_paths`].
379pub enum CandidateSet {
380    /// Trigram-narrowed candidate files (a superset of real matches).
381    Narrowed(Vec<PathBuf>),
382    /// No safe narrowing — the full cached file list (still skips the walk).
383    FullList(Vec<PathBuf>),
384}
385
386impl CandidateSet {
387    pub fn into_paths(self) -> Vec<PathBuf> {
388        match self {
389            CandidateSet::Narrowed(p) | CandidateSet::FullList(p) => p,
390        }
391    }
392}
393
394fn ext_matches(path: &Path, ext: Option<&str>) -> bool {
395    match ext {
396        None => true,
397        Some(want) => path.extension().and_then(|e| e.to_str()) == Some(want),
398    }
399}
400
401/// Intersection of two ascending, deduped `u32` slices.
402fn intersect_sorted(a: &[u32], b: &[u32]) -> Vec<u32> {
403    let mut out = Vec::new();
404    let (mut i, mut j) = (0, 0);
405    while i < a.len() && j < b.len() {
406        match a[i].cmp(&b[j]) {
407            std::cmp::Ordering::Less => i += 1,
408            std::cmp::Ordering::Greater => j += 1,
409            std::cmp::Ordering::Equal => {
410                out.push(a[i]);
411                i += 1;
412                j += 1;
413            }
414        }
415    }
416    out
417}
418
419// ---------------------------------------------------------------------------
420// Resident cache (one index per project root) with background (re)build.
421// ---------------------------------------------------------------------------
422
423struct CacheEntry {
424    index: Option<Arc<SearchIndex>>,
425    building: bool,
426}
427
428static CACHE: OnceLock<Mutex<HashMap<String, CacheEntry>>> = OnceLock::new();
429
430fn cache() -> &'static Mutex<HashMap<String, CacheEntry>> {
431    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
432}
433
434/// Escape hatch: `LEAN_CTX_DISABLE_SEARCH_INDEX=1` forces the walk path
435/// everywhere (debugging / A-B measurement / opt-out).
436fn index_disabled() -> bool {
437    std::env::var("LEAN_CTX_DISABLE_SEARCH_INDEX")
438        .is_ok_and(|v| v == "1" || v.eq_ignore_ascii_case("true"))
439}
440
441/// Returns a fresh resident index for `root` if one is available for the given
442/// config, otherwise spawns a background (re)build and returns `None` so the
443/// caller uses the walk fallback for this call.
444pub fn get_fresh(
445    root: &str,
446    respect_gitignore: bool,
447    allow_secret_paths: bool,
448) -> Option<Arc<SearchIndex>> {
449    // Privileged "ignore gitignore" scans are rare and bypass the index.
450    if !respect_gitignore || index_disabled() {
451        return None;
452    }
453
454    let mut needs_build = false;
455    let result = {
456        let mut map = cache()
457            .lock()
458            .unwrap_or_else(std::sync::PoisonError::into_inner);
459        let entry = map.entry(root.to_string()).or_insert(CacheEntry {
460            index: None,
461            building: false,
462        });
463        match &entry.index {
464            Some(idx)
465                if idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh() =>
466            {
467                Some(Arc::clone(idx))
468            }
469            Some(idx) if idx.config_matches(respect_gitignore, allow_secret_paths) => {
470                // Stale but usable: serve it and refresh in the background.
471                needs_build = !entry.building;
472                if needs_build {
473                    entry.building = true;
474                }
475                Some(Arc::clone(idx))
476            }
477            _ => {
478                needs_build = !entry.building;
479                if needs_build {
480                    entry.building = true;
481                }
482                None
483            }
484        }
485    };
486
487    if needs_build {
488        spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
489    }
490    result
491}
492
493/// Ensure a resident index for `root` is built (or building) in the background.
494/// Safe to call repeatedly; deduped via the per-root `building` flag.
495pub fn ensure_background(root: &str, respect_gitignore: bool, allow_secret_paths: bool) {
496    if !respect_gitignore || index_disabled() {
497        return;
498    }
499    let needs_build = {
500        let mut map = cache()
501            .lock()
502            .unwrap_or_else(std::sync::PoisonError::into_inner);
503        let entry = map.entry(root.to_string()).or_insert(CacheEntry {
504            index: None,
505            building: false,
506        });
507        let fresh = entry.index.as_ref().is_some_and(|idx| {
508            idx.config_matches(respect_gitignore, allow_secret_paths) && idx.is_fresh()
509        });
510        if fresh || entry.building {
511            false
512        } else {
513            entry.building = true;
514            true
515        }
516    };
517    if needs_build {
518        spawn_build(root.to_string(), respect_gitignore, allow_secret_paths);
519    }
520}
521
522/// Build the index synchronously and install it in the resident cache.
523/// Returns `true` on success. Useful for CLI prewarm and benchmarks that need a
524/// guaranteed-warm index. Respects the disable env var.
525pub fn warm_blocking(root: &str, respect_gitignore: bool, allow_secret_paths: bool) -> bool {
526    if !respect_gitignore || index_disabled() {
527        return false;
528    }
529    let Some(idx) = SearchIndex::build(root, respect_gitignore, allow_secret_paths) else {
530        return false;
531    };
532    let mut map = cache()
533        .lock()
534        .unwrap_or_else(std::sync::PoisonError::into_inner);
535    map.insert(
536        root.to_string(),
537        CacheEntry {
538            index: Some(Arc::new(idx)),
539            building: false,
540        },
541    );
542    true
543}
544
545fn spawn_build(root: String, respect_gitignore: bool, allow_secret_paths: bool) {
546    std::thread::spawn(move || {
547        let built = std::panic::catch_unwind(|| {
548            SearchIndex::build(&root, respect_gitignore, allow_secret_paths)
549        })
550        .ok()
551        .flatten();
552
553        let mut map = cache()
554            .lock()
555            .unwrap_or_else(std::sync::PoisonError::into_inner);
556        if let Some(entry) = map.get_mut(&root) {
557            entry.building = false;
558            if let Some(idx) = built {
559                entry.index = Some(Arc::new(idx));
560            }
561        }
562    });
563}
564
565#[cfg(test)]
566mod tests {
567    use super::*;
568
569    fn corpus() -> tempfile::TempDir {
570        let dir = tempfile::tempdir().unwrap();
571        std::fs::write(dir.path().join("a.rs"), "fn handler() {}\nlet x = 1;\n").unwrap();
572        std::fs::write(dir.path().join("b.rs"), "fn other() {}\n// nothing here\n").unwrap();
573        std::fs::write(dir.path().join("c.txt"), "handler appears in text too\n").unwrap();
574        dir
575    }
576
577    #[test]
578    fn build_refuses_to_index_home_directory() {
579        // Auto-indexing HOME would walk the entire home tree and, on Windows,
580        // hydrate every OneDrive placeholder (#363). The build must bail out.
581        if let Some(home) = dirs::home_dir() {
582            assert!(
583                SearchIndex::build(&home.to_string_lossy(), true, false).is_none(),
584                "search index must never auto-build over the home directory"
585            );
586        }
587    }
588
589    #[test]
590    fn narrows_to_files_containing_literal() {
591        let dir = corpus();
592        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
593        let cands = idx.candidate_paths("handler", None);
594        let paths = cands.into_paths();
595        // a.rs and c.txt contain "handler"; b.rs must be excluded.
596        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
597        assert!(paths.iter().any(|p| p.ends_with("c.txt")));
598        assert!(!paths.iter().any(|p| p.ends_with("b.rs")));
599    }
600
601    #[test]
602    fn absent_trigram_yields_empty_candidates() {
603        let dir = corpus();
604        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
605        match idx.candidate_paths("zzzqqq", None) {
606            CandidateSet::Narrowed(p) => assert!(p.is_empty()),
607            CandidateSet::FullList(_) => panic!("pure literal should narrow"),
608        }
609    }
610
611    #[test]
612    fn ext_filter_restricts_candidates() {
613        let dir = corpus();
614        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
615        let paths = idx.candidate_paths("handler", Some("rs")).into_paths();
616        assert!(paths.iter().all(|p| p.extension().unwrap() == "rs"));
617        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
618    }
619
620    #[test]
621    #[cfg(unix)]
622    fn build_skips_named_pipe_without_hanging() {
623        use std::sync::mpsc;
624        use std::time::Duration;
625        // #336: the background index build read every file, so a FIFO in the
626        // corpus blocked the build thread forever. It must be skipped while the
627        // regular files are still indexed, and the build must return.
628        let dir = corpus();
629        let fifo = dir.path().join("pipe.fifo");
630        let c = std::ffi::CString::new(fifo.to_string_lossy().as_bytes()).unwrap();
631        assert_eq!(
632            // SAFETY: `c` is a live CString providing a valid NUL-terminated
633            // path pointer for the duration of the call.
634            unsafe { libc::mkfifo(c.as_ptr(), 0o644) },
635            0,
636            "mkfifo failed"
637        );
638
639        let root = dir.path().to_str().unwrap().to_string();
640        let (tx, rx) = mpsc::channel();
641        std::thread::spawn(move || {
642            let built = SearchIndex::build(&root, true, false);
643            let _ = tx.send(built.map(|idx| idx.candidate_paths("handler", None).into_paths()));
644        });
645        let paths = rx
646            .recv_timeout(Duration::from_secs(5))
647            .expect("SearchIndex::build hung on a FIFO (#336 regression)")
648            .expect("index should build");
649        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
650        assert!(!paths.iter().any(|p| p.ends_with("pipe.fifo")));
651    }
652
653    #[test]
654    fn regex_query_falls_back_to_full_list() {
655        let dir = corpus();
656        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
657        match idx.candidate_paths("fn .*\\(\\)", None) {
658            CandidateSet::FullList(p) => assert!(!p.is_empty()),
659            CandidateSet::Narrowed(_) => panic!("metachar query must not narrow"),
660        }
661    }
662
663    #[test]
664    fn short_query_falls_back_to_full_list() {
665        let dir = corpus();
666        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
667        assert!(matches!(
668            idx.candidate_paths("fn", None),
669            CandidateSet::FullList(_)
670        ));
671    }
672
673    /// The core correctness claim: trigram narrowing never drops a real match.
674    /// For each literal query, the set of `file:line` hits found by scanning only
675    /// the narrowed candidates must equal the set found by scanning every file.
676    #[test]
677    fn narrowing_has_identical_recall_to_full_scan() {
678        use regex::Regex;
679        use std::collections::BTreeSet;
680
681        let dir = tempfile::tempdir().unwrap();
682        // A spread of files; some contain the query tokens, most do not.
683        let samples = [
684            (
685                "auth/login.rs",
686                "fn authenticate(user) {}\nlet token = mint();\n",
687            ),
688            (
689                "auth/session.rs",
690                "struct Session;\n// authenticate again here\n",
691            ),
692            ("db/pool.rs", "fn connect() {}\nlet retries = 3;\n"),
693            (
694                "ui/button.tsx",
695                "export const Button = () => authenticate;\n",
696            ),
697            ("readme.md", "This project uses authenticate flows.\n"),
698            ("unrelated.rs", "fn helper() { let v = 1; }\n"),
699        ];
700        for (rel, content) in samples {
701            let p = dir.path().join(rel);
702            std::fs::create_dir_all(p.parent().unwrap()).unwrap();
703            std::fs::write(p, content).unwrap();
704        }
705        let idx = SearchIndex::build(dir.path().to_str().unwrap(), true, false).unwrap();
706
707        let full_scan = |pat: &str| -> BTreeSet<String> {
708            let re = Regex::new(pat).unwrap();
709            let mut hits = BTreeSet::new();
710            for (rel, content) in samples {
711                for (i, line) in content.lines().enumerate() {
712                    if re.is_match(line) {
713                        hits.insert(format!("{rel}:{}", i + 1));
714                    }
715                }
716            }
717            hits
718        };
719
720        for query in ["authenticate", "Session", "retries", "token"] {
721            let re = Regex::new(query).unwrap();
722            let candidates = idx.candidate_paths(query, None).into_paths();
723            let mut narrowed = BTreeSet::new();
724            for path in &candidates {
725                let content = std::fs::read_to_string(path).unwrap();
726                let rel = path.strip_prefix(dir.path()).unwrap().to_string_lossy();
727                for (i, line) in content.lines().enumerate() {
728                    if re.is_match(line) {
729                        narrowed.insert(format!("{}:{}", rel.replace('\\', "/"), i + 1));
730                    }
731                }
732            }
733            assert_eq!(
734                narrowed,
735                full_scan(query),
736                "recall mismatch for query {query:?}"
737            );
738        }
739    }
740
741    #[test]
742    fn intersect_sorted_basic() {
743        assert_eq!(
744            intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
745            vec![2, 3, 5]
746        );
747        assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
748    }
749
750    // ── Bloom tier ────────────────────────────────────────────────────────
751
752    fn trigrams_of(s: &str) -> Vec<u32> {
753        let mut set = HashSet::new();
754        let b = s.as_bytes();
755        if b.len() >= 3 {
756            for w in b.windows(3) {
757                if is_word_byte(w[0]) && is_word_byte(w[1]) && is_word_byte(w[2]) {
758                    set.insert(pack(w[0], w[1], w[2]));
759                }
760            }
761        }
762        let mut v: Vec<u32> = set.into_iter().collect();
763        v.sort_unstable();
764        v
765    }
766
767    #[test]
768    fn file_bloom_has_no_false_negatives() {
769        let tris = trigrams_of("fn authenticate(user) { let token = mint(); }");
770        let mut bloom = FileBloom::with_capacity(tris.len());
771        for &t in &tris {
772            bloom.insert(t);
773        }
774        // Every inserted trigram must be reported present (Bloom guarantee).
775        assert!(tris.iter().all(|&t| bloom.maybe_contains(t)));
776    }
777
778    /// Parity fuzz: the Bloom tier must return a *superset* of the exact posting
779    /// tier for every query (zero false negatives). False positives are allowed
780    /// (and verified away downstream), so we assert containment, not equality.
781    #[test]
782    fn bloom_tier_is_superset_of_postings_tier() {
783        // Deterministic synthetic corpus (LCG → reproducible).
784        let mut seed = 0x1234_5678_9abc_def0u64;
785        let mut rng = || {
786            seed = seed
787                .wrapping_mul(6364136223846793005)
788                .wrapping_add(1442695040888963407);
789            (seed >> 33) as u32
790        };
791        let mut per_file: Vec<Vec<u32>> = Vec::new();
792        for _ in 0..80 {
793            let n = 50 + (rng() % 250) as usize;
794            let mut s = HashSet::new();
795            for _ in 0..n {
796                s.insert(rng() & 0x00FF_FFFF);
797            }
798            let mut v: Vec<u32> = s.into_iter().collect();
799            v.sort_unstable();
800            per_file.push(v);
801        }
802        let total: usize = per_file.iter().map(Vec::len).sum();
803
804        let postings = build_narrowing(&per_file, total); // ≤ cap → postings
805        let blooms = build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1); // forced bloom
806        let (Narrowing::Postings(pt), Narrowing::Blooms(bl)) = (&postings, &blooms) else {
807            panic!("unexpected narrowing tiers");
808        };
809
810        // Queries drawn from real file trigrams (these MUST be found by both),
811        // plus a few that are unlikely to exist anywhere.
812        for f in &per_file {
813            if f.len() < 3 {
814                continue;
815            }
816            let q = vec![f[0], f[f.len() / 2], f[f.len() - 1]];
817            let exact = SearchIndex::postings_intersect(pt, &q);
818            let bloom: HashSet<u32> = SearchIndex::bloom_scan(bl, &q).into_iter().collect();
819            for id in exact {
820                assert!(
821                    bloom.contains(&id),
822                    "Bloom tier dropped a true match (false negative) for {q:?}"
823                );
824            }
825        }
826    }
827
828    /// End-to-end: an index forced onto the Bloom tier must still surface every
829    /// file that actually contains the literal (recall parity with a full scan).
830    #[test]
831    fn bloom_tier_end_to_end_recall() {
832        let samples = [
833            (
834                "auth_login.rs",
835                "fn authenticate(user) {}\nlet token = mint();\n",
836            ),
837            (
838                "auth_session.rs",
839                "struct Session;\n// authenticate again here\n",
840            ),
841            ("db_pool.rs", "fn connect() {}\nlet retries = 3;\n"),
842            (
843                "ui_button.tsx",
844                "export const Button = () => authenticate;\n",
845            ),
846            ("readme.md", "This project uses authenticate flows.\n"),
847            ("unrelated.rs", "fn helper() { let v = 1; }\n"),
848        ];
849        let files: Vec<PathBuf> = samples.iter().map(|(rel, _)| PathBuf::from(rel)).collect();
850        let per_file: Vec<Vec<u32>> = samples.iter().map(|(_, c)| trigrams_of(c)).collect();
851
852        let idx = SearchIndex {
853            files,
854            narrowing: build_narrowing(&per_file, MAX_POSTING_ENTRIES + 1),
855            respect_gitignore: true,
856            allow_secret_paths: false,
857            built_at: Instant::now(),
858        };
859        assert!(
860            matches!(idx.narrowing, Narrowing::Blooms(_)),
861            "test must exercise the Bloom tier"
862        );
863
864        for query in ["authenticate", "Session", "retries", "token"] {
865            let cands: HashSet<String> = idx
866                .candidate_paths(query, None)
867                .into_paths()
868                .iter()
869                .map(|p| p.to_string_lossy().to_string())
870                .collect();
871            for (rel, content) in samples {
872                if content.contains(query) {
873                    assert!(
874                        cands.contains(rel),
875                        "Bloom tier dropped real match {rel} for query {query:?}"
876                    );
877                }
878            }
879        }
880    }
881}