Skip to main content

lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12    "vendor/**",
13    "dist/**",
14    "build/**",
15    "public/vendor/**",
16    "public/js/**",
17    "public/css/**",
18    "public/build/**",
19    ".next/**",
20    ".nuxt/**",
21    "__pycache__/**",
22    "*.min.js",
23    "*.min.css",
24    "*.bundle.js",
25    "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30        .ok()
31        .and_then(|v| v.parse::<u64>().ok())
32        .unwrap_or_else(|| {
33            let cfg = crate::core::config::Config::load();
34            let profile = crate::core::config::MemoryProfile::effective(&cfg);
35            let profile_mb = profile.bm25_max_cache_mb();
36            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37                profile_mb
38            } else {
39                cfg.bm25_max_cache_mb
40            }
41        });
42    mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47    pub file_path: String,
48    pub symbol_name: String,
49    pub kind: ChunkKind,
50    pub start_line: usize,
51    pub end_line: usize,
52    pub content: String,
53    #[serde(default)]
54    pub tokens: Vec<String>,
55    pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60    Function,
61    Struct,
62    Impl,
63    Module,
64    Class,
65    Method,
66    Other,
67    // -- External source kinds (Context Engine) --
68    Issue,
69    PullRequest,
70    WikiPage,
71    DbSchema,
72    ApiEndpoint,
73    Ticket,
74    ExternalOther,
75}
76
77#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
78pub struct IndexedFileState {
79    pub mtime_ms: u64,
80    pub size_bytes: u64,
81}
82
83impl IndexedFileState {
84    fn from_path(path: &Path) -> Option<Self> {
85        let meta = path.metadata().ok()?;
86        let size_bytes = meta.len();
87        let mtime_ms = meta
88            .modified()
89            .ok()
90            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
91            .map(|d| d.as_millis() as u64)?;
92        Some(Self {
93            mtime_ms,
94            size_bytes,
95        })
96    }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct BM25Index {
101    pub chunks: Vec<CodeChunk>,
102    pub inverted: HashMap<String, Vec<(usize, f64)>>,
103    pub avg_doc_len: f64,
104    pub doc_count: usize,
105    pub doc_freqs: HashMap<String, usize>,
106    #[serde(default)]
107    pub files: HashMap<String, IndexedFileState>,
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct SearchResult {
112    pub chunk_idx: usize,
113    pub score: f64,
114    pub file_path: String,
115    pub symbol_name: String,
116    pub kind: ChunkKind,
117    pub start_line: usize,
118    pub end_line: usize,
119    pub snippet: String,
120}
121
122const BM25_K1: f64 = 1.2;
123const BM25_B: f64 = 0.75;
124
125impl Default for BM25Index {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131impl BM25Index {
132    pub fn new() -> Self {
133        Self {
134            chunks: Vec::new(),
135            inverted: HashMap::new(),
136            avg_doc_len: 0.0,
137            doc_count: 0,
138            doc_freqs: HashMap::new(),
139            files: HashMap::new(),
140        }
141    }
142
143    /// Approximate heap memory used by this index in bytes.
144    pub fn memory_usage_bytes(&self) -> usize {
145        let chunks_size: usize = self
146            .chunks
147            .iter()
148            .map(|c| {
149                c.content.len()
150                    + c.file_path.len()
151                    + c.symbol_name.len()
152                    + c.tokens.iter().map(String::len).sum::<usize>()
153                    + 64
154            })
155            .sum();
156        let inverted_size: usize = self
157            .inverted
158            .iter()
159            .map(|(k, v)| k.len() + v.len() * 16 + 32)
160            .sum();
161        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
162        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
163        chunks_size + inverted_size + files_size + freqs_size
164    }
165
166    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
167    pub fn unload(&mut self) {
168        let usage = self.memory_usage_bytes();
169        self.chunks = Vec::new();
170        self.inverted = HashMap::new();
171        self.doc_freqs = HashMap::new();
172        self.files = HashMap::new();
173        self.avg_doc_len = 0.0;
174        self.doc_count = 0;
175        tracing::info!(
176            "[bm25] unloaded index, freed ~{:.1}MB",
177            usage as f64 / 1_048_576.0
178        );
179    }
180
181    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
182    #[cfg(test)]
183    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
184        let mut index = Self::new();
185        for mut chunk in chunks {
186            if chunk.token_count == 0 {
187                chunk.token_count = tokenize(&chunk.content).len();
188            }
189            index.add_chunk(chunk);
190        }
191        index.finalize();
192        index
193    }
194
195    pub fn build_from_directory(root: &Path) -> Self {
196        Self::build_from_directory_inner(root, &HashMap::new())
197    }
198
199    /// Like `build_from_directory` but reuses file content from a prior scan
200    /// (e.g. the graph index walk) to avoid redundant disk reads.
201    pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
202        Self::build_from_directory_inner(root, content_hint)
203    }
204
205    fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
206        let root_str = root.to_string_lossy();
207        if !super::graph_index::is_safe_scan_root_public(&root_str) {
208            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
209            return Self::new();
210        }
211        let mut index = Self::new();
212        let files = list_code_files(root);
213        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
214        let mut cache_hits = 0usize;
215
216        for (i, rel) in files.iter().enumerate() {
217            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
218                tracing::warn!(
219                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
220                    files.len()
221                );
222                break;
223            }
224            if crate::core::memory_guard::abort_requested() {
225                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
226                break;
227            }
228
229            let abs = root.join(rel);
230            let Some(state) = IndexedFileState::from_path(&abs) else {
231                continue;
232            };
233            if state.size_bytes > MAX_FILE_SIZE_BYTES {
234                continue;
235            }
236
237            let content = if let Some(cached) = content_hint.get(rel) {
238                cache_hits += 1;
239                std::borrow::Cow::Borrowed(cached.as_str())
240            } else {
241                match std::fs::read_to_string(&abs) {
242                    Ok(c) => std::borrow::Cow::Owned(c),
243                    Err(_) => continue,
244                }
245            };
246
247            let mut chunks = extract_chunks(rel, &content);
248            chunks.sort_by(|a, b| {
249                a.start_line
250                    .cmp(&b.start_line)
251                    .then_with(|| a.end_line.cmp(&b.end_line))
252                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
253            });
254            for chunk in chunks {
255                index.add_chunk(chunk);
256            }
257            index.files.insert(rel.clone(), state);
258        }
259
260        if cache_hits > 0 {
261            tracing::info!(
262                "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
263                files.len()
264            );
265        }
266
267        index.finalize();
268        index
269    }
270
271    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
272        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
273        for c in &prev.chunks {
274            old_by_file
275                .entry(c.file_path.clone())
276                .or_default()
277                .push(c.clone());
278        }
279        for v in old_by_file.values_mut() {
280            v.sort_by(|a, b| {
281                a.start_line
282                    .cmp(&b.start_line)
283                    .then_with(|| a.end_line.cmp(&b.end_line))
284                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
285            });
286        }
287
288        let mut index = Self::new();
289        let files = list_code_files(root);
290        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
291
292        for (i, rel) in files.iter().enumerate() {
293            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
294                tracing::warn!(
295                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
296                    files.len()
297                );
298                break;
299            }
300
301            let abs = root.join(rel);
302            let Some(state) = IndexedFileState::from_path(&abs) else {
303                continue;
304            };
305
306            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
307            if unchanged {
308                if let Some(chunks) = old_by_file.get(rel) {
309                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
310                        for chunk in chunks {
311                            index.add_chunk(chunk.clone());
312                        }
313                        index.files.insert(rel.clone(), state);
314                        continue;
315                    }
316                }
317            }
318
319            if state.size_bytes > MAX_FILE_SIZE_BYTES {
320                continue;
321            }
322            if let Ok(content) = std::fs::read_to_string(&abs) {
323                let mut chunks = extract_chunks(rel, &content);
324                chunks.sort_by(|a, b| {
325                    a.start_line
326                        .cmp(&b.start_line)
327                        .then_with(|| a.end_line.cmp(&b.end_line))
328                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
329                });
330                for chunk in chunks {
331                    index.add_chunk(chunk);
332                }
333                index.files.insert(rel.clone(), state);
334            }
335        }
336
337        index.finalize();
338        index
339    }
340
341    fn add_chunk(&mut self, chunk: CodeChunk) {
342        let idx = self.chunks.len();
343
344        let enriched = enrich_for_bm25(&chunk);
345        let tokens = tokenize(&enriched);
346        for token in &tokens {
347            let lower = token.to_lowercase();
348            let postings = self.inverted.entry(lower.clone()).or_default();
349            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
350                *self.doc_freqs.entry(lower).or_insert(0) += 1;
351            }
352            postings.push((idx, 1.0));
353        }
354
355        self.chunks.push(CodeChunk {
356            token_count: tokens.len(),
357            tokens: Vec::new(),
358            ..chunk
359        });
360    }
361
362    fn finalize(&mut self) {
363        self.doc_count = self.chunks.len();
364        if self.doc_count == 0 {
365            return;
366        }
367
368        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
369        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
370    }
371
372    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
373        let query_tokens = tokenize(query);
374        if query_tokens.is_empty() || self.doc_count == 0 {
375            return Vec::new();
376        }
377
378        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
379        // Kolmogorov-optimal: minimal allocation for the scoring operation.
380        let n = self.chunks.len();
381        let mut scores = vec![0.0f64; n];
382        let mut touched = Vec::with_capacity(n.min(256));
383
384        for token in &query_tokens {
385            let lower = token.to_lowercase();
386            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
387            if df == 0.0 {
388                continue;
389            }
390
391            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
392
393            if let Some(postings) = self.inverted.get(&lower) {
394                for &(idx, weight) in postings {
395                    let doc_len = self.chunks[idx].token_count as f64;
396                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
397                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
398                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
399
400                    if scores[idx] == 0.0 {
401                        touched.push(idx);
402                    }
403                    scores[idx] += bm25;
404                }
405            }
406        }
407
408        let mut results: Vec<SearchResult> = touched
409            .iter()
410            .filter(|&&idx| scores[idx] > 0.0)
411            .map(|&idx| {
412                let chunk = &self.chunks[idx];
413                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
414                SearchResult {
415                    chunk_idx: idx,
416                    score: scores[idx],
417                    file_path: chunk.file_path.clone(),
418                    symbol_name: chunk.symbol_name.clone(),
419                    kind: chunk.kind.clone(),
420                    start_line: chunk.start_line,
421                    end_line: chunk.end_line,
422                    snippet,
423                }
424            })
425            .collect();
426
427        results.sort_by(|a, b| {
428            b.score
429                .partial_cmp(&a.score)
430                .unwrap_or(std::cmp::Ordering::Equal)
431                .then_with(|| a.file_path.cmp(&b.file_path))
432                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
433                .then_with(|| a.start_line.cmp(&b.start_line))
434                .then_with(|| a.end_line.cmp(&b.end_line))
435        });
436        results.truncate(top_k);
437        results
438    }
439
440    pub fn save(&self, root: &Path) -> std::io::Result<()> {
441        if self.chunks.len() > CHUNK_COUNT_WARNING {
442            tracing::warn!(
443                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
444                self.chunks.len(),
445                CHUNK_COUNT_WARNING
446            );
447        }
448
449        let dir = index_dir(root);
450        std::fs::create_dir_all(&dir)?;
451        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
452            .map_err(|e| std::io::Error::other(e.to_string()))?;
453
454        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
455            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
456
457        let max_bytes = max_bm25_cache_bytes();
458        if compressed.len() as u64 > max_bytes {
459            tracing::warn!(
460                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
461                compressed.len() as f64 / 1_048_576.0,
462                max_bytes / (1024 * 1024),
463                dir.display()
464            );
465            return Ok(());
466        }
467
468        tracing::info!(
469            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
470            data.len() as f64 / 1_048_576.0,
471            compressed.len() as f64 / 1_048_576.0,
472            (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
473        );
474
475        let target = dir.join("bm25_index.bin.zst");
476        let tmp = dir.join("bm25_index.bin.zst.tmp");
477        std::fs::write(&tmp, &compressed)?;
478        std::fs::rename(&tmp, &target)?;
479
480        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
481        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
482
483        let _ = std::fs::write(
484            dir.join("project_root.txt"),
485            root.to_string_lossy().as_bytes(),
486        );
487
488        Ok(())
489    }
490
491    pub fn load(root: &Path) -> Option<Self> {
492        let dir = index_dir(root);
493        let max_bytes = max_bm25_cache_bytes();
494
495        let zst_path = dir.join("bm25_index.bin.zst");
496        if zst_path.exists() {
497            let meta = std::fs::metadata(&zst_path).ok()?;
498            if meta.len() > max_bytes {
499                tracing::warn!(
500                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
501                    meta.len() as f64 / 1_073_741_824.0,
502                    max_bytes / (1024 * 1024),
503                    zst_path.display()
504                );
505                let quarantined = zst_path.with_extension("zst.quarantined");
506                let _ = std::fs::rename(&zst_path, &quarantined);
507                return None;
508            }
509            let compressed = std::fs::read(&zst_path).ok()?;
510            let max_decompressed = max_bytes * 20; // allow 20x expansion ratio
511            let data = bounded_zstd_decode(&compressed, max_decompressed)?;
512            let (idx, _): (Self, _) =
513                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
514            return Some(idx);
515        }
516
517        let bin_path = dir.join("bm25_index.bin");
518        if bin_path.exists() {
519            let meta = std::fs::metadata(&bin_path).ok()?;
520            if meta.len() > max_bytes {
521                tracing::warn!(
522                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
523                    meta.len() as f64 / 1_073_741_824.0,
524                    max_bytes / (1024 * 1024),
525                    bin_path.display()
526                );
527                let quarantined = bin_path.with_extension("bin.quarantined");
528                let _ = std::fs::rename(&bin_path, &quarantined);
529                return None;
530            }
531            let data = std::fs::read(&bin_path).ok()?;
532            let (idx, _): (Self, _) =
533                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
534            // Auto-migrate: compress legacy .bin to .bin.zst
535            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
536                let zst_tmp = zst_path.with_extension("zst.tmp");
537                if std::fs::write(&zst_tmp, &compressed).is_ok()
538                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
539                {
540                    tracing::info!(
541                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
542                        data.len() as f64 / 1_048_576.0,
543                        compressed.len() as f64 / 1_048_576.0
544                    );
545                    let _ = std::fs::remove_file(&bin_path);
546                }
547            }
548            return Some(idx);
549        }
550
551        let json_path = dir.join("bm25_index.json");
552        if json_path.exists() {
553            let meta = std::fs::metadata(&json_path).ok()?;
554            if meta.len() > max_bytes {
555                tracing::warn!(
556                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
557                    meta.len() as f64 / 1_073_741_824.0,
558                    max_bytes / (1024 * 1024),
559                    json_path.display()
560                );
561                let quarantined = json_path.with_extension("json.quarantined");
562                let _ = std::fs::rename(&json_path, &quarantined);
563                return None;
564            }
565            let data = std::fs::read_to_string(&json_path).ok()?;
566            return serde_json::from_str(&data).ok();
567        }
568
569        None
570    }
571
572    pub fn load_or_build(root: &Path) -> Self {
573        Self::load_or_build_inner(root, false)
574    }
575
576    /// Like `load_or_build` but uses a fast sentinel-sampling staleness check
577    /// that skips the expensive full directory walk for new-file detection.
578    pub fn load_or_build_fast(root: &Path) -> Self {
579        Self::load_or_build_inner(root, true)
580    }
581
582    fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
583        if !is_safe_bm25_root(root) {
584            return Self::default();
585        }
586        if let Some(idx) = Self::load(root) {
587            let stale = if fast_stale {
588                bm25_index_looks_stale_fast(&idx, root)
589            } else {
590                bm25_index_looks_stale(&idx, root)
591            };
592            if !stale {
593                return idx;
594            }
595            tracing::debug!(
596                "[bm25_index: stale index detected for {}; rebuilding]",
597                root.display()
598            );
599            let rebuilt = if idx.files.is_empty() {
600                Self::build_from_directory(root)
601            } else {
602                Self::rebuild_incremental(root, &idx)
603            };
604            let _ = rebuilt.save(root);
605            return rebuilt;
606        }
607
608        let built = Self::build_from_directory(root);
609        let _ = built.save(root);
610        built
611    }
612
613    pub fn index_file_path(root: &Path) -> PathBuf {
614        let dir = index_dir(root);
615        let zst = dir.join("bm25_index.bin.zst");
616        if zst.exists() {
617            return zst;
618        }
619        let bin = dir.join("bm25_index.bin");
620        if bin.exists() {
621            return bin;
622        }
623        dir.join("bm25_index.json")
624    }
625
626    /// Ingest external `ContentChunk`s into the BM25 index.
627    /// Converts each chunk to a `CodeChunk` (backward-compatible) and
628    /// rebuilds the inverted index. Returns the number of chunks ingested.
629    pub fn ingest_content_chunks(
630        &mut self,
631        chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
632    ) -> usize {
633        let mut count = 0usize;
634        for cc in chunks {
635            self.add_chunk(cc.into());
636            count += 1;
637        }
638        if count > 0 {
639            self.finalize();
640        }
641        count
642    }
643
644    /// Number of chunks originating from external providers.
645    pub fn external_chunk_count(&self) -> usize {
646        self.chunks
647            .iter()
648            .filter(|c| c.file_path.contains("://"))
649            .count()
650    }
651}
652
653fn is_safe_bm25_root(root: &Path) -> bool {
654    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
655}
656
657fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
658    bm25_index_looks_stale_inner(index, root, false)
659}
660
661/// Fast staleness check: samples a subset of tracked files and skips the
662/// expensive `list_code_files()` walk for new-file detection.
663pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
664    bm25_index_looks_stale_inner(index, root, true)
665}
666
667fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
668    if index.chunks.is_empty() {
669        return false;
670    }
671
672    if index.files.is_empty() {
673        let mut seen = std::collections::HashSet::<&str>::new();
674        for chunk in &index.chunks {
675            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
676            if rel.is_empty() {
677                continue;
678            }
679            if !seen.insert(rel) {
680                continue;
681            }
682            if !root.join(rel).exists() {
683                return true;
684            }
685        }
686        return false;
687    }
688
689    if fast {
690        let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
691        let step = if index.files.len() > sample_size {
692            index.files.len() / sample_size
693        } else {
694            1
695        };
696        for (i, (rel, old_state)) in index.files.iter().enumerate() {
697            if i % step != 0 {
698                continue;
699            }
700            let abs = root.join(rel);
701            if !abs.exists() {
702                return true;
703            }
704            let Some(cur) = IndexedFileState::from_path(&abs) else {
705                return true;
706            };
707            if &cur != old_state {
708                return true;
709            }
710        }
711        return false;
712    }
713
714    for (rel, old_state) in &index.files {
715        let abs = root.join(rel);
716        if !abs.exists() {
717            return true;
718        }
719        let Some(cur) = IndexedFileState::from_path(&abs) else {
720            return true;
721        };
722        if &cur != old_state {
723            return true;
724        }
725    }
726
727    for rel in list_code_files(root) {
728        if !index.files.contains_key(&rel) {
729            return true;
730        }
731    }
732
733    false
734}
735
736const SENTINEL_SAMPLE_SIZE: usize = 10;
737
738fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
739    use std::io::Read;
740    let mut decoder = zstd::Decoder::new(compressed).ok()?;
741    let mut buf = Vec::new();
742    let mut chunk = vec![0u8; 65536];
743    let mut total = 0u64;
744    loop {
745        let n = decoder.read(&mut chunk).ok()?;
746        if n == 0 {
747            break;
748        }
749        total += n as u64;
750        if total > max_bytes {
751            tracing::warn!(
752                "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
753                total as f64 / (1024.0 * 1024.0),
754                max_bytes as f64 / (1024.0 * 1024.0)
755            );
756            return None;
757        }
758        buf.extend_from_slice(&chunk[..n]);
759    }
760    Some(buf)
761}
762
763fn index_dir(root: &Path) -> PathBuf {
764    crate::core::index_namespace::vectors_dir(root)
765}
766
767fn list_code_files(root: &Path) -> Vec<String> {
768    let walker = ignore::WalkBuilder::new(root)
769        .hidden(true)
770        .git_ignore(true)
771        .git_global(true)
772        .git_exclude(true)
773        .max_depth(Some(20))
774        .build();
775
776    let cfg = crate::core::config::Config::load();
777    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
778        .iter()
779        .filter_map(|p| glob::Pattern::new(p).ok())
780        .collect();
781    ignore_patterns.extend(
782        cfg.extra_ignore_patterns
783            .iter()
784            .filter_map(|p| glob::Pattern::new(p).ok()),
785    );
786
787    let mut files: Vec<String> = Vec::new();
788    for entry in walker.flatten() {
789        let path = entry.path();
790        if !path.is_file() {
791            continue;
792        }
793        if !is_code_file(path) {
794            continue;
795        }
796        let rel = path
797            .strip_prefix(root)
798            .unwrap_or(path)
799            .to_string_lossy()
800            .to_string();
801        if rel.is_empty() {
802            continue;
803        }
804        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
805            continue;
806        }
807        if files.len() >= MAX_BM25_FILES {
808            tracing::warn!(
809                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
810                root.display()
811            );
812            break;
813        }
814        files.push(rel);
815    }
816
817    files.sort();
818    files.dedup();
819    files
820}
821
822pub fn is_code_file(path: &Path) -> bool {
823    let ext = path
824        .extension()
825        .and_then(|e| e.to_str())
826        .unwrap_or("")
827        .to_lowercase();
828    matches!(
829        ext.as_str(),
830        "rs" | "ts"
831            | "tsx"
832            | "js"
833            | "jsx"
834            | "py"
835            | "go"
836            | "java"
837            | "c"
838            | "cc"
839            | "cpp"
840            | "h"
841            | "hpp"
842            | "rb"
843            | "cs"
844            | "kt"
845            | "swift"
846            | "php"
847            | "scala"
848            | "sql"
849            | "ex"
850            | "exs"
851            | "zig"
852            | "lua"
853            | "dart"
854            | "vue"
855            | "svelte"
856    )
857}
858
859fn tokenize(text: &str) -> Vec<String> {
860    let mut tokens = Vec::new();
861    let mut current = String::new();
862
863    for ch in text.chars() {
864        if ch.is_alphanumeric() || ch == '_' {
865            current.push(ch);
866        } else {
867            if current.len() >= 2 {
868                tokens.push(current.clone());
869            }
870            current.clear();
871        }
872    }
873    if current.len() >= 2 {
874        tokens.push(current);
875    }
876
877    split_camel_case_tokens(&tokens)
878}
879
880pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
881    tokenize(text)
882}
883
884fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
885    let mut result = Vec::new();
886    for token in tokens {
887        result.push(token.clone());
888        let mut start = 0;
889        let chars: Vec<char> = token.chars().collect();
890        for i in 1..chars.len() {
891            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
892                let part: String = chars[start..i].iter().collect();
893                if part.len() >= 2 {
894                    result.push(part);
895                }
896                start = i;
897            }
898        }
899        if start > 0 {
900            let part: String = chars[start..].iter().collect();
901            if part.len() >= 2 {
902                result.push(part);
903            }
904        }
905    }
906    result
907}
908
909fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
910    #[cfg(feature = "tree-sitter")]
911    {
912        let ext = std::path::Path::new(file_path)
913            .extension()
914            .and_then(|e| e.to_str())
915            .unwrap_or("");
916        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
917            return chunks;
918        }
919    }
920
921    let lines: Vec<&str> = content.lines().collect();
922    if lines.is_empty() {
923        return Vec::new();
924    }
925
926    let mut chunks = Vec::new();
927    let mut i = 0;
928
929    while i < lines.len() {
930        let trimmed = lines[i].trim();
931
932        if let Some((name, kind)) = detect_symbol(trimmed) {
933            let start = i;
934            let end = find_block_end(&lines, i);
935            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
936            let token_count = tokenize(&block).len();
937
938            chunks.push(CodeChunk {
939                file_path: file_path.to_string(),
940                symbol_name: name,
941                kind,
942                start_line: start + 1,
943                end_line: end + 1,
944                content: block,
945                tokens: Vec::new(),
946                token_count,
947            });
948
949            i = end + 1;
950        } else {
951            i += 1;
952        }
953    }
954
955    if chunks.is_empty() && !content.is_empty() {
956        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
957        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
958        //
959        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
960        let bytes = content.as_bytes();
961        let rk_chunks = crate::core::rabin_karp::chunk(content);
962        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
963            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
964                let end = (c.offset + c.length).min(bytes.len());
965                let slice = &bytes[c.offset..end];
966                let chunk_text = String::from_utf8_lossy(slice).into_owned();
967                let token_count = tokenize(&chunk_text).len();
968                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
969                let end_line = start_line + bytecount::count(slice, b'\n');
970                chunks.push(CodeChunk {
971                    file_path: file_path.to_string(),
972                    symbol_name: format!("{file_path}#chunk-{idx}"),
973                    kind: ChunkKind::Module,
974                    start_line,
975                    end_line: end_line.max(start_line),
976                    content: chunk_text,
977                    tokens: Vec::new(),
978                    token_count,
979                });
980            }
981        } else {
982            let token_count = tokenize(content).len();
983            let snippet = lines
984                .iter()
985                .take(50)
986                .copied()
987                .collect::<Vec<_>>()
988                .join("\n");
989            chunks.push(CodeChunk {
990                file_path: file_path.to_string(),
991                symbol_name: file_path.to_string(),
992                kind: ChunkKind::Module,
993                start_line: 1,
994                end_line: lines.len(),
995                content: snippet,
996                tokens: Vec::new(),
997                token_count,
998            });
999        }
1000    }
1001
1002    chunks
1003}
1004
1005fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
1006    let trimmed = line.trim();
1007
1008    let patterns: &[(&str, ChunkKind)] = &[
1009        ("pub async fn ", ChunkKind::Function),
1010        ("async fn ", ChunkKind::Function),
1011        ("pub fn ", ChunkKind::Function),
1012        ("fn ", ChunkKind::Function),
1013        ("pub struct ", ChunkKind::Struct),
1014        ("struct ", ChunkKind::Struct),
1015        ("pub enum ", ChunkKind::Struct),
1016        ("enum ", ChunkKind::Struct),
1017        ("impl ", ChunkKind::Impl),
1018        ("pub trait ", ChunkKind::Struct),
1019        ("trait ", ChunkKind::Struct),
1020        ("export function ", ChunkKind::Function),
1021        ("export async function ", ChunkKind::Function),
1022        ("export default function ", ChunkKind::Function),
1023        ("function ", ChunkKind::Function),
1024        ("async function ", ChunkKind::Function),
1025        ("export class ", ChunkKind::Class),
1026        ("class ", ChunkKind::Class),
1027        ("export interface ", ChunkKind::Struct),
1028        ("interface ", ChunkKind::Struct),
1029        ("def ", ChunkKind::Function),
1030        ("async def ", ChunkKind::Function),
1031        ("class ", ChunkKind::Class),
1032        ("func ", ChunkKind::Function),
1033    ];
1034
1035    for (prefix, kind) in patterns {
1036        if let Some(rest) = trimmed.strip_prefix(prefix) {
1037            let name: String = rest
1038                .chars()
1039                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
1040                .take_while(|c| *c != '<')
1041                .collect();
1042            if !name.is_empty() {
1043                return Some((name, kind.clone()));
1044            }
1045        }
1046    }
1047
1048    None
1049}
1050
1051fn find_block_end(lines: &[&str], start: usize) -> usize {
1052    let mut depth = 0i32;
1053    let mut found_open = false;
1054
1055    for (i, line) in lines.iter().enumerate().skip(start) {
1056        for ch in line.chars() {
1057            match ch {
1058                '{' | '(' if !found_open || depth > 0 => {
1059                    depth += 1;
1060                    found_open = true;
1061                }
1062                '}' | ')' if depth > 0 => {
1063                    depth -= 1;
1064                    if depth == 0 && found_open {
1065                        return i;
1066                    }
1067                }
1068                _ => {}
1069            }
1070        }
1071
1072        if found_open && depth <= 0 && i > start {
1073            return i;
1074        }
1075
1076        if !found_open && i > start + 2 {
1077            let trimmed = lines[i].trim();
1078            if trimmed.is_empty()
1079                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1080            {
1081                return i.saturating_sub(1);
1082            }
1083        }
1084    }
1085
1086    (start + 50).min(lines.len().saturating_sub(1))
1087}
1088
1089pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1090    if results.is_empty() {
1091        return "No results found.".to_string();
1092    }
1093
1094    let mut out = String::new();
1095    for (i, r) in results.iter().enumerate() {
1096        let is_external = r.file_path.contains("://");
1097        if compact {
1098            if is_external {
1099                out.push_str(&format!(
1100                    "{}. {:.2} [{:?}] {} — {}\n",
1101                    i + 1,
1102                    r.score,
1103                    r.kind,
1104                    r.file_path,
1105                    r.symbol_name,
1106                ));
1107            } else {
1108                out.push_str(&format!(
1109                    "{}. {:.2} {}:{}-{} {:?} {}\n",
1110                    i + 1,
1111                    r.score,
1112                    r.file_path,
1113                    r.start_line,
1114                    r.end_line,
1115                    r.kind,
1116                    r.symbol_name,
1117                ));
1118            }
1119        } else if is_external {
1120            out.push_str(&format!(
1121                "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1122                i + 1,
1123                r.score,
1124                r.kind,
1125                r.file_path,
1126                r.symbol_name,
1127                r.snippet,
1128            ));
1129        } else {
1130            out.push_str(&format!(
1131                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1132                i + 1,
1133                r.score,
1134                r.file_path,
1135                r.symbol_name,
1136                r.kind,
1137                r.start_line,
1138                r.end_line,
1139                r.snippet,
1140            ));
1141        }
1142    }
1143    out
1144}
1145
1146/// Enrich chunk content with file-path components for BM25 path-matching.
1147///
1148/// SACL (EMNLP 2025) shows that augmenting code with structural information
1149/// improves retrieval by 7-12.8%. We append the file stem twice (for boost)
1150/// and the immediate parent directory once, enabling queries like "auth handler"
1151/// to match `src/auth/handler.rs`.
1152fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1153    let path = Path::new(&chunk.file_path);
1154    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1155    let dir = path
1156        .parent()
1157        .and_then(|p| p.file_name())
1158        .and_then(|d| d.to_str())
1159        .unwrap_or("");
1160
1161    if stem.is_empty() {
1162        return chunk.content.clone();
1163    }
1164
1165    format!("{} {} {} {}", chunk.content, stem, stem, dir)
1166}
1167
1168#[cfg(test)]
1169mod tests {
1170    use super::*;
1171    use tempfile::tempdir;
1172
1173    #[cfg(unix)]
1174    use std::os::unix::fs::PermissionsExt;
1175
1176    #[test]
1177    fn tokenize_splits_code() {
1178        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1179        assert!(tokens.contains(&"calculate_total".to_string()));
1180        assert!(tokens.contains(&"items".to_string()));
1181        assert!(tokens.contains(&"Vec".to_string()));
1182    }
1183
1184    #[test]
1185    fn camel_case_splitting() {
1186        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1187        assert!(tokens.contains(&"calculateTotal".to_string()));
1188        assert!(tokens.contains(&"calculate".to_string()));
1189        assert!(tokens.contains(&"Total".to_string()));
1190    }
1191
1192    #[test]
1193    fn detect_rust_function() {
1194        let (name, kind) =
1195            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1196        assert_eq!(name, "process_request");
1197        assert_eq!(kind, ChunkKind::Function);
1198    }
1199
1200    #[test]
1201    fn bm25_search_finds_relevant() {
1202        let mut index = BM25Index::new();
1203        index.add_chunk(CodeChunk {
1204            file_path: "auth.rs".into(),
1205            symbol_name: "validate_token".into(),
1206            kind: ChunkKind::Function,
1207            start_line: 1,
1208            end_line: 10,
1209            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1210            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1211            token_count: 8,
1212        });
1213        index.add_chunk(CodeChunk {
1214            file_path: "db.rs".into(),
1215            symbol_name: "connect_database".into(),
1216            kind: ChunkKind::Function,
1217            start_line: 1,
1218            end_line: 5,
1219            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1220            tokens: tokenize("fn connect_database url str Pool create_pool url"),
1221            token_count: 7,
1222        });
1223        index.finalize();
1224
1225        let results = index.search("jwt token validation", 5);
1226        assert!(!results.is_empty());
1227        assert_eq!(results[0].symbol_name, "validate_token");
1228    }
1229
1230    #[test]
1231    fn bm25_search_sorts_ties_deterministically() {
1232        let mut index = BM25Index::new();
1233
1234        // Insert in reverse path order to ensure the sort tie-break matters.
1235        index.add_chunk(CodeChunk {
1236            file_path: "b.rs".into(),
1237            symbol_name: "same".into(),
1238            kind: ChunkKind::Function,
1239            start_line: 1,
1240            end_line: 1,
1241            content: "fn same() {}".into(),
1242            tokens: tokenize("same token"),
1243            token_count: 2,
1244        });
1245        index.add_chunk(CodeChunk {
1246            file_path: "a.rs".into(),
1247            symbol_name: "same".into(),
1248            kind: ChunkKind::Function,
1249            start_line: 1,
1250            end_line: 1,
1251            content: "fn same() {}".into(),
1252            tokens: tokenize("same token"),
1253            token_count: 2,
1254        });
1255        index.finalize();
1256
1257        let results = index.search("same", 10);
1258        assert!(results.len() >= 2);
1259        assert_eq!(results[0].file_path, "a.rs");
1260        assert_eq!(results[1].file_path, "b.rs");
1261    }
1262
1263    #[test]
1264    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1265        let td = tempdir().expect("tempdir");
1266        let root = td.path();
1267        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1268
1269        let idx = BM25Index::build_from_directory(root);
1270        assert!(!bm25_index_looks_stale(&idx, root));
1271
1272        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1273        assert!(bm25_index_looks_stale(&idx, root));
1274    }
1275
1276    #[test]
1277    #[cfg(unix)]
1278    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1279        let td = tempdir().expect("tempdir");
1280        let root = td.path();
1281
1282        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1283        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1284
1285        let idx1 = BM25Index::build_from_directory(root);
1286        assert!(idx1.files.contains_key("a.rs"));
1287        assert!(idx1.files.contains_key("b.rs"));
1288
1289        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1290        let a_path = root.join("a.rs");
1291        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1292        perms.set_mode(0o000);
1293        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1294
1295        // Change b.rs (size changes) to force a re-read for that file.
1296        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1297            .expect("rewrite b.rs");
1298
1299        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1300        assert!(
1301            idx2.files.contains_key("a.rs"),
1302            "a.rs should be kept via reuse"
1303        );
1304        assert!(idx2.files.contains_key("b.rs"));
1305
1306        let b_has_b2 = idx2
1307            .chunks
1308            .iter()
1309            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1310        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1311
1312        // Restore permissions to avoid cleanup surprises.
1313        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1314        perms.set_mode(0o644);
1315        let _ = std::fs::set_permissions(&a_path, perms);
1316    }
1317
1318    #[test]
1319    fn load_quarantines_oversized_index() {
1320        let _env = crate::core::data_dir::test_env_lock();
1321        let td = tempdir().expect("tempdir");
1322        let root = td.path();
1323        let dir = crate::core::index_namespace::vectors_dir(root);
1324        std::fs::create_dir_all(&dir).expect("create vectors dir");
1325
1326        let index_path = dir.join("bm25_index.json");
1327        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1328        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1329
1330        let result = BM25Index::load(root);
1331        assert!(result.is_none(), "oversized index should return None");
1332        assert!(
1333            !index_path.exists(),
1334            "original index should be removed after quarantine"
1335        );
1336        assert!(
1337            dir.join("bm25_index.json.quarantined").exists(),
1338            "quarantined file should exist"
1339        );
1340
1341        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1342    }
1343
1344    #[test]
1345    fn save_refuses_oversized_output() {
1346        let _env = crate::core::data_dir::test_env_lock();
1347        let data_dir = tempdir().expect("data_dir");
1348        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1349        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1350
1351        let td = tempdir().expect("tempdir");
1352        let root = td.path();
1353
1354        let mut index = BM25Index::new();
1355        index.add_chunk(CodeChunk {
1356            file_path: "a.rs".into(),
1357            symbol_name: "a".into(),
1358            kind: ChunkKind::Function,
1359            start_line: 1,
1360            end_line: 1,
1361            content: "fn a() {}".into(),
1362            tokens: tokenize("fn a"),
1363            token_count: 2,
1364        });
1365        index.finalize();
1366
1367        let _ = index.save(root);
1368        let index_path = BM25Index::index_file_path(root);
1369        assert!(
1370            !index_path.exists(),
1371            "save should refuse to persist oversized index"
1372        );
1373
1374        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1375    }
1376
1377    #[test]
1378    fn save_writes_project_root_marker() {
1379        let _env = crate::core::data_dir::test_env_lock();
1380        let td = tempdir().expect("tempdir");
1381        let root = td.path();
1382        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1383
1384        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1385        let index = BM25Index::build_from_directory(root);
1386        index.save(root).expect("save");
1387
1388        let dir = crate::core::index_namespace::vectors_dir(root);
1389        let marker = dir.join("project_root.txt");
1390        assert!(marker.exists(), "project_root.txt marker should exist");
1391        let content = std::fs::read_to_string(&marker).expect("read marker");
1392        assert_eq!(content, root.to_string_lossy());
1393    }
1394
1395    #[test]
1396    fn save_load_roundtrip_uses_zstd() {
1397        let _env = crate::core::data_dir::test_env_lock();
1398        let data_dir = tempdir().expect("data_dir");
1399        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1400        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1401        let td = tempdir().expect("tempdir");
1402        let root = td.path();
1403
1404        for i in 0..10 {
1405            std::fs::write(
1406                root.join(format!("mod{i}.rs")),
1407                format!(
1408                    "pub fn handler_{i}() {{\n    println!(\"hello\");\n}}\n\n\
1409                     pub fn helper_{i}() {{\n    println!(\"world\");\n}}\n"
1410                ),
1411            )
1412            .expect("write");
1413        }
1414
1415        let index = BM25Index::build_from_directory(root);
1416        assert!(index.doc_count > 0, "should have indexed chunks");
1417        index.save(root).expect("save");
1418
1419        let dir = crate::core::index_namespace::vectors_dir(root);
1420        let zst = dir.join("bm25_index.bin.zst");
1421        assert!(zst.exists(), "should write .bin.zst");
1422        assert!(
1423            !dir.join("bm25_index.bin").exists(),
1424            ".bin should be deleted"
1425        );
1426
1427        let loaded = BM25Index::load(root).expect("load compressed index");
1428        assert_eq!(loaded.doc_count, index.doc_count);
1429        assert_eq!(loaded.chunks.len(), index.chunks.len());
1430
1431        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1432        std::env::remove_var("LEAN_CTX_DATA_DIR");
1433    }
1434
1435    #[test]
1436    fn auto_migrate_bin_to_zst() {
1437        let _env = crate::core::data_dir::test_env_lock();
1438        let data_dir = tempdir().expect("data_dir");
1439        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1440        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1441        let td = tempdir().expect("tempdir");
1442        let root = td.path();
1443
1444        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1445        let index = BM25Index::build_from_directory(root);
1446
1447        let dir = crate::core::index_namespace::vectors_dir(root);
1448        std::fs::create_dir_all(&dir).expect("mkdir");
1449        let data =
1450            bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1451        std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1452
1453        let loaded = BM25Index::load(root).expect("load should auto-migrate");
1454        assert_eq!(loaded.doc_count, index.doc_count);
1455        assert!(
1456            dir.join("bm25_index.bin.zst").exists(),
1457            ".bin.zst should be created"
1458        );
1459        assert!(
1460            !dir.join("bm25_index.bin").exists(),
1461            ".bin should be removed"
1462        );
1463
1464        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1465        std::env::remove_var("LEAN_CTX_DATA_DIR");
1466    }
1467
1468    #[test]
1469    fn list_code_files_skips_default_vendor_ignores() {
1470        let td = tempdir().expect("tempdir");
1471        let root = td.path();
1472
1473        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1474        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1475        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1476        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1477        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1478
1479        let files = list_code_files(root);
1480        assert!(
1481            files.iter().any(|f| f == "main.rs"),
1482            "main.rs should be included"
1483        );
1484        assert!(
1485            !files.iter().any(|f| f.starts_with("vendor/")),
1486            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1487        );
1488        assert!(
1489            !files.iter().any(|f| f.starts_with("dist/")),
1490            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1491        );
1492    }
1493
1494    #[test]
1495    fn list_code_files_respects_max_files_cap() {
1496        let td = tempdir().expect("tempdir");
1497        let root = td.path();
1498
1499        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1500        // but we can verify the cap constant exists and the function returns a bounded vec.
1501        for i in 0..10 {
1502            std::fs::write(
1503                root.join(format!("f{i}.rs")),
1504                format!("pub fn f{i}() {{}}\n"),
1505            )
1506            .expect("write");
1507        }
1508        let files = list_code_files(root);
1509        assert!(
1510            files.len() <= MAX_BM25_FILES,
1511            "file count should not exceed MAX_BM25_FILES"
1512        );
1513    }
1514
1515    #[test]
1516    fn max_bm25_cache_bytes_reads_env() {
1517        let _env = crate::core::data_dir::test_env_lock();
1518        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1519        let bytes = max_bm25_cache_bytes();
1520        assert_eq!(bytes, 64 * 1024 * 1024);
1521        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1522    }
1523}