Skip to main content

lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12    "vendor/**",
13    "dist/**",
14    "build/**",
15    "public/vendor/**",
16    "public/js/**",
17    "public/css/**",
18    "public/build/**",
19    ".next/**",
20    ".nuxt/**",
21    "__pycache__/**",
22    "*.min.js",
23    "*.min.css",
24    "*.bundle.js",
25    "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30        .ok()
31        .and_then(|v| v.parse::<u64>().ok())
32        .unwrap_or_else(|| {
33            let cfg = crate::core::config::Config::load();
34            let profile = crate::core::config::MemoryProfile::effective(&cfg);
35            let profile_mb = profile.bm25_max_cache_mb();
36            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37                profile_mb
38            } else {
39                cfg.bm25_max_cache_mb
40            }
41        });
42    mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47    pub file_path: String,
48    pub symbol_name: String,
49    pub kind: ChunkKind,
50    pub start_line: usize,
51    pub end_line: usize,
52    pub content: String,
53    #[serde(default)]
54    pub tokens: Vec<String>,
55    pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60    Function,
61    Struct,
62    Impl,
63    Module,
64    Class,
65    Method,
66    Other,
67    // -- External source kinds (Context Cortex) --
68    Issue,
69    PullRequest,
70    WikiPage,
71    DbSchema,
72    ApiEndpoint,
73    Ticket,
74    ExternalOther,
75}
76
77#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
78pub struct IndexedFileState {
79    pub mtime_ms: u64,
80    pub size_bytes: u64,
81}
82
83impl IndexedFileState {
84    fn from_path(path: &Path) -> Option<Self> {
85        let meta = path.metadata().ok()?;
86        let size_bytes = meta.len();
87        let mtime_ms = meta
88            .modified()
89            .ok()
90            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
91            .map(|d| d.as_millis() as u64)?;
92        Some(Self {
93            mtime_ms,
94            size_bytes,
95        })
96    }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct BM25Index {
101    pub chunks: Vec<CodeChunk>,
102    pub inverted: HashMap<String, Vec<(usize, f64)>>,
103    pub avg_doc_len: f64,
104    pub doc_count: usize,
105    pub doc_freqs: HashMap<String, usize>,
106    #[serde(default)]
107    pub files: HashMap<String, IndexedFileState>,
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct SearchResult {
112    pub chunk_idx: usize,
113    pub score: f64,
114    pub file_path: String,
115    pub symbol_name: String,
116    pub kind: ChunkKind,
117    pub start_line: usize,
118    pub end_line: usize,
119    pub snippet: String,
120}
121
122const BM25_K1: f64 = 1.2;
123const BM25_B: f64 = 0.75;
124
125impl Default for BM25Index {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131impl BM25Index {
132    pub fn new() -> Self {
133        Self {
134            chunks: Vec::new(),
135            inverted: HashMap::new(),
136            avg_doc_len: 0.0,
137            doc_count: 0,
138            doc_freqs: HashMap::new(),
139            files: HashMap::new(),
140        }
141    }
142
143    /// Approximate heap memory used by this index in bytes.
144    pub fn memory_usage_bytes(&self) -> usize {
145        let chunks_size: usize = self
146            .chunks
147            .iter()
148            .map(|c| {
149                c.content.len()
150                    + c.file_path.len()
151                    + c.symbol_name.len()
152                    + c.tokens.iter().map(String::len).sum::<usize>()
153                    + 64
154            })
155            .sum();
156        let inverted_size: usize = self
157            .inverted
158            .iter()
159            .map(|(k, v)| k.len() + v.len() * 16 + 32)
160            .sum();
161        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
162        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
163        chunks_size + inverted_size + files_size + freqs_size
164    }
165
166    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
167    pub fn unload(&mut self) {
168        let usage = self.memory_usage_bytes();
169        self.chunks = Vec::new();
170        self.inverted = HashMap::new();
171        self.doc_freqs = HashMap::new();
172        self.files = HashMap::new();
173        self.avg_doc_len = 0.0;
174        self.doc_count = 0;
175        tracing::info!(
176            "[bm25] unloaded index, freed ~{:.1}MB",
177            usage as f64 / 1_048_576.0
178        );
179    }
180
181    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
182    #[cfg(test)]
183    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
184        let mut index = Self::new();
185        for mut chunk in chunks {
186            if chunk.token_count == 0 {
187                chunk.token_count = tokenize(&chunk.content).len();
188            }
189            index.add_chunk(chunk);
190        }
191        index.finalize();
192        index
193    }
194
195    pub fn build_from_directory(root: &Path) -> Self {
196        let root_str = root.to_string_lossy();
197        if !super::graph_index::is_safe_scan_root_public(&root_str) {
198            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
199            return Self::new();
200        }
201        let mut index = Self::new();
202        let files = list_code_files(root);
203        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
204
205        for (i, rel) in files.iter().enumerate() {
206            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
207                tracing::warn!(
208                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
209                    files.len()
210                );
211                break;
212            }
213            if crate::core::memory_guard::abort_requested() {
214                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
215                break;
216            }
217
218            let abs = root.join(rel);
219            let Some(state) = IndexedFileState::from_path(&abs) else {
220                continue;
221            };
222            if state.size_bytes > MAX_FILE_SIZE_BYTES {
223                continue;
224            }
225            if let Ok(content) = std::fs::read_to_string(&abs) {
226                let mut chunks = extract_chunks(rel, &content);
227                chunks.sort_by(|a, b| {
228                    a.start_line
229                        .cmp(&b.start_line)
230                        .then_with(|| a.end_line.cmp(&b.end_line))
231                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
232                });
233                for chunk in chunks {
234                    index.add_chunk(chunk);
235                }
236                index.files.insert(rel.clone(), state);
237            }
238        }
239
240        index.finalize();
241        index
242    }
243
244    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
245        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
246        for c in &prev.chunks {
247            old_by_file
248                .entry(c.file_path.clone())
249                .or_default()
250                .push(c.clone());
251        }
252        for v in old_by_file.values_mut() {
253            v.sort_by(|a, b| {
254                a.start_line
255                    .cmp(&b.start_line)
256                    .then_with(|| a.end_line.cmp(&b.end_line))
257                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
258            });
259        }
260
261        let mut index = Self::new();
262        let files = list_code_files(root);
263        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
264
265        for (i, rel) in files.iter().enumerate() {
266            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
267                tracing::warn!(
268                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
269                    files.len()
270                );
271                break;
272            }
273
274            let abs = root.join(rel);
275            let Some(state) = IndexedFileState::from_path(&abs) else {
276                continue;
277            };
278
279            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
280            if unchanged {
281                if let Some(chunks) = old_by_file.get(rel) {
282                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
283                        for chunk in chunks {
284                            index.add_chunk(chunk.clone());
285                        }
286                        index.files.insert(rel.clone(), state);
287                        continue;
288                    }
289                }
290            }
291
292            if state.size_bytes > MAX_FILE_SIZE_BYTES {
293                continue;
294            }
295            if let Ok(content) = std::fs::read_to_string(&abs) {
296                let mut chunks = extract_chunks(rel, &content);
297                chunks.sort_by(|a, b| {
298                    a.start_line
299                        .cmp(&b.start_line)
300                        .then_with(|| a.end_line.cmp(&b.end_line))
301                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
302                });
303                for chunk in chunks {
304                    index.add_chunk(chunk);
305                }
306                index.files.insert(rel.clone(), state);
307            }
308        }
309
310        index.finalize();
311        index
312    }
313
314    fn add_chunk(&mut self, chunk: CodeChunk) {
315        let idx = self.chunks.len();
316
317        let enriched = enrich_for_bm25(&chunk);
318        let tokens = tokenize(&enriched);
319        for token in &tokens {
320            let lower = token.to_lowercase();
321            let postings = self.inverted.entry(lower.clone()).or_default();
322            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
323                *self.doc_freqs.entry(lower).or_insert(0) += 1;
324            }
325            postings.push((idx, 1.0));
326        }
327
328        self.chunks.push(CodeChunk {
329            token_count: tokens.len(),
330            tokens: Vec::new(),
331            ..chunk
332        });
333    }
334
335    fn finalize(&mut self) {
336        self.doc_count = self.chunks.len();
337        if self.doc_count == 0 {
338            return;
339        }
340
341        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
342        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
343    }
344
345    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
346        let query_tokens = tokenize(query);
347        if query_tokens.is_empty() || self.doc_count == 0 {
348            return Vec::new();
349        }
350
351        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
352        // Kolmogorov-optimal: minimal allocation for the scoring operation.
353        let n = self.chunks.len();
354        let mut scores = vec![0.0f64; n];
355        let mut touched = Vec::with_capacity(n.min(256));
356
357        for token in &query_tokens {
358            let lower = token.to_lowercase();
359            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
360            if df == 0.0 {
361                continue;
362            }
363
364            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
365
366            if let Some(postings) = self.inverted.get(&lower) {
367                for &(idx, weight) in postings {
368                    let doc_len = self.chunks[idx].token_count as f64;
369                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
370                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
371                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
372
373                    if scores[idx] == 0.0 {
374                        touched.push(idx);
375                    }
376                    scores[idx] += bm25;
377                }
378            }
379        }
380
381        let mut results: Vec<SearchResult> = touched
382            .iter()
383            .filter(|&&idx| scores[idx] > 0.0)
384            .map(|&idx| {
385                let chunk = &self.chunks[idx];
386                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
387                SearchResult {
388                    chunk_idx: idx,
389                    score: scores[idx],
390                    file_path: chunk.file_path.clone(),
391                    symbol_name: chunk.symbol_name.clone(),
392                    kind: chunk.kind.clone(),
393                    start_line: chunk.start_line,
394                    end_line: chunk.end_line,
395                    snippet,
396                }
397            })
398            .collect();
399
400        results.sort_by(|a, b| {
401            b.score
402                .partial_cmp(&a.score)
403                .unwrap_or(std::cmp::Ordering::Equal)
404                .then_with(|| a.file_path.cmp(&b.file_path))
405                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
406                .then_with(|| a.start_line.cmp(&b.start_line))
407                .then_with(|| a.end_line.cmp(&b.end_line))
408        });
409        results.truncate(top_k);
410        results
411    }
412
413    pub fn save(&self, root: &Path) -> std::io::Result<()> {
414        if self.chunks.len() > CHUNK_COUNT_WARNING {
415            tracing::warn!(
416                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
417                self.chunks.len(),
418                CHUNK_COUNT_WARNING
419            );
420        }
421
422        let dir = index_dir(root);
423        std::fs::create_dir_all(&dir)?;
424        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
425            .map_err(|e| std::io::Error::other(e.to_string()))?;
426
427        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
428            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
429
430        let max_bytes = max_bm25_cache_bytes();
431        if compressed.len() as u64 > max_bytes {
432            tracing::warn!(
433                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
434                compressed.len() as f64 / 1_048_576.0,
435                max_bytes / (1024 * 1024),
436                dir.display()
437            );
438            return Ok(());
439        }
440
441        tracing::info!(
442            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
443            data.len() as f64 / 1_048_576.0,
444            compressed.len() as f64 / 1_048_576.0,
445            (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
446        );
447
448        let target = dir.join("bm25_index.bin.zst");
449        let tmp = dir.join("bm25_index.bin.zst.tmp");
450        std::fs::write(&tmp, &compressed)?;
451        std::fs::rename(&tmp, &target)?;
452
453        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
454        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
455
456        let _ = std::fs::write(
457            dir.join("project_root.txt"),
458            root.to_string_lossy().as_bytes(),
459        );
460
461        Ok(())
462    }
463
464    pub fn load(root: &Path) -> Option<Self> {
465        let dir = index_dir(root);
466        let max_bytes = max_bm25_cache_bytes();
467
468        let zst_path = dir.join("bm25_index.bin.zst");
469        if zst_path.exists() {
470            let meta = std::fs::metadata(&zst_path).ok()?;
471            if meta.len() > max_bytes {
472                tracing::warn!(
473                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
474                    meta.len() as f64 / 1_073_741_824.0,
475                    max_bytes / (1024 * 1024),
476                    zst_path.display()
477                );
478                let quarantined = zst_path.with_extension("zst.quarantined");
479                let _ = std::fs::rename(&zst_path, &quarantined);
480                return None;
481            }
482            let compressed = std::fs::read(&zst_path).ok()?;
483            let max_decompressed = max_bytes * 20; // allow 20x expansion ratio
484            let data = bounded_zstd_decode(&compressed, max_decompressed)?;
485            let (idx, _): (Self, _) =
486                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
487            return Some(idx);
488        }
489
490        let bin_path = dir.join("bm25_index.bin");
491        if bin_path.exists() {
492            let meta = std::fs::metadata(&bin_path).ok()?;
493            if meta.len() > max_bytes {
494                tracing::warn!(
495                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
496                    meta.len() as f64 / 1_073_741_824.0,
497                    max_bytes / (1024 * 1024),
498                    bin_path.display()
499                );
500                let quarantined = bin_path.with_extension("bin.quarantined");
501                let _ = std::fs::rename(&bin_path, &quarantined);
502                return None;
503            }
504            let data = std::fs::read(&bin_path).ok()?;
505            let (idx, _): (Self, _) =
506                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
507            // Auto-migrate: compress legacy .bin to .bin.zst
508            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
509                let zst_tmp = zst_path.with_extension("zst.tmp");
510                if std::fs::write(&zst_tmp, &compressed).is_ok()
511                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
512                {
513                    tracing::info!(
514                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
515                        data.len() as f64 / 1_048_576.0,
516                        compressed.len() as f64 / 1_048_576.0
517                    );
518                    let _ = std::fs::remove_file(&bin_path);
519                }
520            }
521            return Some(idx);
522        }
523
524        let json_path = dir.join("bm25_index.json");
525        if json_path.exists() {
526            let meta = std::fs::metadata(&json_path).ok()?;
527            if meta.len() > max_bytes {
528                tracing::warn!(
529                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
530                    meta.len() as f64 / 1_073_741_824.0,
531                    max_bytes / (1024 * 1024),
532                    json_path.display()
533                );
534                let quarantined = json_path.with_extension("json.quarantined");
535                let _ = std::fs::rename(&json_path, &quarantined);
536                return None;
537            }
538            let data = std::fs::read_to_string(&json_path).ok()?;
539            return serde_json::from_str(&data).ok();
540        }
541
542        None
543    }
544
545    pub fn load_or_build(root: &Path) -> Self {
546        if !is_safe_bm25_root(root) {
547            return Self::default();
548        }
549        if let Some(idx) = Self::load(root) {
550            if !bm25_index_looks_stale(&idx, root) {
551                return idx;
552            }
553            tracing::debug!(
554                "[bm25_index: stale index detected for {}; rebuilding]",
555                root.display()
556            );
557            let rebuilt = if idx.files.is_empty() {
558                Self::build_from_directory(root)
559            } else {
560                Self::rebuild_incremental(root, &idx)
561            };
562            let _ = rebuilt.save(root);
563            return rebuilt;
564        }
565
566        let built = Self::build_from_directory(root);
567        let _ = built.save(root);
568        built
569    }
570
571    pub fn index_file_path(root: &Path) -> PathBuf {
572        let dir = index_dir(root);
573        let zst = dir.join("bm25_index.bin.zst");
574        if zst.exists() {
575            return zst;
576        }
577        let bin = dir.join("bm25_index.bin");
578        if bin.exists() {
579            return bin;
580        }
581        dir.join("bm25_index.json")
582    }
583
584    /// Ingest external `ContentChunk`s into the BM25 index.
585    /// Converts each chunk to a `CodeChunk` (backward-compatible) and
586    /// rebuilds the inverted index. Returns the number of chunks ingested.
587    pub fn ingest_content_chunks(
588        &mut self,
589        chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
590    ) -> usize {
591        let mut count = 0usize;
592        for cc in chunks {
593            self.add_chunk(cc.into());
594            count += 1;
595        }
596        if count > 0 {
597            self.finalize();
598        }
599        count
600    }
601
602    /// Number of chunks originating from external providers.
603    pub fn external_chunk_count(&self) -> usize {
604        self.chunks
605            .iter()
606            .filter(|c| c.file_path.contains("://"))
607            .count()
608    }
609}
610
611fn is_safe_bm25_root(root: &Path) -> bool {
612    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
613}
614
615fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
616    if index.chunks.is_empty() {
617        return false;
618    }
619
620    if index.files.is_empty() {
621        // Legacy index (pre file-state tracking): only detect missing files.
622        let mut seen = std::collections::HashSet::<&str>::new();
623        for chunk in &index.chunks {
624            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
625            if rel.is_empty() {
626                continue;
627            }
628            if !seen.insert(rel) {
629                continue;
630            }
631            if !root.join(rel).exists() {
632                return true;
633            }
634        }
635        return false;
636    }
637
638    // Missing or modified tracked files.
639    for (rel, old_state) in &index.files {
640        let abs = root.join(rel);
641        if !abs.exists() {
642            return true;
643        }
644        let Some(cur) = IndexedFileState::from_path(&abs) else {
645            return true;
646        };
647        if &cur != old_state {
648            return true;
649        }
650    }
651
652    // New files (present on disk but not in index).
653    for rel in list_code_files(root) {
654        if !index.files.contains_key(&rel) {
655            return true;
656        }
657    }
658
659    false
660}
661
662fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
663    use std::io::Read;
664    let mut decoder = zstd::Decoder::new(compressed).ok()?;
665    let mut buf = Vec::new();
666    let mut chunk = vec![0u8; 65536];
667    let mut total = 0u64;
668    loop {
669        let n = decoder.read(&mut chunk).ok()?;
670        if n == 0 {
671            break;
672        }
673        total += n as u64;
674        if total > max_bytes {
675            tracing::warn!(
676                "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
677                total as f64 / (1024.0 * 1024.0),
678                max_bytes as f64 / (1024.0 * 1024.0)
679            );
680            return None;
681        }
682        buf.extend_from_slice(&chunk[..n]);
683    }
684    Some(buf)
685}
686
687fn index_dir(root: &Path) -> PathBuf {
688    crate::core::index_namespace::vectors_dir(root)
689}
690
691fn list_code_files(root: &Path) -> Vec<String> {
692    let walker = ignore::WalkBuilder::new(root)
693        .hidden(true)
694        .git_ignore(true)
695        .git_global(true)
696        .git_exclude(true)
697        .max_depth(Some(20))
698        .build();
699
700    let cfg = crate::core::config::Config::load();
701    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
702        .iter()
703        .filter_map(|p| glob::Pattern::new(p).ok())
704        .collect();
705    ignore_patterns.extend(
706        cfg.extra_ignore_patterns
707            .iter()
708            .filter_map(|p| glob::Pattern::new(p).ok()),
709    );
710
711    let mut files: Vec<String> = Vec::new();
712    for entry in walker.flatten() {
713        let path = entry.path();
714        if !path.is_file() {
715            continue;
716        }
717        if !is_code_file(path) {
718            continue;
719        }
720        let rel = path
721            .strip_prefix(root)
722            .unwrap_or(path)
723            .to_string_lossy()
724            .to_string();
725        if rel.is_empty() {
726            continue;
727        }
728        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
729            continue;
730        }
731        if files.len() >= MAX_BM25_FILES {
732            tracing::warn!(
733                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
734                root.display()
735            );
736            break;
737        }
738        files.push(rel);
739    }
740
741    files.sort();
742    files.dedup();
743    files
744}
745
746pub fn is_code_file(path: &Path) -> bool {
747    let ext = path
748        .extension()
749        .and_then(|e| e.to_str())
750        .unwrap_or("")
751        .to_lowercase();
752    matches!(
753        ext.as_str(),
754        "rs" | "ts"
755            | "tsx"
756            | "js"
757            | "jsx"
758            | "py"
759            | "go"
760            | "java"
761            | "c"
762            | "cc"
763            | "cpp"
764            | "h"
765            | "hpp"
766            | "rb"
767            | "cs"
768            | "kt"
769            | "swift"
770            | "php"
771            | "scala"
772            | "sql"
773            | "ex"
774            | "exs"
775            | "zig"
776            | "lua"
777            | "dart"
778            | "vue"
779            | "svelte"
780    )
781}
782
783fn tokenize(text: &str) -> Vec<String> {
784    let mut tokens = Vec::new();
785    let mut current = String::new();
786
787    for ch in text.chars() {
788        if ch.is_alphanumeric() || ch == '_' {
789            current.push(ch);
790        } else {
791            if current.len() >= 2 {
792                tokens.push(current.clone());
793            }
794            current.clear();
795        }
796    }
797    if current.len() >= 2 {
798        tokens.push(current);
799    }
800
801    split_camel_case_tokens(&tokens)
802}
803
804pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
805    tokenize(text)
806}
807
808fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
809    let mut result = Vec::new();
810    for token in tokens {
811        result.push(token.clone());
812        let mut start = 0;
813        let chars: Vec<char> = token.chars().collect();
814        for i in 1..chars.len() {
815            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
816                let part: String = chars[start..i].iter().collect();
817                if part.len() >= 2 {
818                    result.push(part);
819                }
820                start = i;
821            }
822        }
823        if start > 0 {
824            let part: String = chars[start..].iter().collect();
825            if part.len() >= 2 {
826                result.push(part);
827            }
828        }
829    }
830    result
831}
832
833fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
834    #[cfg(feature = "tree-sitter")]
835    {
836        let ext = std::path::Path::new(file_path)
837            .extension()
838            .and_then(|e| e.to_str())
839            .unwrap_or("");
840        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
841            return chunks;
842        }
843    }
844
845    let lines: Vec<&str> = content.lines().collect();
846    if lines.is_empty() {
847        return Vec::new();
848    }
849
850    let mut chunks = Vec::new();
851    let mut i = 0;
852
853    while i < lines.len() {
854        let trimmed = lines[i].trim();
855
856        if let Some((name, kind)) = detect_symbol(trimmed) {
857            let start = i;
858            let end = find_block_end(&lines, i);
859            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
860            let token_count = tokenize(&block).len();
861
862            chunks.push(CodeChunk {
863                file_path: file_path.to_string(),
864                symbol_name: name,
865                kind,
866                start_line: start + 1,
867                end_line: end + 1,
868                content: block,
869                tokens: Vec::new(),
870                token_count,
871            });
872
873            i = end + 1;
874        } else {
875            i += 1;
876        }
877    }
878
879    if chunks.is_empty() && !content.is_empty() {
880        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
881        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
882        //
883        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
884        let bytes = content.as_bytes();
885        let rk_chunks = crate::core::rabin_karp::chunk(content);
886        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
887            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
888                let end = (c.offset + c.length).min(bytes.len());
889                let slice = &bytes[c.offset..end];
890                let chunk_text = String::from_utf8_lossy(slice).into_owned();
891                let token_count = tokenize(&chunk_text).len();
892                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
893                let end_line = start_line + bytecount::count(slice, b'\n');
894                chunks.push(CodeChunk {
895                    file_path: file_path.to_string(),
896                    symbol_name: format!("{file_path}#chunk-{idx}"),
897                    kind: ChunkKind::Module,
898                    start_line,
899                    end_line: end_line.max(start_line),
900                    content: chunk_text,
901                    tokens: Vec::new(),
902                    token_count,
903                });
904            }
905        } else {
906            let token_count = tokenize(content).len();
907            let snippet = lines
908                .iter()
909                .take(50)
910                .copied()
911                .collect::<Vec<_>>()
912                .join("\n");
913            chunks.push(CodeChunk {
914                file_path: file_path.to_string(),
915                symbol_name: file_path.to_string(),
916                kind: ChunkKind::Module,
917                start_line: 1,
918                end_line: lines.len(),
919                content: snippet,
920                tokens: Vec::new(),
921                token_count,
922            });
923        }
924    }
925
926    chunks
927}
928
929fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
930    let trimmed = line.trim();
931
932    let patterns: &[(&str, ChunkKind)] = &[
933        ("pub async fn ", ChunkKind::Function),
934        ("async fn ", ChunkKind::Function),
935        ("pub fn ", ChunkKind::Function),
936        ("fn ", ChunkKind::Function),
937        ("pub struct ", ChunkKind::Struct),
938        ("struct ", ChunkKind::Struct),
939        ("pub enum ", ChunkKind::Struct),
940        ("enum ", ChunkKind::Struct),
941        ("impl ", ChunkKind::Impl),
942        ("pub trait ", ChunkKind::Struct),
943        ("trait ", ChunkKind::Struct),
944        ("export function ", ChunkKind::Function),
945        ("export async function ", ChunkKind::Function),
946        ("export default function ", ChunkKind::Function),
947        ("function ", ChunkKind::Function),
948        ("async function ", ChunkKind::Function),
949        ("export class ", ChunkKind::Class),
950        ("class ", ChunkKind::Class),
951        ("export interface ", ChunkKind::Struct),
952        ("interface ", ChunkKind::Struct),
953        ("def ", ChunkKind::Function),
954        ("async def ", ChunkKind::Function),
955        ("class ", ChunkKind::Class),
956        ("func ", ChunkKind::Function),
957    ];
958
959    for (prefix, kind) in patterns {
960        if let Some(rest) = trimmed.strip_prefix(prefix) {
961            let name: String = rest
962                .chars()
963                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
964                .take_while(|c| *c != '<')
965                .collect();
966            if !name.is_empty() {
967                return Some((name, kind.clone()));
968            }
969        }
970    }
971
972    None
973}
974
975fn find_block_end(lines: &[&str], start: usize) -> usize {
976    let mut depth = 0i32;
977    let mut found_open = false;
978
979    for (i, line) in lines.iter().enumerate().skip(start) {
980        for ch in line.chars() {
981            match ch {
982                '{' | '(' if !found_open || depth > 0 => {
983                    depth += 1;
984                    found_open = true;
985                }
986                '}' | ')' if depth > 0 => {
987                    depth -= 1;
988                    if depth == 0 && found_open {
989                        return i;
990                    }
991                }
992                _ => {}
993            }
994        }
995
996        if found_open && depth <= 0 && i > start {
997            return i;
998        }
999
1000        if !found_open && i > start + 2 {
1001            let trimmed = lines[i].trim();
1002            if trimmed.is_empty()
1003                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1004            {
1005                return i.saturating_sub(1);
1006            }
1007        }
1008    }
1009
1010    (start + 50).min(lines.len().saturating_sub(1))
1011}
1012
1013pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1014    if results.is_empty() {
1015        return "No results found.".to_string();
1016    }
1017
1018    let mut out = String::new();
1019    for (i, r) in results.iter().enumerate() {
1020        let is_external = r.file_path.contains("://");
1021        if compact {
1022            if is_external {
1023                out.push_str(&format!(
1024                    "{}. {:.2} [{:?}] {} — {}\n",
1025                    i + 1,
1026                    r.score,
1027                    r.kind,
1028                    r.file_path,
1029                    r.symbol_name,
1030                ));
1031            } else {
1032                out.push_str(&format!(
1033                    "{}. {:.2} {}:{}-{} {:?} {}\n",
1034                    i + 1,
1035                    r.score,
1036                    r.file_path,
1037                    r.start_line,
1038                    r.end_line,
1039                    r.kind,
1040                    r.symbol_name,
1041                ));
1042            }
1043        } else if is_external {
1044            out.push_str(&format!(
1045                "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1046                i + 1,
1047                r.score,
1048                r.kind,
1049                r.file_path,
1050                r.symbol_name,
1051                r.snippet,
1052            ));
1053        } else {
1054            out.push_str(&format!(
1055                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1056                i + 1,
1057                r.score,
1058                r.file_path,
1059                r.symbol_name,
1060                r.kind,
1061                r.start_line,
1062                r.end_line,
1063                r.snippet,
1064            ));
1065        }
1066    }
1067    out
1068}
1069
1070/// Enrich chunk content with file-path components for BM25 path-matching.
1071///
1072/// SACL (EMNLP 2025) shows that augmenting code with structural information
1073/// improves retrieval by 7-12.8%. We append the file stem twice (for boost)
1074/// and the immediate parent directory once, enabling queries like "auth handler"
1075/// to match `src/auth/handler.rs`.
1076fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1077    let path = Path::new(&chunk.file_path);
1078    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1079    let dir = path
1080        .parent()
1081        .and_then(|p| p.file_name())
1082        .and_then(|d| d.to_str())
1083        .unwrap_or("");
1084
1085    if stem.is_empty() {
1086        return chunk.content.clone();
1087    }
1088
1089    format!("{} {} {} {}", chunk.content, stem, stem, dir)
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094    use super::*;
1095    use tempfile::tempdir;
1096
1097    #[cfg(unix)]
1098    use std::os::unix::fs::PermissionsExt;
1099
1100    #[test]
1101    fn tokenize_splits_code() {
1102        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1103        assert!(tokens.contains(&"calculate_total".to_string()));
1104        assert!(tokens.contains(&"items".to_string()));
1105        assert!(tokens.contains(&"Vec".to_string()));
1106    }
1107
1108    #[test]
1109    fn camel_case_splitting() {
1110        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1111        assert!(tokens.contains(&"calculateTotal".to_string()));
1112        assert!(tokens.contains(&"calculate".to_string()));
1113        assert!(tokens.contains(&"Total".to_string()));
1114    }
1115
1116    #[test]
1117    fn detect_rust_function() {
1118        let (name, kind) =
1119            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1120        assert_eq!(name, "process_request");
1121        assert_eq!(kind, ChunkKind::Function);
1122    }
1123
1124    #[test]
1125    fn bm25_search_finds_relevant() {
1126        let mut index = BM25Index::new();
1127        index.add_chunk(CodeChunk {
1128            file_path: "auth.rs".into(),
1129            symbol_name: "validate_token".into(),
1130            kind: ChunkKind::Function,
1131            start_line: 1,
1132            end_line: 10,
1133            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1134            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1135            token_count: 8,
1136        });
1137        index.add_chunk(CodeChunk {
1138            file_path: "db.rs".into(),
1139            symbol_name: "connect_database".into(),
1140            kind: ChunkKind::Function,
1141            start_line: 1,
1142            end_line: 5,
1143            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1144            tokens: tokenize("fn connect_database url str Pool create_pool url"),
1145            token_count: 7,
1146        });
1147        index.finalize();
1148
1149        let results = index.search("jwt token validation", 5);
1150        assert!(!results.is_empty());
1151        assert_eq!(results[0].symbol_name, "validate_token");
1152    }
1153
1154    #[test]
1155    fn bm25_search_sorts_ties_deterministically() {
1156        let mut index = BM25Index::new();
1157
1158        // Insert in reverse path order to ensure the sort tie-break matters.
1159        index.add_chunk(CodeChunk {
1160            file_path: "b.rs".into(),
1161            symbol_name: "same".into(),
1162            kind: ChunkKind::Function,
1163            start_line: 1,
1164            end_line: 1,
1165            content: "fn same() {}".into(),
1166            tokens: tokenize("same token"),
1167            token_count: 2,
1168        });
1169        index.add_chunk(CodeChunk {
1170            file_path: "a.rs".into(),
1171            symbol_name: "same".into(),
1172            kind: ChunkKind::Function,
1173            start_line: 1,
1174            end_line: 1,
1175            content: "fn same() {}".into(),
1176            tokens: tokenize("same token"),
1177            token_count: 2,
1178        });
1179        index.finalize();
1180
1181        let results = index.search("same", 10);
1182        assert!(results.len() >= 2);
1183        assert_eq!(results[0].file_path, "a.rs");
1184        assert_eq!(results[1].file_path, "b.rs");
1185    }
1186
1187    #[test]
1188    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1189        let td = tempdir().expect("tempdir");
1190        let root = td.path();
1191        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1192
1193        let idx = BM25Index::build_from_directory(root);
1194        assert!(!bm25_index_looks_stale(&idx, root));
1195
1196        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1197        assert!(bm25_index_looks_stale(&idx, root));
1198    }
1199
1200    #[test]
1201    #[cfg(unix)]
1202    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1203        let td = tempdir().expect("tempdir");
1204        let root = td.path();
1205
1206        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1207        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1208
1209        let idx1 = BM25Index::build_from_directory(root);
1210        assert!(idx1.files.contains_key("a.rs"));
1211        assert!(idx1.files.contains_key("b.rs"));
1212
1213        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1214        let a_path = root.join("a.rs");
1215        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1216        perms.set_mode(0o000);
1217        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1218
1219        // Change b.rs (size changes) to force a re-read for that file.
1220        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1221            .expect("rewrite b.rs");
1222
1223        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1224        assert!(
1225            idx2.files.contains_key("a.rs"),
1226            "a.rs should be kept via reuse"
1227        );
1228        assert!(idx2.files.contains_key("b.rs"));
1229
1230        let b_has_b2 = idx2
1231            .chunks
1232            .iter()
1233            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1234        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1235
1236        // Restore permissions to avoid cleanup surprises.
1237        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1238        perms.set_mode(0o644);
1239        let _ = std::fs::set_permissions(&a_path, perms);
1240    }
1241
1242    #[test]
1243    fn load_quarantines_oversized_index() {
1244        let _env = crate::core::data_dir::test_env_lock();
1245        let td = tempdir().expect("tempdir");
1246        let root = td.path();
1247        let dir = crate::core::index_namespace::vectors_dir(root);
1248        std::fs::create_dir_all(&dir).expect("create vectors dir");
1249
1250        let index_path = dir.join("bm25_index.json");
1251        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1252        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1253
1254        let result = BM25Index::load(root);
1255        assert!(result.is_none(), "oversized index should return None");
1256        assert!(
1257            !index_path.exists(),
1258            "original index should be removed after quarantine"
1259        );
1260        assert!(
1261            dir.join("bm25_index.json.quarantined").exists(),
1262            "quarantined file should exist"
1263        );
1264
1265        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1266    }
1267
1268    #[test]
1269    fn save_refuses_oversized_output() {
1270        let _env = crate::core::data_dir::test_env_lock();
1271        let data_dir = tempdir().expect("data_dir");
1272        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1273        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1274
1275        let td = tempdir().expect("tempdir");
1276        let root = td.path();
1277
1278        let mut index = BM25Index::new();
1279        index.add_chunk(CodeChunk {
1280            file_path: "a.rs".into(),
1281            symbol_name: "a".into(),
1282            kind: ChunkKind::Function,
1283            start_line: 1,
1284            end_line: 1,
1285            content: "fn a() {}".into(),
1286            tokens: tokenize("fn a"),
1287            token_count: 2,
1288        });
1289        index.finalize();
1290
1291        let _ = index.save(root);
1292        let index_path = BM25Index::index_file_path(root);
1293        assert!(
1294            !index_path.exists(),
1295            "save should refuse to persist oversized index"
1296        );
1297
1298        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1299    }
1300
1301    #[test]
1302    fn save_writes_project_root_marker() {
1303        let _env = crate::core::data_dir::test_env_lock();
1304        let td = tempdir().expect("tempdir");
1305        let root = td.path();
1306        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1307
1308        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1309        let index = BM25Index::build_from_directory(root);
1310        index.save(root).expect("save");
1311
1312        let dir = crate::core::index_namespace::vectors_dir(root);
1313        let marker = dir.join("project_root.txt");
1314        assert!(marker.exists(), "project_root.txt marker should exist");
1315        let content = std::fs::read_to_string(&marker).expect("read marker");
1316        assert_eq!(content, root.to_string_lossy());
1317    }
1318
1319    #[test]
1320    fn save_load_roundtrip_uses_zstd() {
1321        let _env = crate::core::data_dir::test_env_lock();
1322        let data_dir = tempdir().expect("data_dir");
1323        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1324        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1325        let td = tempdir().expect("tempdir");
1326        let root = td.path();
1327
1328        for i in 0..10 {
1329            std::fs::write(
1330                root.join(format!("mod{i}.rs")),
1331                format!(
1332                    "pub fn handler_{i}() {{\n    println!(\"hello\");\n}}\n\n\
1333                     pub fn helper_{i}() {{\n    println!(\"world\");\n}}\n"
1334                ),
1335            )
1336            .expect("write");
1337        }
1338
1339        let index = BM25Index::build_from_directory(root);
1340        assert!(index.doc_count > 0, "should have indexed chunks");
1341        index.save(root).expect("save");
1342
1343        let dir = crate::core::index_namespace::vectors_dir(root);
1344        let zst = dir.join("bm25_index.bin.zst");
1345        assert!(zst.exists(), "should write .bin.zst");
1346        assert!(
1347            !dir.join("bm25_index.bin").exists(),
1348            ".bin should be deleted"
1349        );
1350
1351        let loaded = BM25Index::load(root).expect("load compressed index");
1352        assert_eq!(loaded.doc_count, index.doc_count);
1353        assert_eq!(loaded.chunks.len(), index.chunks.len());
1354
1355        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1356        std::env::remove_var("LEAN_CTX_DATA_DIR");
1357    }
1358
1359    #[test]
1360    fn auto_migrate_bin_to_zst() {
1361        let _env = crate::core::data_dir::test_env_lock();
1362        let data_dir = tempdir().expect("data_dir");
1363        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1364        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1365        let td = tempdir().expect("tempdir");
1366        let root = td.path();
1367
1368        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1369        let index = BM25Index::build_from_directory(root);
1370
1371        let dir = crate::core::index_namespace::vectors_dir(root);
1372        std::fs::create_dir_all(&dir).expect("mkdir");
1373        let data =
1374            bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1375        std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1376
1377        let loaded = BM25Index::load(root).expect("load should auto-migrate");
1378        assert_eq!(loaded.doc_count, index.doc_count);
1379        assert!(
1380            dir.join("bm25_index.bin.zst").exists(),
1381            ".bin.zst should be created"
1382        );
1383        assert!(
1384            !dir.join("bm25_index.bin").exists(),
1385            ".bin should be removed"
1386        );
1387
1388        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1389        std::env::remove_var("LEAN_CTX_DATA_DIR");
1390    }
1391
1392    #[test]
1393    fn list_code_files_skips_default_vendor_ignores() {
1394        let td = tempdir().expect("tempdir");
1395        let root = td.path();
1396
1397        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1398        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1399        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1400        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1401        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1402
1403        let files = list_code_files(root);
1404        assert!(
1405            files.iter().any(|f| f == "main.rs"),
1406            "main.rs should be included"
1407        );
1408        assert!(
1409            !files.iter().any(|f| f.starts_with("vendor/")),
1410            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1411        );
1412        assert!(
1413            !files.iter().any(|f| f.starts_with("dist/")),
1414            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1415        );
1416    }
1417
1418    #[test]
1419    fn list_code_files_respects_max_files_cap() {
1420        let td = tempdir().expect("tempdir");
1421        let root = td.path();
1422
1423        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1424        // but we can verify the cap constant exists and the function returns a bounded vec.
1425        for i in 0..10 {
1426            std::fs::write(
1427                root.join(format!("f{i}.rs")),
1428                format!("pub fn f{i}() {{}}\n"),
1429            )
1430            .expect("write");
1431        }
1432        let files = list_code_files(root);
1433        assert!(
1434            files.len() <= MAX_BM25_FILES,
1435            "file count should not exceed MAX_BM25_FILES"
1436        );
1437    }
1438
1439    #[test]
1440    fn max_bm25_cache_bytes_reads_env() {
1441        let _env = crate::core::data_dir::test_env_lock();
1442        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1443        let bytes = max_bm25_cache_bytes();
1444        assert_eq!(bytes, 64 * 1024 * 1024);
1445        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1446    }
1447}