lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12    "vendor/**",
13    "dist/**",
14    "build/**",
15    "public/vendor/**",
16    "public/js/**",
17    "public/css/**",
18    "public/build/**",
19    ".next/**",
20    ".nuxt/**",
21    "__pycache__/**",
22    "*.min.js",
23    "*.min.css",
24    "*.bundle.js",
25    "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29    // Single source of truth: `Config::bm25_max_cache_mb_effective` (env override
30    // › explicit config › disk-budget › generous default). Decoupled from the RAM
31    // profile so large repos persist instead of rebuilding forever (issue #249).
32    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
33        .ok()
34        .and_then(|v| v.parse::<u64>().ok())
35        .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb_effective());
36    mb * 1024 * 1024
37}
38
39/// Effective on-disk ceiling (bytes) for the persisted BM25 index. Single source
40/// of truth shared with `doctor` so its "oversized index" warning matches what
41/// `save`/`load` actually enforce.
42pub fn persist_ceiling_bytes() -> u64 {
43    max_bm25_cache_bytes()
44}
45
46/// Outcome of persisting a BM25 index to disk. Distinguishes a real write from a
47/// size-capped refusal so callers never mistake "refused to persist" for
48/// success (the bug behind the perpetual "index warming" report, issue #249).
49#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50pub enum SaveOutcome {
51    /// Written to disk. Carries the compressed (zstd) size in bytes.
52    Persisted { compressed_bytes: u64 },
53    /// Built fine but NOT written — the compressed size exceeds the disk
54    /// ceiling. The in-memory index is still usable for this process; callers
55    /// should surface the remedy (raise the cap / add ignore patterns) instead
56    /// of silently rebuilding on every call.
57    SkippedTooLarge {
58        compressed_bytes: u64,
59        limit_bytes: u64,
60    },
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct CodeChunk {
65    pub file_path: String,
66    pub symbol_name: String,
67    pub kind: ChunkKind,
68    pub start_line: usize,
69    pub end_line: usize,
70    pub content: String,
71    #[serde(default)]
72    pub tokens: Vec<String>,
73    pub token_count: usize,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
77pub enum ChunkKind {
78    Function,
79    Struct,
80    Impl,
81    Module,
82    Class,
83    Method,
84    Other,
85    // -- External source kinds (Context Engine) --
86    Issue,
87    PullRequest,
88    WikiPage,
89    DbSchema,
90    ApiEndpoint,
91    Ticket,
92    ExternalOther,
93}
94
95#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
96pub struct IndexedFileState {
97    pub mtime_ms: u64,
98    pub size_bytes: u64,
99}
100
101impl IndexedFileState {
102    fn from_path(path: &Path) -> Option<Self> {
103        let meta = path.metadata().ok()?;
104        let size_bytes = meta.len();
105        let mtime_ms = meta
106            .modified()
107            .ok()
108            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
109            .map(|d| d.as_millis() as u64)?;
110        Some(Self {
111            mtime_ms,
112            size_bytes,
113        })
114    }
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct BM25Index {
119    pub chunks: Vec<CodeChunk>,
120    pub inverted: HashMap<String, Vec<(usize, f64)>>,
121    pub avg_doc_len: f64,
122    pub doc_count: usize,
123    pub doc_freqs: HashMap<String, usize>,
124    #[serde(default)]
125    pub files: HashMap<String, IndexedFileState>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct SearchResult {
130    pub chunk_idx: usize,
131    pub score: f64,
132    pub file_path: String,
133    pub symbol_name: String,
134    pub kind: ChunkKind,
135    pub start_line: usize,
136    pub end_line: usize,
137    pub snippet: String,
138}
139
140const BM25_K1: f64 = 1.2;
141const BM25_B: f64 = 0.75;
142
143impl Default for BM25Index {
144    fn default() -> Self {
145        Self::new()
146    }
147}
148
149impl BM25Index {
150    pub fn new() -> Self {
151        Self {
152            chunks: Vec::new(),
153            inverted: HashMap::new(),
154            avg_doc_len: 0.0,
155            doc_count: 0,
156            doc_freqs: HashMap::new(),
157            files: HashMap::new(),
158        }
159    }
160
161    /// Approximate heap memory used by this index in bytes.
162    pub fn memory_usage_bytes(&self) -> usize {
163        let chunks_size: usize = self
164            .chunks
165            .iter()
166            .map(|c| {
167                c.content.len()
168                    + c.file_path.len()
169                    + c.symbol_name.len()
170                    + c.tokens.iter().map(String::len).sum::<usize>()
171                    + 64
172            })
173            .sum();
174        let inverted_size: usize = self
175            .inverted
176            .iter()
177            .map(|(k, v)| k.len() + v.len() * 16 + 32)
178            .sum();
179        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
180        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
181        chunks_size + inverted_size + files_size + freqs_size
182    }
183
184    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
185    pub fn unload(&mut self) {
186        let usage = self.memory_usage_bytes();
187        self.chunks = Vec::new();
188        self.inverted = HashMap::new();
189        self.doc_freqs = HashMap::new();
190        self.files = HashMap::new();
191        self.avg_doc_len = 0.0;
192        self.doc_count = 0;
193        tracing::info!(
194            "[bm25] unloaded index, freed ~{:.1}MB",
195            usage as f64 / 1_048_576.0
196        );
197    }
198
199    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
200    #[cfg(test)]
201    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
202        let mut index = Self::new();
203        for mut chunk in chunks {
204            if chunk.token_count == 0 {
205                chunk.token_count = tokenize(&chunk.content).len();
206            }
207            index.add_chunk(chunk);
208        }
209        index.finalize();
210        index
211    }
212
213    pub fn build_from_directory(root: &Path) -> Self {
214        Self::build_from_directory_inner(root, &HashMap::new())
215    }
216
217    /// Like `build_from_directory` but reuses file content from a prior scan
218    /// (e.g. the graph index walk) to avoid redundant disk reads.
219    pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
220        Self::build_from_directory_inner(root, content_hint)
221    }
222
223    fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
224        let root_str = root.to_string_lossy();
225        if !super::graph_index::is_safe_scan_root_public(&root_str) {
226            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
227            return Self::new();
228        }
229        let mut index = Self::new();
230        let files = list_code_files(root);
231        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
232        let mut cache_hits = 0usize;
233
234        for (i, rel) in files.iter().enumerate() {
235            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
236                tracing::warn!(
237                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
238                    files.len()
239                );
240                break;
241            }
242            if crate::core::memory_guard::abort_requested() {
243                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
244                break;
245            }
246
247            let abs = root.join(rel);
248            let Some(state) = IndexedFileState::from_path(&abs) else {
249                continue;
250            };
251            if state.size_bytes > MAX_FILE_SIZE_BYTES {
252                continue;
253            }
254
255            let content = if let Some(cached) = content_hint.get(rel) {
256                cache_hits += 1;
257                std::borrow::Cow::Borrowed(cached.as_str())
258            } else {
259                match std::fs::read_to_string(&abs) {
260                    Ok(c) => std::borrow::Cow::Owned(c),
261                    Err(_) => continue,
262                }
263            };
264
265            let mut chunks = extract_chunks(rel, &content);
266            chunks.sort_by(|a, b| {
267                a.start_line
268                    .cmp(&b.start_line)
269                    .then_with(|| a.end_line.cmp(&b.end_line))
270                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
271            });
272            for chunk in chunks {
273                index.add_chunk(chunk);
274            }
275            index.files.insert(rel.clone(), state);
276        }
277
278        if cache_hits > 0 {
279            tracing::info!(
280                "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
281                files.len()
282            );
283        }
284
285        index.finalize();
286        index
287    }
288
289    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
290        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
291        for c in &prev.chunks {
292            old_by_file
293                .entry(c.file_path.clone())
294                .or_default()
295                .push(c.clone());
296        }
297        for v in old_by_file.values_mut() {
298            v.sort_by(|a, b| {
299                a.start_line
300                    .cmp(&b.start_line)
301                    .then_with(|| a.end_line.cmp(&b.end_line))
302                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
303            });
304        }
305
306        let mut index = Self::new();
307        let files = list_code_files(root);
308        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
309
310        for (i, rel) in files.iter().enumerate() {
311            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
312                tracing::warn!(
313                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
314                    files.len()
315                );
316                break;
317            }
318
319            let abs = root.join(rel);
320            let Some(state) = IndexedFileState::from_path(&abs) else {
321                continue;
322            };
323
324            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
325            if unchanged {
326                if let Some(chunks) = old_by_file.get(rel) {
327                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
328                        for chunk in chunks {
329                            index.add_chunk(chunk.clone());
330                        }
331                        index.files.insert(rel.clone(), state);
332                        continue;
333                    }
334                }
335            }
336
337            if state.size_bytes > MAX_FILE_SIZE_BYTES {
338                continue;
339            }
340            if let Ok(content) = std::fs::read_to_string(&abs) {
341                let mut chunks = extract_chunks(rel, &content);
342                chunks.sort_by(|a, b| {
343                    a.start_line
344                        .cmp(&b.start_line)
345                        .then_with(|| a.end_line.cmp(&b.end_line))
346                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
347                });
348                for chunk in chunks {
349                    index.add_chunk(chunk);
350                }
351                index.files.insert(rel.clone(), state);
352            }
353        }
354
355        index.finalize();
356        index
357    }
358
359    fn add_chunk(&mut self, chunk: CodeChunk) {
360        let idx = self.chunks.len();
361
362        let enriched = enrich_for_bm25(&chunk);
363        let tokens = tokenize(&enriched);
364        for token in &tokens {
365            let lower = token.to_lowercase();
366            let postings = self.inverted.entry(lower.clone()).or_default();
367            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
368                *self.doc_freqs.entry(lower).or_insert(0) += 1;
369            }
370            postings.push((idx, 1.0));
371        }
372
373        self.chunks.push(CodeChunk {
374            token_count: tokens.len(),
375            tokens: Vec::new(),
376            ..chunk
377        });
378    }
379
380    fn finalize(&mut self) {
381        self.doc_count = self.chunks.len();
382        if self.doc_count == 0 {
383            return;
384        }
385
386        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
387        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
388    }
389
390    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
391        let query_tokens = tokenize(query);
392        if query_tokens.is_empty() || self.doc_count == 0 {
393            return Vec::new();
394        }
395
396        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
397        // Kolmogorov-optimal: minimal allocation for the scoring operation.
398        let n = self.chunks.len();
399        let mut scores = vec![0.0f64; n];
400        let mut touched = Vec::with_capacity(n.min(256));
401
402        for token in &query_tokens {
403            let lower = token.to_lowercase();
404            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
405            if df == 0.0 {
406                continue;
407            }
408
409            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
410
411            if let Some(postings) = self.inverted.get(&lower) {
412                for &(idx, weight) in postings {
413                    let doc_len = self.chunks[idx].token_count as f64;
414                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
415                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
416                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
417
418                    if scores[idx] == 0.0 {
419                        touched.push(idx);
420                    }
421                    scores[idx] += bm25;
422                }
423            }
424        }
425
426        let mut results: Vec<SearchResult> = touched
427            .iter()
428            .filter(|&&idx| scores[idx] > 0.0)
429            .map(|&idx| {
430                let chunk = &self.chunks[idx];
431                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
432                SearchResult {
433                    chunk_idx: idx,
434                    score: scores[idx],
435                    file_path: chunk.file_path.clone(),
436                    symbol_name: chunk.symbol_name.clone(),
437                    kind: chunk.kind.clone(),
438                    start_line: chunk.start_line,
439                    end_line: chunk.end_line,
440                    snippet,
441                }
442            })
443            .collect();
444
445        results.sort_by(|a, b| {
446            b.score
447                .partial_cmp(&a.score)
448                .unwrap_or(std::cmp::Ordering::Equal)
449                .then_with(|| a.file_path.cmp(&b.file_path))
450                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
451                .then_with(|| a.start_line.cmp(&b.start_line))
452                .then_with(|| a.end_line.cmp(&b.end_line))
453        });
454        results.truncate(top_k);
455        results
456    }
457
458    pub fn save(&self, root: &Path) -> std::io::Result<SaveOutcome> {
459        if self.chunks.len() > CHUNK_COUNT_WARNING {
460            tracing::warn!(
461                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
462                self.chunks.len(),
463                CHUNK_COUNT_WARNING
464            );
465        }
466
467        let dir = index_dir(root);
468        std::fs::create_dir_all(&dir)?;
469        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
470            .map_err(|e| std::io::Error::other(e.to_string()))?;
471
472        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
473            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
474        let compressed_bytes = compressed.len() as u64;
475
476        let max_bytes = max_bm25_cache_bytes();
477        if compressed_bytes > max_bytes {
478            // Do NOT pretend success: a silent `Ok(())` here made `load` return
479            // `None` forever and the index rebuild on every call (issue #249).
480            // Report the refusal so the orchestrator can record an actionable
481            // note and the agent-facing tools can stop claiming the index will
482            // be "ready next call".
483            tracing::warn!(
484                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
485                compressed_bytes as f64 / 1_048_576.0,
486                max_bytes / (1024 * 1024),
487                dir.display()
488            );
489            return Ok(SaveOutcome::SkippedTooLarge {
490                compressed_bytes,
491                limit_bytes: max_bytes,
492            });
493        }
494
495        tracing::info!(
496            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
497            data.len() as f64 / 1_048_576.0,
498            compressed_bytes as f64 / 1_048_576.0,
499            (1.0 - compressed_bytes as f64 / data.len().max(1) as f64) * 100.0
500        );
501
502        let target = dir.join("bm25_index.bin.zst");
503        let tmp = dir.join("bm25_index.bin.zst.tmp");
504        std::fs::write(&tmp, &compressed)?;
505        std::fs::rename(&tmp, &target)?;
506
507        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
508        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
509
510        let _ = std::fs::write(
511            dir.join("project_root.txt"),
512            root.to_string_lossy().as_bytes(),
513        );
514
515        Ok(SaveOutcome::Persisted { compressed_bytes })
516    }
517
518    pub fn load(root: &Path) -> Option<Self> {
519        let dir = index_dir(root);
520        let max_bytes = max_bm25_cache_bytes();
521
522        let zst_path = dir.join("bm25_index.bin.zst");
523        if zst_path.exists() {
524            let meta = std::fs::metadata(&zst_path).ok()?;
525            if meta.len() > max_bytes {
526                tracing::warn!(
527                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
528                    meta.len() as f64 / 1_073_741_824.0,
529                    max_bytes / (1024 * 1024),
530                    zst_path.display()
531                );
532                let quarantined = zst_path.with_extension("zst.quarantined");
533                let _ = std::fs::rename(&zst_path, &quarantined);
534                return None;
535            }
536            let compressed = std::fs::read(&zst_path).ok()?;
537            let max_decompressed = max_bytes * 20; // allow 20x expansion ratio
538            let data = bounded_zstd_decode(&compressed, max_decompressed)?;
539            let (idx, _): (Self, _) =
540                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
541            return Some(idx);
542        }
543
544        let bin_path = dir.join("bm25_index.bin");
545        if bin_path.exists() {
546            let meta = std::fs::metadata(&bin_path).ok()?;
547            if meta.len() > max_bytes {
548                tracing::warn!(
549                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
550                    meta.len() as f64 / 1_073_741_824.0,
551                    max_bytes / (1024 * 1024),
552                    bin_path.display()
553                );
554                let quarantined = bin_path.with_extension("bin.quarantined");
555                let _ = std::fs::rename(&bin_path, &quarantined);
556                return None;
557            }
558            let data = std::fs::read(&bin_path).ok()?;
559            let (idx, _): (Self, _) =
560                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
561            // Auto-migrate: compress legacy .bin to .bin.zst
562            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
563                let zst_tmp = zst_path.with_extension("zst.tmp");
564                if std::fs::write(&zst_tmp, &compressed).is_ok()
565                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
566                {
567                    tracing::info!(
568                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
569                        data.len() as f64 / 1_048_576.0,
570                        compressed.len() as f64 / 1_048_576.0
571                    );
572                    let _ = std::fs::remove_file(&bin_path);
573                }
574            }
575            return Some(idx);
576        }
577
578        let json_path = dir.join("bm25_index.json");
579        if json_path.exists() {
580            let meta = std::fs::metadata(&json_path).ok()?;
581            if meta.len() > max_bytes {
582                tracing::warn!(
583                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
584                    meta.len() as f64 / 1_073_741_824.0,
585                    max_bytes / (1024 * 1024),
586                    json_path.display()
587                );
588                let quarantined = json_path.with_extension("json.quarantined");
589                let _ = std::fs::rename(&json_path, &quarantined);
590                return None;
591            }
592            let data = std::fs::read_to_string(&json_path).ok()?;
593            return serde_json::from_str(&data).ok();
594        }
595
596        None
597    }
598
599    pub fn load_or_build(root: &Path) -> Self {
600        Self::load_or_build_inner(root, false)
601    }
602
603    /// Like `load_or_build` but uses a fast sentinel-sampling staleness check
604    /// that skips the expensive full directory walk for new-file detection.
605    pub fn load_or_build_fast(root: &Path) -> Self {
606        Self::load_or_build_inner(root, true)
607    }
608
609    fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
610        if !is_safe_bm25_root(root) {
611            return Self::default();
612        }
613        if let Some(idx) = Self::load(root) {
614            let stale = if fast_stale {
615                bm25_index_looks_stale_fast(&idx, root)
616            } else {
617                bm25_index_looks_stale(&idx, root)
618            };
619            if !stale {
620                return idx;
621            }
622            tracing::debug!(
623                "[bm25_index: stale index detected for {}; rebuilding]",
624                root.display()
625            );
626            let rebuilt = if idx.files.is_empty() {
627                Self::build_from_directory(root)
628            } else {
629                Self::rebuild_incremental(root, &idx)
630            };
631            let _ = rebuilt.save(root);
632            return rebuilt;
633        }
634
635        let built = Self::build_from_directory(root);
636        let _ = built.save(root);
637        built
638    }
639
640    pub fn index_file_path(root: &Path) -> PathBuf {
641        let dir = index_dir(root);
642        let zst = dir.join("bm25_index.bin.zst");
643        if zst.exists() {
644            return zst;
645        }
646        let bin = dir.join("bm25_index.bin");
647        if bin.exists() {
648            return bin;
649        }
650        dir.join("bm25_index.json")
651    }
652
653    /// Ingest external `ContentChunk`s into the BM25 index.
654    /// Converts each chunk to a `CodeChunk` (backward-compatible) and
655    /// rebuilds the inverted index. Returns the number of chunks ingested.
656    pub fn ingest_content_chunks(
657        &mut self,
658        chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
659    ) -> usize {
660        let mut count = 0usize;
661        for cc in chunks {
662            self.add_chunk(cc.into());
663            count += 1;
664        }
665        if count > 0 {
666            self.finalize();
667        }
668        count
669    }
670
671    /// Number of chunks originating from external providers.
672    pub fn external_chunk_count(&self) -> usize {
673        self.chunks
674            .iter()
675            .filter(|c| c.file_path.contains("://"))
676            .count()
677    }
678}
679
680fn is_safe_bm25_root(root: &Path) -> bool {
681    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
682}
683
684fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
685    bm25_index_looks_stale_inner(index, root, false)
686}
687
688/// Fast staleness check: samples a subset of tracked files and skips the
689/// expensive `list_code_files()` walk for new-file detection.
690pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
691    bm25_index_looks_stale_inner(index, root, true)
692}
693
694fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
695    if index.chunks.is_empty() {
696        return false;
697    }
698
699    if index.files.is_empty() {
700        let mut seen = std::collections::HashSet::<&str>::new();
701        for chunk in &index.chunks {
702            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
703            if rel.is_empty() {
704                continue;
705            }
706            if !seen.insert(rel) {
707                continue;
708            }
709            if !root.join(rel).exists() {
710                return true;
711            }
712        }
713        return false;
714    }
715
716    if fast {
717        let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
718        let step = if index.files.len() > sample_size {
719            index.files.len() / sample_size
720        } else {
721            1
722        };
723        for (i, (rel, old_state)) in index.files.iter().enumerate() {
724            if i % step != 0 {
725                continue;
726            }
727            let abs = root.join(rel);
728            if !abs.exists() {
729                return true;
730            }
731            let Some(cur) = IndexedFileState::from_path(&abs) else {
732                return true;
733            };
734            if &cur != old_state {
735                return true;
736            }
737        }
738        return false;
739    }
740
741    for (rel, old_state) in &index.files {
742        let abs = root.join(rel);
743        if !abs.exists() {
744            return true;
745        }
746        let Some(cur) = IndexedFileState::from_path(&abs) else {
747            return true;
748        };
749        if &cur != old_state {
750            return true;
751        }
752    }
753
754    for rel in list_code_files(root) {
755        if !index.files.contains_key(&rel) {
756            return true;
757        }
758    }
759
760    false
761}
762
763const SENTINEL_SAMPLE_SIZE: usize = 10;
764
765fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
766    use std::io::Read;
767    let mut decoder = zstd::Decoder::new(compressed).ok()?;
768    let mut buf = Vec::new();
769    let mut chunk = vec![0u8; 65536];
770    let mut total = 0u64;
771    loop {
772        let n = decoder.read(&mut chunk).ok()?;
773        if n == 0 {
774            break;
775        }
776        total += n as u64;
777        if total > max_bytes {
778            tracing::warn!(
779                "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
780                total as f64 / (1024.0 * 1024.0),
781                max_bytes as f64 / (1024.0 * 1024.0)
782            );
783            return None;
784        }
785        buf.extend_from_slice(&chunk[..n]);
786    }
787    Some(buf)
788}
789
790fn index_dir(root: &Path) -> PathBuf {
791    crate::core::index_namespace::vectors_dir(root)
792}
793
794fn list_code_files(root: &Path) -> Vec<String> {
795    let walker = ignore::WalkBuilder::new(root)
796        .hidden(true)
797        .git_ignore(true)
798        .git_global(true)
799        .git_exclude(true)
800        .max_depth(Some(20))
801        .build();
802
803    let cfg = crate::core::config::Config::load();
804    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
805        .iter()
806        .filter_map(|p| glob::Pattern::new(p).ok())
807        .collect();
808    ignore_patterns.extend(
809        cfg.extra_ignore_patterns
810            .iter()
811            .filter_map(|p| glob::Pattern::new(p).ok()),
812    );
813
814    let mut files: Vec<String> = Vec::new();
815    for entry in walker.flatten() {
816        let path = entry.path();
817        if !path.is_file() {
818            continue;
819        }
820        if !is_code_file(path) {
821            continue;
822        }
823        let rel = path
824            .strip_prefix(root)
825            .unwrap_or(path)
826            .to_string_lossy()
827            .to_string();
828        if rel.is_empty() {
829            continue;
830        }
831        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
832            continue;
833        }
834        if files.len() >= MAX_BM25_FILES {
835            tracing::warn!(
836                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
837                root.display()
838            );
839            break;
840        }
841        files.push(rel);
842    }
843
844    files.sort();
845    files.dedup();
846    files
847}
848
849pub fn is_code_file(path: &Path) -> bool {
850    let ext = path
851        .extension()
852        .and_then(|e| e.to_str())
853        .unwrap_or("")
854        .to_lowercase();
855    matches!(
856        ext.as_str(),
857        "rs" | "ts"
858            | "tsx"
859            | "js"
860            | "jsx"
861            | "py"
862            | "go"
863            | "java"
864            | "c"
865            | "cc"
866            | "cpp"
867            | "h"
868            | "hpp"
869            | "rb"
870            | "cs"
871            | "kt"
872            | "swift"
873            | "php"
874            | "scala"
875            | "sql"
876            | "ex"
877            | "exs"
878            | "zig"
879            | "lua"
880            | "dart"
881            | "vue"
882            | "svelte"
883    )
884}
885
886fn tokenize(text: &str) -> Vec<String> {
887    let mut tokens = Vec::new();
888    let mut current = String::new();
889
890    for ch in text.chars() {
891        if ch.is_alphanumeric() || ch == '_' {
892            current.push(ch);
893        } else {
894            if current.len() >= 2 {
895                tokens.push(current.clone());
896            }
897            current.clear();
898        }
899    }
900    if current.len() >= 2 {
901        tokens.push(current);
902    }
903
904    split_camel_case_tokens(&tokens)
905}
906
907pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
908    tokenize(text)
909}
910
911fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
912    let mut result = Vec::new();
913    for token in tokens {
914        result.push(token.clone());
915        let mut start = 0;
916        let chars: Vec<char> = token.chars().collect();
917        for i in 1..chars.len() {
918            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
919                let part: String = chars[start..i].iter().collect();
920                if part.len() >= 2 {
921                    result.push(part);
922                }
923                start = i;
924            }
925        }
926        if start > 0 {
927            let part: String = chars[start..].iter().collect();
928            if part.len() >= 2 {
929                result.push(part);
930            }
931        }
932    }
933    result
934}
935
936fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
937    #[cfg(feature = "tree-sitter")]
938    {
939        let ext = std::path::Path::new(file_path)
940            .extension()
941            .and_then(|e| e.to_str())
942            .unwrap_or("");
943        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
944            return chunks;
945        }
946    }
947
948    let lines: Vec<&str> = content.lines().collect();
949    if lines.is_empty() {
950        return Vec::new();
951    }
952
953    let mut chunks = Vec::new();
954    let mut i = 0;
955
956    while i < lines.len() {
957        let trimmed = lines[i].trim();
958
959        if let Some((name, kind)) = detect_symbol(trimmed) {
960            let start = i;
961            let end = find_block_end(&lines, i);
962            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
963            let token_count = tokenize(&block).len();
964
965            chunks.push(CodeChunk {
966                file_path: file_path.to_string(),
967                symbol_name: name,
968                kind,
969                start_line: start + 1,
970                end_line: end + 1,
971                content: block,
972                tokens: Vec::new(),
973                token_count,
974            });
975
976            i = end + 1;
977        } else {
978            i += 1;
979        }
980    }
981
982    if chunks.is_empty() && !content.is_empty() {
983        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
984        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
985        //
986        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
987        let bytes = content.as_bytes();
988        let rk_chunks = crate::core::rabin_karp::chunk(content);
989        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
990            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
991                let end = (c.offset + c.length).min(bytes.len());
992                let slice = &bytes[c.offset..end];
993                let chunk_text = String::from_utf8_lossy(slice).into_owned();
994                let token_count = tokenize(&chunk_text).len();
995                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
996                let end_line = start_line + bytecount::count(slice, b'\n');
997                chunks.push(CodeChunk {
998                    file_path: file_path.to_string(),
999                    symbol_name: format!("{file_path}#chunk-{idx}"),
1000                    kind: ChunkKind::Module,
1001                    start_line,
1002                    end_line: end_line.max(start_line),
1003                    content: chunk_text,
1004                    tokens: Vec::new(),
1005                    token_count,
1006                });
1007            }
1008        } else {
1009            let token_count = tokenize(content).len();
1010            let snippet = lines
1011                .iter()
1012                .take(50)
1013                .copied()
1014                .collect::<Vec<_>>()
1015                .join("\n");
1016            chunks.push(CodeChunk {
1017                file_path: file_path.to_string(),
1018                symbol_name: file_path.to_string(),
1019                kind: ChunkKind::Module,
1020                start_line: 1,
1021                end_line: lines.len(),
1022                content: snippet,
1023                tokens: Vec::new(),
1024                token_count,
1025            });
1026        }
1027    }
1028
1029    chunks
1030}
1031
1032fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
1033    let trimmed = line.trim();
1034
1035    let patterns: &[(&str, ChunkKind)] = &[
1036        ("pub async fn ", ChunkKind::Function),
1037        ("async fn ", ChunkKind::Function),
1038        ("pub fn ", ChunkKind::Function),
1039        ("fn ", ChunkKind::Function),
1040        ("pub struct ", ChunkKind::Struct),
1041        ("struct ", ChunkKind::Struct),
1042        ("pub enum ", ChunkKind::Struct),
1043        ("enum ", ChunkKind::Struct),
1044        ("impl ", ChunkKind::Impl),
1045        ("pub trait ", ChunkKind::Struct),
1046        ("trait ", ChunkKind::Struct),
1047        ("export function ", ChunkKind::Function),
1048        ("export async function ", ChunkKind::Function),
1049        ("export default function ", ChunkKind::Function),
1050        ("function ", ChunkKind::Function),
1051        ("async function ", ChunkKind::Function),
1052        ("export class ", ChunkKind::Class),
1053        ("class ", ChunkKind::Class),
1054        ("export interface ", ChunkKind::Struct),
1055        ("interface ", ChunkKind::Struct),
1056        ("def ", ChunkKind::Function),
1057        ("async def ", ChunkKind::Function),
1058        ("class ", ChunkKind::Class),
1059        ("func ", ChunkKind::Function),
1060    ];
1061
1062    for (prefix, kind) in patterns {
1063        if let Some(rest) = trimmed.strip_prefix(prefix) {
1064            let name: String = rest
1065                .chars()
1066                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
1067                .take_while(|c| *c != '<')
1068                .collect();
1069            if !name.is_empty() {
1070                return Some((name, kind.clone()));
1071            }
1072        }
1073    }
1074
1075    None
1076}
1077
1078fn find_block_end(lines: &[&str], start: usize) -> usize {
1079    let mut depth = 0i32;
1080    let mut found_open = false;
1081
1082    for (i, line) in lines.iter().enumerate().skip(start) {
1083        for ch in line.chars() {
1084            match ch {
1085                '{' | '(' if !found_open || depth > 0 => {
1086                    depth += 1;
1087                    found_open = true;
1088                }
1089                '}' | ')' if depth > 0 => {
1090                    depth -= 1;
1091                    if depth == 0 && found_open {
1092                        return i;
1093                    }
1094                }
1095                _ => {}
1096            }
1097        }
1098
1099        if found_open && depth <= 0 && i > start {
1100            return i;
1101        }
1102
1103        if !found_open && i > start + 2 {
1104            let trimmed = lines[i].trim();
1105            if trimmed.is_empty()
1106                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1107            {
1108                return i.saturating_sub(1);
1109            }
1110        }
1111    }
1112
1113    (start + 50).min(lines.len().saturating_sub(1))
1114}
1115
1116pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1117    if results.is_empty() {
1118        return "No results found.".to_string();
1119    }
1120
1121    let mut out = String::new();
1122    for (i, r) in results.iter().enumerate() {
1123        let is_external = r.file_path.contains("://");
1124        // Forward-slash normalize local paths so Windows backslashes are never
1125        // dropped/escape-mangled by client render layers (issue #324). External
1126        // URIs (provider results, e.g. `github://`) are left untouched.
1127        let normalized;
1128        let file_path: &str = if is_external {
1129            &r.file_path
1130        } else {
1131            normalized = crate::core::protocol::display_path(&r.file_path);
1132            &normalized
1133        };
1134        if compact {
1135            if is_external {
1136                out.push_str(&format!(
1137                    "{}. {:.2} [{:?}] {} — {}\n",
1138                    i + 1,
1139                    r.score,
1140                    r.kind,
1141                    file_path,
1142                    r.symbol_name,
1143                ));
1144            } else {
1145                out.push_str(&format!(
1146                    "{}. {:.2} {}:{}-{} {:?} {}\n",
1147                    i + 1,
1148                    r.score,
1149                    file_path,
1150                    r.start_line,
1151                    r.end_line,
1152                    r.kind,
1153                    r.symbol_name,
1154                ));
1155            }
1156        } else if is_external {
1157            out.push_str(&format!(
1158                "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1159                i + 1,
1160                r.score,
1161                r.kind,
1162                file_path,
1163                r.symbol_name,
1164                r.snippet,
1165            ));
1166        } else {
1167            out.push_str(&format!(
1168                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1169                i + 1,
1170                r.score,
1171                file_path,
1172                r.symbol_name,
1173                r.kind,
1174                r.start_line,
1175                r.end_line,
1176                r.snippet,
1177            ));
1178        }
1179    }
1180    out
1181}
1182
1183/// Enrich chunk content with file-path components for BM25 path-matching.
1184///
1185/// SACL (EMNLP 2025) shows that augmenting code with structural information
1186/// improves retrieval by 7-12.8%. We append the file stem twice (for boost)
1187/// and the immediate parent directory once, enabling queries like "auth handler"
1188/// to match `src/auth/handler.rs`.
1189fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1190    let path = Path::new(&chunk.file_path);
1191    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1192    let dir = path
1193        .parent()
1194        .and_then(|p| p.file_name())
1195        .and_then(|d| d.to_str())
1196        .unwrap_or("");
1197
1198    if stem.is_empty() {
1199        return chunk.content.clone();
1200    }
1201
1202    format!("{} {} {} {}", chunk.content, stem, stem, dir)
1203}
1204
1205#[cfg(test)]
1206mod tests {
1207    use super::*;
1208    use tempfile::tempdir;
1209
1210    #[cfg(unix)]
1211    use std::os::unix::fs::PermissionsExt;
1212
1213    #[test]
1214    fn tokenize_splits_code() {
1215        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1216        assert!(tokens.contains(&"calculate_total".to_string()));
1217        assert!(tokens.contains(&"items".to_string()));
1218        assert!(tokens.contains(&"Vec".to_string()));
1219    }
1220
1221    #[test]
1222    fn format_search_results_normalizes_windows_separators() {
1223        // Issue #324: Windows backslash paths in search output were dropped or
1224        // escape-mangled by client render layers. They must come out with
1225        // forward slashes.
1226        let r = SearchResult {
1227            chunk_idx: 0,
1228            score: 1.0,
1229            file_path: r"C:\Users\zir\AppData\Local\Temp\win-build-log.txt".to_string(),
1230            symbol_name: "main".to_string(),
1231            kind: ChunkKind::Function,
1232            start_line: 1,
1233            end_line: 2,
1234            snippet: "x".to_string(),
1235        };
1236        let compact = format_search_results(std::slice::from_ref(&r), true);
1237        assert!(compact.contains("C:/Users/zir/AppData/Local/Temp/win-build-log.txt"));
1238        assert!(!compact.contains('\\'));
1239
1240        let verbose = format_search_results(std::slice::from_ref(&r), false);
1241        assert!(verbose.contains("C:/Users/zir/AppData/Local/Temp/win-build-log.txt"));
1242        assert!(!verbose.contains('\\'));
1243    }
1244
1245    #[test]
1246    fn format_search_results_leaves_external_uris_untouched() {
1247        // Provider results (e.g. github://) are not OS paths and must not be
1248        // rewritten.
1249        let r = SearchResult {
1250            chunk_idx: 0,
1251            score: 1.0,
1252            file_path: "github://owner/repo/issues/42".to_string(),
1253            symbol_name: "issue".to_string(),
1254            kind: ChunkKind::Module,
1255            start_line: 0,
1256            end_line: 0,
1257            snippet: "y".to_string(),
1258        };
1259        let out = format_search_results(std::slice::from_ref(&r), true);
1260        assert!(out.contains("github://owner/repo/issues/42"));
1261    }
1262
1263    #[test]
1264    fn camel_case_splitting() {
1265        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1266        assert!(tokens.contains(&"calculateTotal".to_string()));
1267        assert!(tokens.contains(&"calculate".to_string()));
1268        assert!(tokens.contains(&"Total".to_string()));
1269    }
1270
1271    #[test]
1272    fn detect_rust_function() {
1273        let (name, kind) =
1274            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1275        assert_eq!(name, "process_request");
1276        assert_eq!(kind, ChunkKind::Function);
1277    }
1278
1279    #[test]
1280    fn bm25_search_finds_relevant() {
1281        let mut index = BM25Index::new();
1282        index.add_chunk(CodeChunk {
1283            file_path: "auth.rs".into(),
1284            symbol_name: "validate_token".into(),
1285            kind: ChunkKind::Function,
1286            start_line: 1,
1287            end_line: 10,
1288            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1289            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1290            token_count: 8,
1291        });
1292        index.add_chunk(CodeChunk {
1293            file_path: "db.rs".into(),
1294            symbol_name: "connect_database".into(),
1295            kind: ChunkKind::Function,
1296            start_line: 1,
1297            end_line: 5,
1298            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1299            tokens: tokenize("fn connect_database url str Pool create_pool url"),
1300            token_count: 7,
1301        });
1302        index.finalize();
1303
1304        let results = index.search("jwt token validation", 5);
1305        assert!(!results.is_empty());
1306        assert_eq!(results[0].symbol_name, "validate_token");
1307    }
1308
1309    #[test]
1310    fn bm25_search_sorts_ties_deterministically() {
1311        let mut index = BM25Index::new();
1312
1313        // Insert in reverse path order to ensure the sort tie-break matters.
1314        index.add_chunk(CodeChunk {
1315            file_path: "b.rs".into(),
1316            symbol_name: "same".into(),
1317            kind: ChunkKind::Function,
1318            start_line: 1,
1319            end_line: 1,
1320            content: "fn same() {}".into(),
1321            tokens: tokenize("same token"),
1322            token_count: 2,
1323        });
1324        index.add_chunk(CodeChunk {
1325            file_path: "a.rs".into(),
1326            symbol_name: "same".into(),
1327            kind: ChunkKind::Function,
1328            start_line: 1,
1329            end_line: 1,
1330            content: "fn same() {}".into(),
1331            tokens: tokenize("same token"),
1332            token_count: 2,
1333        });
1334        index.finalize();
1335
1336        let results = index.search("same", 10);
1337        assert!(results.len() >= 2);
1338        assert_eq!(results[0].file_path, "a.rs");
1339        assert_eq!(results[1].file_path, "b.rs");
1340    }
1341
1342    #[test]
1343    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1344        let td = tempdir().expect("tempdir");
1345        let root = td.path();
1346        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1347
1348        let idx = BM25Index::build_from_directory(root);
1349        assert!(!bm25_index_looks_stale(&idx, root));
1350
1351        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1352        assert!(bm25_index_looks_stale(&idx, root));
1353    }
1354
1355    #[test]
1356    #[cfg(unix)]
1357    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1358        let td = tempdir().expect("tempdir");
1359        let root = td.path();
1360
1361        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1362        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1363
1364        let idx1 = BM25Index::build_from_directory(root);
1365        assert!(idx1.files.contains_key("a.rs"));
1366        assert!(idx1.files.contains_key("b.rs"));
1367
1368        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1369        let a_path = root.join("a.rs");
1370        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1371        perms.set_mode(0o000);
1372        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1373
1374        // Change b.rs (size changes) to force a re-read for that file.
1375        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1376            .expect("rewrite b.rs");
1377
1378        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1379        assert!(
1380            idx2.files.contains_key("a.rs"),
1381            "a.rs should be kept via reuse"
1382        );
1383        assert!(idx2.files.contains_key("b.rs"));
1384
1385        let b_has_b2 = idx2
1386            .chunks
1387            .iter()
1388            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1389        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1390
1391        // Restore permissions to avoid cleanup surprises.
1392        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1393        perms.set_mode(0o644);
1394        let _ = std::fs::set_permissions(&a_path, perms);
1395    }
1396
1397    #[test]
1398    fn load_quarantines_oversized_index() {
1399        let _env = crate::core::data_dir::test_env_lock();
1400        let td = tempdir().expect("tempdir");
1401        let root = td.path();
1402        let dir = crate::core::index_namespace::vectors_dir(root);
1403        std::fs::create_dir_all(&dir).expect("create vectors dir");
1404
1405        let index_path = dir.join("bm25_index.json");
1406        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1407        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1408
1409        let result = BM25Index::load(root);
1410        assert!(result.is_none(), "oversized index should return None");
1411        assert!(
1412            !index_path.exists(),
1413            "original index should be removed after quarantine"
1414        );
1415        assert!(
1416            dir.join("bm25_index.json.quarantined").exists(),
1417            "quarantined file should exist"
1418        );
1419
1420        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1421    }
1422
1423    #[test]
1424    fn save_refuses_oversized_output() {
1425        let _env = crate::core::data_dir::test_env_lock();
1426        let data_dir = tempdir().expect("data_dir");
1427        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1428        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1429
1430        let td = tempdir().expect("tempdir");
1431        let root = td.path();
1432
1433        let mut index = BM25Index::new();
1434        index.add_chunk(CodeChunk {
1435            file_path: "a.rs".into(),
1436            symbol_name: "a".into(),
1437            kind: ChunkKind::Function,
1438            start_line: 1,
1439            end_line: 1,
1440            content: "fn a() {}".into(),
1441            tokens: tokenize("fn a"),
1442            token_count: 2,
1443        });
1444        index.finalize();
1445
1446        let outcome = index
1447            .save(root)
1448            .expect("save returns Ok even when refusing");
1449        assert!(
1450            matches!(outcome, SaveOutcome::SkippedTooLarge { .. }),
1451            "oversized save must report SkippedTooLarge (not a silent success), got {outcome:?}"
1452        );
1453        let index_path = BM25Index::index_file_path(root);
1454        assert!(
1455            !index_path.exists(),
1456            "save should refuse to persist oversized index"
1457        );
1458
1459        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1460    }
1461
1462    #[test]
1463    fn save_reports_persisted_outcome() {
1464        let _env = crate::core::data_dir::test_env_lock();
1465        let data_dir = tempdir().expect("data_dir");
1466        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1467        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1468        let td = tempdir().expect("tempdir");
1469        let root = td.path();
1470        std::fs::write(root.join("a.rs"), "pub fn alpha() {}\n").expect("write");
1471
1472        let index = BM25Index::build_from_directory(root);
1473        let outcome = index.save(root).expect("save");
1474        match outcome {
1475            SaveOutcome::Persisted { compressed_bytes } => {
1476                assert!(compressed_bytes > 0, "persisted size should be non-zero");
1477            }
1478            SaveOutcome::SkippedTooLarge { .. } => {
1479                panic!("expected Persisted, got {outcome:?}")
1480            }
1481        }
1482
1483        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1484        std::env::remove_var("LEAN_CTX_DATA_DIR");
1485    }
1486
1487    #[test]
1488    fn persist_ceiling_honors_env_override() {
1489        // The public ceiling accessor (shared with doctor) must honor an explicit
1490        // override exactly, so operators can size it to their monorepo.
1491        let _env = crate::core::data_dir::test_env_lock();
1492        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "777");
1493        assert_eq!(persist_ceiling_bytes(), 777 * 1024 * 1024);
1494        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1495    }
1496
1497    #[test]
1498    fn save_writes_project_root_marker() {
1499        let _env = crate::core::data_dir::test_env_lock();
1500        let td = tempdir().expect("tempdir");
1501        let root = td.path();
1502        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1503
1504        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1505        let index = BM25Index::build_from_directory(root);
1506        index.save(root).expect("save");
1507
1508        let dir = crate::core::index_namespace::vectors_dir(root);
1509        let marker = dir.join("project_root.txt");
1510        assert!(marker.exists(), "project_root.txt marker should exist");
1511        let content = std::fs::read_to_string(&marker).expect("read marker");
1512        assert_eq!(content, root.to_string_lossy());
1513    }
1514
1515    #[test]
1516    fn save_load_roundtrip_uses_zstd() {
1517        let _env = crate::core::data_dir::test_env_lock();
1518        let data_dir = tempdir().expect("data_dir");
1519        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1520        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1521        let td = tempdir().expect("tempdir");
1522        let root = td.path();
1523
1524        for i in 0..10 {
1525            std::fs::write(
1526                root.join(format!("mod{i}.rs")),
1527                format!(
1528                    "pub fn handler_{i}() {{\n    println!(\"hello\");\n}}\n\n\
1529                     pub fn helper_{i}() {{\n    println!(\"world\");\n}}\n"
1530                ),
1531            )
1532            .expect("write");
1533        }
1534
1535        let index = BM25Index::build_from_directory(root);
1536        assert!(index.doc_count > 0, "should have indexed chunks");
1537        index.save(root).expect("save");
1538
1539        let dir = crate::core::index_namespace::vectors_dir(root);
1540        let zst = dir.join("bm25_index.bin.zst");
1541        assert!(zst.exists(), "should write .bin.zst");
1542        assert!(
1543            !dir.join("bm25_index.bin").exists(),
1544            ".bin should be deleted"
1545        );
1546
1547        let loaded = BM25Index::load(root).expect("load compressed index");
1548        assert_eq!(loaded.doc_count, index.doc_count);
1549        assert_eq!(loaded.chunks.len(), index.chunks.len());
1550
1551        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1552        std::env::remove_var("LEAN_CTX_DATA_DIR");
1553    }
1554
1555    #[test]
1556    fn auto_migrate_bin_to_zst() {
1557        let _env = crate::core::data_dir::test_env_lock();
1558        let data_dir = tempdir().expect("data_dir");
1559        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1560        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1561        let td = tempdir().expect("tempdir");
1562        let root = td.path();
1563
1564        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1565        let index = BM25Index::build_from_directory(root);
1566
1567        let dir = crate::core::index_namespace::vectors_dir(root);
1568        std::fs::create_dir_all(&dir).expect("mkdir");
1569        let data =
1570            bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1571        std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1572
1573        let loaded = BM25Index::load(root).expect("load should auto-migrate");
1574        assert_eq!(loaded.doc_count, index.doc_count);
1575        assert!(
1576            dir.join("bm25_index.bin.zst").exists(),
1577            ".bin.zst should be created"
1578        );
1579        assert!(
1580            !dir.join("bm25_index.bin").exists(),
1581            ".bin should be removed"
1582        );
1583
1584        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1585        std::env::remove_var("LEAN_CTX_DATA_DIR");
1586    }
1587
1588    #[test]
1589    fn list_code_files_skips_default_vendor_ignores() {
1590        let td = tempdir().expect("tempdir");
1591        let root = td.path();
1592
1593        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1594        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1595        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1596        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1597        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1598
1599        let files = list_code_files(root);
1600        assert!(
1601            files.iter().any(|f| f == "main.rs"),
1602            "main.rs should be included"
1603        );
1604        assert!(
1605            !files.iter().any(|f| f.starts_with("vendor/")),
1606            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1607        );
1608        assert!(
1609            !files.iter().any(|f| f.starts_with("dist/")),
1610            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1611        );
1612    }
1613
1614    #[test]
1615    fn list_code_files_respects_max_files_cap() {
1616        let td = tempdir().expect("tempdir");
1617        let root = td.path();
1618
1619        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1620        // but we can verify the cap constant exists and the function returns a bounded vec.
1621        for i in 0..10 {
1622            std::fs::write(
1623                root.join(format!("f{i}.rs")),
1624                format!("pub fn f{i}() {{}}\n"),
1625            )
1626            .expect("write");
1627        }
1628        let files = list_code_files(root);
1629        assert!(
1630            files.len() <= MAX_BM25_FILES,
1631            "file count should not exceed MAX_BM25_FILES"
1632        );
1633    }
1634
1635    #[test]
1636    fn max_bm25_cache_bytes_reads_env() {
1637        let _env = crate::core::data_dir::test_env_lock();
1638        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1639        let bytes = max_bm25_cache_bytes();
1640        assert_eq!(bytes, 64 * 1024 * 1024);
1641        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1642    }
1643}
lean_ctx/core/bm25_index.rs

lean_ctx/core/
bm25_index.rs