lean_ctx/core/bm25_index/
mod.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6mod chunking;
7pub use chunking::*;
8#[cfg(test)]
9mod tests;
10
11const MAX_BM25_FILES: usize = 5000;
12const CHUNK_COUNT_WARNING: usize = 50_000;
13const ZSTD_LEVEL: i32 = 9;
14
15const DEFAULT_BM25_IGNORES: &[&str] = &[
16    "vendor/**",
17    "dist/**",
18    "build/**",
19    "public/vendor/**",
20    "public/js/**",
21    "public/css/**",
22    "public/build/**",
23    ".next/**",
24    ".nuxt/**",
25    "__pycache__/**",
26    "*.min.js",
27    "*.min.css",
28    "*.bundle.js",
29    "*.chunk.js",
30];
31
32fn max_bm25_cache_bytes() -> u64 {
33    // Single source of truth: `Config::bm25_max_cache_mb_effective` (env override
34    // › explicit config › disk-budget › generous default). Decoupled from the RAM
35    // profile so large repos persist instead of rebuilding forever (issue #249).
36    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
37        .ok()
38        .and_then(|v| v.parse::<u64>().ok())
39        .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb_effective());
40    mb * 1024 * 1024
41}
42
43/// Effective on-disk ceiling (bytes) for the persisted BM25 index. Single source
44/// of truth shared with `doctor` so its "oversized index" warning matches what
45/// `save`/`load` actually enforce.
46pub fn persist_ceiling_bytes() -> u64 {
47    max_bm25_cache_bytes()
48}
49
50/// Outcome of persisting a BM25 index to disk. Distinguishes a real write from a
51/// size-capped refusal so callers never mistake "refused to persist" for
52/// success (the bug behind the perpetual "index warming" report, issue #249).
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum SaveOutcome {
55    /// Written to disk. Carries the compressed (zstd) size in bytes.
56    Persisted { compressed_bytes: u64 },
57    /// Built fine but NOT written — the compressed size exceeds the disk
58    /// ceiling. The in-memory index is still usable for this process; callers
59    /// should surface the remedy (raise the cap / add ignore patterns) instead
60    /// of silently rebuilding on every call.
61    SkippedTooLarge {
62        compressed_bytes: u64,
63        limit_bytes: u64,
64    },
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct CodeChunk {
69    pub file_path: String,
70    pub symbol_name: String,
71    pub kind: ChunkKind,
72    pub start_line: usize,
73    pub end_line: usize,
74    pub content: String,
75    #[serde(default)]
76    pub tokens: Vec<String>,
77    pub token_count: usize,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
81pub enum ChunkKind {
82    Function,
83    Struct,
84    Impl,
85    Module,
86    Class,
87    Method,
88    Other,
89    // -- External source kinds (Context Engine) --
90    Issue,
91    PullRequest,
92    WikiPage,
93    DbSchema,
94    ApiEndpoint,
95    Ticket,
96    ExternalOther,
97}
98
99#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
100pub struct IndexedFileState {
101    pub mtime_ms: u64,
102    pub size_bytes: u64,
103}
104
105impl IndexedFileState {
106    fn from_path(path: &Path) -> Option<Self> {
107        let meta = path.metadata().ok()?;
108        let size_bytes = meta.len();
109        let mtime_ms = meta
110            .modified()
111            .ok()
112            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
113            .map(|d| d.as_millis() as u64)?;
114        Some(Self {
115            mtime_ms,
116            size_bytes,
117        })
118    }
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct BM25Index {
123    pub chunks: Vec<CodeChunk>,
124    pub inverted: HashMap<String, Vec<(usize, f64)>>,
125    pub avg_doc_len: f64,
126    pub doc_count: usize,
127    pub doc_freqs: HashMap<String, usize>,
128    #[serde(default)]
129    pub files: HashMap<String, IndexedFileState>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct SearchResult {
134    pub chunk_idx: usize,
135    pub score: f64,
136    pub file_path: String,
137    pub symbol_name: String,
138    pub kind: ChunkKind,
139    pub start_line: usize,
140    pub end_line: usize,
141    pub snippet: String,
142}
143
144const BM25_K1: f64 = 1.2;
145const BM25_B: f64 = 0.75;
146
147impl Default for BM25Index {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153impl BM25Index {
154    pub fn new() -> Self {
155        Self {
156            chunks: Vec::new(),
157            inverted: HashMap::new(),
158            avg_doc_len: 0.0,
159            doc_count: 0,
160            doc_freqs: HashMap::new(),
161            files: HashMap::new(),
162        }
163    }
164
165    /// Approximate heap memory used by this index in bytes.
166    pub fn memory_usage_bytes(&self) -> usize {
167        let chunks_size: usize = self
168            .chunks
169            .iter()
170            .map(|c| {
171                c.content.len()
172                    + c.file_path.len()
173                    + c.symbol_name.len()
174                    + c.tokens.iter().map(String::len).sum::<usize>()
175                    + 64
176            })
177            .sum();
178        let inverted_size: usize = self
179            .inverted
180            .iter()
181            .map(|(k, v)| k.len() + v.len() * 16 + 32)
182            .sum();
183        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
184        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
185        chunks_size + inverted_size + files_size + freqs_size
186    }
187
188    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
189    pub fn unload(&mut self) {
190        let usage = self.memory_usage_bytes();
191        self.chunks = Vec::new();
192        self.inverted = HashMap::new();
193        self.doc_freqs = HashMap::new();
194        self.files = HashMap::new();
195        self.avg_doc_len = 0.0;
196        self.doc_count = 0;
197        tracing::info!(
198            "[bm25] unloaded index, freed ~{:.1}MB",
199            usage as f64 / 1_048_576.0
200        );
201    }
202
203    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
204    #[cfg(test)]
205    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
206        let mut index = Self::new();
207        for mut chunk in chunks {
208            if chunk.token_count == 0 {
209                chunk.token_count = tokenize(&chunk.content).len();
210            }
211            index.add_chunk(chunk);
212        }
213        index.finalize();
214        index
215    }
216
217    pub fn build_from_directory(root: &Path) -> Self {
218        Self::build_from_directory_inner(root, &HashMap::new())
219    }
220
221    /// Like `build_from_directory` but reuses file content from a prior scan
222    /// (e.g. the graph index walk) to avoid redundant disk reads.
223    pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
224        Self::build_from_directory_inner(root, content_hint)
225    }
226
227    fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
228        let root_str = root.to_string_lossy();
229        if !super::graph_index::is_safe_scan_root_public(&root_str) {
230            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
231            return Self::new();
232        }
233        let mut index = Self::new();
234        let files = list_code_files(root);
235        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
236        let mut cache_hits = 0usize;
237
238        for (i, rel) in files.iter().enumerate() {
239            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
240                tracing::warn!(
241                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
242                    files.len()
243                );
244                break;
245            }
246            if crate::core::memory_guard::abort_requested() {
247                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
248                break;
249            }
250
251            let abs = root.join(rel);
252            let Some(state) = IndexedFileState::from_path(&abs) else {
253                continue;
254            };
255            if state.size_bytes > MAX_FILE_SIZE_BYTES {
256                continue;
257            }
258
259            let content = if let Some(cached) = content_hint.get(rel) {
260                cache_hits += 1;
261                std::borrow::Cow::Borrowed(cached.as_str())
262            } else {
263                match std::fs::read_to_string(&abs) {
264                    Ok(c) => std::borrow::Cow::Owned(c),
265                    Err(_) => continue,
266                }
267            };
268
269            let mut chunks = extract_chunks(rel, &content);
270            chunks.sort_by(|a, b| {
271                a.start_line
272                    .cmp(&b.start_line)
273                    .then_with(|| a.end_line.cmp(&b.end_line))
274                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
275            });
276            for chunk in chunks {
277                index.add_chunk(chunk);
278            }
279            index.files.insert(rel.clone(), state);
280        }
281
282        if cache_hits > 0 {
283            tracing::info!(
284                "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
285                files.len()
286            );
287        }
288
289        index.finalize();
290        index
291    }
292
293    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
294        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
295        for c in &prev.chunks {
296            old_by_file
297                .entry(c.file_path.clone())
298                .or_default()
299                .push(c.clone());
300        }
301        for v in old_by_file.values_mut() {
302            v.sort_by(|a, b| {
303                a.start_line
304                    .cmp(&b.start_line)
305                    .then_with(|| a.end_line.cmp(&b.end_line))
306                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
307            });
308        }
309
310        let mut index = Self::new();
311        let files = list_code_files(root);
312        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
313
314        for (i, rel) in files.iter().enumerate() {
315            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
316                tracing::warn!(
317                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
318                    files.len()
319                );
320                break;
321            }
322
323            let abs = root.join(rel);
324            let Some(state) = IndexedFileState::from_path(&abs) else {
325                continue;
326            };
327
328            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
329            if unchanged {
330                if let Some(chunks) = old_by_file.get(rel) {
331                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
332                        for chunk in chunks {
333                            index.add_chunk(chunk.clone());
334                        }
335                        index.files.insert(rel.clone(), state);
336                        continue;
337                    }
338                }
339            }
340
341            if state.size_bytes > MAX_FILE_SIZE_BYTES {
342                continue;
343            }
344            if let Ok(content) = std::fs::read_to_string(&abs) {
345                let mut chunks = extract_chunks(rel, &content);
346                chunks.sort_by(|a, b| {
347                    a.start_line
348                        .cmp(&b.start_line)
349                        .then_with(|| a.end_line.cmp(&b.end_line))
350                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
351                });
352                for chunk in chunks {
353                    index.add_chunk(chunk);
354                }
355                index.files.insert(rel.clone(), state);
356            }
357        }
358
359        index.finalize();
360        index
361    }
362
363    fn add_chunk(&mut self, chunk: CodeChunk) {
364        let idx = self.chunks.len();
365
366        let enriched = enrich_for_bm25(&chunk);
367        let tokens = tokenize(&enriched);
368        for token in &tokens {
369            let lower = token.to_lowercase();
370            let postings = self.inverted.entry(lower.clone()).or_default();
371            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
372                *self.doc_freqs.entry(lower).or_insert(0) += 1;
373            }
374            postings.push((idx, 1.0));
375        }
376
377        self.chunks.push(CodeChunk {
378            token_count: tokens.len(),
379            tokens: Vec::new(),
380            ..chunk
381        });
382    }
383
384    fn finalize(&mut self) {
385        self.doc_count = self.chunks.len();
386        if self.doc_count == 0 {
387            return;
388        }
389
390        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
391        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
392    }
393
394    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
395        let query_tokens = tokenize(query);
396        if query_tokens.is_empty() || self.doc_count == 0 {
397            return Vec::new();
398        }
399
400        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
401        // Kolmogorov-optimal: minimal allocation for the scoring operation.
402        let n = self.chunks.len();
403        let mut scores = vec![0.0f64; n];
404        let mut touched = Vec::with_capacity(n.min(256));
405
406        for token in &query_tokens {
407            let lower = token.to_lowercase();
408            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
409            if df == 0.0 {
410                continue;
411            }
412
413            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
414
415            if let Some(postings) = self.inverted.get(&lower) {
416                for &(idx, weight) in postings {
417                    let doc_len = self.chunks[idx].token_count as f64;
418                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
419                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
420                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
421
422                    if scores[idx] == 0.0 {
423                        touched.push(idx);
424                    }
425                    scores[idx] += bm25;
426                }
427            }
428        }
429
430        let mut results: Vec<SearchResult> = touched
431            .iter()
432            .filter(|&&idx| scores[idx] > 0.0)
433            .map(|&idx| {
434                let chunk = &self.chunks[idx];
435                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
436                SearchResult {
437                    chunk_idx: idx,
438                    score: scores[idx],
439                    file_path: chunk.file_path.clone(),
440                    symbol_name: chunk.symbol_name.clone(),
441                    kind: chunk.kind.clone(),
442                    start_line: chunk.start_line,
443                    end_line: chunk.end_line,
444                    snippet,
445                }
446            })
447            .collect();
448
449        results.sort_by(|a, b| {
450            b.score
451                .partial_cmp(&a.score)
452                .unwrap_or(std::cmp::Ordering::Equal)
453                .then_with(|| a.file_path.cmp(&b.file_path))
454                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
455                .then_with(|| a.start_line.cmp(&b.start_line))
456                .then_with(|| a.end_line.cmp(&b.end_line))
457        });
458        results.truncate(top_k);
459        results
460    }
461
462    pub fn save(&self, root: &Path) -> std::io::Result<SaveOutcome> {
463        if self.chunks.len() > CHUNK_COUNT_WARNING {
464            tracing::warn!(
465                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
466                self.chunks.len(),
467                CHUNK_COUNT_WARNING
468            );
469        }
470
471        let dir = index_dir(root);
472        std::fs::create_dir_all(&dir)?;
473        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
474            .map_err(|e| std::io::Error::other(e.to_string()))?;
475
476        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
477            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
478        let compressed_bytes = compressed.len() as u64;
479
480        let max_bytes = max_bm25_cache_bytes();
481        if compressed_bytes > max_bytes {
482            // Do NOT pretend success: a silent `Ok(())` here made `load` return
483            // `None` forever and the index rebuild on every call (issue #249).
484            // Report the refusal so the orchestrator can record an actionable
485            // note and the agent-facing tools can stop claiming the index will
486            // be "ready next call".
487            tracing::warn!(
488                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
489                compressed_bytes as f64 / 1_048_576.0,
490                max_bytes / (1024 * 1024),
491                dir.display()
492            );
493            return Ok(SaveOutcome::SkippedTooLarge {
494                compressed_bytes,
495                limit_bytes: max_bytes,
496            });
497        }
498
499        tracing::info!(
500            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
501            data.len() as f64 / 1_048_576.0,
502            compressed_bytes as f64 / 1_048_576.0,
503            (1.0 - compressed_bytes as f64 / data.len().max(1) as f64) * 100.0
504        );
505
506        let target = dir.join("bm25_index.bin.zst");
507        let tmp = dir.join("bm25_index.bin.zst.tmp");
508        std::fs::write(&tmp, &compressed)?;
509        std::fs::rename(&tmp, &target)?;
510
511        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
512        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
513
514        let _ = std::fs::write(
515            dir.join("project_root.txt"),
516            root.to_string_lossy().as_bytes(),
517        );
518
519        Ok(SaveOutcome::Persisted { compressed_bytes })
520    }
521
522    pub fn load(root: &Path) -> Option<Self> {
523        let dir = index_dir(root);
524        let max_bytes = max_bm25_cache_bytes();
525
526        let zst_path = dir.join("bm25_index.bin.zst");
527        if zst_path.exists() {
528            let meta = std::fs::metadata(&zst_path).ok()?;
529            if meta.len() > max_bytes {
530                tracing::warn!(
531                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
532                    meta.len() as f64 / 1_073_741_824.0,
533                    max_bytes / (1024 * 1024),
534                    zst_path.display()
535                );
536                let quarantined = zst_path.with_extension("zst.quarantined");
537                let _ = std::fs::rename(&zst_path, &quarantined);
538                return None;
539            }
540            let compressed = std::fs::read(&zst_path).ok()?;
541            let max_decompressed = max_bytes * 20; // allow 20x expansion ratio
542            let data = bounded_zstd_decode(&compressed, max_decompressed)?;
543            let (idx, _): (Self, _) =
544                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
545            return Some(idx);
546        }
547
548        let bin_path = dir.join("bm25_index.bin");
549        if bin_path.exists() {
550            let meta = std::fs::metadata(&bin_path).ok()?;
551            if meta.len() > max_bytes {
552                tracing::warn!(
553                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
554                    meta.len() as f64 / 1_073_741_824.0,
555                    max_bytes / (1024 * 1024),
556                    bin_path.display()
557                );
558                let quarantined = bin_path.with_extension("bin.quarantined");
559                let _ = std::fs::rename(&bin_path, &quarantined);
560                return None;
561            }
562            let data = std::fs::read(&bin_path).ok()?;
563            let (idx, _): (Self, _) =
564                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
565            // Auto-migrate: compress legacy .bin to .bin.zst
566            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
567                let zst_tmp = zst_path.with_extension("zst.tmp");
568                if std::fs::write(&zst_tmp, &compressed).is_ok()
569                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
570                {
571                    tracing::info!(
572                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
573                        data.len() as f64 / 1_048_576.0,
574                        compressed.len() as f64 / 1_048_576.0
575                    );
576                    let _ = std::fs::remove_file(&bin_path);
577                }
578            }
579            return Some(idx);
580        }
581
582        let json_path = dir.join("bm25_index.json");
583        if json_path.exists() {
584            let meta = std::fs::metadata(&json_path).ok()?;
585            if meta.len() > max_bytes {
586                tracing::warn!(
587                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
588                    meta.len() as f64 / 1_073_741_824.0,
589                    max_bytes / (1024 * 1024),
590                    json_path.display()
591                );
592                let quarantined = json_path.with_extension("json.quarantined");
593                let _ = std::fs::rename(&json_path, &quarantined);
594                return None;
595            }
596            let data = std::fs::read_to_string(&json_path).ok()?;
597            return serde_json::from_str(&data).ok();
598        }
599
600        None
601    }
602
603    pub fn load_or_build(root: &Path) -> Self {
604        Self::load_or_build_inner(root, false)
605    }
606
607    /// Like `load_or_build` but uses a fast sentinel-sampling staleness check
608    /// that skips the expensive full directory walk for new-file detection.
609    pub fn load_or_build_fast(root: &Path) -> Self {
610        Self::load_or_build_inner(root, true)
611    }
612
613    fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
614        if !is_safe_bm25_root(root) {
615            return Self::default();
616        }
617        if let Some(idx) = Self::load(root) {
618            let stale = if fast_stale {
619                bm25_index_looks_stale_fast(&idx, root)
620            } else {
621                bm25_index_looks_stale(&idx, root)
622            };
623            if !stale {
624                return idx;
625            }
626            tracing::debug!(
627                "[bm25_index: stale index detected for {}; rebuilding]",
628                root.display()
629            );
630            let rebuilt = if idx.files.is_empty() {
631                Self::build_from_directory(root)
632            } else {
633                Self::rebuild_incremental(root, &idx)
634            };
635            let _ = rebuilt.save(root);
636            return rebuilt;
637        }
638
639        let built = Self::build_from_directory(root);
640        let _ = built.save(root);
641        built
642    }
643
644    pub fn index_file_path(root: &Path) -> PathBuf {
645        let dir = index_dir(root);
646        let zst = dir.join("bm25_index.bin.zst");
647        if zst.exists() {
648            return zst;
649        }
650        let bin = dir.join("bm25_index.bin");
651        if bin.exists() {
652            return bin;
653        }
654        dir.join("bm25_index.json")
655    }
656
657    /// Ingest external `ContentChunk`s into the BM25 index.
658    /// Converts each chunk to a `CodeChunk` (backward-compatible) and
659    /// rebuilds the inverted index. Returns the number of chunks ingested.
660    pub fn ingest_content_chunks(
661        &mut self,
662        chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
663    ) -> usize {
664        let mut count = 0usize;
665        for cc in chunks {
666            self.add_chunk(cc.into());
667            count += 1;
668        }
669        if count > 0 {
670            self.finalize();
671        }
672        count
673    }
674
675    /// Number of chunks originating from external providers.
676    pub fn external_chunk_count(&self) -> usize {
677        self.chunks
678            .iter()
679            .filter(|c| c.file_path.contains("://"))
680            .count()
681    }
682}
683
684fn is_safe_bm25_root(root: &Path) -> bool {
685    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
686}
687
688fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
689    bm25_index_looks_stale_inner(index, root, false)
690}
691
692/// Fast staleness check: samples a subset of tracked files and skips the
693/// expensive `list_code_files()` walk for new-file detection.
694pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
695    bm25_index_looks_stale_inner(index, root, true)
696}
697
698fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
699    if index.chunks.is_empty() {
700        return false;
701    }
702
703    if index.files.is_empty() {
704        let mut seen = std::collections::HashSet::<&str>::new();
705        for chunk in &index.chunks {
706            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
707            if rel.is_empty() {
708                continue;
709            }
710            if !seen.insert(rel) {
711                continue;
712            }
713            if !root.join(rel).exists() {
714                return true;
715            }
716        }
717        return false;
718    }
719
720    if fast {
721        let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
722        let step = if index.files.len() > sample_size {
723            index.files.len() / sample_size
724        } else {
725            1
726        };
727        for (i, (rel, old_state)) in index.files.iter().enumerate() {
728            if i % step != 0 {
729                continue;
730            }
731            let abs = root.join(rel);
732            if !abs.exists() {
733                return true;
734            }
735            let Some(cur) = IndexedFileState::from_path(&abs) else {
736                return true;
737            };
738            if &cur != old_state {
739                return true;
740            }
741        }
742        return false;
743    }
744
745    for (rel, old_state) in &index.files {
746        let abs = root.join(rel);
747        if !abs.exists() {
748            return true;
749        }
750        let Some(cur) = IndexedFileState::from_path(&abs) else {
751            return true;
752        };
753        if &cur != old_state {
754            return true;
755        }
756    }
757
758    for rel in list_code_files(root) {
759        if !index.files.contains_key(&rel) {
760            return true;
761        }
762    }
763
764    false
765}
766
767const SENTINEL_SAMPLE_SIZE: usize = 10;
768
769fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
770    use std::io::Read;
771    let mut decoder = zstd::Decoder::new(compressed).ok()?;
772    let mut buf = Vec::new();
773    let mut chunk = vec![0u8; 65536];
774    let mut total = 0u64;
775    loop {
776        let n = decoder.read(&mut chunk).ok()?;
777        if n == 0 {
778            break;
779        }
780        total += n as u64;
781        if total > max_bytes {
782            tracing::warn!(
783                "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
784                total as f64 / (1024.0 * 1024.0),
785                max_bytes as f64 / (1024.0 * 1024.0)
786            );
787            return None;
788        }
789        buf.extend_from_slice(&chunk[..n]);
790    }
791    Some(buf)
792}
793
794fn index_dir(root: &Path) -> PathBuf {
795    crate::core::index_namespace::vectors_dir(root)
796}
797
798fn list_code_files(root: &Path) -> Vec<String> {
799    let walker = ignore::WalkBuilder::new(root)
800        .hidden(true)
801        .git_ignore(true)
802        .git_global(true)
803        .git_exclude(true)
804        .max_depth(Some(20))
805        .build();
806
807    let cfg = crate::core::config::Config::load();
808    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
809        .iter()
810        .filter_map(|p| glob::Pattern::new(p).ok())
811        .collect();
812    ignore_patterns.extend(
813        cfg.extra_ignore_patterns
814            .iter()
815            .filter_map(|p| glob::Pattern::new(p).ok()),
816    );
817
818    let mut files: Vec<String> = Vec::new();
819    for entry in walker.flatten() {
820        let path = entry.path();
821        if !path.is_file() {
822            continue;
823        }
824        if !is_code_file(path) {
825            continue;
826        }
827        let rel = path
828            .strip_prefix(root)
829            .unwrap_or(path)
830            .to_string_lossy()
831            .to_string();
832        if rel.is_empty() {
833            continue;
834        }
835        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
836            continue;
837        }
838        if files.len() >= MAX_BM25_FILES {
839            tracing::warn!(
840                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
841                root.display()
842            );
843            break;
844        }
845        files.push(rel);
846    }
847
848    files.sort();
849    files.dedup();
850    files
851}
852
853pub fn is_code_file(path: &Path) -> bool {
854    let ext = path
855        .extension()
856        .and_then(|e| e.to_str())
857        .unwrap_or("")
858        .to_lowercase();
859    matches!(
860        ext.as_str(),
861        "rs" | "ts"
862            | "tsx"
863            | "js"
864            | "jsx"
865            | "py"
866            | "go"
867            | "java"
868            | "c"
869            | "cc"
870            | "cpp"
871            | "h"
872            | "hpp"
873            | "rb"
874            | "cs"
875            | "kt"
876            | "swift"
877            | "php"
878            | "scala"
879            | "sql"
880            | "ex"
881            | "exs"
882            | "zig"
883            | "lua"
884            | "dart"
885            | "vue"
886            | "svelte"
887    )
888}
lean_ctx/core/bm25_index/mod.rs

lean_ctx/core/bm25_index/
mod.rs