Skip to main content

lean_ctx/core/bm25_index/
mod.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6mod chunking;
7pub use chunking::*;
8#[cfg(test)]
9mod tests;
10
11const MAX_BM25_FILES: usize = 5000;
12const CHUNK_COUNT_WARNING: usize = 50_000;
13const ZSTD_LEVEL: i32 = 9;
14
15const DEFAULT_BM25_IGNORES: &[&str] = &[
16    "vendor/**",
17    "dist/**",
18    "build/**",
19    "public/vendor/**",
20    "public/js/**",
21    "public/css/**",
22    "public/build/**",
23    ".next/**",
24    ".nuxt/**",
25    "__pycache__/**",
26    "*.min.js",
27    "*.min.css",
28    "*.bundle.js",
29    "*.chunk.js",
30];
31
32fn max_bm25_cache_bytes() -> u64 {
33    // Single source of truth: `Config::bm25_max_cache_mb_effective` (env override
34    // › explicit config › disk-budget › generous default). Decoupled from the RAM
35    // profile so large repos persist instead of rebuilding forever (issue #249).
36    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
37        .ok()
38        .and_then(|v| v.parse::<u64>().ok())
39        .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb_effective());
40    mb * 1024 * 1024
41}
42
43/// Effective on-disk ceiling (bytes) for the persisted BM25 index. Single source
44/// of truth shared with `doctor` so its "oversized index" warning matches what
45/// `save`/`load` actually enforce.
46pub fn persist_ceiling_bytes() -> u64 {
47    max_bm25_cache_bytes()
48}
49
50/// Outcome of persisting a BM25 index to disk. Distinguishes a real write from a
51/// size-capped refusal so callers never mistake "refused to persist" for
52/// success (the bug behind the perpetual "index warming" report, issue #249).
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum SaveOutcome {
55    /// Written to disk. Carries the compressed (zstd) size in bytes.
56    Persisted { compressed_bytes: u64 },
57    /// Built fine but NOT written — the compressed size exceeds the disk
58    /// ceiling. The in-memory index is still usable for this process; callers
59    /// should surface the remedy (raise the cap / add ignore patterns) instead
60    /// of silently rebuilding on every call.
61    SkippedTooLarge {
62        compressed_bytes: u64,
63        limit_bytes: u64,
64    },
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct CodeChunk {
69    pub file_path: String,
70    pub symbol_name: String,
71    pub kind: ChunkKind,
72    pub start_line: usize,
73    pub end_line: usize,
74    pub content: String,
75    #[serde(default)]
76    pub tokens: Vec<String>,
77    pub token_count: usize,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
81pub enum ChunkKind {
82    Function,
83    Struct,
84    Impl,
85    Module,
86    Class,
87    Method,
88    Other,
89    // -- External source kinds (Context Engine) --
90    Issue,
91    PullRequest,
92    WikiPage,
93    DbSchema,
94    ApiEndpoint,
95    Ticket,
96    ExternalOther,
97}
98
99#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
100pub struct IndexedFileState {
101    pub mtime_ms: u64,
102    pub size_bytes: u64,
103}
104
105impl IndexedFileState {
106    fn from_path(path: &Path) -> Option<Self> {
107        let meta = path.metadata().ok()?;
108        let size_bytes = meta.len();
109        let mtime_ms = meta
110            .modified()
111            .ok()
112            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
113            .map(|d| d.as_millis() as u64)?;
114        Some(Self {
115            mtime_ms,
116            size_bytes,
117        })
118    }
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct BM25Index {
123    pub chunks: Vec<CodeChunk>,
124    pub inverted: HashMap<String, Vec<(usize, f64)>>,
125    pub avg_doc_len: f64,
126    pub doc_count: usize,
127    pub doc_freqs: HashMap<String, usize>,
128    #[serde(default)]
129    pub files: HashMap<String, IndexedFileState>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct SearchResult {
134    pub chunk_idx: usize,
135    pub score: f64,
136    pub file_path: String,
137    pub symbol_name: String,
138    pub kind: ChunkKind,
139    pub start_line: usize,
140    pub end_line: usize,
141    pub snippet: String,
142}
143
144const BM25_K1: f64 = 1.2;
145const BM25_B: f64 = 0.75;
146
147impl Default for BM25Index {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153impl BM25Index {
154    pub fn new() -> Self {
155        Self {
156            chunks: Vec::new(),
157            inverted: HashMap::new(),
158            avg_doc_len: 0.0,
159            doc_count: 0,
160            doc_freqs: HashMap::new(),
161            files: HashMap::new(),
162        }
163    }
164
165    /// Approximate heap memory used by this index in bytes.
166    pub fn memory_usage_bytes(&self) -> usize {
167        let chunks_size: usize = self
168            .chunks
169            .iter()
170            .map(|c| {
171                c.content.len()
172                    + c.file_path.len()
173                    + c.symbol_name.len()
174                    + c.tokens.iter().map(String::len).sum::<usize>()
175                    + 64
176            })
177            .sum();
178        let inverted_size: usize = self
179            .inverted
180            .iter()
181            .map(|(k, v)| k.len() + v.len() * 16 + 32)
182            .sum();
183        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
184        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
185        chunks_size + inverted_size + files_size + freqs_size
186    }
187
188    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
189    pub fn unload(&mut self) {
190        let usage = self.memory_usage_bytes();
191        self.chunks = Vec::new();
192        self.inverted = HashMap::new();
193        self.doc_freqs = HashMap::new();
194        self.files = HashMap::new();
195        self.avg_doc_len = 0.0;
196        self.doc_count = 0;
197        tracing::info!(
198            "[bm25] unloaded index, freed ~{:.1}MB",
199            usage as f64 / 1_048_576.0
200        );
201    }
202
203    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
204    #[cfg(test)]
205    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
206        let mut index = Self::new();
207        for mut chunk in chunks {
208            if chunk.token_count == 0 {
209                chunk.token_count = tokenize(&chunk.content).len();
210            }
211            index.add_chunk(chunk);
212        }
213        index.finalize();
214        index
215    }
216
217    pub fn build_from_directory(root: &Path) -> Self {
218        Self::build_from_directory_inner(root, &HashMap::new())
219    }
220
221    /// Like `build_from_directory` but reuses file content from a prior scan
222    /// (e.g. the graph index walk) to avoid redundant disk reads.
223    pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
224        Self::build_from_directory_inner(root, content_hint)
225    }
226
227    fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
228        let root_str = root.to_string_lossy();
229        if !super::graph_index::is_safe_scan_root_public(&root_str) {
230            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
231            return Self::new();
232        }
233        let mut index = Self::new();
234        let files = list_code_files(root);
235        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
236        let mut cache_hits = 0usize;
237
238        for (i, rel) in files.iter().enumerate() {
239            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
240                tracing::warn!(
241                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
242                    files.len()
243                );
244                break;
245            }
246            if crate::core::memory_guard::abort_requested() {
247                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
248                break;
249            }
250
251            let abs = root.join(rel);
252            let Some(state) = IndexedFileState::from_path(&abs) else {
253                continue;
254            };
255            if state.size_bytes > MAX_FILE_SIZE_BYTES {
256                continue;
257            }
258
259            // Content sources, cheapest first: an explicit per-build hint, then
260            // the shared resident content cache (populated by the search-index
261            // build / ctx_search, issue #148) validated by `(mtime, size)`, then
262            // a one-time disk read that also publishes into the shared cache.
263            let cache_state = crate::core::content_cache::FileState {
264                mtime_ms: state.mtime_ms,
265                size_bytes: state.size_bytes,
266            };
267            let content = if let Some(cached) = content_hint.get(rel) {
268                cache_hits += 1;
269                std::borrow::Cow::Borrowed(cached.as_str())
270            } else if let Some(arc) = crate::core::content_cache::get(&abs, cache_state) {
271                cache_hits += 1;
272                std::borrow::Cow::Owned(arc.to_string())
273            } else {
274                match std::fs::read_to_string(&abs) {
275                    Ok(c) => {
276                        crate::core::content_cache::insert(
277                            &abs,
278                            cache_state,
279                            std::sync::Arc::from(c.as_str()),
280                        );
281                        std::borrow::Cow::Owned(c)
282                    }
283                    Err(_) => continue,
284                }
285            };
286
287            let mut chunks = extract_chunks(rel, &content);
288            chunks.sort_by(|a, b| {
289                a.start_line
290                    .cmp(&b.start_line)
291                    .then_with(|| a.end_line.cmp(&b.end_line))
292                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
293            });
294            for chunk in chunks {
295                index.add_chunk(chunk);
296            }
297            index.files.insert(rel.clone(), state);
298        }
299
300        if cache_hits > 0 {
301            tracing::info!(
302                "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
303                files.len()
304            );
305        }
306
307        index.finalize();
308        index
309    }
310
311    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
312        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
313        for c in &prev.chunks {
314            old_by_file
315                .entry(c.file_path.clone())
316                .or_default()
317                .push(c.clone());
318        }
319        for v in old_by_file.values_mut() {
320            v.sort_by(|a, b| {
321                a.start_line
322                    .cmp(&b.start_line)
323                    .then_with(|| a.end_line.cmp(&b.end_line))
324                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
325            });
326        }
327
328        let mut index = Self::new();
329        let files = list_code_files(root);
330        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
331
332        for (i, rel) in files.iter().enumerate() {
333            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
334                tracing::warn!(
335                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
336                    files.len()
337                );
338                break;
339            }
340
341            let abs = root.join(rel);
342            let Some(state) = IndexedFileState::from_path(&abs) else {
343                continue;
344            };
345
346            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
347            if unchanged {
348                if let Some(chunks) = old_by_file.get(rel) {
349                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
350                        for chunk in chunks {
351                            index.add_chunk(chunk.clone());
352                        }
353                        index.files.insert(rel.clone(), state);
354                        continue;
355                    }
356                }
357            }
358
359            if state.size_bytes > MAX_FILE_SIZE_BYTES {
360                continue;
361            }
362            if let Ok(content) = std::fs::read_to_string(&abs) {
363                let mut chunks = extract_chunks(rel, &content);
364                chunks.sort_by(|a, b| {
365                    a.start_line
366                        .cmp(&b.start_line)
367                        .then_with(|| a.end_line.cmp(&b.end_line))
368                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
369                });
370                for chunk in chunks {
371                    index.add_chunk(chunk);
372                }
373                index.files.insert(rel.clone(), state);
374            }
375        }
376
377        index.finalize();
378        index
379    }
380
381    fn add_chunk(&mut self, chunk: CodeChunk) {
382        let idx = self.chunks.len();
383
384        let enriched = enrich_for_bm25(&chunk);
385        let tokens = tokenize(&enriched);
386        for token in &tokens {
387            let lower = token.to_lowercase();
388            let postings = self.inverted.entry(lower.clone()).or_default();
389            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
390                *self.doc_freqs.entry(lower).or_insert(0) += 1;
391            }
392            postings.push((idx, 1.0));
393        }
394
395        self.chunks.push(CodeChunk {
396            token_count: tokens.len(),
397            tokens: Vec::new(),
398            ..chunk
399        });
400    }
401
402    fn finalize(&mut self) {
403        self.doc_count = self.chunks.len();
404        if self.doc_count == 0 {
405            return;
406        }
407
408        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
409        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
410    }
411
412    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
413        let query_tokens = tokenize(query);
414        if query_tokens.is_empty() || self.doc_count == 0 {
415            return Vec::new();
416        }
417
418        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
419        // Kolmogorov-optimal: minimal allocation for the scoring operation.
420        let n = self.chunks.len();
421        let mut scores = vec![0.0f64; n];
422        let mut touched = Vec::with_capacity(n.min(256));
423
424        for token in &query_tokens {
425            let lower = token.to_lowercase();
426            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
427            if df == 0.0 {
428                continue;
429            }
430
431            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
432
433            if let Some(postings) = self.inverted.get(&lower) {
434                for &(idx, weight) in postings {
435                    let doc_len = self.chunks[idx].token_count as f64;
436                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
437                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
438                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
439
440                    if scores[idx] == 0.0 {
441                        touched.push(idx);
442                    }
443                    scores[idx] += bm25;
444                }
445            }
446        }
447
448        let mut results: Vec<SearchResult> = touched
449            .iter()
450            .filter(|&&idx| scores[idx] > 0.0)
451            .map(|&idx| {
452                let chunk = &self.chunks[idx];
453                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
454                SearchResult {
455                    chunk_idx: idx,
456                    score: scores[idx],
457                    file_path: chunk.file_path.clone(),
458                    symbol_name: chunk.symbol_name.clone(),
459                    kind: chunk.kind.clone(),
460                    start_line: chunk.start_line,
461                    end_line: chunk.end_line,
462                    snippet,
463                }
464            })
465            .collect();
466
467        results.sort_by(|a, b| {
468            b.score
469                .partial_cmp(&a.score)
470                .unwrap_or(std::cmp::Ordering::Equal)
471                .then_with(|| a.file_path.cmp(&b.file_path))
472                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
473                .then_with(|| a.start_line.cmp(&b.start_line))
474                .then_with(|| a.end_line.cmp(&b.end_line))
475        });
476        results.truncate(top_k);
477        results
478    }
479
480    pub fn save(&self, root: &Path) -> std::io::Result<SaveOutcome> {
481        if self.chunks.len() > CHUNK_COUNT_WARNING {
482            tracing::warn!(
483                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
484                self.chunks.len(),
485                CHUNK_COUNT_WARNING
486            );
487        }
488
489        let dir = index_dir(root);
490        std::fs::create_dir_all(&dir)?;
491        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
492            .map_err(|e| std::io::Error::other(e.to_string()))?;
493
494        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
495            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
496        let compressed_bytes = compressed.len() as u64;
497
498        let max_bytes = max_bm25_cache_bytes();
499        if compressed_bytes > max_bytes {
500            // Do NOT pretend success: a silent `Ok(())` here made `load` return
501            // `None` forever and the index rebuild on every call (issue #249).
502            // Report the refusal so the orchestrator can record an actionable
503            // note and the agent-facing tools can stop claiming the index will
504            // be "ready next call".
505            tracing::warn!(
506                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
507                compressed_bytes as f64 / 1_048_576.0,
508                max_bytes / (1024 * 1024),
509                dir.display()
510            );
511            return Ok(SaveOutcome::SkippedTooLarge {
512                compressed_bytes,
513                limit_bytes: max_bytes,
514            });
515        }
516
517        tracing::info!(
518            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
519            data.len() as f64 / 1_048_576.0,
520            compressed_bytes as f64 / 1_048_576.0,
521            (1.0 - compressed_bytes as f64 / data.len().max(1) as f64) * 100.0
522        );
523
524        let target = dir.join("bm25_index.bin.zst");
525        let tmp = dir.join("bm25_index.bin.zst.tmp");
526        std::fs::write(&tmp, &compressed)?;
527        std::fs::rename(&tmp, &target)?;
528
529        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
530        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
531
532        let _ = std::fs::write(
533            dir.join("project_root.txt"),
534            root.to_string_lossy().as_bytes(),
535        );
536
537        Ok(SaveOutcome::Persisted { compressed_bytes })
538    }
539
540    pub fn load(root: &Path) -> Option<Self> {
541        let dir = index_dir(root);
542        let max_bytes = max_bm25_cache_bytes();
543
544        let zst_path = dir.join("bm25_index.bin.zst");
545        if zst_path.exists() {
546            let meta = std::fs::metadata(&zst_path).ok()?;
547            if meta.len() > max_bytes {
548                tracing::warn!(
549                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
550                    meta.len() as f64 / 1_073_741_824.0,
551                    max_bytes / (1024 * 1024),
552                    zst_path.display()
553                );
554                let quarantined = zst_path.with_extension("zst.quarantined");
555                let _ = std::fs::rename(&zst_path, &quarantined);
556                return None;
557            }
558            let compressed = std::fs::read(&zst_path).ok()?;
559            let max_decompressed = max_bytes * 20; // allow 20x expansion ratio
560            let data = bounded_zstd_decode(&compressed, max_decompressed)?;
561            let (idx, _): (Self, _) =
562                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
563            return Some(idx);
564        }
565
566        let bin_path = dir.join("bm25_index.bin");
567        if bin_path.exists() {
568            let meta = std::fs::metadata(&bin_path).ok()?;
569            if meta.len() > max_bytes {
570                tracing::warn!(
571                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
572                    meta.len() as f64 / 1_073_741_824.0,
573                    max_bytes / (1024 * 1024),
574                    bin_path.display()
575                );
576                let quarantined = bin_path.with_extension("bin.quarantined");
577                let _ = std::fs::rename(&bin_path, &quarantined);
578                return None;
579            }
580            let data = std::fs::read(&bin_path).ok()?;
581            let (idx, _): (Self, _) =
582                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
583            // Auto-migrate: compress legacy .bin to .bin.zst
584            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
585                let zst_tmp = zst_path.with_extension("zst.tmp");
586                if std::fs::write(&zst_tmp, &compressed).is_ok()
587                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
588                {
589                    tracing::info!(
590                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
591                        data.len() as f64 / 1_048_576.0,
592                        compressed.len() as f64 / 1_048_576.0
593                    );
594                    let _ = std::fs::remove_file(&bin_path);
595                }
596            }
597            return Some(idx);
598        }
599
600        let json_path = dir.join("bm25_index.json");
601        if json_path.exists() {
602            let meta = std::fs::metadata(&json_path).ok()?;
603            if meta.len() > max_bytes {
604                tracing::warn!(
605                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
606                    meta.len() as f64 / 1_073_741_824.0,
607                    max_bytes / (1024 * 1024),
608                    json_path.display()
609                );
610                let quarantined = json_path.with_extension("json.quarantined");
611                let _ = std::fs::rename(&json_path, &quarantined);
612                return None;
613            }
614            let data = std::fs::read_to_string(&json_path).ok()?;
615            return serde_json::from_str(&data).ok();
616        }
617
618        None
619    }
620
621    pub fn load_or_build(root: &Path) -> Self {
622        Self::load_or_build_inner(root, false)
623    }
624
625    /// Like `load_or_build` but uses a fast sentinel-sampling staleness check
626    /// that skips the expensive full directory walk for new-file detection.
627    pub fn load_or_build_fast(root: &Path) -> Self {
628        Self::load_or_build_inner(root, true)
629    }
630
631    fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
632        if !is_safe_bm25_root(root) {
633            return Self::default();
634        }
635        if let Some(idx) = Self::load(root) {
636            let stale = if fast_stale {
637                bm25_index_looks_stale_fast(&idx, root)
638            } else {
639                bm25_index_looks_stale(&idx, root)
640            };
641            if !stale {
642                return idx;
643            }
644            tracing::debug!(
645                "[bm25_index: stale index detected for {}; rebuilding]",
646                root.display()
647            );
648            let rebuilt = if idx.files.is_empty() {
649                Self::build_from_directory(root)
650            } else {
651                Self::rebuild_incremental(root, &idx)
652            };
653            let _ = rebuilt.save(root);
654            return rebuilt;
655        }
656
657        let built = Self::build_from_directory(root);
658        let _ = built.save(root);
659        built
660    }
661
662    pub fn index_file_path(root: &Path) -> PathBuf {
663        let dir = index_dir(root);
664        let zst = dir.join("bm25_index.bin.zst");
665        if zst.exists() {
666            return zst;
667        }
668        let bin = dir.join("bm25_index.bin");
669        if bin.exists() {
670            return bin;
671        }
672        dir.join("bm25_index.json")
673    }
674
675    /// Ingest external `ContentChunk`s into the BM25 index.
676    /// Converts each chunk to a `CodeChunk` (backward-compatible) and
677    /// rebuilds the inverted index. Returns the number of chunks ingested.
678    pub fn ingest_content_chunks(
679        &mut self,
680        chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
681    ) -> usize {
682        let mut count = 0usize;
683        for cc in chunks {
684            self.add_chunk(cc.into());
685            count += 1;
686        }
687        if count > 0 {
688            self.finalize();
689        }
690        count
691    }
692
693    /// Number of chunks originating from external providers.
694    pub fn external_chunk_count(&self) -> usize {
695        self.chunks
696            .iter()
697            .filter(|c| c.file_path.contains("://"))
698            .count()
699    }
700}
701
702fn is_safe_bm25_root(root: &Path) -> bool {
703    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
704}
705
706fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
707    bm25_index_looks_stale_inner(index, root, false)
708}
709
710/// Fast staleness check: samples a subset of tracked files and skips the
711/// expensive `list_code_files()` walk for new-file detection.
712pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
713    bm25_index_looks_stale_inner(index, root, true)
714}
715
716fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
717    if index.chunks.is_empty() {
718        return false;
719    }
720
721    if index.files.is_empty() {
722        let mut seen = std::collections::HashSet::<&str>::new();
723        for chunk in &index.chunks {
724            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
725            if rel.is_empty() {
726                continue;
727            }
728            if !seen.insert(rel) {
729                continue;
730            }
731            if !root.join(rel).exists() {
732                return true;
733            }
734        }
735        return false;
736    }
737
738    if fast {
739        let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
740        let step = if index.files.len() > sample_size {
741            index.files.len() / sample_size
742        } else {
743            1
744        };
745        for (i, (rel, old_state)) in index.files.iter().enumerate() {
746            if i % step != 0 {
747                continue;
748            }
749            let abs = root.join(rel);
750            if !abs.exists() {
751                return true;
752            }
753            let Some(cur) = IndexedFileState::from_path(&abs) else {
754                return true;
755            };
756            if &cur != old_state {
757                return true;
758            }
759        }
760        return false;
761    }
762
763    for (rel, old_state) in &index.files {
764        let abs = root.join(rel);
765        if !abs.exists() {
766            return true;
767        }
768        let Some(cur) = IndexedFileState::from_path(&abs) else {
769            return true;
770        };
771        if &cur != old_state {
772            return true;
773        }
774    }
775
776    for rel in list_code_files(root) {
777        if !index.files.contains_key(&rel) {
778            return true;
779        }
780    }
781
782    false
783}
784
785const SENTINEL_SAMPLE_SIZE: usize = 10;
786
787fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
788    use std::io::Read;
789    let mut decoder = zstd::Decoder::new(compressed).ok()?;
790    let mut buf = Vec::new();
791    let mut chunk = vec![0u8; 65536];
792    let mut total = 0u64;
793    loop {
794        let n = decoder.read(&mut chunk).ok()?;
795        if n == 0 {
796            break;
797        }
798        total += n as u64;
799        if total > max_bytes {
800            tracing::warn!(
801                "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
802                total as f64 / (1024.0 * 1024.0),
803                max_bytes as f64 / (1024.0 * 1024.0)
804            );
805            return None;
806        }
807        buf.extend_from_slice(&chunk[..n]);
808    }
809    Some(buf)
810}
811
812fn index_dir(root: &Path) -> PathBuf {
813    crate::core::index_namespace::vectors_dir(root)
814}
815
816fn list_code_files(root: &Path) -> Vec<String> {
817    let walker = ignore::WalkBuilder::new(root)
818        .hidden(true)
819        .git_ignore(true)
820        .git_global(true)
821        .git_exclude(true)
822        .max_depth(Some(20))
823        .filter_entry(crate::core::cloud_files::keep_entry)
824        .build();
825
826    let cfg = crate::core::config::Config::load();
827    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
828        .iter()
829        .filter_map(|p| glob::Pattern::new(p).ok())
830        .collect();
831    ignore_patterns.extend(
832        cfg.extra_ignore_patterns
833            .iter()
834            .filter_map(|p| glob::Pattern::new(p).ok()),
835    );
836
837    let mut files: Vec<String> = Vec::new();
838    for entry in walker.flatten() {
839        let path = entry.path();
840        if !path.is_file() {
841            continue;
842        }
843        if !is_code_file(path) {
844            continue;
845        }
846        let rel = path
847            .strip_prefix(root)
848            .unwrap_or(path)
849            .to_string_lossy()
850            .to_string();
851        if rel.is_empty() {
852            continue;
853        }
854        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
855            continue;
856        }
857        if files.len() >= MAX_BM25_FILES {
858            tracing::warn!(
859                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
860                root.display()
861            );
862            break;
863        }
864        files.push(rel);
865    }
866
867    files.sort();
868    files.dedup();
869    files
870}
871
872pub fn is_code_file(path: &Path) -> bool {
873    let ext = path
874        .extension()
875        .and_then(|e| e.to_str())
876        .unwrap_or("")
877        .to_lowercase();
878    matches!(
879        ext.as_str(),
880        "rs" | "ts"
881            | "tsx"
882            | "js"
883            | "jsx"
884            | "py"
885            | "go"
886            | "java"
887            | "c"
888            | "cc"
889            | "cpp"
890            | "h"
891            | "hpp"
892            | "rb"
893            | "cs"
894            | "kt"
895            | "swift"
896            | "php"
897            | "scala"
898            | "sql"
899            | "ex"
900            | "exs"
901            | "zig"
902            | "lua"
903            | "dart"
904            | "vue"
905            | "svelte"
906    )
907}