lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12    "vendor/**",
13    "dist/**",
14    "build/**",
15    "public/vendor/**",
16    "public/js/**",
17    "public/css/**",
18    "public/build/**",
19    ".next/**",
20    ".nuxt/**",
21    "__pycache__/**",
22    "*.min.js",
23    "*.min.css",
24    "*.bundle.js",
25    "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30        .ok()
31        .and_then(|v| v.parse::<u64>().ok())
32        .unwrap_or_else(|| {
33            let cfg = crate::core::config::Config::load();
34            let profile = crate::core::config::MemoryProfile::effective(&cfg);
35            let profile_mb = profile.bm25_max_cache_mb();
36            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37                profile_mb
38            } else {
39                cfg.bm25_max_cache_mb
40            }
41        });
42    mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47    pub file_path: String,
48    pub symbol_name: String,
49    pub kind: ChunkKind,
50    pub start_line: usize,
51    pub end_line: usize,
52    pub content: String,
53    #[serde(default)]
54    pub tokens: Vec<String>,
55    pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60    Function,
61    Struct,
62    Impl,
63    Module,
64    Class,
65    Method,
66    Other,
67}
68
69#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
70pub struct IndexedFileState {
71    pub mtime_ms: u64,
72    pub size_bytes: u64,
73}
74
75impl IndexedFileState {
76    fn from_path(path: &Path) -> Option<Self> {
77        let meta = path.metadata().ok()?;
78        let size_bytes = meta.len();
79        let mtime_ms = meta
80            .modified()
81            .ok()
82            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
83            .map(|d| d.as_millis() as u64)?;
84        Some(Self {
85            mtime_ms,
86            size_bytes,
87        })
88    }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct BM25Index {
93    pub chunks: Vec<CodeChunk>,
94    pub inverted: HashMap<String, Vec<(usize, f64)>>,
95    pub avg_doc_len: f64,
96    pub doc_count: usize,
97    pub doc_freqs: HashMap<String, usize>,
98    #[serde(default)]
99    pub files: HashMap<String, IndexedFileState>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct SearchResult {
104    pub chunk_idx: usize,
105    pub score: f64,
106    pub file_path: String,
107    pub symbol_name: String,
108    pub kind: ChunkKind,
109    pub start_line: usize,
110    pub end_line: usize,
111    pub snippet: String,
112}
113
114const BM25_K1: f64 = 1.2;
115const BM25_B: f64 = 0.75;
116
117impl Default for BM25Index {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123impl BM25Index {
124    pub fn new() -> Self {
125        Self {
126            chunks: Vec::new(),
127            inverted: HashMap::new(),
128            avg_doc_len: 0.0,
129            doc_count: 0,
130            doc_freqs: HashMap::new(),
131            files: HashMap::new(),
132        }
133    }
134
135    /// Approximate heap memory used by this index in bytes.
136    pub fn memory_usage_bytes(&self) -> usize {
137        let chunks_size: usize = self
138            .chunks
139            .iter()
140            .map(|c| {
141                c.content.len()
142                    + c.file_path.len()
143                    + c.symbol_name.len()
144                    + c.tokens.iter().map(String::len).sum::<usize>()
145                    + 64
146            })
147            .sum();
148        let inverted_size: usize = self
149            .inverted
150            .iter()
151            .map(|(k, v)| k.len() + v.len() * 16 + 32)
152            .sum();
153        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
154        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
155        chunks_size + inverted_size + files_size + freqs_size
156    }
157
158    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
159    pub fn unload(&mut self) {
160        let usage = self.memory_usage_bytes();
161        self.chunks = Vec::new();
162        self.inverted = HashMap::new();
163        self.doc_freqs = HashMap::new();
164        self.files = HashMap::new();
165        self.avg_doc_len = 0.0;
166        self.doc_count = 0;
167        tracing::info!(
168            "[bm25] unloaded index, freed ~{:.1}MB",
169            usage as f64 / 1_048_576.0
170        );
171    }
172
173    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
174    #[cfg(test)]
175    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
176        let mut index = Self::new();
177        for mut chunk in chunks {
178            if chunk.token_count == 0 {
179                chunk.token_count = tokenize(&chunk.content).len();
180            }
181            index.add_chunk(chunk);
182        }
183        index.finalize();
184        index
185    }
186
187    pub fn build_from_directory(root: &Path) -> Self {
188        let mut index = Self::new();
189        let files = list_code_files(root);
190        for rel in files {
191            let abs = root.join(&rel);
192            let Some(state) = IndexedFileState::from_path(&abs) else {
193                continue;
194            };
195            if let Ok(content) = std::fs::read_to_string(&abs) {
196                let mut chunks = extract_chunks(&rel, &content);
197                chunks.sort_by(|a, b| {
198                    a.start_line
199                        .cmp(&b.start_line)
200                        .then_with(|| a.end_line.cmp(&b.end_line))
201                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
202                });
203                for chunk in chunks {
204                    index.add_chunk(chunk);
205                }
206                index.files.insert(rel, state);
207            }
208        }
209
210        index.finalize();
211        index
212    }
213
214    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
215        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
216        for c in &prev.chunks {
217            old_by_file
218                .entry(c.file_path.clone())
219                .or_default()
220                .push(c.clone());
221        }
222        for v in old_by_file.values_mut() {
223            v.sort_by(|a, b| {
224                a.start_line
225                    .cmp(&b.start_line)
226                    .then_with(|| a.end_line.cmp(&b.end_line))
227                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
228            });
229        }
230
231        let mut index = Self::new();
232        let files = list_code_files(root);
233        for rel in files {
234            let abs = root.join(&rel);
235            let Some(state) = IndexedFileState::from_path(&abs) else {
236                continue;
237            };
238
239            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
240            if unchanged {
241                if let Some(chunks) = old_by_file.get(&rel) {
242                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
243                        for chunk in chunks {
244                            index.add_chunk(chunk.clone());
245                        }
246                        index.files.insert(rel, state);
247                        continue;
248                    }
249                }
250            }
251
252            if let Ok(content) = std::fs::read_to_string(&abs) {
253                let mut chunks = extract_chunks(&rel, &content);
254                chunks.sort_by(|a, b| {
255                    a.start_line
256                        .cmp(&b.start_line)
257                        .then_with(|| a.end_line.cmp(&b.end_line))
258                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
259                });
260                for chunk in chunks {
261                    index.add_chunk(chunk);
262                }
263                index.files.insert(rel, state);
264            }
265        }
266
267        index.finalize();
268        index
269    }
270
271    fn add_chunk(&mut self, chunk: CodeChunk) {
272        let idx = self.chunks.len();
273
274        let tokens = tokenize(&chunk.content);
275        for token in &tokens {
276            let lower = token.to_lowercase();
277            let postings = self.inverted.entry(lower.clone()).or_default();
278            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
279                *self.doc_freqs.entry(lower).or_insert(0) += 1;
280            }
281            postings.push((idx, 1.0));
282        }
283
284        self.chunks.push(CodeChunk {
285            token_count: tokens.len(),
286            tokens: Vec::new(),
287            ..chunk
288        });
289    }
290
291    fn finalize(&mut self) {
292        self.doc_count = self.chunks.len();
293        if self.doc_count == 0 {
294            return;
295        }
296
297        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
298        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
299    }
300
301    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
302        let query_tokens = tokenize(query);
303        if query_tokens.is_empty() || self.doc_count == 0 {
304            return Vec::new();
305        }
306
307        let mut scores: HashMap<usize, f64> = HashMap::new();
308
309        for token in &query_tokens {
310            let lower = token.to_lowercase();
311            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
312            if df == 0.0 {
313                continue;
314            }
315
316            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
317
318            if let Some(postings) = self.inverted.get(&lower) {
319                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
320                for (idx, weight) in postings {
321                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
322                }
323
324                for (doc_idx, tf) in &doc_tfs {
325                    let doc_len = self.chunks[*doc_idx].token_count as f64;
326                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
327                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
328                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
329
330                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
331                }
332            }
333        }
334
335        let mut results: Vec<SearchResult> = scores
336            .into_iter()
337            .map(|(idx, score)| {
338                let chunk = &self.chunks[idx];
339                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
340                SearchResult {
341                    chunk_idx: idx,
342                    score,
343                    file_path: chunk.file_path.clone(),
344                    symbol_name: chunk.symbol_name.clone(),
345                    kind: chunk.kind.clone(),
346                    start_line: chunk.start_line,
347                    end_line: chunk.end_line,
348                    snippet,
349                }
350            })
351            .collect();
352
353        results.sort_by(|a, b| {
354            b.score
355                .partial_cmp(&a.score)
356                .unwrap_or(std::cmp::Ordering::Equal)
357                .then_with(|| a.file_path.cmp(&b.file_path))
358                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
359                .then_with(|| a.start_line.cmp(&b.start_line))
360                .then_with(|| a.end_line.cmp(&b.end_line))
361        });
362        results.truncate(top_k);
363        results
364    }
365
366    pub fn save(&self, root: &Path) -> std::io::Result<()> {
367        if self.chunks.len() > CHUNK_COUNT_WARNING {
368            tracing::warn!(
369                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
370                self.chunks.len(),
371                CHUNK_COUNT_WARNING
372            );
373        }
374
375        let dir = index_dir(root);
376        std::fs::create_dir_all(&dir)?;
377        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
378            .map_err(|e| std::io::Error::other(e.to_string()))?;
379
380        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
381            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
382
383        let max_bytes = max_bm25_cache_bytes();
384        if compressed.len() as u64 > max_bytes {
385            tracing::warn!(
386                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
387                compressed.len() as f64 / 1_048_576.0,
388                max_bytes / (1024 * 1024),
389                dir.display()
390            );
391            return Ok(());
392        }
393
394        tracing::info!(
395            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
396            data.len() as f64 / 1_048_576.0,
397            compressed.len() as f64 / 1_048_576.0,
398            (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
399        );
400
401        let target = dir.join("bm25_index.bin.zst");
402        let tmp = dir.join("bm25_index.bin.zst.tmp");
403        std::fs::write(&tmp, &compressed)?;
404        std::fs::rename(&tmp, &target)?;
405
406        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
407        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
408
409        let _ = std::fs::write(
410            dir.join("project_root.txt"),
411            root.to_string_lossy().as_bytes(),
412        );
413
414        Ok(())
415    }
416
417    pub fn load(root: &Path) -> Option<Self> {
418        let dir = index_dir(root);
419        let max_bytes = max_bm25_cache_bytes();
420
421        let zst_path = dir.join("bm25_index.bin.zst");
422        if zst_path.exists() {
423            let meta = std::fs::metadata(&zst_path).ok()?;
424            if meta.len() > max_bytes {
425                tracing::warn!(
426                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
427                    meta.len() as f64 / 1_073_741_824.0,
428                    max_bytes / (1024 * 1024),
429                    zst_path.display()
430                );
431                let quarantined = zst_path.with_extension("zst.quarantined");
432                let _ = std::fs::rename(&zst_path, &quarantined);
433                return None;
434            }
435            let compressed = std::fs::read(&zst_path).ok()?;
436            let data = zstd::decode_all(compressed.as_slice()).ok()?;
437            let (idx, _): (Self, _) =
438                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
439            return Some(idx);
440        }
441
442        let bin_path = dir.join("bm25_index.bin");
443        if bin_path.exists() {
444            let meta = std::fs::metadata(&bin_path).ok()?;
445            if meta.len() > max_bytes {
446                tracing::warn!(
447                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
448                    meta.len() as f64 / 1_073_741_824.0,
449                    max_bytes / (1024 * 1024),
450                    bin_path.display()
451                );
452                let quarantined = bin_path.with_extension("bin.quarantined");
453                let _ = std::fs::rename(&bin_path, &quarantined);
454                return None;
455            }
456            let data = std::fs::read(&bin_path).ok()?;
457            let (idx, _): (Self, _) =
458                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
459            // Auto-migrate: compress legacy .bin to .bin.zst
460            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
461                let zst_tmp = zst_path.with_extension("zst.tmp");
462                if std::fs::write(&zst_tmp, &compressed).is_ok()
463                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
464                {
465                    tracing::info!(
466                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
467                        data.len() as f64 / 1_048_576.0,
468                        compressed.len() as f64 / 1_048_576.0
469                    );
470                    let _ = std::fs::remove_file(&bin_path);
471                }
472            }
473            return Some(idx);
474        }
475
476        let json_path = dir.join("bm25_index.json");
477        if json_path.exists() {
478            let meta = std::fs::metadata(&json_path).ok()?;
479            if meta.len() > max_bytes {
480                tracing::warn!(
481                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
482                    meta.len() as f64 / 1_073_741_824.0,
483                    max_bytes / (1024 * 1024),
484                    json_path.display()
485                );
486                let quarantined = json_path.with_extension("json.quarantined");
487                let _ = std::fs::rename(&json_path, &quarantined);
488                return None;
489            }
490            let data = std::fs::read_to_string(&json_path).ok()?;
491            return serde_json::from_str(&data).ok();
492        }
493
494        None
495    }
496
497    pub fn load_or_build(root: &Path) -> Self {
498        if let Some(idx) = Self::load(root) {
499            if !bm25_index_looks_stale(&idx, root) {
500                return idx;
501            }
502            tracing::warn!(
503                "[bm25_index: stale index detected for {}; rebuilding]",
504                root.display()
505            );
506            let rebuilt = if idx.files.is_empty() {
507                Self::build_from_directory(root)
508            } else {
509                Self::rebuild_incremental(root, &idx)
510            };
511            let _ = rebuilt.save(root);
512            return rebuilt;
513        }
514
515        let built = Self::build_from_directory(root);
516        let _ = built.save(root);
517        built
518    }
519
520    pub fn index_file_path(root: &Path) -> PathBuf {
521        let dir = index_dir(root);
522        let zst = dir.join("bm25_index.bin.zst");
523        if zst.exists() {
524            return zst;
525        }
526        let bin = dir.join("bm25_index.bin");
527        if bin.exists() {
528            return bin;
529        }
530        dir.join("bm25_index.json")
531    }
532}
533
534fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
535    if index.chunks.is_empty() {
536        return false;
537    }
538
539    if index.files.is_empty() {
540        // Legacy index (pre file-state tracking): only detect missing files.
541        let mut seen = std::collections::HashSet::<&str>::new();
542        for chunk in &index.chunks {
543            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
544            if rel.is_empty() {
545                continue;
546            }
547            if !seen.insert(rel) {
548                continue;
549            }
550            if !root.join(rel).exists() {
551                return true;
552            }
553        }
554        return false;
555    }
556
557    // Missing or modified tracked files.
558    for (rel, old_state) in &index.files {
559        let abs = root.join(rel);
560        if !abs.exists() {
561            return true;
562        }
563        let Some(cur) = IndexedFileState::from_path(&abs) else {
564            return true;
565        };
566        if &cur != old_state {
567            return true;
568        }
569    }
570
571    // New files (present on disk but not in index).
572    for rel in list_code_files(root) {
573        if !index.files.contains_key(&rel) {
574            return true;
575        }
576    }
577
578    false
579}
580
581fn index_dir(root: &Path) -> PathBuf {
582    crate::core::index_namespace::vectors_dir(root)
583}
584
585fn list_code_files(root: &Path) -> Vec<String> {
586    let walker = ignore::WalkBuilder::new(root)
587        .hidden(true)
588        .git_ignore(true)
589        .git_global(true)
590        .git_exclude(true)
591        .build();
592
593    let cfg = crate::core::config::Config::load();
594    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
595        .iter()
596        .filter_map(|p| glob::Pattern::new(p).ok())
597        .collect();
598    ignore_patterns.extend(
599        cfg.extra_ignore_patterns
600            .iter()
601            .filter_map(|p| glob::Pattern::new(p).ok()),
602    );
603
604    let mut files: Vec<String> = Vec::new();
605    for entry in walker.flatten() {
606        let path = entry.path();
607        if !path.is_file() {
608            continue;
609        }
610        if !is_code_file(path) {
611            continue;
612        }
613        let rel = path
614            .strip_prefix(root)
615            .unwrap_or(path)
616            .to_string_lossy()
617            .to_string();
618        if rel.is_empty() {
619            continue;
620        }
621        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
622            continue;
623        }
624        if files.len() >= MAX_BM25_FILES {
625            tracing::warn!(
626                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
627                root.display()
628            );
629            break;
630        }
631        files.push(rel);
632    }
633
634    files.sort();
635    files.dedup();
636    files
637}
638
639pub fn is_code_file(path: &Path) -> bool {
640    let ext = path
641        .extension()
642        .and_then(|e| e.to_str())
643        .unwrap_or("")
644        .to_lowercase();
645    matches!(
646        ext.as_str(),
647        "rs" | "ts"
648            | "tsx"
649            | "js"
650            | "jsx"
651            | "py"
652            | "go"
653            | "java"
654            | "c"
655            | "cc"
656            | "cpp"
657            | "h"
658            | "hpp"
659            | "rb"
660            | "cs"
661            | "kt"
662            | "swift"
663            | "php"
664            | "scala"
665            | "sql"
666            | "ex"
667            | "exs"
668            | "zig"
669            | "lua"
670            | "dart"
671            | "vue"
672            | "svelte"
673    )
674}
675
676fn tokenize(text: &str) -> Vec<String> {
677    let mut tokens = Vec::new();
678    let mut current = String::new();
679
680    for ch in text.chars() {
681        if ch.is_alphanumeric() || ch == '_' {
682            current.push(ch);
683        } else {
684            if current.len() >= 2 {
685                tokens.push(current.clone());
686            }
687            current.clear();
688        }
689    }
690    if current.len() >= 2 {
691        tokens.push(current);
692    }
693
694    split_camel_case_tokens(&tokens)
695}
696
697pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
698    tokenize(text)
699}
700
701fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
702    let mut result = Vec::new();
703    for token in tokens {
704        result.push(token.clone());
705        let mut start = 0;
706        let chars: Vec<char> = token.chars().collect();
707        for i in 1..chars.len() {
708            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
709                let part: String = chars[start..i].iter().collect();
710                if part.len() >= 2 {
711                    result.push(part);
712                }
713                start = i;
714            }
715        }
716        if start > 0 {
717            let part: String = chars[start..].iter().collect();
718            if part.len() >= 2 {
719                result.push(part);
720            }
721        }
722    }
723    result
724}
725
726fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
727    #[cfg(feature = "tree-sitter")]
728    {
729        let ext = std::path::Path::new(file_path)
730            .extension()
731            .and_then(|e| e.to_str())
732            .unwrap_or("");
733        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
734            return chunks;
735        }
736    }
737
738    let lines: Vec<&str> = content.lines().collect();
739    if lines.is_empty() {
740        return Vec::new();
741    }
742
743    let mut chunks = Vec::new();
744    let mut i = 0;
745
746    while i < lines.len() {
747        let trimmed = lines[i].trim();
748
749        if let Some((name, kind)) = detect_symbol(trimmed) {
750            let start = i;
751            let end = find_block_end(&lines, i);
752            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
753            let token_count = tokenize(&block).len();
754
755            chunks.push(CodeChunk {
756                file_path: file_path.to_string(),
757                symbol_name: name,
758                kind,
759                start_line: start + 1,
760                end_line: end + 1,
761                content: block,
762                tokens: Vec::new(),
763                token_count,
764            });
765
766            i = end + 1;
767        } else {
768            i += 1;
769        }
770    }
771
772    if chunks.is_empty() && !content.is_empty() {
773        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
774        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
775        //
776        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
777        let bytes = content.as_bytes();
778        let rk_chunks = crate::core::rabin_karp::chunk(content);
779        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
780            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
781                let end = (c.offset + c.length).min(bytes.len());
782                let slice = &bytes[c.offset..end];
783                let chunk_text = String::from_utf8_lossy(slice).into_owned();
784                let token_count = tokenize(&chunk_text).len();
785                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
786                let end_line = start_line + bytecount::count(slice, b'\n');
787                chunks.push(CodeChunk {
788                    file_path: file_path.to_string(),
789                    symbol_name: format!("{file_path}#chunk-{idx}"),
790                    kind: ChunkKind::Module,
791                    start_line,
792                    end_line: end_line.max(start_line),
793                    content: chunk_text,
794                    tokens: Vec::new(),
795                    token_count,
796                });
797            }
798        } else {
799            let token_count = tokenize(content).len();
800            let snippet = lines
801                .iter()
802                .take(50)
803                .copied()
804                .collect::<Vec<_>>()
805                .join("\n");
806            chunks.push(CodeChunk {
807                file_path: file_path.to_string(),
808                symbol_name: file_path.to_string(),
809                kind: ChunkKind::Module,
810                start_line: 1,
811                end_line: lines.len(),
812                content: snippet,
813                tokens: Vec::new(),
814                token_count,
815            });
816        }
817    }
818
819    chunks
820}
821
822fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
823    let trimmed = line.trim();
824
825    let patterns: &[(&str, ChunkKind)] = &[
826        ("pub async fn ", ChunkKind::Function),
827        ("async fn ", ChunkKind::Function),
828        ("pub fn ", ChunkKind::Function),
829        ("fn ", ChunkKind::Function),
830        ("pub struct ", ChunkKind::Struct),
831        ("struct ", ChunkKind::Struct),
832        ("pub enum ", ChunkKind::Struct),
833        ("enum ", ChunkKind::Struct),
834        ("impl ", ChunkKind::Impl),
835        ("pub trait ", ChunkKind::Struct),
836        ("trait ", ChunkKind::Struct),
837        ("export function ", ChunkKind::Function),
838        ("export async function ", ChunkKind::Function),
839        ("export default function ", ChunkKind::Function),
840        ("function ", ChunkKind::Function),
841        ("async function ", ChunkKind::Function),
842        ("export class ", ChunkKind::Class),
843        ("class ", ChunkKind::Class),
844        ("export interface ", ChunkKind::Struct),
845        ("interface ", ChunkKind::Struct),
846        ("def ", ChunkKind::Function),
847        ("async def ", ChunkKind::Function),
848        ("class ", ChunkKind::Class),
849        ("func ", ChunkKind::Function),
850    ];
851
852    for (prefix, kind) in patterns {
853        if let Some(rest) = trimmed.strip_prefix(prefix) {
854            let name: String = rest
855                .chars()
856                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
857                .take_while(|c| *c != '<')
858                .collect();
859            if !name.is_empty() {
860                return Some((name, kind.clone()));
861            }
862        }
863    }
864
865    None
866}
867
868fn find_block_end(lines: &[&str], start: usize) -> usize {
869    let mut depth = 0i32;
870    let mut found_open = false;
871
872    for (i, line) in lines.iter().enumerate().skip(start) {
873        for ch in line.chars() {
874            match ch {
875                '{' | '(' if !found_open || depth > 0 => {
876                    depth += 1;
877                    found_open = true;
878                }
879                '}' | ')' if depth > 0 => {
880                    depth -= 1;
881                    if depth == 0 && found_open {
882                        return i;
883                    }
884                }
885                _ => {}
886            }
887        }
888
889        if found_open && depth <= 0 && i > start {
890            return i;
891        }
892
893        if !found_open && i > start + 2 {
894            let trimmed = lines[i].trim();
895            if trimmed.is_empty()
896                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
897            {
898                return i.saturating_sub(1);
899            }
900        }
901    }
902
903    (start + 50).min(lines.len().saturating_sub(1))
904}
905
906pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
907    if results.is_empty() {
908        return "No results found.".to_string();
909    }
910
911    let mut out = String::new();
912    for (i, r) in results.iter().enumerate() {
913        if compact {
914            out.push_str(&format!(
915                "{}. {:.2} {}:{}-{} {:?} {}\n",
916                i + 1,
917                r.score,
918                r.file_path,
919                r.start_line,
920                r.end_line,
921                r.kind,
922                r.symbol_name,
923            ));
924        } else {
925            out.push_str(&format!(
926                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
927                i + 1,
928                r.score,
929                r.file_path,
930                r.symbol_name,
931                r.kind,
932                r.start_line,
933                r.end_line,
934                r.snippet,
935            ));
936        }
937    }
938    out
939}
940
941#[cfg(test)]
942mod tests {
943    use super::*;
944    use tempfile::tempdir;
945
946    #[cfg(unix)]
947    use std::os::unix::fs::PermissionsExt;
948
949    #[test]
950    fn tokenize_splits_code() {
951        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
952        assert!(tokens.contains(&"calculate_total".to_string()));
953        assert!(tokens.contains(&"items".to_string()));
954        assert!(tokens.contains(&"Vec".to_string()));
955    }
956
957    #[test]
958    fn camel_case_splitting() {
959        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
960        assert!(tokens.contains(&"calculateTotal".to_string()));
961        assert!(tokens.contains(&"calculate".to_string()));
962        assert!(tokens.contains(&"Total".to_string()));
963    }
964
965    #[test]
966    fn detect_rust_function() {
967        let (name, kind) =
968            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
969        assert_eq!(name, "process_request");
970        assert_eq!(kind, ChunkKind::Function);
971    }
972
973    #[test]
974    fn bm25_search_finds_relevant() {
975        let mut index = BM25Index::new();
976        index.add_chunk(CodeChunk {
977            file_path: "auth.rs".into(),
978            symbol_name: "validate_token".into(),
979            kind: ChunkKind::Function,
980            start_line: 1,
981            end_line: 10,
982            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
983            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
984            token_count: 8,
985        });
986        index.add_chunk(CodeChunk {
987            file_path: "db.rs".into(),
988            symbol_name: "connect_database".into(),
989            kind: ChunkKind::Function,
990            start_line: 1,
991            end_line: 5,
992            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
993            tokens: tokenize("fn connect_database url str Pool create_pool url"),
994            token_count: 7,
995        });
996        index.finalize();
997
998        let results = index.search("jwt token validation", 5);
999        assert!(!results.is_empty());
1000        assert_eq!(results[0].symbol_name, "validate_token");
1001    }
1002
1003    #[test]
1004    fn bm25_search_sorts_ties_deterministically() {
1005        let mut index = BM25Index::new();
1006
1007        // Insert in reverse path order to ensure the sort tie-break matters.
1008        index.add_chunk(CodeChunk {
1009            file_path: "b.rs".into(),
1010            symbol_name: "same".into(),
1011            kind: ChunkKind::Function,
1012            start_line: 1,
1013            end_line: 1,
1014            content: "fn same() {}".into(),
1015            tokens: tokenize("same token"),
1016            token_count: 2,
1017        });
1018        index.add_chunk(CodeChunk {
1019            file_path: "a.rs".into(),
1020            symbol_name: "same".into(),
1021            kind: ChunkKind::Function,
1022            start_line: 1,
1023            end_line: 1,
1024            content: "fn same() {}".into(),
1025            tokens: tokenize("same token"),
1026            token_count: 2,
1027        });
1028        index.finalize();
1029
1030        let results = index.search("same", 10);
1031        assert!(results.len() >= 2);
1032        assert_eq!(results[0].file_path, "a.rs");
1033        assert_eq!(results[1].file_path, "b.rs");
1034    }
1035
1036    #[test]
1037    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1038        let td = tempdir().expect("tempdir");
1039        let root = td.path();
1040        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1041
1042        let idx = BM25Index::build_from_directory(root);
1043        assert!(!bm25_index_looks_stale(&idx, root));
1044
1045        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1046        assert!(bm25_index_looks_stale(&idx, root));
1047    }
1048
1049    #[test]
1050    #[cfg(unix)]
1051    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1052        let td = tempdir().expect("tempdir");
1053        let root = td.path();
1054
1055        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1056        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1057
1058        let idx1 = BM25Index::build_from_directory(root);
1059        assert!(idx1.files.contains_key("a.rs"));
1060        assert!(idx1.files.contains_key("b.rs"));
1061
1062        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1063        let a_path = root.join("a.rs");
1064        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1065        perms.set_mode(0o000);
1066        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1067
1068        // Change b.rs (size changes) to force a re-read for that file.
1069        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1070            .expect("rewrite b.rs");
1071
1072        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1073        assert!(
1074            idx2.files.contains_key("a.rs"),
1075            "a.rs should be kept via reuse"
1076        );
1077        assert!(idx2.files.contains_key("b.rs"));
1078
1079        let b_has_b2 = idx2
1080            .chunks
1081            .iter()
1082            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1083        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1084
1085        // Restore permissions to avoid cleanup surprises.
1086        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1087        perms.set_mode(0o644);
1088        let _ = std::fs::set_permissions(&a_path, perms);
1089    }
1090
1091    #[test]
1092    fn load_quarantines_oversized_index() {
1093        let _env = crate::core::data_dir::test_env_lock();
1094        let td = tempdir().expect("tempdir");
1095        let root = td.path();
1096        let dir = crate::core::index_namespace::vectors_dir(root);
1097        std::fs::create_dir_all(&dir).expect("create vectors dir");
1098
1099        let index_path = dir.join("bm25_index.json");
1100        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1101        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1102
1103        let result = BM25Index::load(root);
1104        assert!(result.is_none(), "oversized index should return None");
1105        assert!(
1106            !index_path.exists(),
1107            "original index should be removed after quarantine"
1108        );
1109        assert!(
1110            dir.join("bm25_index.json.quarantined").exists(),
1111            "quarantined file should exist"
1112        );
1113
1114        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1115    }
1116
1117    #[test]
1118    fn save_refuses_oversized_output() {
1119        let _env = crate::core::data_dir::test_env_lock();
1120        let data_dir = tempdir().expect("data_dir");
1121        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1122        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1123
1124        let td = tempdir().expect("tempdir");
1125        let root = td.path();
1126
1127        let mut index = BM25Index::new();
1128        index.add_chunk(CodeChunk {
1129            file_path: "a.rs".into(),
1130            symbol_name: "a".into(),
1131            kind: ChunkKind::Function,
1132            start_line: 1,
1133            end_line: 1,
1134            content: "fn a() {}".into(),
1135            tokens: tokenize("fn a"),
1136            token_count: 2,
1137        });
1138        index.finalize();
1139
1140        let _ = index.save(root);
1141        let index_path = BM25Index::index_file_path(root);
1142        assert!(
1143            !index_path.exists(),
1144            "save should refuse to persist oversized index"
1145        );
1146
1147        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1148    }
1149
1150    #[test]
1151    fn save_writes_project_root_marker() {
1152        let _env = crate::core::data_dir::test_env_lock();
1153        let td = tempdir().expect("tempdir");
1154        let root = td.path();
1155        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1156
1157        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1158        let index = BM25Index::build_from_directory(root);
1159        index.save(root).expect("save");
1160
1161        let dir = crate::core::index_namespace::vectors_dir(root);
1162        let marker = dir.join("project_root.txt");
1163        assert!(marker.exists(), "project_root.txt marker should exist");
1164        let content = std::fs::read_to_string(&marker).expect("read marker");
1165        assert_eq!(content, root.to_string_lossy());
1166    }
1167
1168    #[test]
1169    fn save_load_roundtrip_uses_zstd() {
1170        let _env = crate::core::data_dir::test_env_lock();
1171        let data_dir = tempdir().expect("data_dir");
1172        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1173        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1174        let td = tempdir().expect("tempdir");
1175        let root = td.path();
1176
1177        for i in 0..10 {
1178            std::fs::write(
1179                root.join(format!("mod{i}.rs")),
1180                format!(
1181                    "pub fn handler_{i}() {{\n    println!(\"hello\");\n}}\n\n\
1182                     pub fn helper_{i}() {{\n    println!(\"world\");\n}}\n"
1183                ),
1184            )
1185            .expect("write");
1186        }
1187
1188        let index = BM25Index::build_from_directory(root);
1189        assert!(index.doc_count > 0, "should have indexed chunks");
1190        index.save(root).expect("save");
1191
1192        let dir = crate::core::index_namespace::vectors_dir(root);
1193        let zst = dir.join("bm25_index.bin.zst");
1194        assert!(zst.exists(), "should write .bin.zst");
1195        assert!(
1196            !dir.join("bm25_index.bin").exists(),
1197            ".bin should be deleted"
1198        );
1199
1200        let loaded = BM25Index::load(root).expect("load compressed index");
1201        assert_eq!(loaded.doc_count, index.doc_count);
1202        assert_eq!(loaded.chunks.len(), index.chunks.len());
1203
1204        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1205        std::env::remove_var("LEAN_CTX_DATA_DIR");
1206    }
1207
1208    #[test]
1209    fn auto_migrate_bin_to_zst() {
1210        let _env = crate::core::data_dir::test_env_lock();
1211        let data_dir = tempdir().expect("data_dir");
1212        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1213        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1214        let td = tempdir().expect("tempdir");
1215        let root = td.path();
1216
1217        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1218        let index = BM25Index::build_from_directory(root);
1219
1220        let dir = crate::core::index_namespace::vectors_dir(root);
1221        std::fs::create_dir_all(&dir).expect("mkdir");
1222        let data =
1223            bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1224        std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1225
1226        let loaded = BM25Index::load(root).expect("load should auto-migrate");
1227        assert_eq!(loaded.doc_count, index.doc_count);
1228        assert!(
1229            dir.join("bm25_index.bin.zst").exists(),
1230            ".bin.zst should be created"
1231        );
1232        assert!(
1233            !dir.join("bm25_index.bin").exists(),
1234            ".bin should be removed"
1235        );
1236
1237        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1238        std::env::remove_var("LEAN_CTX_DATA_DIR");
1239    }
1240
1241    #[test]
1242    fn list_code_files_skips_default_vendor_ignores() {
1243        let td = tempdir().expect("tempdir");
1244        let root = td.path();
1245
1246        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1247        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1248        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1249        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1250        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1251
1252        let files = list_code_files(root);
1253        assert!(
1254            files.iter().any(|f| f == "main.rs"),
1255            "main.rs should be included"
1256        );
1257        assert!(
1258            !files.iter().any(|f| f.starts_with("vendor/")),
1259            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1260        );
1261        assert!(
1262            !files.iter().any(|f| f.starts_with("dist/")),
1263            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1264        );
1265    }
1266
1267    #[test]
1268    fn list_code_files_respects_max_files_cap() {
1269        let td = tempdir().expect("tempdir");
1270        let root = td.path();
1271
1272        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1273        // but we can verify the cap constant exists and the function returns a bounded vec.
1274        for i in 0..10 {
1275            std::fs::write(
1276                root.join(format!("f{i}.rs")),
1277                format!("pub fn f{i}() {{}}\n"),
1278            )
1279            .expect("write");
1280        }
1281        let files = list_code_files(root);
1282        assert!(
1283            files.len() <= MAX_BM25_FILES,
1284            "file count should not exceed MAX_BM25_FILES"
1285        );
1286    }
1287
1288    #[test]
1289    fn max_bm25_cache_bytes_reads_env() {
1290        let _env = crate::core::data_dir::test_env_lock();
1291        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1292        let bytes = max_bm25_cache_bytes();
1293        assert_eq!(bytes, 64 * 1024 * 1024);
1294        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1295    }
1296}
lean_ctx/core/bm25_index.rs

lean_ctx/core/
bm25_index.rs