lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11    "vendor/**",
12    "dist/**",
13    "build/**",
14    "public/vendor/**",
15    "public/js/**",
16    "public/css/**",
17    "public/build/**",
18    ".next/**",
19    ".nuxt/**",
20    "__pycache__/**",
21    "*.min.js",
22    "*.min.css",
23    "*.bundle.js",
24    "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29        .ok()
30        .and_then(|v| v.parse::<u64>().ok())
31        .unwrap_or_else(|| {
32            let cfg = crate::core::config::Config::load();
33            let profile = crate::core::config::MemoryProfile::effective(&cfg);
34            let profile_mb = profile.bm25_max_cache_mb();
35            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36                profile_mb
37            } else {
38                cfg.bm25_max_cache_mb
39            }
40        });
41    mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46    pub file_path: String,
47    pub symbol_name: String,
48    pub kind: ChunkKind,
49    pub start_line: usize,
50    pub end_line: usize,
51    pub content: String,
52    #[serde(skip_serializing, default)]
53    pub tokens: Vec<String>,
54    pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59    Function,
60    Struct,
61    Impl,
62    Module,
63    Class,
64    Method,
65    Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70    pub mtime_ms: u64,
71    pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75    fn from_path(path: &Path) -> Option<Self> {
76        let meta = path.metadata().ok()?;
77        let size_bytes = meta.len();
78        let mtime_ms = meta
79            .modified()
80            .ok()
81            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82            .map(|d| d.as_millis() as u64)?;
83        Some(Self {
84            mtime_ms,
85            size_bytes,
86        })
87    }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92    pub chunks: Vec<CodeChunk>,
93    pub inverted: HashMap<String, Vec<(usize, f64)>>,
94    pub avg_doc_len: f64,
95    pub doc_count: usize,
96    pub doc_freqs: HashMap<String, usize>,
97    #[serde(default)]
98    pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103    pub chunk_idx: usize,
104    pub score: f64,
105    pub file_path: String,
106    pub symbol_name: String,
107    pub kind: ChunkKind,
108    pub start_line: usize,
109    pub end_line: usize,
110    pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117    fn default() -> Self {
118        Self::new()
119    }
120}
121
122impl BM25Index {
123    pub fn new() -> Self {
124        Self {
125            chunks: Vec::new(),
126            inverted: HashMap::new(),
127            avg_doc_len: 0.0,
128            doc_count: 0,
129            doc_freqs: HashMap::new(),
130            files: HashMap::new(),
131        }
132    }
133
134    /// Approximate heap memory used by this index in bytes.
135    pub fn memory_usage_bytes(&self) -> usize {
136        let chunks_size: usize = self
137            .chunks
138            .iter()
139            .map(|c| {
140                c.content.len()
141                    + c.file_path.len()
142                    + c.symbol_name.len()
143                    + c.tokens.iter().map(String::len).sum::<usize>()
144                    + 64
145            })
146            .sum();
147        let inverted_size: usize = self
148            .inverted
149            .iter()
150            .map(|(k, v)| k.len() + v.len() * 16 + 32)
151            .sum();
152        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
153        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
154        chunks_size + inverted_size + files_size + freqs_size
155    }
156
157    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
158    pub fn unload(&mut self) {
159        let usage = self.memory_usage_bytes();
160        self.chunks = Vec::new();
161        self.inverted = HashMap::new();
162        self.doc_freqs = HashMap::new();
163        self.files = HashMap::new();
164        self.avg_doc_len = 0.0;
165        self.doc_count = 0;
166        tracing::info!(
167            "[bm25] unloaded index, freed ~{:.1}MB",
168            usage as f64 / 1_048_576.0
169        );
170    }
171
172    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
173    #[cfg(test)]
174    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
175        let mut index = Self::new();
176        for mut chunk in chunks {
177            if chunk.token_count == 0 {
178                chunk.token_count = tokenize(&chunk.content).len();
179            }
180            index.add_chunk(chunk);
181        }
182        index.finalize();
183        index
184    }
185
186    pub fn build_from_directory(root: &Path) -> Self {
187        let mut index = Self::new();
188        let files = list_code_files(root);
189        for rel in files {
190            let abs = root.join(&rel);
191            let Some(state) = IndexedFileState::from_path(&abs) else {
192                continue;
193            };
194            if let Ok(content) = std::fs::read_to_string(&abs) {
195                let mut chunks = extract_chunks(&rel, &content);
196                chunks.sort_by(|a, b| {
197                    a.start_line
198                        .cmp(&b.start_line)
199                        .then_with(|| a.end_line.cmp(&b.end_line))
200                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
201                });
202                for chunk in chunks {
203                    index.add_chunk(chunk);
204                }
205                index.files.insert(rel, state);
206            }
207        }
208
209        index.finalize();
210        index
211    }
212
213    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
214        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
215        for c in &prev.chunks {
216            old_by_file
217                .entry(c.file_path.clone())
218                .or_default()
219                .push(c.clone());
220        }
221        for v in old_by_file.values_mut() {
222            v.sort_by(|a, b| {
223                a.start_line
224                    .cmp(&b.start_line)
225                    .then_with(|| a.end_line.cmp(&b.end_line))
226                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
227            });
228        }
229
230        let mut index = Self::new();
231        let files = list_code_files(root);
232        for rel in files {
233            let abs = root.join(&rel);
234            let Some(state) = IndexedFileState::from_path(&abs) else {
235                continue;
236            };
237
238            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
239            if unchanged {
240                if let Some(chunks) = old_by_file.get(&rel) {
241                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
242                        for chunk in chunks {
243                            index.add_chunk(chunk.clone());
244                        }
245                        index.files.insert(rel, state);
246                        continue;
247                    }
248                }
249            }
250
251            if let Ok(content) = std::fs::read_to_string(&abs) {
252                let mut chunks = extract_chunks(&rel, &content);
253                chunks.sort_by(|a, b| {
254                    a.start_line
255                        .cmp(&b.start_line)
256                        .then_with(|| a.end_line.cmp(&b.end_line))
257                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
258                });
259                for chunk in chunks {
260                    index.add_chunk(chunk);
261                }
262                index.files.insert(rel, state);
263            }
264        }
265
266        index.finalize();
267        index
268    }
269
270    fn add_chunk(&mut self, chunk: CodeChunk) {
271        let idx = self.chunks.len();
272
273        let tokens = tokenize(&chunk.content);
274        for token in &tokens {
275            let lower = token.to_lowercase();
276            let postings = self.inverted.entry(lower.clone()).or_default();
277            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
278                *self.doc_freqs.entry(lower).or_insert(0) += 1;
279            }
280            postings.push((idx, 1.0));
281        }
282
283        self.chunks.push(CodeChunk {
284            token_count: tokens.len(),
285            tokens: Vec::new(),
286            ..chunk
287        });
288    }
289
290    fn finalize(&mut self) {
291        self.doc_count = self.chunks.len();
292        if self.doc_count == 0 {
293            return;
294        }
295
296        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
297        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
298    }
299
300    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
301        let query_tokens = tokenize(query);
302        if query_tokens.is_empty() || self.doc_count == 0 {
303            return Vec::new();
304        }
305
306        let mut scores: HashMap<usize, f64> = HashMap::new();
307
308        for token in &query_tokens {
309            let lower = token.to_lowercase();
310            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
311            if df == 0.0 {
312                continue;
313            }
314
315            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
316
317            if let Some(postings) = self.inverted.get(&lower) {
318                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
319                for (idx, weight) in postings {
320                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
321                }
322
323                for (doc_idx, tf) in &doc_tfs {
324                    let doc_len = self.chunks[*doc_idx].token_count as f64;
325                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
326                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
327                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
328
329                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
330                }
331            }
332        }
333
334        let mut results: Vec<SearchResult> = scores
335            .into_iter()
336            .map(|(idx, score)| {
337                let chunk = &self.chunks[idx];
338                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
339                SearchResult {
340                    chunk_idx: idx,
341                    score,
342                    file_path: chunk.file_path.clone(),
343                    symbol_name: chunk.symbol_name.clone(),
344                    kind: chunk.kind.clone(),
345                    start_line: chunk.start_line,
346                    end_line: chunk.end_line,
347                    snippet,
348                }
349            })
350            .collect();
351
352        results.sort_by(|a, b| {
353            b.score
354                .partial_cmp(&a.score)
355                .unwrap_or(std::cmp::Ordering::Equal)
356                .then_with(|| a.file_path.cmp(&b.file_path))
357                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
358                .then_with(|| a.start_line.cmp(&b.start_line))
359                .then_with(|| a.end_line.cmp(&b.end_line))
360        });
361        results.truncate(top_k);
362        results
363    }
364
365    pub fn save(&self, root: &Path) -> std::io::Result<()> {
366        if self.chunks.len() > CHUNK_COUNT_WARNING {
367            tracing::warn!(
368                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
369                self.chunks.len(),
370                CHUNK_COUNT_WARNING
371            );
372        }
373
374        let dir = index_dir(root);
375        std::fs::create_dir_all(&dir)?;
376        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
377            .map_err(|e| std::io::Error::other(e.to_string()))?;
378
379        let max_bytes = max_bm25_cache_bytes();
380        if data.len() as u64 > max_bytes {
381            tracing::warn!(
382                "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
383                data.len() as f64 / 1_048_576.0,
384                max_bytes / (1024 * 1024),
385                dir.display()
386            );
387            return Ok(());
388        }
389
390        let target = dir.join("bm25_index.bin");
391        let tmp = dir.join("bm25_index.bin.tmp");
392        std::fs::write(&tmp, &data)?;
393        std::fs::rename(&tmp, &target)?;
394
395        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
396
397        let _ = std::fs::write(
398            dir.join("project_root.txt"),
399            root.to_string_lossy().as_bytes(),
400        );
401
402        Ok(())
403    }
404
405    pub fn load(root: &Path) -> Option<Self> {
406        let dir = index_dir(root);
407        let max_bytes = max_bm25_cache_bytes();
408
409        let bin_path = dir.join("bm25_index.bin");
410        if bin_path.exists() {
411            let meta = std::fs::metadata(&bin_path).ok()?;
412            if meta.len() > max_bytes {
413                tracing::warn!(
414                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
415                    meta.len() as f64 / 1_073_741_824.0,
416                    max_bytes / (1024 * 1024),
417                    bin_path.display()
418                );
419                let quarantined = bin_path.with_extension("bin.quarantined");
420                let _ = std::fs::rename(&bin_path, &quarantined);
421                return None;
422            }
423            let data = std::fs::read(&bin_path).ok()?;
424            let (idx, _): (Self, _) =
425                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
426            return Some(idx);
427        }
428
429        let json_path = dir.join("bm25_index.json");
430        if json_path.exists() {
431            let meta = std::fs::metadata(&json_path).ok()?;
432            if meta.len() > max_bytes {
433                tracing::warn!(
434                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
435                    meta.len() as f64 / 1_073_741_824.0,
436                    max_bytes / (1024 * 1024),
437                    json_path.display()
438                );
439                let quarantined = json_path.with_extension("json.quarantined");
440                let _ = std::fs::rename(&json_path, &quarantined);
441                return None;
442            }
443            let data = std::fs::read_to_string(&json_path).ok()?;
444            return serde_json::from_str(&data).ok();
445        }
446
447        None
448    }
449
450    pub fn load_or_build(root: &Path) -> Self {
451        if let Some(idx) = Self::load(root) {
452            if !bm25_index_looks_stale(&idx, root) {
453                return idx;
454            }
455            tracing::warn!(
456                "[bm25_index: stale index detected for {}; rebuilding]",
457                root.display()
458            );
459            let rebuilt = if idx.files.is_empty() {
460                Self::build_from_directory(root)
461            } else {
462                Self::rebuild_incremental(root, &idx)
463            };
464            let _ = rebuilt.save(root);
465            return rebuilt;
466        }
467
468        let built = Self::build_from_directory(root);
469        let _ = built.save(root);
470        built
471    }
472
473    pub fn index_file_path(root: &Path) -> PathBuf {
474        let dir = index_dir(root);
475        let bin = dir.join("bm25_index.bin");
476        if bin.exists() {
477            return bin;
478        }
479        dir.join("bm25_index.json")
480    }
481}
482
483fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
484    if index.chunks.is_empty() {
485        return false;
486    }
487
488    if index.files.is_empty() {
489        // Legacy index (pre file-state tracking): only detect missing files.
490        let mut seen = std::collections::HashSet::<&str>::new();
491        for chunk in &index.chunks {
492            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
493            if rel.is_empty() {
494                continue;
495            }
496            if !seen.insert(rel) {
497                continue;
498            }
499            if !root.join(rel).exists() {
500                return true;
501            }
502        }
503        return false;
504    }
505
506    // Missing or modified tracked files.
507    for (rel, old_state) in &index.files {
508        let abs = root.join(rel);
509        if !abs.exists() {
510            return true;
511        }
512        let Some(cur) = IndexedFileState::from_path(&abs) else {
513            return true;
514        };
515        if &cur != old_state {
516            return true;
517        }
518    }
519
520    // New files (present on disk but not in index).
521    for rel in list_code_files(root) {
522        if !index.files.contains_key(&rel) {
523            return true;
524        }
525    }
526
527    false
528}
529
530fn index_dir(root: &Path) -> PathBuf {
531    crate::core::index_namespace::vectors_dir(root)
532}
533
534fn list_code_files(root: &Path) -> Vec<String> {
535    let walker = ignore::WalkBuilder::new(root)
536        .hidden(true)
537        .git_ignore(true)
538        .git_global(true)
539        .git_exclude(true)
540        .build();
541
542    let cfg = crate::core::config::Config::load();
543    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
544        .iter()
545        .filter_map(|p| glob::Pattern::new(p).ok())
546        .collect();
547    ignore_patterns.extend(
548        cfg.extra_ignore_patterns
549            .iter()
550            .filter_map(|p| glob::Pattern::new(p).ok()),
551    );
552
553    let mut files: Vec<String> = Vec::new();
554    for entry in walker.flatten() {
555        let path = entry.path();
556        if !path.is_file() {
557            continue;
558        }
559        if !is_code_file(path) {
560            continue;
561        }
562        let rel = path
563            .strip_prefix(root)
564            .unwrap_or(path)
565            .to_string_lossy()
566            .to_string();
567        if rel.is_empty() {
568            continue;
569        }
570        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
571            continue;
572        }
573        if files.len() >= MAX_BM25_FILES {
574            tracing::warn!(
575                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
576                root.display()
577            );
578            break;
579        }
580        files.push(rel);
581    }
582
583    files.sort();
584    files.dedup();
585    files
586}
587
588pub fn is_code_file(path: &Path) -> bool {
589    let ext = path
590        .extension()
591        .and_then(|e| e.to_str())
592        .unwrap_or("")
593        .to_lowercase();
594    matches!(
595        ext.as_str(),
596        "rs" | "ts"
597            | "tsx"
598            | "js"
599            | "jsx"
600            | "py"
601            | "go"
602            | "java"
603            | "c"
604            | "cc"
605            | "cpp"
606            | "h"
607            | "hpp"
608            | "rb"
609            | "cs"
610            | "kt"
611            | "swift"
612            | "php"
613            | "scala"
614            | "sql"
615            | "ex"
616            | "exs"
617            | "zig"
618            | "lua"
619            | "dart"
620            | "vue"
621            | "svelte"
622    )
623}
624
625fn tokenize(text: &str) -> Vec<String> {
626    let mut tokens = Vec::new();
627    let mut current = String::new();
628
629    for ch in text.chars() {
630        if ch.is_alphanumeric() || ch == '_' {
631            current.push(ch);
632        } else {
633            if current.len() >= 2 {
634                tokens.push(current.clone());
635            }
636            current.clear();
637        }
638    }
639    if current.len() >= 2 {
640        tokens.push(current);
641    }
642
643    split_camel_case_tokens(&tokens)
644}
645
646pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
647    tokenize(text)
648}
649
650fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
651    let mut result = Vec::new();
652    for token in tokens {
653        result.push(token.clone());
654        let mut start = 0;
655        let chars: Vec<char> = token.chars().collect();
656        for i in 1..chars.len() {
657            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
658                let part: String = chars[start..i].iter().collect();
659                if part.len() >= 2 {
660                    result.push(part);
661                }
662                start = i;
663            }
664        }
665        if start > 0 {
666            let part: String = chars[start..].iter().collect();
667            if part.len() >= 2 {
668                result.push(part);
669            }
670        }
671    }
672    result
673}
674
675fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
676    #[cfg(feature = "tree-sitter")]
677    {
678        let ext = std::path::Path::new(file_path)
679            .extension()
680            .and_then(|e| e.to_str())
681            .unwrap_or("");
682        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
683            return chunks;
684        }
685    }
686
687    let lines: Vec<&str> = content.lines().collect();
688    if lines.is_empty() {
689        return Vec::new();
690    }
691
692    let mut chunks = Vec::new();
693    let mut i = 0;
694
695    while i < lines.len() {
696        let trimmed = lines[i].trim();
697
698        if let Some((name, kind)) = detect_symbol(trimmed) {
699            let start = i;
700            let end = find_block_end(&lines, i);
701            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
702            let token_count = tokenize(&block).len();
703
704            chunks.push(CodeChunk {
705                file_path: file_path.to_string(),
706                symbol_name: name,
707                kind,
708                start_line: start + 1,
709                end_line: end + 1,
710                content: block,
711                tokens: Vec::new(),
712                token_count,
713            });
714
715            i = end + 1;
716        } else {
717            i += 1;
718        }
719    }
720
721    if chunks.is_empty() && !content.is_empty() {
722        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
723        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
724        //
725        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
726        let bytes = content.as_bytes();
727        let rk_chunks = crate::core::rabin_karp::chunk(content);
728        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
729            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
730                let end = (c.offset + c.length).min(bytes.len());
731                let slice = &bytes[c.offset..end];
732                let chunk_text = String::from_utf8_lossy(slice).into_owned();
733                let token_count = tokenize(&chunk_text).len();
734                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
735                let end_line = start_line + bytecount::count(slice, b'\n');
736                chunks.push(CodeChunk {
737                    file_path: file_path.to_string(),
738                    symbol_name: format!("{file_path}#chunk-{idx}"),
739                    kind: ChunkKind::Module,
740                    start_line,
741                    end_line: end_line.max(start_line),
742                    content: chunk_text,
743                    tokens: Vec::new(),
744                    token_count,
745                });
746            }
747        } else {
748            let token_count = tokenize(content).len();
749            let snippet = lines
750                .iter()
751                .take(50)
752                .copied()
753                .collect::<Vec<_>>()
754                .join("\n");
755            chunks.push(CodeChunk {
756                file_path: file_path.to_string(),
757                symbol_name: file_path.to_string(),
758                kind: ChunkKind::Module,
759                start_line: 1,
760                end_line: lines.len(),
761                content: snippet,
762                tokens: Vec::new(),
763                token_count,
764            });
765        }
766    }
767
768    chunks
769}
770
771fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
772    let trimmed = line.trim();
773
774    let patterns: &[(&str, ChunkKind)] = &[
775        ("pub async fn ", ChunkKind::Function),
776        ("async fn ", ChunkKind::Function),
777        ("pub fn ", ChunkKind::Function),
778        ("fn ", ChunkKind::Function),
779        ("pub struct ", ChunkKind::Struct),
780        ("struct ", ChunkKind::Struct),
781        ("pub enum ", ChunkKind::Struct),
782        ("enum ", ChunkKind::Struct),
783        ("impl ", ChunkKind::Impl),
784        ("pub trait ", ChunkKind::Struct),
785        ("trait ", ChunkKind::Struct),
786        ("export function ", ChunkKind::Function),
787        ("export async function ", ChunkKind::Function),
788        ("export default function ", ChunkKind::Function),
789        ("function ", ChunkKind::Function),
790        ("async function ", ChunkKind::Function),
791        ("export class ", ChunkKind::Class),
792        ("class ", ChunkKind::Class),
793        ("export interface ", ChunkKind::Struct),
794        ("interface ", ChunkKind::Struct),
795        ("def ", ChunkKind::Function),
796        ("async def ", ChunkKind::Function),
797        ("class ", ChunkKind::Class),
798        ("func ", ChunkKind::Function),
799    ];
800
801    for (prefix, kind) in patterns {
802        if let Some(rest) = trimmed.strip_prefix(prefix) {
803            let name: String = rest
804                .chars()
805                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
806                .take_while(|c| *c != '<')
807                .collect();
808            if !name.is_empty() {
809                return Some((name, kind.clone()));
810            }
811        }
812    }
813
814    None
815}
816
817fn find_block_end(lines: &[&str], start: usize) -> usize {
818    let mut depth = 0i32;
819    let mut found_open = false;
820
821    for (i, line) in lines.iter().enumerate().skip(start) {
822        for ch in line.chars() {
823            match ch {
824                '{' | '(' if !found_open || depth > 0 => {
825                    depth += 1;
826                    found_open = true;
827                }
828                '}' | ')' if depth > 0 => {
829                    depth -= 1;
830                    if depth == 0 && found_open {
831                        return i;
832                    }
833                }
834                _ => {}
835            }
836        }
837
838        if found_open && depth <= 0 && i > start {
839            return i;
840        }
841
842        if !found_open && i > start + 2 {
843            let trimmed = lines[i].trim();
844            if trimmed.is_empty()
845                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
846            {
847                return i.saturating_sub(1);
848            }
849        }
850    }
851
852    (start + 50).min(lines.len().saturating_sub(1))
853}
854
855pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
856    if results.is_empty() {
857        return "No results found.".to_string();
858    }
859
860    let mut out = String::new();
861    for (i, r) in results.iter().enumerate() {
862        if compact {
863            out.push_str(&format!(
864                "{}. {:.2} {}:{}-{} {:?} {}\n",
865                i + 1,
866                r.score,
867                r.file_path,
868                r.start_line,
869                r.end_line,
870                r.kind,
871                r.symbol_name,
872            ));
873        } else {
874            out.push_str(&format!(
875                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
876                i + 1,
877                r.score,
878                r.file_path,
879                r.symbol_name,
880                r.kind,
881                r.start_line,
882                r.end_line,
883                r.snippet,
884            ));
885        }
886    }
887    out
888}
889
890#[cfg(test)]
891mod tests {
892    use super::*;
893    use tempfile::tempdir;
894
895    #[cfg(unix)]
896    use std::os::unix::fs::PermissionsExt;
897
898    #[test]
899    fn tokenize_splits_code() {
900        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
901        assert!(tokens.contains(&"calculate_total".to_string()));
902        assert!(tokens.contains(&"items".to_string()));
903        assert!(tokens.contains(&"Vec".to_string()));
904    }
905
906    #[test]
907    fn camel_case_splitting() {
908        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
909        assert!(tokens.contains(&"calculateTotal".to_string()));
910        assert!(tokens.contains(&"calculate".to_string()));
911        assert!(tokens.contains(&"Total".to_string()));
912    }
913
914    #[test]
915    fn detect_rust_function() {
916        let (name, kind) =
917            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
918        assert_eq!(name, "process_request");
919        assert_eq!(kind, ChunkKind::Function);
920    }
921
922    #[test]
923    fn bm25_search_finds_relevant() {
924        let mut index = BM25Index::new();
925        index.add_chunk(CodeChunk {
926            file_path: "auth.rs".into(),
927            symbol_name: "validate_token".into(),
928            kind: ChunkKind::Function,
929            start_line: 1,
930            end_line: 10,
931            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
932            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
933            token_count: 8,
934        });
935        index.add_chunk(CodeChunk {
936            file_path: "db.rs".into(),
937            symbol_name: "connect_database".into(),
938            kind: ChunkKind::Function,
939            start_line: 1,
940            end_line: 5,
941            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
942            tokens: tokenize("fn connect_database url str Pool create_pool url"),
943            token_count: 7,
944        });
945        index.finalize();
946
947        let results = index.search("jwt token validation", 5);
948        assert!(!results.is_empty());
949        assert_eq!(results[0].symbol_name, "validate_token");
950    }
951
952    #[test]
953    fn bm25_search_sorts_ties_deterministically() {
954        let mut index = BM25Index::new();
955
956        // Insert in reverse path order to ensure the sort tie-break matters.
957        index.add_chunk(CodeChunk {
958            file_path: "b.rs".into(),
959            symbol_name: "same".into(),
960            kind: ChunkKind::Function,
961            start_line: 1,
962            end_line: 1,
963            content: "fn same() {}".into(),
964            tokens: tokenize("same token"),
965            token_count: 2,
966        });
967        index.add_chunk(CodeChunk {
968            file_path: "a.rs".into(),
969            symbol_name: "same".into(),
970            kind: ChunkKind::Function,
971            start_line: 1,
972            end_line: 1,
973            content: "fn same() {}".into(),
974            tokens: tokenize("same token"),
975            token_count: 2,
976        });
977        index.finalize();
978
979        let results = index.search("same", 10);
980        assert!(results.len() >= 2);
981        assert_eq!(results[0].file_path, "a.rs");
982        assert_eq!(results[1].file_path, "b.rs");
983    }
984
985    #[test]
986    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
987        let td = tempdir().expect("tempdir");
988        let root = td.path();
989        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
990
991        let idx = BM25Index::build_from_directory(root);
992        assert!(!bm25_index_looks_stale(&idx, root));
993
994        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
995        assert!(bm25_index_looks_stale(&idx, root));
996    }
997
998    #[test]
999    #[cfg(unix)]
1000    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1001        let td = tempdir().expect("tempdir");
1002        let root = td.path();
1003
1004        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1005        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1006
1007        let idx1 = BM25Index::build_from_directory(root);
1008        assert!(idx1.files.contains_key("a.rs"));
1009        assert!(idx1.files.contains_key("b.rs"));
1010
1011        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1012        let a_path = root.join("a.rs");
1013        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1014        perms.set_mode(0o000);
1015        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1016
1017        // Change b.rs (size changes) to force a re-read for that file.
1018        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1019            .expect("rewrite b.rs");
1020
1021        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1022        assert!(
1023            idx2.files.contains_key("a.rs"),
1024            "a.rs should be kept via reuse"
1025        );
1026        assert!(idx2.files.contains_key("b.rs"));
1027
1028        let b_has_b2 = idx2
1029            .chunks
1030            .iter()
1031            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1032        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1033
1034        // Restore permissions to avoid cleanup surprises.
1035        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1036        perms.set_mode(0o644);
1037        let _ = std::fs::set_permissions(&a_path, perms);
1038    }
1039
1040    #[test]
1041    fn load_quarantines_oversized_index() {
1042        let _env = crate::core::data_dir::test_env_lock();
1043        let td = tempdir().expect("tempdir");
1044        let root = td.path();
1045        let dir = crate::core::index_namespace::vectors_dir(root);
1046        std::fs::create_dir_all(&dir).expect("create vectors dir");
1047
1048        let index_path = dir.join("bm25_index.json");
1049        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1050        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1051
1052        let result = BM25Index::load(root);
1053        assert!(result.is_none(), "oversized index should return None");
1054        assert!(
1055            !index_path.exists(),
1056            "original index should be removed after quarantine"
1057        );
1058        assert!(
1059            dir.join("bm25_index.json.quarantined").exists(),
1060            "quarantined file should exist"
1061        );
1062
1063        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1064    }
1065
1066    #[test]
1067    fn save_refuses_oversized_output() {
1068        let _env = crate::core::data_dir::test_env_lock();
1069        let data_dir = tempdir().expect("data_dir");
1070        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1071        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1072
1073        let td = tempdir().expect("tempdir");
1074        let root = td.path();
1075
1076        let mut index = BM25Index::new();
1077        index.add_chunk(CodeChunk {
1078            file_path: "a.rs".into(),
1079            symbol_name: "a".into(),
1080            kind: ChunkKind::Function,
1081            start_line: 1,
1082            end_line: 1,
1083            content: "fn a() {}".into(),
1084            tokens: tokenize("fn a"),
1085            token_count: 2,
1086        });
1087        index.finalize();
1088
1089        let _ = index.save(root);
1090        let index_path = BM25Index::index_file_path(root);
1091        assert!(
1092            !index_path.exists(),
1093            "save should refuse to persist oversized index"
1094        );
1095
1096        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1097    }
1098
1099    #[test]
1100    fn save_writes_project_root_marker() {
1101        let _env = crate::core::data_dir::test_env_lock();
1102        let td = tempdir().expect("tempdir");
1103        let root = td.path();
1104        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1105
1106        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1107        let index = BM25Index::build_from_directory(root);
1108        index.save(root).expect("save");
1109
1110        let dir = crate::core::index_namespace::vectors_dir(root);
1111        let marker = dir.join("project_root.txt");
1112        assert!(marker.exists(), "project_root.txt marker should exist");
1113        let content = std::fs::read_to_string(&marker).expect("read marker");
1114        assert_eq!(content, root.to_string_lossy());
1115    }
1116
1117    #[test]
1118    fn list_code_files_skips_default_vendor_ignores() {
1119        let td = tempdir().expect("tempdir");
1120        let root = td.path();
1121
1122        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1123        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1124        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1125        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1126        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1127
1128        let files = list_code_files(root);
1129        assert!(
1130            files.iter().any(|f| f == "main.rs"),
1131            "main.rs should be included"
1132        );
1133        assert!(
1134            !files.iter().any(|f| f.starts_with("vendor/")),
1135            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1136        );
1137        assert!(
1138            !files.iter().any(|f| f.starts_with("dist/")),
1139            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1140        );
1141    }
1142
1143    #[test]
1144    fn list_code_files_respects_max_files_cap() {
1145        let td = tempdir().expect("tempdir");
1146        let root = td.path();
1147
1148        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1149        // but we can verify the cap constant exists and the function returns a bounded vec.
1150        for i in 0..10 {
1151            std::fs::write(
1152                root.join(format!("f{i}.rs")),
1153                format!("pub fn f{i}() {{}}\n"),
1154            )
1155            .expect("write");
1156        }
1157        let files = list_code_files(root);
1158        assert!(
1159            files.len() <= MAX_BM25_FILES,
1160            "file count should not exceed MAX_BM25_FILES"
1161        );
1162    }
1163
1164    #[test]
1165    fn max_bm25_cache_bytes_reads_env() {
1166        let _env = crate::core::data_dir::test_env_lock();
1167        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1168        let bytes = max_bm25_cache_bytes();
1169        assert_eq!(bytes, 64 * 1024 * 1024);
1170        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1171    }
1172}
lean_ctx/core/bm25_index.rs

lean_ctx/core/
bm25_index.rs