lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11    "vendor/**",
12    "dist/**",
13    "build/**",
14    "public/vendor/**",
15    "public/js/**",
16    "public/css/**",
17    "public/build/**",
18    ".next/**",
19    ".nuxt/**",
20    "__pycache__/**",
21    "*.min.js",
22    "*.min.css",
23    "*.bundle.js",
24    "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28    std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29        .ok()
30        .and_then(|v| v.parse::<u64>().ok())
31        .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb)
32        * 1024
33        * 1024
34}
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct CodeChunk {
38    pub file_path: String,
39    pub symbol_name: String,
40    pub kind: ChunkKind,
41    pub start_line: usize,
42    pub end_line: usize,
43    pub content: String,
44    pub tokens: Vec<String>,
45    pub token_count: usize,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49pub enum ChunkKind {
50    Function,
51    Struct,
52    Impl,
53    Module,
54    Class,
55    Method,
56    Other,
57}
58
59#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
60pub struct IndexedFileState {
61    pub mtime_ms: u64,
62    pub size_bytes: u64,
63}
64
65impl IndexedFileState {
66    fn from_path(path: &Path) -> Option<Self> {
67        let meta = path.metadata().ok()?;
68        let size_bytes = meta.len();
69        let mtime_ms = meta
70            .modified()
71            .ok()
72            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
73            .map(|d| d.as_millis() as u64)?;
74        Some(Self {
75            mtime_ms,
76            size_bytes,
77        })
78    }
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct BM25Index {
83    pub chunks: Vec<CodeChunk>,
84    pub inverted: HashMap<String, Vec<(usize, f64)>>,
85    pub avg_doc_len: f64,
86    pub doc_count: usize,
87    pub doc_freqs: HashMap<String, usize>,
88    #[serde(default)]
89    pub files: HashMap<String, IndexedFileState>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SearchResult {
94    pub chunk_idx: usize,
95    pub score: f64,
96    pub file_path: String,
97    pub symbol_name: String,
98    pub kind: ChunkKind,
99    pub start_line: usize,
100    pub end_line: usize,
101    pub snippet: String,
102}
103
104const BM25_K1: f64 = 1.2;
105const BM25_B: f64 = 0.75;
106
107impl Default for BM25Index {
108    fn default() -> Self {
109        Self::new()
110    }
111}
112
113impl BM25Index {
114    pub fn new() -> Self {
115        Self {
116            chunks: Vec::new(),
117            inverted: HashMap::new(),
118            avg_doc_len: 0.0,
119            doc_count: 0,
120            doc_freqs: HashMap::new(),
121            files: HashMap::new(),
122        }
123    }
124
125    pub fn build_from_directory(root: &Path) -> Self {
126        let mut index = Self::new();
127        let files = list_code_files(root);
128        for rel in files {
129            let abs = root.join(&rel);
130            let Some(state) = IndexedFileState::from_path(&abs) else {
131                continue;
132            };
133            if let Ok(content) = std::fs::read_to_string(&abs) {
134                let mut chunks = extract_chunks(&rel, &content);
135                chunks.sort_by(|a, b| {
136                    a.start_line
137                        .cmp(&b.start_line)
138                        .then_with(|| a.end_line.cmp(&b.end_line))
139                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
140                });
141                for chunk in chunks {
142                    index.add_chunk(chunk);
143                }
144                index.files.insert(rel, state);
145            }
146        }
147
148        index.finalize();
149        index
150    }
151
152    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
153        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
154        for c in &prev.chunks {
155            old_by_file
156                .entry(c.file_path.clone())
157                .or_default()
158                .push(c.clone());
159        }
160        for v in old_by_file.values_mut() {
161            v.sort_by(|a, b| {
162                a.start_line
163                    .cmp(&b.start_line)
164                    .then_with(|| a.end_line.cmp(&b.end_line))
165                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
166            });
167        }
168
169        let mut index = Self::new();
170        let files = list_code_files(root);
171        for rel in files {
172            let abs = root.join(&rel);
173            let Some(state) = IndexedFileState::from_path(&abs) else {
174                continue;
175            };
176
177            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
178            if unchanged {
179                if let Some(chunks) = old_by_file.get(&rel) {
180                    for chunk in chunks {
181                        index.add_chunk(chunk.clone());
182                    }
183                    index.files.insert(rel, state);
184                    continue;
185                }
186            }
187
188            if let Ok(content) = std::fs::read_to_string(&abs) {
189                let mut chunks = extract_chunks(&rel, &content);
190                chunks.sort_by(|a, b| {
191                    a.start_line
192                        .cmp(&b.start_line)
193                        .then_with(|| a.end_line.cmp(&b.end_line))
194                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
195                });
196                for chunk in chunks {
197                    index.add_chunk(chunk);
198                }
199                index.files.insert(rel, state);
200            }
201        }
202
203        index.finalize();
204        index
205    }
206
207    fn add_chunk(&mut self, chunk: CodeChunk) {
208        let idx = self.chunks.len();
209
210        for token in &chunk.tokens {
211            let lower = token.to_lowercase();
212            self.inverted.entry(lower).or_default().push((idx, 1.0));
213        }
214
215        self.chunks.push(chunk);
216    }
217
218    fn finalize(&mut self) {
219        self.doc_count = self.chunks.len();
220        if self.doc_count == 0 {
221            return;
222        }
223
224        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
225        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
226
227        self.doc_freqs.clear();
228        for (term, postings) in &self.inverted {
229            let unique_docs: std::collections::HashSet<usize> =
230                postings.iter().map(|(idx, _)| *idx).collect();
231            self.doc_freqs.insert(term.clone(), unique_docs.len());
232        }
233    }
234
235    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
236        let query_tokens = tokenize(query);
237        if query_tokens.is_empty() || self.doc_count == 0 {
238            return Vec::new();
239        }
240
241        let mut scores: HashMap<usize, f64> = HashMap::new();
242
243        for token in &query_tokens {
244            let lower = token.to_lowercase();
245            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
246            if df == 0.0 {
247                continue;
248            }
249
250            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
251
252            if let Some(postings) = self.inverted.get(&lower) {
253                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
254                for (idx, weight) in postings {
255                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
256                }
257
258                for (doc_idx, tf) in &doc_tfs {
259                    let doc_len = self.chunks[*doc_idx].token_count as f64;
260                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
261                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
262                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
263
264                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
265                }
266            }
267        }
268
269        let mut results: Vec<SearchResult> = scores
270            .into_iter()
271            .map(|(idx, score)| {
272                let chunk = &self.chunks[idx];
273                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
274                SearchResult {
275                    chunk_idx: idx,
276                    score,
277                    file_path: chunk.file_path.clone(),
278                    symbol_name: chunk.symbol_name.clone(),
279                    kind: chunk.kind.clone(),
280                    start_line: chunk.start_line,
281                    end_line: chunk.end_line,
282                    snippet,
283                }
284            })
285            .collect();
286
287        results.sort_by(|a, b| {
288            b.score
289                .partial_cmp(&a.score)
290                .unwrap_or(std::cmp::Ordering::Equal)
291                .then_with(|| a.file_path.cmp(&b.file_path))
292                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
293                .then_with(|| a.start_line.cmp(&b.start_line))
294                .then_with(|| a.end_line.cmp(&b.end_line))
295        });
296        results.truncate(top_k);
297        results
298    }
299
300    pub fn save(&self, root: &Path) -> std::io::Result<()> {
301        if self.chunks.len() > CHUNK_COUNT_WARNING {
302            tracing::warn!(
303                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
304                self.chunks.len(),
305                CHUNK_COUNT_WARNING
306            );
307        }
308
309        let dir = index_dir(root);
310        std::fs::create_dir_all(&dir)?;
311        let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
312
313        let max_bytes = max_bm25_cache_bytes();
314        if data.len() as u64 > max_bytes {
315            tracing::warn!(
316                "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
317                data.len() as f64 / 1_048_576.0,
318                max_bytes / (1024 * 1024),
319                dir.display()
320            );
321            return Ok(());
322        }
323
324        let target = dir.join("bm25_index.json");
325        let tmp = dir.join("bm25_index.json.tmp");
326        std::fs::write(&tmp, &data)?;
327        std::fs::rename(&tmp, &target)?;
328
329        let _ = std::fs::write(
330            dir.join("project_root.txt"),
331            root.to_string_lossy().as_bytes(),
332        );
333
334        Ok(())
335    }
336
337    pub fn load(root: &Path) -> Option<Self> {
338        let path = index_dir(root).join("bm25_index.json");
339        let meta = std::fs::metadata(&path).ok()?;
340        let max_bytes = max_bm25_cache_bytes();
341        if meta.len() > max_bytes {
342            tracing::warn!(
343                "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
344                meta.len() as f64 / 1_073_741_824.0,
345                max_bytes / (1024 * 1024),
346                path.display()
347            );
348            let quarantined = path.with_extension("json.quarantined");
349            let _ = std::fs::rename(&path, &quarantined);
350            return None;
351        }
352        let data = std::fs::read_to_string(&path).ok()?;
353        serde_json::from_str(&data).ok()
354    }
355
356    pub fn load_or_build(root: &Path) -> Self {
357        if let Some(idx) = Self::load(root) {
358            if !bm25_index_looks_stale(&idx, root) {
359                return idx;
360            }
361            tracing::warn!(
362                "[bm25_index: stale index detected for {}; rebuilding]",
363                root.display()
364            );
365            let rebuilt = if idx.files.is_empty() {
366                Self::build_from_directory(root)
367            } else {
368                Self::rebuild_incremental(root, &idx)
369            };
370            let _ = rebuilt.save(root);
371            return rebuilt;
372        }
373
374        let built = Self::build_from_directory(root);
375        let _ = built.save(root);
376        built
377    }
378
379    pub fn index_file_path(root: &Path) -> PathBuf {
380        index_dir(root).join("bm25_index.json")
381    }
382}
383
384fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
385    if index.chunks.is_empty() {
386        return false;
387    }
388
389    if index.files.is_empty() {
390        // Legacy index (pre file-state tracking): only detect missing files.
391        let mut seen = std::collections::HashSet::<&str>::new();
392        for chunk in &index.chunks {
393            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
394            if rel.is_empty() {
395                continue;
396            }
397            if !seen.insert(rel) {
398                continue;
399            }
400            if !root.join(rel).exists() {
401                return true;
402            }
403        }
404        return false;
405    }
406
407    // Missing or modified tracked files.
408    for (rel, old_state) in &index.files {
409        let abs = root.join(rel);
410        if !abs.exists() {
411            return true;
412        }
413        let Some(cur) = IndexedFileState::from_path(&abs) else {
414            return true;
415        };
416        if &cur != old_state {
417            return true;
418        }
419    }
420
421    // New files (present on disk but not in index).
422    for rel in list_code_files(root) {
423        if !index.files.contains_key(&rel) {
424            return true;
425        }
426    }
427
428    false
429}
430
431fn index_dir(root: &Path) -> PathBuf {
432    crate::core::index_namespace::vectors_dir(root)
433}
434
435fn list_code_files(root: &Path) -> Vec<String> {
436    let walker = ignore::WalkBuilder::new(root)
437        .hidden(true)
438        .git_ignore(true)
439        .git_global(true)
440        .git_exclude(true)
441        .build();
442
443    let cfg = crate::core::config::Config::load();
444    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
445        .iter()
446        .filter_map(|p| glob::Pattern::new(p).ok())
447        .collect();
448    ignore_patterns.extend(
449        cfg.extra_ignore_patterns
450            .iter()
451            .filter_map(|p| glob::Pattern::new(p).ok()),
452    );
453
454    let mut files: Vec<String> = Vec::new();
455    for entry in walker.flatten() {
456        let path = entry.path();
457        if !path.is_file() {
458            continue;
459        }
460        if !is_code_file(path) {
461            continue;
462        }
463        let rel = path
464            .strip_prefix(root)
465            .unwrap_or(path)
466            .to_string_lossy()
467            .to_string();
468        if rel.is_empty() {
469            continue;
470        }
471        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
472            continue;
473        }
474        if files.len() >= MAX_BM25_FILES {
475            tracing::warn!(
476                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
477                root.display()
478            );
479            break;
480        }
481        files.push(rel);
482    }
483
484    files.sort();
485    files.dedup();
486    files
487}
488
489pub fn is_code_file(path: &Path) -> bool {
490    let ext = path
491        .extension()
492        .and_then(|e| e.to_str())
493        .unwrap_or("")
494        .to_lowercase();
495    matches!(
496        ext.as_str(),
497        "rs" | "ts"
498            | "tsx"
499            | "js"
500            | "jsx"
501            | "py"
502            | "go"
503            | "java"
504            | "c"
505            | "cc"
506            | "cpp"
507            | "h"
508            | "hpp"
509            | "rb"
510            | "cs"
511            | "kt"
512            | "swift"
513            | "php"
514            | "scala"
515            | "sql"
516            | "ex"
517            | "exs"
518            | "zig"
519            | "lua"
520            | "dart"
521            | "vue"
522            | "svelte"
523    )
524}
525
526fn tokenize(text: &str) -> Vec<String> {
527    let mut tokens = Vec::new();
528    let mut current = String::new();
529
530    for ch in text.chars() {
531        if ch.is_alphanumeric() || ch == '_' {
532            current.push(ch);
533        } else {
534            if current.len() >= 2 {
535                tokens.push(current.clone());
536            }
537            current.clear();
538        }
539    }
540    if current.len() >= 2 {
541        tokens.push(current);
542    }
543
544    split_camel_case_tokens(&tokens)
545}
546
547pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
548    tokenize(text)
549}
550
551fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
552    let mut result = Vec::new();
553    for token in tokens {
554        result.push(token.clone());
555        let mut start = 0;
556        let chars: Vec<char> = token.chars().collect();
557        for i in 1..chars.len() {
558            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
559                let part: String = chars[start..i].iter().collect();
560                if part.len() >= 2 {
561                    result.push(part);
562                }
563                start = i;
564            }
565        }
566        if start > 0 {
567            let part: String = chars[start..].iter().collect();
568            if part.len() >= 2 {
569                result.push(part);
570            }
571        }
572    }
573    result
574}
575
576fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
577    #[cfg(feature = "tree-sitter")]
578    {
579        let ext = std::path::Path::new(file_path)
580            .extension()
581            .and_then(|e| e.to_str())
582            .unwrap_or("");
583        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
584            return chunks;
585        }
586    }
587
588    let lines: Vec<&str> = content.lines().collect();
589    if lines.is_empty() {
590        return Vec::new();
591    }
592
593    let mut chunks = Vec::new();
594    let mut i = 0;
595
596    while i < lines.len() {
597        let trimmed = lines[i].trim();
598
599        if let Some((name, kind)) = detect_symbol(trimmed) {
600            let start = i;
601            let end = find_block_end(&lines, i);
602            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
603            let tokens = tokenize(&block);
604            let token_count = tokens.len();
605
606            chunks.push(CodeChunk {
607                file_path: file_path.to_string(),
608                symbol_name: name,
609                kind,
610                start_line: start + 1,
611                end_line: end + 1,
612                content: block,
613                tokens,
614                token_count,
615            });
616
617            i = end + 1;
618        } else {
619            i += 1;
620        }
621    }
622
623    if chunks.is_empty() && !content.is_empty() {
624        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
625        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
626        //
627        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
628        let bytes = content.as_bytes();
629        let rk_chunks = crate::core::rabin_karp::chunk(content);
630        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
631            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
632                let end = (c.offset + c.length).min(bytes.len());
633                let slice = &bytes[c.offset..end];
634                let chunk_text = String::from_utf8_lossy(slice).into_owned();
635                let tokens = tokenize(&chunk_text);
636                let token_count = tokens.len();
637                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
638                let end_line = start_line + bytecount::count(slice, b'\n');
639                chunks.push(CodeChunk {
640                    file_path: file_path.to_string(),
641                    symbol_name: format!("{file_path}#chunk-{idx}"),
642                    kind: ChunkKind::Module,
643                    start_line,
644                    end_line: end_line.max(start_line),
645                    content: chunk_text,
646                    tokens,
647                    token_count,
648                });
649            }
650        } else {
651            let tokens = tokenize(content);
652            let token_count = tokens.len();
653            let snippet = lines
654                .iter()
655                .take(50)
656                .copied()
657                .collect::<Vec<_>>()
658                .join("\n");
659            chunks.push(CodeChunk {
660                file_path: file_path.to_string(),
661                symbol_name: file_path.to_string(),
662                kind: ChunkKind::Module,
663                start_line: 1,
664                end_line: lines.len(),
665                content: snippet,
666                tokens,
667                token_count,
668            });
669        }
670    }
671
672    chunks
673}
674
675fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
676    let trimmed = line.trim();
677
678    let patterns: &[(&str, ChunkKind)] = &[
679        ("pub async fn ", ChunkKind::Function),
680        ("async fn ", ChunkKind::Function),
681        ("pub fn ", ChunkKind::Function),
682        ("fn ", ChunkKind::Function),
683        ("pub struct ", ChunkKind::Struct),
684        ("struct ", ChunkKind::Struct),
685        ("pub enum ", ChunkKind::Struct),
686        ("enum ", ChunkKind::Struct),
687        ("impl ", ChunkKind::Impl),
688        ("pub trait ", ChunkKind::Struct),
689        ("trait ", ChunkKind::Struct),
690        ("export function ", ChunkKind::Function),
691        ("export async function ", ChunkKind::Function),
692        ("export default function ", ChunkKind::Function),
693        ("function ", ChunkKind::Function),
694        ("async function ", ChunkKind::Function),
695        ("export class ", ChunkKind::Class),
696        ("class ", ChunkKind::Class),
697        ("export interface ", ChunkKind::Struct),
698        ("interface ", ChunkKind::Struct),
699        ("def ", ChunkKind::Function),
700        ("async def ", ChunkKind::Function),
701        ("class ", ChunkKind::Class),
702        ("func ", ChunkKind::Function),
703    ];
704
705    for (prefix, kind) in patterns {
706        if let Some(rest) = trimmed.strip_prefix(prefix) {
707            let name: String = rest
708                .chars()
709                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
710                .take_while(|c| *c != '<')
711                .collect();
712            if !name.is_empty() {
713                return Some((name, kind.clone()));
714            }
715        }
716    }
717
718    None
719}
720
721fn find_block_end(lines: &[&str], start: usize) -> usize {
722    let mut depth = 0i32;
723    let mut found_open = false;
724
725    for (i, line) in lines.iter().enumerate().skip(start) {
726        for ch in line.chars() {
727            match ch {
728                '{' | '(' if !found_open || depth > 0 => {
729                    depth += 1;
730                    found_open = true;
731                }
732                '}' | ')' if depth > 0 => {
733                    depth -= 1;
734                    if depth == 0 && found_open {
735                        return i;
736                    }
737                }
738                _ => {}
739            }
740        }
741
742        if found_open && depth <= 0 && i > start {
743            return i;
744        }
745
746        if !found_open && i > start + 2 {
747            let trimmed = lines[i].trim();
748            if trimmed.is_empty()
749                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
750            {
751                return i.saturating_sub(1);
752            }
753        }
754    }
755
756    (start + 50).min(lines.len().saturating_sub(1))
757}
758
759pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
760    if results.is_empty() {
761        return "No results found.".to_string();
762    }
763
764    let mut out = String::new();
765    for (i, r) in results.iter().enumerate() {
766        if compact {
767            out.push_str(&format!(
768                "{}. {:.2} {}:{}-{} {:?} {}\n",
769                i + 1,
770                r.score,
771                r.file_path,
772                r.start_line,
773                r.end_line,
774                r.kind,
775                r.symbol_name,
776            ));
777        } else {
778            out.push_str(&format!(
779                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
780                i + 1,
781                r.score,
782                r.file_path,
783                r.symbol_name,
784                r.kind,
785                r.start_line,
786                r.end_line,
787                r.snippet,
788            ));
789        }
790    }
791    out
792}
793
794#[cfg(test)]
795mod tests {
796    use super::*;
797    use tempfile::tempdir;
798
799    #[cfg(unix)]
800    use std::os::unix::fs::PermissionsExt;
801
802    #[test]
803    fn tokenize_splits_code() {
804        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
805        assert!(tokens.contains(&"calculate_total".to_string()));
806        assert!(tokens.contains(&"items".to_string()));
807        assert!(tokens.contains(&"Vec".to_string()));
808    }
809
810    #[test]
811    fn camel_case_splitting() {
812        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
813        assert!(tokens.contains(&"calculateTotal".to_string()));
814        assert!(tokens.contains(&"calculate".to_string()));
815        assert!(tokens.contains(&"Total".to_string()));
816    }
817
818    #[test]
819    fn detect_rust_function() {
820        let (name, kind) =
821            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
822        assert_eq!(name, "process_request");
823        assert_eq!(kind, ChunkKind::Function);
824    }
825
826    #[test]
827    fn bm25_search_finds_relevant() {
828        let mut index = BM25Index::new();
829        index.add_chunk(CodeChunk {
830            file_path: "auth.rs".into(),
831            symbol_name: "validate_token".into(),
832            kind: ChunkKind::Function,
833            start_line: 1,
834            end_line: 10,
835            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
836            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
837            token_count: 8,
838        });
839        index.add_chunk(CodeChunk {
840            file_path: "db.rs".into(),
841            symbol_name: "connect_database".into(),
842            kind: ChunkKind::Function,
843            start_line: 1,
844            end_line: 5,
845            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
846            tokens: tokenize("fn connect_database url str Pool create_pool url"),
847            token_count: 7,
848        });
849        index.finalize();
850
851        let results = index.search("jwt token validation", 5);
852        assert!(!results.is_empty());
853        assert_eq!(results[0].symbol_name, "validate_token");
854    }
855
856    #[test]
857    fn bm25_search_sorts_ties_deterministically() {
858        let mut index = BM25Index::new();
859
860        // Insert in reverse path order to ensure the sort tie-break matters.
861        index.add_chunk(CodeChunk {
862            file_path: "b.rs".into(),
863            symbol_name: "same".into(),
864            kind: ChunkKind::Function,
865            start_line: 1,
866            end_line: 1,
867            content: "fn same() {}".into(),
868            tokens: tokenize("same token"),
869            token_count: 2,
870        });
871        index.add_chunk(CodeChunk {
872            file_path: "a.rs".into(),
873            symbol_name: "same".into(),
874            kind: ChunkKind::Function,
875            start_line: 1,
876            end_line: 1,
877            content: "fn same() {}".into(),
878            tokens: tokenize("same token"),
879            token_count: 2,
880        });
881        index.finalize();
882
883        let results = index.search("same", 10);
884        assert!(results.len() >= 2);
885        assert_eq!(results[0].file_path, "a.rs");
886        assert_eq!(results[1].file_path, "b.rs");
887    }
888
889    #[test]
890    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
891        let td = tempdir().expect("tempdir");
892        let root = td.path();
893        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
894
895        let idx = BM25Index::build_from_directory(root);
896        assert!(!bm25_index_looks_stale(&idx, root));
897
898        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
899        assert!(bm25_index_looks_stale(&idx, root));
900    }
901
902    #[test]
903    #[cfg(unix)]
904    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
905        let td = tempdir().expect("tempdir");
906        let root = td.path();
907
908        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
909        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
910
911        let idx1 = BM25Index::build_from_directory(root);
912        assert!(idx1.files.contains_key("a.rs"));
913        assert!(idx1.files.contains_key("b.rs"));
914
915        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
916        let a_path = root.join("a.rs");
917        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
918        perms.set_mode(0o000);
919        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
920
921        // Change b.rs (size changes) to force a re-read for that file.
922        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
923            .expect("rewrite b.rs");
924
925        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
926        assert!(
927            idx2.files.contains_key("a.rs"),
928            "a.rs should be kept via reuse"
929        );
930        assert!(idx2.files.contains_key("b.rs"));
931
932        let b_has_b2 = idx2
933            .chunks
934            .iter()
935            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
936        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
937
938        // Restore permissions to avoid cleanup surprises.
939        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
940        perms.set_mode(0o644);
941        let _ = std::fs::set_permissions(&a_path, perms);
942    }
943
944    #[test]
945    fn load_quarantines_oversized_index() {
946        let _env = crate::core::data_dir::test_env_lock();
947        let td = tempdir().expect("tempdir");
948        let root = td.path();
949        let dir = crate::core::index_namespace::vectors_dir(root);
950        std::fs::create_dir_all(&dir).expect("create vectors dir");
951
952        let index_path = dir.join("bm25_index.json");
953        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
954        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
955
956        let result = BM25Index::load(root);
957        assert!(result.is_none(), "oversized index should return None");
958        assert!(
959            !index_path.exists(),
960            "original index should be removed after quarantine"
961        );
962        assert!(
963            dir.join("bm25_index.json.quarantined").exists(),
964            "quarantined file should exist"
965        );
966
967        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
968    }
969
970    #[test]
971    fn save_refuses_oversized_output() {
972        let _env = crate::core::data_dir::test_env_lock();
973        let data_dir = tempdir().expect("data_dir");
974        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
975        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
976
977        let td = tempdir().expect("tempdir");
978        let root = td.path();
979
980        let mut index = BM25Index::new();
981        index.add_chunk(CodeChunk {
982            file_path: "a.rs".into(),
983            symbol_name: "a".into(),
984            kind: ChunkKind::Function,
985            start_line: 1,
986            end_line: 1,
987            content: "fn a() {}".into(),
988            tokens: tokenize("fn a"),
989            token_count: 2,
990        });
991        index.finalize();
992
993        let _ = index.save(root);
994        let index_path = BM25Index::index_file_path(root);
995        assert!(
996            !index_path.exists(),
997            "save should refuse to persist oversized index"
998        );
999
1000        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1001    }
1002
1003    #[test]
1004    fn save_writes_project_root_marker() {
1005        let td = tempdir().expect("tempdir");
1006        let root = td.path();
1007        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1008
1009        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1010        let index = BM25Index::build_from_directory(root);
1011        index.save(root).expect("save");
1012
1013        let dir = crate::core::index_namespace::vectors_dir(root);
1014        let marker = dir.join("project_root.txt");
1015        assert!(marker.exists(), "project_root.txt marker should exist");
1016        let content = std::fs::read_to_string(&marker).expect("read marker");
1017        assert_eq!(content, root.to_string_lossy());
1018    }
1019
1020    #[test]
1021    fn list_code_files_skips_default_vendor_ignores() {
1022        let td = tempdir().expect("tempdir");
1023        let root = td.path();
1024
1025        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1026        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1027        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1028        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1029        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1030
1031        let files = list_code_files(root);
1032        assert!(
1033            files.iter().any(|f| f == "main.rs"),
1034            "main.rs should be included"
1035        );
1036        assert!(
1037            !files.iter().any(|f| f.starts_with("vendor/")),
1038            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1039        );
1040        assert!(
1041            !files.iter().any(|f| f.starts_with("dist/")),
1042            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1043        );
1044    }
1045
1046    #[test]
1047    fn list_code_files_respects_max_files_cap() {
1048        let td = tempdir().expect("tempdir");
1049        let root = td.path();
1050
1051        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1052        // but we can verify the cap constant exists and the function returns a bounded vec.
1053        for i in 0..10 {
1054            std::fs::write(
1055                root.join(format!("f{i}.rs")),
1056                format!("pub fn f{i}() {{}}\n"),
1057            )
1058            .expect("write");
1059        }
1060        let files = list_code_files(root);
1061        assert!(
1062            files.len() <= MAX_BM25_FILES,
1063            "file count should not exceed MAX_BM25_FILES"
1064        );
1065    }
1066
1067    #[test]
1068    fn max_bm25_cache_bytes_reads_env() {
1069        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1070        let bytes = max_bm25_cache_bytes();
1071        assert_eq!(bytes, 64 * 1024 * 1024);
1072        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1073    }
1074}
lean_ctx/core/bm25_index.rs

lean_ctx/core/
bm25_index.rs