Skip to main content

lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11    "vendor/**",
12    "dist/**",
13    "build/**",
14    "public/vendor/**",
15    "public/js/**",
16    "public/css/**",
17    "public/build/**",
18    ".next/**",
19    ".nuxt/**",
20    "__pycache__/**",
21    "*.min.js",
22    "*.min.css",
23    "*.bundle.js",
24    "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29        .ok()
30        .and_then(|v| v.parse::<u64>().ok())
31        .unwrap_or_else(|| {
32            let cfg = crate::core::config::Config::load();
33            let profile = crate::core::config::MemoryProfile::effective(&cfg);
34            let profile_mb = profile.bm25_max_cache_mb();
35            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36                profile_mb
37            } else {
38                cfg.bm25_max_cache_mb
39            }
40        });
41    mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46    pub file_path: String,
47    pub symbol_name: String,
48    pub kind: ChunkKind,
49    pub start_line: usize,
50    pub end_line: usize,
51    pub content: String,
52    #[serde(skip_serializing, default)]
53    pub tokens: Vec<String>,
54    pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59    Function,
60    Struct,
61    Impl,
62    Module,
63    Class,
64    Method,
65    Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70    pub mtime_ms: u64,
71    pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75    fn from_path(path: &Path) -> Option<Self> {
76        let meta = path.metadata().ok()?;
77        let size_bytes = meta.len();
78        let mtime_ms = meta
79            .modified()
80            .ok()
81            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82            .map(|d| d.as_millis() as u64)?;
83        Some(Self {
84            mtime_ms,
85            size_bytes,
86        })
87    }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92    pub chunks: Vec<CodeChunk>,
93    pub inverted: HashMap<String, Vec<(usize, f64)>>,
94    pub avg_doc_len: f64,
95    pub doc_count: usize,
96    pub doc_freqs: HashMap<String, usize>,
97    #[serde(default)]
98    pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103    pub chunk_idx: usize,
104    pub score: f64,
105    pub file_path: String,
106    pub symbol_name: String,
107    pub kind: ChunkKind,
108    pub start_line: usize,
109    pub end_line: usize,
110    pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117    fn default() -> Self {
118        Self::new()
119    }
120}
121
122impl BM25Index {
123    pub fn new() -> Self {
124        Self {
125            chunks: Vec::new(),
126            inverted: HashMap::new(),
127            avg_doc_len: 0.0,
128            doc_count: 0,
129            doc_freqs: HashMap::new(),
130            files: HashMap::new(),
131        }
132    }
133
134    pub fn build_from_directory(root: &Path) -> Self {
135        let mut index = Self::new();
136        let files = list_code_files(root);
137        for rel in files {
138            let abs = root.join(&rel);
139            let Some(state) = IndexedFileState::from_path(&abs) else {
140                continue;
141            };
142            if let Ok(content) = std::fs::read_to_string(&abs) {
143                let mut chunks = extract_chunks(&rel, &content);
144                chunks.sort_by(|a, b| {
145                    a.start_line
146                        .cmp(&b.start_line)
147                        .then_with(|| a.end_line.cmp(&b.end_line))
148                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
149                });
150                for chunk in chunks {
151                    index.add_chunk(chunk);
152                }
153                index.files.insert(rel, state);
154            }
155        }
156
157        index.finalize();
158        index
159    }
160
161    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
162        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
163        for c in &prev.chunks {
164            old_by_file
165                .entry(c.file_path.clone())
166                .or_default()
167                .push(c.clone());
168        }
169        for v in old_by_file.values_mut() {
170            v.sort_by(|a, b| {
171                a.start_line
172                    .cmp(&b.start_line)
173                    .then_with(|| a.end_line.cmp(&b.end_line))
174                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
175            });
176        }
177
178        let mut index = Self::new();
179        let files = list_code_files(root);
180        for rel in files {
181            let abs = root.join(&rel);
182            let Some(state) = IndexedFileState::from_path(&abs) else {
183                continue;
184            };
185
186            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
187            if unchanged {
188                if let Some(chunks) = old_by_file.get(&rel) {
189                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
190                        for chunk in chunks {
191                            index.add_chunk(chunk.clone());
192                        }
193                        index.files.insert(rel, state);
194                        continue;
195                    }
196                }
197            }
198
199            if let Ok(content) = std::fs::read_to_string(&abs) {
200                let mut chunks = extract_chunks(&rel, &content);
201                chunks.sort_by(|a, b| {
202                    a.start_line
203                        .cmp(&b.start_line)
204                        .then_with(|| a.end_line.cmp(&b.end_line))
205                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
206                });
207                for chunk in chunks {
208                    index.add_chunk(chunk);
209                }
210                index.files.insert(rel, state);
211            }
212        }
213
214        index.finalize();
215        index
216    }
217
218    fn add_chunk(&mut self, chunk: CodeChunk) {
219        let idx = self.chunks.len();
220
221        let tokens = tokenize(&chunk.content);
222        for token in &tokens {
223            let lower = token.to_lowercase();
224            self.inverted.entry(lower).or_default().push((idx, 1.0));
225        }
226
227        self.chunks.push(CodeChunk {
228            token_count: tokens.len(),
229            tokens: Vec::new(),
230            ..chunk
231        });
232    }
233
234    fn finalize(&mut self) {
235        self.doc_count = self.chunks.len();
236        if self.doc_count == 0 {
237            return;
238        }
239
240        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
241        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
242
243        self.doc_freqs.clear();
244        for (term, postings) in &self.inverted {
245            let unique_docs: std::collections::HashSet<usize> =
246                postings.iter().map(|(idx, _)| *idx).collect();
247            self.doc_freqs.insert(term.clone(), unique_docs.len());
248        }
249    }
250
251    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
252        let query_tokens = tokenize(query);
253        if query_tokens.is_empty() || self.doc_count == 0 {
254            return Vec::new();
255        }
256
257        let mut scores: HashMap<usize, f64> = HashMap::new();
258
259        for token in &query_tokens {
260            let lower = token.to_lowercase();
261            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
262            if df == 0.0 {
263                continue;
264            }
265
266            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
267
268            if let Some(postings) = self.inverted.get(&lower) {
269                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
270                for (idx, weight) in postings {
271                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
272                }
273
274                for (doc_idx, tf) in &doc_tfs {
275                    let doc_len = self.chunks[*doc_idx].token_count as f64;
276                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
277                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
278                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
279
280                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
281                }
282            }
283        }
284
285        let mut results: Vec<SearchResult> = scores
286            .into_iter()
287            .map(|(idx, score)| {
288                let chunk = &self.chunks[idx];
289                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
290                SearchResult {
291                    chunk_idx: idx,
292                    score,
293                    file_path: chunk.file_path.clone(),
294                    symbol_name: chunk.symbol_name.clone(),
295                    kind: chunk.kind.clone(),
296                    start_line: chunk.start_line,
297                    end_line: chunk.end_line,
298                    snippet,
299                }
300            })
301            .collect();
302
303        results.sort_by(|a, b| {
304            b.score
305                .partial_cmp(&a.score)
306                .unwrap_or(std::cmp::Ordering::Equal)
307                .then_with(|| a.file_path.cmp(&b.file_path))
308                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
309                .then_with(|| a.start_line.cmp(&b.start_line))
310                .then_with(|| a.end_line.cmp(&b.end_line))
311        });
312        results.truncate(top_k);
313        results
314    }
315
316    pub fn save(&self, root: &Path) -> std::io::Result<()> {
317        if self.chunks.len() > CHUNK_COUNT_WARNING {
318            tracing::warn!(
319                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
320                self.chunks.len(),
321                CHUNK_COUNT_WARNING
322            );
323        }
324
325        let dir = index_dir(root);
326        std::fs::create_dir_all(&dir)?;
327        let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
328
329        let max_bytes = max_bm25_cache_bytes();
330        if data.len() as u64 > max_bytes {
331            tracing::warn!(
332                "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
333                data.len() as f64 / 1_048_576.0,
334                max_bytes / (1024 * 1024),
335                dir.display()
336            );
337            return Ok(());
338        }
339
340        let target = dir.join("bm25_index.json");
341        let tmp = dir.join("bm25_index.json.tmp");
342        std::fs::write(&tmp, &data)?;
343        std::fs::rename(&tmp, &target)?;
344
345        let _ = std::fs::write(
346            dir.join("project_root.txt"),
347            root.to_string_lossy().as_bytes(),
348        );
349
350        Ok(())
351    }
352
353    pub fn load(root: &Path) -> Option<Self> {
354        let path = index_dir(root).join("bm25_index.json");
355        let meta = std::fs::metadata(&path).ok()?;
356        let max_bytes = max_bm25_cache_bytes();
357        if meta.len() > max_bytes {
358            tracing::warn!(
359                "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
360                meta.len() as f64 / 1_073_741_824.0,
361                max_bytes / (1024 * 1024),
362                path.display()
363            );
364            let quarantined = path.with_extension("json.quarantined");
365            let _ = std::fs::rename(&path, &quarantined);
366            return None;
367        }
368        let data = std::fs::read_to_string(&path).ok()?;
369        serde_json::from_str(&data).ok()
370    }
371
372    pub fn load_or_build(root: &Path) -> Self {
373        if let Some(idx) = Self::load(root) {
374            if !bm25_index_looks_stale(&idx, root) {
375                return idx;
376            }
377            tracing::warn!(
378                "[bm25_index: stale index detected for {}; rebuilding]",
379                root.display()
380            );
381            let rebuilt = if idx.files.is_empty() {
382                Self::build_from_directory(root)
383            } else {
384                Self::rebuild_incremental(root, &idx)
385            };
386            let _ = rebuilt.save(root);
387            return rebuilt;
388        }
389
390        let built = Self::build_from_directory(root);
391        let _ = built.save(root);
392        built
393    }
394
395    pub fn index_file_path(root: &Path) -> PathBuf {
396        index_dir(root).join("bm25_index.json")
397    }
398}
399
400fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
401    if index.chunks.is_empty() {
402        return false;
403    }
404
405    if index.files.is_empty() {
406        // Legacy index (pre file-state tracking): only detect missing files.
407        let mut seen = std::collections::HashSet::<&str>::new();
408        for chunk in &index.chunks {
409            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
410            if rel.is_empty() {
411                continue;
412            }
413            if !seen.insert(rel) {
414                continue;
415            }
416            if !root.join(rel).exists() {
417                return true;
418            }
419        }
420        return false;
421    }
422
423    // Missing or modified tracked files.
424    for (rel, old_state) in &index.files {
425        let abs = root.join(rel);
426        if !abs.exists() {
427            return true;
428        }
429        let Some(cur) = IndexedFileState::from_path(&abs) else {
430            return true;
431        };
432        if &cur != old_state {
433            return true;
434        }
435    }
436
437    // New files (present on disk but not in index).
438    for rel in list_code_files(root) {
439        if !index.files.contains_key(&rel) {
440            return true;
441        }
442    }
443
444    false
445}
446
447fn index_dir(root: &Path) -> PathBuf {
448    crate::core::index_namespace::vectors_dir(root)
449}
450
451fn list_code_files(root: &Path) -> Vec<String> {
452    let walker = ignore::WalkBuilder::new(root)
453        .hidden(true)
454        .git_ignore(true)
455        .git_global(true)
456        .git_exclude(true)
457        .build();
458
459    let cfg = crate::core::config::Config::load();
460    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
461        .iter()
462        .filter_map(|p| glob::Pattern::new(p).ok())
463        .collect();
464    ignore_patterns.extend(
465        cfg.extra_ignore_patterns
466            .iter()
467            .filter_map(|p| glob::Pattern::new(p).ok()),
468    );
469
470    let mut files: Vec<String> = Vec::new();
471    for entry in walker.flatten() {
472        let path = entry.path();
473        if !path.is_file() {
474            continue;
475        }
476        if !is_code_file(path) {
477            continue;
478        }
479        let rel = path
480            .strip_prefix(root)
481            .unwrap_or(path)
482            .to_string_lossy()
483            .to_string();
484        if rel.is_empty() {
485            continue;
486        }
487        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
488            continue;
489        }
490        if files.len() >= MAX_BM25_FILES {
491            tracing::warn!(
492                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
493                root.display()
494            );
495            break;
496        }
497        files.push(rel);
498    }
499
500    files.sort();
501    files.dedup();
502    files
503}
504
505pub fn is_code_file(path: &Path) -> bool {
506    let ext = path
507        .extension()
508        .and_then(|e| e.to_str())
509        .unwrap_or("")
510        .to_lowercase();
511    matches!(
512        ext.as_str(),
513        "rs" | "ts"
514            | "tsx"
515            | "js"
516            | "jsx"
517            | "py"
518            | "go"
519            | "java"
520            | "c"
521            | "cc"
522            | "cpp"
523            | "h"
524            | "hpp"
525            | "rb"
526            | "cs"
527            | "kt"
528            | "swift"
529            | "php"
530            | "scala"
531            | "sql"
532            | "ex"
533            | "exs"
534            | "zig"
535            | "lua"
536            | "dart"
537            | "vue"
538            | "svelte"
539    )
540}
541
542fn tokenize(text: &str) -> Vec<String> {
543    let mut tokens = Vec::new();
544    let mut current = String::new();
545
546    for ch in text.chars() {
547        if ch.is_alphanumeric() || ch == '_' {
548            current.push(ch);
549        } else {
550            if current.len() >= 2 {
551                tokens.push(current.clone());
552            }
553            current.clear();
554        }
555    }
556    if current.len() >= 2 {
557        tokens.push(current);
558    }
559
560    split_camel_case_tokens(&tokens)
561}
562
563pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
564    tokenize(text)
565}
566
567fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
568    let mut result = Vec::new();
569    for token in tokens {
570        result.push(token.clone());
571        let mut start = 0;
572        let chars: Vec<char> = token.chars().collect();
573        for i in 1..chars.len() {
574            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
575                let part: String = chars[start..i].iter().collect();
576                if part.len() >= 2 {
577                    result.push(part);
578                }
579                start = i;
580            }
581        }
582        if start > 0 {
583            let part: String = chars[start..].iter().collect();
584            if part.len() >= 2 {
585                result.push(part);
586            }
587        }
588    }
589    result
590}
591
592fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
593    #[cfg(feature = "tree-sitter")]
594    {
595        let ext = std::path::Path::new(file_path)
596            .extension()
597            .and_then(|e| e.to_str())
598            .unwrap_or("");
599        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
600            return chunks;
601        }
602    }
603
604    let lines: Vec<&str> = content.lines().collect();
605    if lines.is_empty() {
606        return Vec::new();
607    }
608
609    let mut chunks = Vec::new();
610    let mut i = 0;
611
612    while i < lines.len() {
613        let trimmed = lines[i].trim();
614
615        if let Some((name, kind)) = detect_symbol(trimmed) {
616            let start = i;
617            let end = find_block_end(&lines, i);
618            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
619            let token_count = tokenize(&block).len();
620
621            chunks.push(CodeChunk {
622                file_path: file_path.to_string(),
623                symbol_name: name,
624                kind,
625                start_line: start + 1,
626                end_line: end + 1,
627                content: block,
628                tokens: Vec::new(),
629                token_count,
630            });
631
632            i = end + 1;
633        } else {
634            i += 1;
635        }
636    }
637
638    if chunks.is_empty() && !content.is_empty() {
639        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
640        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
641        //
642        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
643        let bytes = content.as_bytes();
644        let rk_chunks = crate::core::rabin_karp::chunk(content);
645        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
646            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
647                let end = (c.offset + c.length).min(bytes.len());
648                let slice = &bytes[c.offset..end];
649                let chunk_text = String::from_utf8_lossy(slice).into_owned();
650                let token_count = tokenize(&chunk_text).len();
651                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
652                let end_line = start_line + bytecount::count(slice, b'\n');
653                chunks.push(CodeChunk {
654                    file_path: file_path.to_string(),
655                    symbol_name: format!("{file_path}#chunk-{idx}"),
656                    kind: ChunkKind::Module,
657                    start_line,
658                    end_line: end_line.max(start_line),
659                    content: chunk_text,
660                    tokens: Vec::new(),
661                    token_count,
662                });
663            }
664        } else {
665            let token_count = tokenize(content).len();
666            let snippet = lines
667                .iter()
668                .take(50)
669                .copied()
670                .collect::<Vec<_>>()
671                .join("\n");
672            chunks.push(CodeChunk {
673                file_path: file_path.to_string(),
674                symbol_name: file_path.to_string(),
675                kind: ChunkKind::Module,
676                start_line: 1,
677                end_line: lines.len(),
678                content: snippet,
679                tokens: Vec::new(),
680                token_count,
681            });
682        }
683    }
684
685    chunks
686}
687
688fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
689    let trimmed = line.trim();
690
691    let patterns: &[(&str, ChunkKind)] = &[
692        ("pub async fn ", ChunkKind::Function),
693        ("async fn ", ChunkKind::Function),
694        ("pub fn ", ChunkKind::Function),
695        ("fn ", ChunkKind::Function),
696        ("pub struct ", ChunkKind::Struct),
697        ("struct ", ChunkKind::Struct),
698        ("pub enum ", ChunkKind::Struct),
699        ("enum ", ChunkKind::Struct),
700        ("impl ", ChunkKind::Impl),
701        ("pub trait ", ChunkKind::Struct),
702        ("trait ", ChunkKind::Struct),
703        ("export function ", ChunkKind::Function),
704        ("export async function ", ChunkKind::Function),
705        ("export default function ", ChunkKind::Function),
706        ("function ", ChunkKind::Function),
707        ("async function ", ChunkKind::Function),
708        ("export class ", ChunkKind::Class),
709        ("class ", ChunkKind::Class),
710        ("export interface ", ChunkKind::Struct),
711        ("interface ", ChunkKind::Struct),
712        ("def ", ChunkKind::Function),
713        ("async def ", ChunkKind::Function),
714        ("class ", ChunkKind::Class),
715        ("func ", ChunkKind::Function),
716    ];
717
718    for (prefix, kind) in patterns {
719        if let Some(rest) = trimmed.strip_prefix(prefix) {
720            let name: String = rest
721                .chars()
722                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
723                .take_while(|c| *c != '<')
724                .collect();
725            if !name.is_empty() {
726                return Some((name, kind.clone()));
727            }
728        }
729    }
730
731    None
732}
733
734fn find_block_end(lines: &[&str], start: usize) -> usize {
735    let mut depth = 0i32;
736    let mut found_open = false;
737
738    for (i, line) in lines.iter().enumerate().skip(start) {
739        for ch in line.chars() {
740            match ch {
741                '{' | '(' if !found_open || depth > 0 => {
742                    depth += 1;
743                    found_open = true;
744                }
745                '}' | ')' if depth > 0 => {
746                    depth -= 1;
747                    if depth == 0 && found_open {
748                        return i;
749                    }
750                }
751                _ => {}
752            }
753        }
754
755        if found_open && depth <= 0 && i > start {
756            return i;
757        }
758
759        if !found_open && i > start + 2 {
760            let trimmed = lines[i].trim();
761            if trimmed.is_empty()
762                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
763            {
764                return i.saturating_sub(1);
765            }
766        }
767    }
768
769    (start + 50).min(lines.len().saturating_sub(1))
770}
771
772pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
773    if results.is_empty() {
774        return "No results found.".to_string();
775    }
776
777    let mut out = String::new();
778    for (i, r) in results.iter().enumerate() {
779        if compact {
780            out.push_str(&format!(
781                "{}. {:.2} {}:{}-{} {:?} {}\n",
782                i + 1,
783                r.score,
784                r.file_path,
785                r.start_line,
786                r.end_line,
787                r.kind,
788                r.symbol_name,
789            ));
790        } else {
791            out.push_str(&format!(
792                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
793                i + 1,
794                r.score,
795                r.file_path,
796                r.symbol_name,
797                r.kind,
798                r.start_line,
799                r.end_line,
800                r.snippet,
801            ));
802        }
803    }
804    out
805}
806
807#[cfg(test)]
808mod tests {
809    use super::*;
810    use tempfile::tempdir;
811
812    #[cfg(unix)]
813    use std::os::unix::fs::PermissionsExt;
814
815    #[test]
816    fn tokenize_splits_code() {
817        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
818        assert!(tokens.contains(&"calculate_total".to_string()));
819        assert!(tokens.contains(&"items".to_string()));
820        assert!(tokens.contains(&"Vec".to_string()));
821    }
822
823    #[test]
824    fn camel_case_splitting() {
825        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
826        assert!(tokens.contains(&"calculateTotal".to_string()));
827        assert!(tokens.contains(&"calculate".to_string()));
828        assert!(tokens.contains(&"Total".to_string()));
829    }
830
831    #[test]
832    fn detect_rust_function() {
833        let (name, kind) =
834            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
835        assert_eq!(name, "process_request");
836        assert_eq!(kind, ChunkKind::Function);
837    }
838
839    #[test]
840    fn bm25_search_finds_relevant() {
841        let mut index = BM25Index::new();
842        index.add_chunk(CodeChunk {
843            file_path: "auth.rs".into(),
844            symbol_name: "validate_token".into(),
845            kind: ChunkKind::Function,
846            start_line: 1,
847            end_line: 10,
848            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
849            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
850            token_count: 8,
851        });
852        index.add_chunk(CodeChunk {
853            file_path: "db.rs".into(),
854            symbol_name: "connect_database".into(),
855            kind: ChunkKind::Function,
856            start_line: 1,
857            end_line: 5,
858            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
859            tokens: tokenize("fn connect_database url str Pool create_pool url"),
860            token_count: 7,
861        });
862        index.finalize();
863
864        let results = index.search("jwt token validation", 5);
865        assert!(!results.is_empty());
866        assert_eq!(results[0].symbol_name, "validate_token");
867    }
868
869    #[test]
870    fn bm25_search_sorts_ties_deterministically() {
871        let mut index = BM25Index::new();
872
873        // Insert in reverse path order to ensure the sort tie-break matters.
874        index.add_chunk(CodeChunk {
875            file_path: "b.rs".into(),
876            symbol_name: "same".into(),
877            kind: ChunkKind::Function,
878            start_line: 1,
879            end_line: 1,
880            content: "fn same() {}".into(),
881            tokens: tokenize("same token"),
882            token_count: 2,
883        });
884        index.add_chunk(CodeChunk {
885            file_path: "a.rs".into(),
886            symbol_name: "same".into(),
887            kind: ChunkKind::Function,
888            start_line: 1,
889            end_line: 1,
890            content: "fn same() {}".into(),
891            tokens: tokenize("same token"),
892            token_count: 2,
893        });
894        index.finalize();
895
896        let results = index.search("same", 10);
897        assert!(results.len() >= 2);
898        assert_eq!(results[0].file_path, "a.rs");
899        assert_eq!(results[1].file_path, "b.rs");
900    }
901
902    #[test]
903    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
904        let td = tempdir().expect("tempdir");
905        let root = td.path();
906        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
907
908        let idx = BM25Index::build_from_directory(root);
909        assert!(!bm25_index_looks_stale(&idx, root));
910
911        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
912        assert!(bm25_index_looks_stale(&idx, root));
913    }
914
915    #[test]
916    #[cfg(unix)]
917    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
918        let td = tempdir().expect("tempdir");
919        let root = td.path();
920
921        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
922        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
923
924        let idx1 = BM25Index::build_from_directory(root);
925        assert!(idx1.files.contains_key("a.rs"));
926        assert!(idx1.files.contains_key("b.rs"));
927
928        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
929        let a_path = root.join("a.rs");
930        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
931        perms.set_mode(0o000);
932        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
933
934        // Change b.rs (size changes) to force a re-read for that file.
935        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
936            .expect("rewrite b.rs");
937
938        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
939        assert!(
940            idx2.files.contains_key("a.rs"),
941            "a.rs should be kept via reuse"
942        );
943        assert!(idx2.files.contains_key("b.rs"));
944
945        let b_has_b2 = idx2
946            .chunks
947            .iter()
948            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
949        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
950
951        // Restore permissions to avoid cleanup surprises.
952        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
953        perms.set_mode(0o644);
954        let _ = std::fs::set_permissions(&a_path, perms);
955    }
956
957    #[test]
958    fn load_quarantines_oversized_index() {
959        let _env = crate::core::data_dir::test_env_lock();
960        let td = tempdir().expect("tempdir");
961        let root = td.path();
962        let dir = crate::core::index_namespace::vectors_dir(root);
963        std::fs::create_dir_all(&dir).expect("create vectors dir");
964
965        let index_path = dir.join("bm25_index.json");
966        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
967        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
968
969        let result = BM25Index::load(root);
970        assert!(result.is_none(), "oversized index should return None");
971        assert!(
972            !index_path.exists(),
973            "original index should be removed after quarantine"
974        );
975        assert!(
976            dir.join("bm25_index.json.quarantined").exists(),
977            "quarantined file should exist"
978        );
979
980        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
981    }
982
983    #[test]
984    fn save_refuses_oversized_output() {
985        let _env = crate::core::data_dir::test_env_lock();
986        let data_dir = tempdir().expect("data_dir");
987        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
988        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
989
990        let td = tempdir().expect("tempdir");
991        let root = td.path();
992
993        let mut index = BM25Index::new();
994        index.add_chunk(CodeChunk {
995            file_path: "a.rs".into(),
996            symbol_name: "a".into(),
997            kind: ChunkKind::Function,
998            start_line: 1,
999            end_line: 1,
1000            content: "fn a() {}".into(),
1001            tokens: tokenize("fn a"),
1002            token_count: 2,
1003        });
1004        index.finalize();
1005
1006        let _ = index.save(root);
1007        let index_path = BM25Index::index_file_path(root);
1008        assert!(
1009            !index_path.exists(),
1010            "save should refuse to persist oversized index"
1011        );
1012
1013        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1014    }
1015
1016    #[test]
1017    fn save_writes_project_root_marker() {
1018        let td = tempdir().expect("tempdir");
1019        let root = td.path();
1020        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1021
1022        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1023        let index = BM25Index::build_from_directory(root);
1024        index.save(root).expect("save");
1025
1026        let dir = crate::core::index_namespace::vectors_dir(root);
1027        let marker = dir.join("project_root.txt");
1028        assert!(marker.exists(), "project_root.txt marker should exist");
1029        let content = std::fs::read_to_string(&marker).expect("read marker");
1030        assert_eq!(content, root.to_string_lossy());
1031    }
1032
1033    #[test]
1034    fn list_code_files_skips_default_vendor_ignores() {
1035        let td = tempdir().expect("tempdir");
1036        let root = td.path();
1037
1038        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1039        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1040        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1041        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1042        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1043
1044        let files = list_code_files(root);
1045        assert!(
1046            files.iter().any(|f| f == "main.rs"),
1047            "main.rs should be included"
1048        );
1049        assert!(
1050            !files.iter().any(|f| f.starts_with("vendor/")),
1051            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1052        );
1053        assert!(
1054            !files.iter().any(|f| f.starts_with("dist/")),
1055            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1056        );
1057    }
1058
1059    #[test]
1060    fn list_code_files_respects_max_files_cap() {
1061        let td = tempdir().expect("tempdir");
1062        let root = td.path();
1063
1064        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1065        // but we can verify the cap constant exists and the function returns a bounded vec.
1066        for i in 0..10 {
1067            std::fs::write(
1068                root.join(format!("f{i}.rs")),
1069                format!("pub fn f{i}() {{}}\n"),
1070            )
1071            .expect("write");
1072        }
1073        let files = list_code_files(root);
1074        assert!(
1075            files.len() <= MAX_BM25_FILES,
1076            "file count should not exceed MAX_BM25_FILES"
1077        );
1078    }
1079
1080    #[test]
1081    fn max_bm25_cache_bytes_reads_env() {
1082        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1083        let bytes = max_bm25_cache_bytes();
1084        assert_eq!(bytes, 64 * 1024 * 1024);
1085        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1086    }
1087}