Skip to main content

lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11    "vendor/**",
12    "dist/**",
13    "build/**",
14    "public/vendor/**",
15    "public/js/**",
16    "public/css/**",
17    "public/build/**",
18    ".next/**",
19    ".nuxt/**",
20    "__pycache__/**",
21    "*.min.js",
22    "*.min.css",
23    "*.bundle.js",
24    "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29        .ok()
30        .and_then(|v| v.parse::<u64>().ok())
31        .unwrap_or_else(|| {
32            let cfg = crate::core::config::Config::load();
33            let profile = crate::core::config::MemoryProfile::effective(&cfg);
34            let profile_mb = profile.bm25_max_cache_mb();
35            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36                profile_mb
37            } else {
38                cfg.bm25_max_cache_mb
39            }
40        });
41    mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46    pub file_path: String,
47    pub symbol_name: String,
48    pub kind: ChunkKind,
49    pub start_line: usize,
50    pub end_line: usize,
51    pub content: String,
52    #[serde(skip_serializing, default)]
53    pub tokens: Vec<String>,
54    pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59    Function,
60    Struct,
61    Impl,
62    Module,
63    Class,
64    Method,
65    Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70    pub mtime_ms: u64,
71    pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75    fn from_path(path: &Path) -> Option<Self> {
76        let meta = path.metadata().ok()?;
77        let size_bytes = meta.len();
78        let mtime_ms = meta
79            .modified()
80            .ok()
81            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82            .map(|d| d.as_millis() as u64)?;
83        Some(Self {
84            mtime_ms,
85            size_bytes,
86        })
87    }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92    pub chunks: Vec<CodeChunk>,
93    pub inverted: HashMap<String, Vec<(usize, f64)>>,
94    pub avg_doc_len: f64,
95    pub doc_count: usize,
96    pub doc_freqs: HashMap<String, usize>,
97    #[serde(default)]
98    pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103    pub chunk_idx: usize,
104    pub score: f64,
105    pub file_path: String,
106    pub symbol_name: String,
107    pub kind: ChunkKind,
108    pub start_line: usize,
109    pub end_line: usize,
110    pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117    fn default() -> Self {
118        Self::new()
119    }
120}
121
122impl BM25Index {
123    pub fn new() -> Self {
124        Self {
125            chunks: Vec::new(),
126            inverted: HashMap::new(),
127            avg_doc_len: 0.0,
128            doc_count: 0,
129            doc_freqs: HashMap::new(),
130            files: HashMap::new(),
131        }
132    }
133
134    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
135    #[cfg(test)]
136    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
137        let mut index = Self::new();
138        for mut chunk in chunks {
139            if chunk.token_count == 0 {
140                chunk.token_count = tokenize(&chunk.content).len();
141            }
142            index.add_chunk(chunk);
143        }
144        index.finalize();
145        index
146    }
147
148    pub fn build_from_directory(root: &Path) -> Self {
149        let mut index = Self::new();
150        let files = list_code_files(root);
151        for rel in files {
152            let abs = root.join(&rel);
153            let Some(state) = IndexedFileState::from_path(&abs) else {
154                continue;
155            };
156            if let Ok(content) = std::fs::read_to_string(&abs) {
157                let mut chunks = extract_chunks(&rel, &content);
158                chunks.sort_by(|a, b| {
159                    a.start_line
160                        .cmp(&b.start_line)
161                        .then_with(|| a.end_line.cmp(&b.end_line))
162                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
163                });
164                for chunk in chunks {
165                    index.add_chunk(chunk);
166                }
167                index.files.insert(rel, state);
168            }
169        }
170
171        index.finalize();
172        index
173    }
174
175    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
176        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
177        for c in &prev.chunks {
178            old_by_file
179                .entry(c.file_path.clone())
180                .or_default()
181                .push(c.clone());
182        }
183        for v in old_by_file.values_mut() {
184            v.sort_by(|a, b| {
185                a.start_line
186                    .cmp(&b.start_line)
187                    .then_with(|| a.end_line.cmp(&b.end_line))
188                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
189            });
190        }
191
192        let mut index = Self::new();
193        let files = list_code_files(root);
194        for rel in files {
195            let abs = root.join(&rel);
196            let Some(state) = IndexedFileState::from_path(&abs) else {
197                continue;
198            };
199
200            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
201            if unchanged {
202                if let Some(chunks) = old_by_file.get(&rel) {
203                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
204                        for chunk in chunks {
205                            index.add_chunk(chunk.clone());
206                        }
207                        index.files.insert(rel, state);
208                        continue;
209                    }
210                }
211            }
212
213            if let Ok(content) = std::fs::read_to_string(&abs) {
214                let mut chunks = extract_chunks(&rel, &content);
215                chunks.sort_by(|a, b| {
216                    a.start_line
217                        .cmp(&b.start_line)
218                        .then_with(|| a.end_line.cmp(&b.end_line))
219                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
220                });
221                for chunk in chunks {
222                    index.add_chunk(chunk);
223                }
224                index.files.insert(rel, state);
225            }
226        }
227
228        index.finalize();
229        index
230    }
231
232    fn add_chunk(&mut self, chunk: CodeChunk) {
233        let idx = self.chunks.len();
234
235        let tokens = tokenize(&chunk.content);
236        for token in &tokens {
237            let lower = token.to_lowercase();
238            let postings = self.inverted.entry(lower.clone()).or_default();
239            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
240                *self.doc_freqs.entry(lower).or_insert(0) += 1;
241            }
242            postings.push((idx, 1.0));
243        }
244
245        self.chunks.push(CodeChunk {
246            token_count: tokens.len(),
247            tokens: Vec::new(),
248            ..chunk
249        });
250    }
251
252    fn finalize(&mut self) {
253        self.doc_count = self.chunks.len();
254        if self.doc_count == 0 {
255            return;
256        }
257
258        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
259        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
260    }
261
262    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
263        let query_tokens = tokenize(query);
264        if query_tokens.is_empty() || self.doc_count == 0 {
265            return Vec::new();
266        }
267
268        let mut scores: HashMap<usize, f64> = HashMap::new();
269
270        for token in &query_tokens {
271            let lower = token.to_lowercase();
272            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
273            if df == 0.0 {
274                continue;
275            }
276
277            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
278
279            if let Some(postings) = self.inverted.get(&lower) {
280                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
281                for (idx, weight) in postings {
282                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
283                }
284
285                for (doc_idx, tf) in &doc_tfs {
286                    let doc_len = self.chunks[*doc_idx].token_count as f64;
287                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
288                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
289                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
290
291                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
292                }
293            }
294        }
295
296        let mut results: Vec<SearchResult> = scores
297            .into_iter()
298            .map(|(idx, score)| {
299                let chunk = &self.chunks[idx];
300                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
301                SearchResult {
302                    chunk_idx: idx,
303                    score,
304                    file_path: chunk.file_path.clone(),
305                    symbol_name: chunk.symbol_name.clone(),
306                    kind: chunk.kind.clone(),
307                    start_line: chunk.start_line,
308                    end_line: chunk.end_line,
309                    snippet,
310                }
311            })
312            .collect();
313
314        results.sort_by(|a, b| {
315            b.score
316                .partial_cmp(&a.score)
317                .unwrap_or(std::cmp::Ordering::Equal)
318                .then_with(|| a.file_path.cmp(&b.file_path))
319                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
320                .then_with(|| a.start_line.cmp(&b.start_line))
321                .then_with(|| a.end_line.cmp(&b.end_line))
322        });
323        results.truncate(top_k);
324        results
325    }
326
327    pub fn save(&self, root: &Path) -> std::io::Result<()> {
328        if self.chunks.len() > CHUNK_COUNT_WARNING {
329            tracing::warn!(
330                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
331                self.chunks.len(),
332                CHUNK_COUNT_WARNING
333            );
334        }
335
336        let dir = index_dir(root);
337        std::fs::create_dir_all(&dir)?;
338        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
339            .map_err(|e| std::io::Error::other(e.to_string()))?;
340
341        let max_bytes = max_bm25_cache_bytes();
342        if data.len() as u64 > max_bytes {
343            tracing::warn!(
344                "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
345                data.len() as f64 / 1_048_576.0,
346                max_bytes / (1024 * 1024),
347                dir.display()
348            );
349            return Ok(());
350        }
351
352        let target = dir.join("bm25_index.bin");
353        let tmp = dir.join("bm25_index.bin.tmp");
354        std::fs::write(&tmp, &data)?;
355        std::fs::rename(&tmp, &target)?;
356
357        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
358
359        let _ = std::fs::write(
360            dir.join("project_root.txt"),
361            root.to_string_lossy().as_bytes(),
362        );
363
364        Ok(())
365    }
366
367    pub fn load(root: &Path) -> Option<Self> {
368        let dir = index_dir(root);
369        let max_bytes = max_bm25_cache_bytes();
370
371        let bin_path = dir.join("bm25_index.bin");
372        if bin_path.exists() {
373            let meta = std::fs::metadata(&bin_path).ok()?;
374            if meta.len() > max_bytes {
375                tracing::warn!(
376                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
377                    meta.len() as f64 / 1_073_741_824.0,
378                    max_bytes / (1024 * 1024),
379                    bin_path.display()
380                );
381                let quarantined = bin_path.with_extension("bin.quarantined");
382                let _ = std::fs::rename(&bin_path, &quarantined);
383                return None;
384            }
385            let data = std::fs::read(&bin_path).ok()?;
386            let (idx, _): (Self, _) =
387                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
388            return Some(idx);
389        }
390
391        let json_path = dir.join("bm25_index.json");
392        if json_path.exists() {
393            let meta = std::fs::metadata(&json_path).ok()?;
394            if meta.len() > max_bytes {
395                tracing::warn!(
396                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
397                    meta.len() as f64 / 1_073_741_824.0,
398                    max_bytes / (1024 * 1024),
399                    json_path.display()
400                );
401                let quarantined = json_path.with_extension("json.quarantined");
402                let _ = std::fs::rename(&json_path, &quarantined);
403                return None;
404            }
405            let data = std::fs::read_to_string(&json_path).ok()?;
406            return serde_json::from_str(&data).ok();
407        }
408
409        None
410    }
411
412    pub fn load_or_build(root: &Path) -> Self {
413        if let Some(idx) = Self::load(root) {
414            if !bm25_index_looks_stale(&idx, root) {
415                return idx;
416            }
417            tracing::warn!(
418                "[bm25_index: stale index detected for {}; rebuilding]",
419                root.display()
420            );
421            let rebuilt = if idx.files.is_empty() {
422                Self::build_from_directory(root)
423            } else {
424                Self::rebuild_incremental(root, &idx)
425            };
426            let _ = rebuilt.save(root);
427            return rebuilt;
428        }
429
430        let built = Self::build_from_directory(root);
431        let _ = built.save(root);
432        built
433    }
434
435    pub fn index_file_path(root: &Path) -> PathBuf {
436        let dir = index_dir(root);
437        let bin = dir.join("bm25_index.bin");
438        if bin.exists() {
439            return bin;
440        }
441        dir.join("bm25_index.json")
442    }
443}
444
445fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
446    if index.chunks.is_empty() {
447        return false;
448    }
449
450    if index.files.is_empty() {
451        // Legacy index (pre file-state tracking): only detect missing files.
452        let mut seen = std::collections::HashSet::<&str>::new();
453        for chunk in &index.chunks {
454            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
455            if rel.is_empty() {
456                continue;
457            }
458            if !seen.insert(rel) {
459                continue;
460            }
461            if !root.join(rel).exists() {
462                return true;
463            }
464        }
465        return false;
466    }
467
468    // Missing or modified tracked files.
469    for (rel, old_state) in &index.files {
470        let abs = root.join(rel);
471        if !abs.exists() {
472            return true;
473        }
474        let Some(cur) = IndexedFileState::from_path(&abs) else {
475            return true;
476        };
477        if &cur != old_state {
478            return true;
479        }
480    }
481
482    // New files (present on disk but not in index).
483    for rel in list_code_files(root) {
484        if !index.files.contains_key(&rel) {
485            return true;
486        }
487    }
488
489    false
490}
491
492fn index_dir(root: &Path) -> PathBuf {
493    crate::core::index_namespace::vectors_dir(root)
494}
495
496fn list_code_files(root: &Path) -> Vec<String> {
497    let walker = ignore::WalkBuilder::new(root)
498        .hidden(true)
499        .git_ignore(true)
500        .git_global(true)
501        .git_exclude(true)
502        .build();
503
504    let cfg = crate::core::config::Config::load();
505    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
506        .iter()
507        .filter_map(|p| glob::Pattern::new(p).ok())
508        .collect();
509    ignore_patterns.extend(
510        cfg.extra_ignore_patterns
511            .iter()
512            .filter_map(|p| glob::Pattern::new(p).ok()),
513    );
514
515    let mut files: Vec<String> = Vec::new();
516    for entry in walker.flatten() {
517        let path = entry.path();
518        if !path.is_file() {
519            continue;
520        }
521        if !is_code_file(path) {
522            continue;
523        }
524        let rel = path
525            .strip_prefix(root)
526            .unwrap_or(path)
527            .to_string_lossy()
528            .to_string();
529        if rel.is_empty() {
530            continue;
531        }
532        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
533            continue;
534        }
535        if files.len() >= MAX_BM25_FILES {
536            tracing::warn!(
537                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
538                root.display()
539            );
540            break;
541        }
542        files.push(rel);
543    }
544
545    files.sort();
546    files.dedup();
547    files
548}
549
550pub fn is_code_file(path: &Path) -> bool {
551    let ext = path
552        .extension()
553        .and_then(|e| e.to_str())
554        .unwrap_or("")
555        .to_lowercase();
556    matches!(
557        ext.as_str(),
558        "rs" | "ts"
559            | "tsx"
560            | "js"
561            | "jsx"
562            | "py"
563            | "go"
564            | "java"
565            | "c"
566            | "cc"
567            | "cpp"
568            | "h"
569            | "hpp"
570            | "rb"
571            | "cs"
572            | "kt"
573            | "swift"
574            | "php"
575            | "scala"
576            | "sql"
577            | "ex"
578            | "exs"
579            | "zig"
580            | "lua"
581            | "dart"
582            | "vue"
583            | "svelte"
584    )
585}
586
587fn tokenize(text: &str) -> Vec<String> {
588    let mut tokens = Vec::new();
589    let mut current = String::new();
590
591    for ch in text.chars() {
592        if ch.is_alphanumeric() || ch == '_' {
593            current.push(ch);
594        } else {
595            if current.len() >= 2 {
596                tokens.push(current.clone());
597            }
598            current.clear();
599        }
600    }
601    if current.len() >= 2 {
602        tokens.push(current);
603    }
604
605    split_camel_case_tokens(&tokens)
606}
607
608pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
609    tokenize(text)
610}
611
612fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
613    let mut result = Vec::new();
614    for token in tokens {
615        result.push(token.clone());
616        let mut start = 0;
617        let chars: Vec<char> = token.chars().collect();
618        for i in 1..chars.len() {
619            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
620                let part: String = chars[start..i].iter().collect();
621                if part.len() >= 2 {
622                    result.push(part);
623                }
624                start = i;
625            }
626        }
627        if start > 0 {
628            let part: String = chars[start..].iter().collect();
629            if part.len() >= 2 {
630                result.push(part);
631            }
632        }
633    }
634    result
635}
636
637fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
638    #[cfg(feature = "tree-sitter")]
639    {
640        let ext = std::path::Path::new(file_path)
641            .extension()
642            .and_then(|e| e.to_str())
643            .unwrap_or("");
644        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
645            return chunks;
646        }
647    }
648
649    let lines: Vec<&str> = content.lines().collect();
650    if lines.is_empty() {
651        return Vec::new();
652    }
653
654    let mut chunks = Vec::new();
655    let mut i = 0;
656
657    while i < lines.len() {
658        let trimmed = lines[i].trim();
659
660        if let Some((name, kind)) = detect_symbol(trimmed) {
661            let start = i;
662            let end = find_block_end(&lines, i);
663            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
664            let token_count = tokenize(&block).len();
665
666            chunks.push(CodeChunk {
667                file_path: file_path.to_string(),
668                symbol_name: name,
669                kind,
670                start_line: start + 1,
671                end_line: end + 1,
672                content: block,
673                tokens: Vec::new(),
674                token_count,
675            });
676
677            i = end + 1;
678        } else {
679            i += 1;
680        }
681    }
682
683    if chunks.is_empty() && !content.is_empty() {
684        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
685        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
686        //
687        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
688        let bytes = content.as_bytes();
689        let rk_chunks = crate::core::rabin_karp::chunk(content);
690        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
691            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
692                let end = (c.offset + c.length).min(bytes.len());
693                let slice = &bytes[c.offset..end];
694                let chunk_text = String::from_utf8_lossy(slice).into_owned();
695                let token_count = tokenize(&chunk_text).len();
696                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
697                let end_line = start_line + bytecount::count(slice, b'\n');
698                chunks.push(CodeChunk {
699                    file_path: file_path.to_string(),
700                    symbol_name: format!("{file_path}#chunk-{idx}"),
701                    kind: ChunkKind::Module,
702                    start_line,
703                    end_line: end_line.max(start_line),
704                    content: chunk_text,
705                    tokens: Vec::new(),
706                    token_count,
707                });
708            }
709        } else {
710            let token_count = tokenize(content).len();
711            let snippet = lines
712                .iter()
713                .take(50)
714                .copied()
715                .collect::<Vec<_>>()
716                .join("\n");
717            chunks.push(CodeChunk {
718                file_path: file_path.to_string(),
719                symbol_name: file_path.to_string(),
720                kind: ChunkKind::Module,
721                start_line: 1,
722                end_line: lines.len(),
723                content: snippet,
724                tokens: Vec::new(),
725                token_count,
726            });
727        }
728    }
729
730    chunks
731}
732
733fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
734    let trimmed = line.trim();
735
736    let patterns: &[(&str, ChunkKind)] = &[
737        ("pub async fn ", ChunkKind::Function),
738        ("async fn ", ChunkKind::Function),
739        ("pub fn ", ChunkKind::Function),
740        ("fn ", ChunkKind::Function),
741        ("pub struct ", ChunkKind::Struct),
742        ("struct ", ChunkKind::Struct),
743        ("pub enum ", ChunkKind::Struct),
744        ("enum ", ChunkKind::Struct),
745        ("impl ", ChunkKind::Impl),
746        ("pub trait ", ChunkKind::Struct),
747        ("trait ", ChunkKind::Struct),
748        ("export function ", ChunkKind::Function),
749        ("export async function ", ChunkKind::Function),
750        ("export default function ", ChunkKind::Function),
751        ("function ", ChunkKind::Function),
752        ("async function ", ChunkKind::Function),
753        ("export class ", ChunkKind::Class),
754        ("class ", ChunkKind::Class),
755        ("export interface ", ChunkKind::Struct),
756        ("interface ", ChunkKind::Struct),
757        ("def ", ChunkKind::Function),
758        ("async def ", ChunkKind::Function),
759        ("class ", ChunkKind::Class),
760        ("func ", ChunkKind::Function),
761    ];
762
763    for (prefix, kind) in patterns {
764        if let Some(rest) = trimmed.strip_prefix(prefix) {
765            let name: String = rest
766                .chars()
767                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
768                .take_while(|c| *c != '<')
769                .collect();
770            if !name.is_empty() {
771                return Some((name, kind.clone()));
772            }
773        }
774    }
775
776    None
777}
778
779fn find_block_end(lines: &[&str], start: usize) -> usize {
780    let mut depth = 0i32;
781    let mut found_open = false;
782
783    for (i, line) in lines.iter().enumerate().skip(start) {
784        for ch in line.chars() {
785            match ch {
786                '{' | '(' if !found_open || depth > 0 => {
787                    depth += 1;
788                    found_open = true;
789                }
790                '}' | ')' if depth > 0 => {
791                    depth -= 1;
792                    if depth == 0 && found_open {
793                        return i;
794                    }
795                }
796                _ => {}
797            }
798        }
799
800        if found_open && depth <= 0 && i > start {
801            return i;
802        }
803
804        if !found_open && i > start + 2 {
805            let trimmed = lines[i].trim();
806            if trimmed.is_empty()
807                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
808            {
809                return i.saturating_sub(1);
810            }
811        }
812    }
813
814    (start + 50).min(lines.len().saturating_sub(1))
815}
816
817pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
818    if results.is_empty() {
819        return "No results found.".to_string();
820    }
821
822    let mut out = String::new();
823    for (i, r) in results.iter().enumerate() {
824        if compact {
825            out.push_str(&format!(
826                "{}. {:.2} {}:{}-{} {:?} {}\n",
827                i + 1,
828                r.score,
829                r.file_path,
830                r.start_line,
831                r.end_line,
832                r.kind,
833                r.symbol_name,
834            ));
835        } else {
836            out.push_str(&format!(
837                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
838                i + 1,
839                r.score,
840                r.file_path,
841                r.symbol_name,
842                r.kind,
843                r.start_line,
844                r.end_line,
845                r.snippet,
846            ));
847        }
848    }
849    out
850}
851
852#[cfg(test)]
853mod tests {
854    use super::*;
855    use tempfile::tempdir;
856
857    #[cfg(unix)]
858    use std::os::unix::fs::PermissionsExt;
859
860    #[test]
861    fn tokenize_splits_code() {
862        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
863        assert!(tokens.contains(&"calculate_total".to_string()));
864        assert!(tokens.contains(&"items".to_string()));
865        assert!(tokens.contains(&"Vec".to_string()));
866    }
867
868    #[test]
869    fn camel_case_splitting() {
870        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
871        assert!(tokens.contains(&"calculateTotal".to_string()));
872        assert!(tokens.contains(&"calculate".to_string()));
873        assert!(tokens.contains(&"Total".to_string()));
874    }
875
876    #[test]
877    fn detect_rust_function() {
878        let (name, kind) =
879            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
880        assert_eq!(name, "process_request");
881        assert_eq!(kind, ChunkKind::Function);
882    }
883
884    #[test]
885    fn bm25_search_finds_relevant() {
886        let mut index = BM25Index::new();
887        index.add_chunk(CodeChunk {
888            file_path: "auth.rs".into(),
889            symbol_name: "validate_token".into(),
890            kind: ChunkKind::Function,
891            start_line: 1,
892            end_line: 10,
893            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
894            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
895            token_count: 8,
896        });
897        index.add_chunk(CodeChunk {
898            file_path: "db.rs".into(),
899            symbol_name: "connect_database".into(),
900            kind: ChunkKind::Function,
901            start_line: 1,
902            end_line: 5,
903            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
904            tokens: tokenize("fn connect_database url str Pool create_pool url"),
905            token_count: 7,
906        });
907        index.finalize();
908
909        let results = index.search("jwt token validation", 5);
910        assert!(!results.is_empty());
911        assert_eq!(results[0].symbol_name, "validate_token");
912    }
913
914    #[test]
915    fn bm25_search_sorts_ties_deterministically() {
916        let mut index = BM25Index::new();
917
918        // Insert in reverse path order to ensure the sort tie-break matters.
919        index.add_chunk(CodeChunk {
920            file_path: "b.rs".into(),
921            symbol_name: "same".into(),
922            kind: ChunkKind::Function,
923            start_line: 1,
924            end_line: 1,
925            content: "fn same() {}".into(),
926            tokens: tokenize("same token"),
927            token_count: 2,
928        });
929        index.add_chunk(CodeChunk {
930            file_path: "a.rs".into(),
931            symbol_name: "same".into(),
932            kind: ChunkKind::Function,
933            start_line: 1,
934            end_line: 1,
935            content: "fn same() {}".into(),
936            tokens: tokenize("same token"),
937            token_count: 2,
938        });
939        index.finalize();
940
941        let results = index.search("same", 10);
942        assert!(results.len() >= 2);
943        assert_eq!(results[0].file_path, "a.rs");
944        assert_eq!(results[1].file_path, "b.rs");
945    }
946
947    #[test]
948    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
949        let td = tempdir().expect("tempdir");
950        let root = td.path();
951        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
952
953        let idx = BM25Index::build_from_directory(root);
954        assert!(!bm25_index_looks_stale(&idx, root));
955
956        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
957        assert!(bm25_index_looks_stale(&idx, root));
958    }
959
960    #[test]
961    #[cfg(unix)]
962    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
963        let td = tempdir().expect("tempdir");
964        let root = td.path();
965
966        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
967        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
968
969        let idx1 = BM25Index::build_from_directory(root);
970        assert!(idx1.files.contains_key("a.rs"));
971        assert!(idx1.files.contains_key("b.rs"));
972
973        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
974        let a_path = root.join("a.rs");
975        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
976        perms.set_mode(0o000);
977        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
978
979        // Change b.rs (size changes) to force a re-read for that file.
980        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
981            .expect("rewrite b.rs");
982
983        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
984        assert!(
985            idx2.files.contains_key("a.rs"),
986            "a.rs should be kept via reuse"
987        );
988        assert!(idx2.files.contains_key("b.rs"));
989
990        let b_has_b2 = idx2
991            .chunks
992            .iter()
993            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
994        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
995
996        // Restore permissions to avoid cleanup surprises.
997        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
998        perms.set_mode(0o644);
999        let _ = std::fs::set_permissions(&a_path, perms);
1000    }
1001
1002    #[test]
1003    fn load_quarantines_oversized_index() {
1004        let _env = crate::core::data_dir::test_env_lock();
1005        let td = tempdir().expect("tempdir");
1006        let root = td.path();
1007        let dir = crate::core::index_namespace::vectors_dir(root);
1008        std::fs::create_dir_all(&dir).expect("create vectors dir");
1009
1010        let index_path = dir.join("bm25_index.json");
1011        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1012        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1013
1014        let result = BM25Index::load(root);
1015        assert!(result.is_none(), "oversized index should return None");
1016        assert!(
1017            !index_path.exists(),
1018            "original index should be removed after quarantine"
1019        );
1020        assert!(
1021            dir.join("bm25_index.json.quarantined").exists(),
1022            "quarantined file should exist"
1023        );
1024
1025        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1026    }
1027
1028    #[test]
1029    fn save_refuses_oversized_output() {
1030        let _env = crate::core::data_dir::test_env_lock();
1031        let data_dir = tempdir().expect("data_dir");
1032        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1033        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1034
1035        let td = tempdir().expect("tempdir");
1036        let root = td.path();
1037
1038        let mut index = BM25Index::new();
1039        index.add_chunk(CodeChunk {
1040            file_path: "a.rs".into(),
1041            symbol_name: "a".into(),
1042            kind: ChunkKind::Function,
1043            start_line: 1,
1044            end_line: 1,
1045            content: "fn a() {}".into(),
1046            tokens: tokenize("fn a"),
1047            token_count: 2,
1048        });
1049        index.finalize();
1050
1051        let _ = index.save(root);
1052        let index_path = BM25Index::index_file_path(root);
1053        assert!(
1054            !index_path.exists(),
1055            "save should refuse to persist oversized index"
1056        );
1057
1058        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1059    }
1060
1061    #[test]
1062    fn save_writes_project_root_marker() {
1063        let _env = crate::core::data_dir::test_env_lock();
1064        let td = tempdir().expect("tempdir");
1065        let root = td.path();
1066        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1067
1068        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1069        let index = BM25Index::build_from_directory(root);
1070        index.save(root).expect("save");
1071
1072        let dir = crate::core::index_namespace::vectors_dir(root);
1073        let marker = dir.join("project_root.txt");
1074        assert!(marker.exists(), "project_root.txt marker should exist");
1075        let content = std::fs::read_to_string(&marker).expect("read marker");
1076        assert_eq!(content, root.to_string_lossy());
1077    }
1078
1079    #[test]
1080    fn list_code_files_skips_default_vendor_ignores() {
1081        let td = tempdir().expect("tempdir");
1082        let root = td.path();
1083
1084        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1085        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1086        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1087        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1088        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1089
1090        let files = list_code_files(root);
1091        assert!(
1092            files.iter().any(|f| f == "main.rs"),
1093            "main.rs should be included"
1094        );
1095        assert!(
1096            !files.iter().any(|f| f.starts_with("vendor/")),
1097            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1098        );
1099        assert!(
1100            !files.iter().any(|f| f.starts_with("dist/")),
1101            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1102        );
1103    }
1104
1105    #[test]
1106    fn list_code_files_respects_max_files_cap() {
1107        let td = tempdir().expect("tempdir");
1108        let root = td.path();
1109
1110        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1111        // but we can verify the cap constant exists and the function returns a bounded vec.
1112        for i in 0..10 {
1113            std::fs::write(
1114                root.join(format!("f{i}.rs")),
1115                format!("pub fn f{i}() {{}}\n"),
1116            )
1117            .expect("write");
1118        }
1119        let files = list_code_files(root);
1120        assert!(
1121            files.len() <= MAX_BM25_FILES,
1122            "file count should not exceed MAX_BM25_FILES"
1123        );
1124    }
1125
1126    #[test]
1127    fn max_bm25_cache_bytes_reads_env() {
1128        let _env = crate::core::data_dir::test_env_lock();
1129        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1130        let bytes = max_bm25_cache_bytes();
1131        assert_eq!(bytes, 64 * 1024 * 1024);
1132        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1133    }
1134}