Skip to main content

lean_ctx/core/
bm25_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12    "vendor/**",
13    "dist/**",
14    "build/**",
15    "public/vendor/**",
16    "public/js/**",
17    "public/css/**",
18    "public/build/**",
19    ".next/**",
20    ".nuxt/**",
21    "__pycache__/**",
22    "*.min.js",
23    "*.min.css",
24    "*.bundle.js",
25    "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29    let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30        .ok()
31        .and_then(|v| v.parse::<u64>().ok())
32        .unwrap_or_else(|| {
33            let cfg = crate::core::config::Config::load();
34            let profile = crate::core::config::MemoryProfile::effective(&cfg);
35            let profile_mb = profile.bm25_max_cache_mb();
36            if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37                profile_mb
38            } else {
39                cfg.bm25_max_cache_mb
40            }
41        });
42    mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47    pub file_path: String,
48    pub symbol_name: String,
49    pub kind: ChunkKind,
50    pub start_line: usize,
51    pub end_line: usize,
52    pub content: String,
53    #[serde(default)]
54    pub tokens: Vec<String>,
55    pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60    Function,
61    Struct,
62    Impl,
63    Module,
64    Class,
65    Method,
66    Other,
67}
68
69#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
70pub struct IndexedFileState {
71    pub mtime_ms: u64,
72    pub size_bytes: u64,
73}
74
75impl IndexedFileState {
76    fn from_path(path: &Path) -> Option<Self> {
77        let meta = path.metadata().ok()?;
78        let size_bytes = meta.len();
79        let mtime_ms = meta
80            .modified()
81            .ok()
82            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
83            .map(|d| d.as_millis() as u64)?;
84        Some(Self {
85            mtime_ms,
86            size_bytes,
87        })
88    }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct BM25Index {
93    pub chunks: Vec<CodeChunk>,
94    pub inverted: HashMap<String, Vec<(usize, f64)>>,
95    pub avg_doc_len: f64,
96    pub doc_count: usize,
97    pub doc_freqs: HashMap<String, usize>,
98    #[serde(default)]
99    pub files: HashMap<String, IndexedFileState>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct SearchResult {
104    pub chunk_idx: usize,
105    pub score: f64,
106    pub file_path: String,
107    pub symbol_name: String,
108    pub kind: ChunkKind,
109    pub start_line: usize,
110    pub end_line: usize,
111    pub snippet: String,
112}
113
114const BM25_K1: f64 = 1.2;
115const BM25_B: f64 = 0.75;
116
117impl Default for BM25Index {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123impl BM25Index {
124    pub fn new() -> Self {
125        Self {
126            chunks: Vec::new(),
127            inverted: HashMap::new(),
128            avg_doc_len: 0.0,
129            doc_count: 0,
130            doc_freqs: HashMap::new(),
131            files: HashMap::new(),
132        }
133    }
134
135    /// Approximate heap memory used by this index in bytes.
136    pub fn memory_usage_bytes(&self) -> usize {
137        let chunks_size: usize = self
138            .chunks
139            .iter()
140            .map(|c| {
141                c.content.len()
142                    + c.file_path.len()
143                    + c.symbol_name.len()
144                    + c.tokens.iter().map(String::len).sum::<usize>()
145                    + 64
146            })
147            .sum();
148        let inverted_size: usize = self
149            .inverted
150            .iter()
151            .map(|(k, v)| k.len() + v.len() * 16 + 32)
152            .sum();
153        let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
154        let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
155        chunks_size + inverted_size + files_size + freqs_size
156    }
157
158    /// Drops all in-memory data, effectively freeing heap. Index can be re-loaded from disk.
159    pub fn unload(&mut self) {
160        let usage = self.memory_usage_bytes();
161        self.chunks = Vec::new();
162        self.inverted = HashMap::new();
163        self.doc_freqs = HashMap::new();
164        self.files = HashMap::new();
165        self.avg_doc_len = 0.0;
166        self.doc_count = 0;
167        tracing::info!(
168            "[bm25] unloaded index, freed ~{:.1}MB",
169            usage as f64 / 1_048_576.0
170        );
171    }
172
173    /// Builds an index from explicit chunks (unit tests; avoids filesystem walking).
174    #[cfg(test)]
175    pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
176        let mut index = Self::new();
177        for mut chunk in chunks {
178            if chunk.token_count == 0 {
179                chunk.token_count = tokenize(&chunk.content).len();
180            }
181            index.add_chunk(chunk);
182        }
183        index.finalize();
184        index
185    }
186
187    pub fn build_from_directory(root: &Path) -> Self {
188        let root_str = root.to_string_lossy();
189        if !super::graph_index::is_safe_scan_root_public(&root_str) {
190            tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
191            return Self::new();
192        }
193        let mut index = Self::new();
194        let files = list_code_files(root);
195        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
196
197        for (i, rel) in files.iter().enumerate() {
198            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
199                tracing::warn!(
200                    "[bm25: stopping build at file {i}/{} due to memory pressure]",
201                    files.len()
202                );
203                break;
204            }
205            if crate::core::memory_guard::abort_requested() {
206                tracing::warn!("[bm25: aborting build due to critical memory pressure]");
207                break;
208            }
209
210            let abs = root.join(rel);
211            let Some(state) = IndexedFileState::from_path(&abs) else {
212                continue;
213            };
214            if state.size_bytes > MAX_FILE_SIZE_BYTES {
215                continue;
216            }
217            if let Ok(content) = std::fs::read_to_string(&abs) {
218                let mut chunks = extract_chunks(rel, &content);
219                chunks.sort_by(|a, b| {
220                    a.start_line
221                        .cmp(&b.start_line)
222                        .then_with(|| a.end_line.cmp(&b.end_line))
223                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
224                });
225                for chunk in chunks {
226                    index.add_chunk(chunk);
227                }
228                index.files.insert(rel.clone(), state);
229            }
230        }
231
232        index.finalize();
233        index
234    }
235
236    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
237        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
238        for c in &prev.chunks {
239            old_by_file
240                .entry(c.file_path.clone())
241                .or_default()
242                .push(c.clone());
243        }
244        for v in old_by_file.values_mut() {
245            v.sort_by(|a, b| {
246                a.start_line
247                    .cmp(&b.start_line)
248                    .then_with(|| a.end_line.cmp(&b.end_line))
249                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
250            });
251        }
252
253        let mut index = Self::new();
254        let files = list_code_files(root);
255        const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
256
257        for (i, rel) in files.iter().enumerate() {
258            if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
259                tracing::warn!(
260                    "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
261                    files.len()
262                );
263                break;
264            }
265
266            let abs = root.join(rel);
267            let Some(state) = IndexedFileState::from_path(&abs) else {
268                continue;
269            };
270
271            let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
272            if unchanged {
273                if let Some(chunks) = old_by_file.get(rel) {
274                    if chunks.first().is_some_and(|c| !c.content.is_empty()) {
275                        for chunk in chunks {
276                            index.add_chunk(chunk.clone());
277                        }
278                        index.files.insert(rel.clone(), state);
279                        continue;
280                    }
281                }
282            }
283
284            if state.size_bytes > MAX_FILE_SIZE_BYTES {
285                continue;
286            }
287            if let Ok(content) = std::fs::read_to_string(&abs) {
288                let mut chunks = extract_chunks(rel, &content);
289                chunks.sort_by(|a, b| {
290                    a.start_line
291                        .cmp(&b.start_line)
292                        .then_with(|| a.end_line.cmp(&b.end_line))
293                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
294                });
295                for chunk in chunks {
296                    index.add_chunk(chunk);
297                }
298                index.files.insert(rel.clone(), state);
299            }
300        }
301
302        index.finalize();
303        index
304    }
305
306    fn add_chunk(&mut self, chunk: CodeChunk) {
307        let idx = self.chunks.len();
308
309        let enriched = enrich_for_bm25(&chunk);
310        let tokens = tokenize(&enriched);
311        for token in &tokens {
312            let lower = token.to_lowercase();
313            let postings = self.inverted.entry(lower.clone()).or_default();
314            if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
315                *self.doc_freqs.entry(lower).or_insert(0) += 1;
316            }
317            postings.push((idx, 1.0));
318        }
319
320        self.chunks.push(CodeChunk {
321            token_count: tokens.len(),
322            tokens: Vec::new(),
323            ..chunk
324        });
325    }
326
327    fn finalize(&mut self) {
328        self.doc_count = self.chunks.len();
329        if self.doc_count == 0 {
330            return;
331        }
332
333        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
334        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
335    }
336
337    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
338        let query_tokens = tokenize(query);
339        if query_tokens.is_empty() || self.doc_count == 0 {
340            return Vec::new();
341        }
342
343        // Pre-allocated score array: O(1) per-access vs HashMap overhead.
344        // Kolmogorov-optimal: minimal allocation for the scoring operation.
345        let n = self.chunks.len();
346        let mut scores = vec![0.0f64; n];
347        let mut touched = Vec::with_capacity(n.min(256));
348
349        for token in &query_tokens {
350            let lower = token.to_lowercase();
351            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
352            if df == 0.0 {
353                continue;
354            }
355
356            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
357
358            if let Some(postings) = self.inverted.get(&lower) {
359                for &(idx, weight) in postings {
360                    let doc_len = self.chunks[idx].token_count as f64;
361                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
362                    let bm25 = idf * (weight * (BM25_K1 + 1.0))
363                        / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
364
365                    if scores[idx] == 0.0 {
366                        touched.push(idx);
367                    }
368                    scores[idx] += bm25;
369                }
370            }
371        }
372
373        let mut results: Vec<SearchResult> = touched
374            .iter()
375            .filter(|&&idx| scores[idx] > 0.0)
376            .map(|&idx| {
377                let chunk = &self.chunks[idx];
378                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
379                SearchResult {
380                    chunk_idx: idx,
381                    score: scores[idx],
382                    file_path: chunk.file_path.clone(),
383                    symbol_name: chunk.symbol_name.clone(),
384                    kind: chunk.kind.clone(),
385                    start_line: chunk.start_line,
386                    end_line: chunk.end_line,
387                    snippet,
388                }
389            })
390            .collect();
391
392        results.sort_by(|a, b| {
393            b.score
394                .partial_cmp(&a.score)
395                .unwrap_or(std::cmp::Ordering::Equal)
396                .then_with(|| a.file_path.cmp(&b.file_path))
397                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
398                .then_with(|| a.start_line.cmp(&b.start_line))
399                .then_with(|| a.end_line.cmp(&b.end_line))
400        });
401        results.truncate(top_k);
402        results
403    }
404
405    pub fn save(&self, root: &Path) -> std::io::Result<()> {
406        if self.chunks.len() > CHUNK_COUNT_WARNING {
407            tracing::warn!(
408                "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
409                self.chunks.len(),
410                CHUNK_COUNT_WARNING
411            );
412        }
413
414        let dir = index_dir(root);
415        std::fs::create_dir_all(&dir)?;
416        let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
417            .map_err(|e| std::io::Error::other(e.to_string()))?;
418
419        let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
420            .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
421
422        let max_bytes = max_bm25_cache_bytes();
423        if compressed.len() as u64 > max_bytes {
424            tracing::warn!(
425                "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
426                compressed.len() as f64 / 1_048_576.0,
427                max_bytes / (1024 * 1024),
428                dir.display()
429            );
430            return Ok(());
431        }
432
433        tracing::info!(
434            "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
435            data.len() as f64 / 1_048_576.0,
436            compressed.len() as f64 / 1_048_576.0,
437            (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
438        );
439
440        let target = dir.join("bm25_index.bin.zst");
441        let tmp = dir.join("bm25_index.bin.zst.tmp");
442        std::fs::write(&tmp, &compressed)?;
443        std::fs::rename(&tmp, &target)?;
444
445        let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
446        let _ = std::fs::remove_file(dir.join("bm25_index.json"));
447
448        let _ = std::fs::write(
449            dir.join("project_root.txt"),
450            root.to_string_lossy().as_bytes(),
451        );
452
453        Ok(())
454    }
455
456    pub fn load(root: &Path) -> Option<Self> {
457        let dir = index_dir(root);
458        let max_bytes = max_bm25_cache_bytes();
459
460        let zst_path = dir.join("bm25_index.bin.zst");
461        if zst_path.exists() {
462            let meta = std::fs::metadata(&zst_path).ok()?;
463            if meta.len() > max_bytes {
464                tracing::warn!(
465                    "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
466                    meta.len() as f64 / 1_073_741_824.0,
467                    max_bytes / (1024 * 1024),
468                    zst_path.display()
469                );
470                let quarantined = zst_path.with_extension("zst.quarantined");
471                let _ = std::fs::rename(&zst_path, &quarantined);
472                return None;
473            }
474            let compressed = std::fs::read(&zst_path).ok()?;
475            let data = zstd::decode_all(compressed.as_slice()).ok()?;
476            let (idx, _): (Self, _) =
477                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
478            return Some(idx);
479        }
480
481        let bin_path = dir.join("bm25_index.bin");
482        if bin_path.exists() {
483            let meta = std::fs::metadata(&bin_path).ok()?;
484            if meta.len() > max_bytes {
485                tracing::warn!(
486                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
487                    meta.len() as f64 / 1_073_741_824.0,
488                    max_bytes / (1024 * 1024),
489                    bin_path.display()
490                );
491                let quarantined = bin_path.with_extension("bin.quarantined");
492                let _ = std::fs::rename(&bin_path, &quarantined);
493                return None;
494            }
495            let data = std::fs::read(&bin_path).ok()?;
496            let (idx, _): (Self, _) =
497                bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
498            // Auto-migrate: compress legacy .bin to .bin.zst
499            if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
500                let zst_tmp = zst_path.with_extension("zst.tmp");
501                if std::fs::write(&zst_tmp, &compressed).is_ok()
502                    && std::fs::rename(&zst_tmp, &zst_path).is_ok()
503                {
504                    tracing::info!(
505                        "[bm25] migrated {:.1} MB → {:.1} MB zstd",
506                        data.len() as f64 / 1_048_576.0,
507                        compressed.len() as f64 / 1_048_576.0
508                    );
509                    let _ = std::fs::remove_file(&bin_path);
510                }
511            }
512            return Some(idx);
513        }
514
515        let json_path = dir.join("bm25_index.json");
516        if json_path.exists() {
517            let meta = std::fs::metadata(&json_path).ok()?;
518            if meta.len() > max_bytes {
519                tracing::warn!(
520                    "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
521                    meta.len() as f64 / 1_073_741_824.0,
522                    max_bytes / (1024 * 1024),
523                    json_path.display()
524                );
525                let quarantined = json_path.with_extension("json.quarantined");
526                let _ = std::fs::rename(&json_path, &quarantined);
527                return None;
528            }
529            let data = std::fs::read_to_string(&json_path).ok()?;
530            return serde_json::from_str(&data).ok();
531        }
532
533        None
534    }
535
536    pub fn load_or_build(root: &Path) -> Self {
537        if !is_safe_bm25_root(root) {
538            return Self::default();
539        }
540        if let Some(idx) = Self::load(root) {
541            if !bm25_index_looks_stale(&idx, root) {
542                return idx;
543            }
544            tracing::warn!(
545                "[bm25_index: stale index detected for {}; rebuilding]",
546                root.display()
547            );
548            let rebuilt = if idx.files.is_empty() {
549                Self::build_from_directory(root)
550            } else {
551                Self::rebuild_incremental(root, &idx)
552            };
553            let _ = rebuilt.save(root);
554            return rebuilt;
555        }
556
557        let built = Self::build_from_directory(root);
558        let _ = built.save(root);
559        built
560    }
561
562    pub fn index_file_path(root: &Path) -> PathBuf {
563        let dir = index_dir(root);
564        let zst = dir.join("bm25_index.bin.zst");
565        if zst.exists() {
566            return zst;
567        }
568        let bin = dir.join("bm25_index.bin");
569        if bin.exists() {
570            return bin;
571        }
572        dir.join("bm25_index.json")
573    }
574}
575
576fn is_safe_bm25_root(root: &Path) -> bool {
577    super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
578}
579
580fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
581    if index.chunks.is_empty() {
582        return false;
583    }
584
585    if index.files.is_empty() {
586        // Legacy index (pre file-state tracking): only detect missing files.
587        let mut seen = std::collections::HashSet::<&str>::new();
588        for chunk in &index.chunks {
589            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
590            if rel.is_empty() {
591                continue;
592            }
593            if !seen.insert(rel) {
594                continue;
595            }
596            if !root.join(rel).exists() {
597                return true;
598            }
599        }
600        return false;
601    }
602
603    // Missing or modified tracked files.
604    for (rel, old_state) in &index.files {
605        let abs = root.join(rel);
606        if !abs.exists() {
607            return true;
608        }
609        let Some(cur) = IndexedFileState::from_path(&abs) else {
610            return true;
611        };
612        if &cur != old_state {
613            return true;
614        }
615    }
616
617    // New files (present on disk but not in index).
618    for rel in list_code_files(root) {
619        if !index.files.contains_key(&rel) {
620            return true;
621        }
622    }
623
624    false
625}
626
627fn index_dir(root: &Path) -> PathBuf {
628    crate::core::index_namespace::vectors_dir(root)
629}
630
631fn list_code_files(root: &Path) -> Vec<String> {
632    let walker = ignore::WalkBuilder::new(root)
633        .hidden(true)
634        .git_ignore(true)
635        .git_global(true)
636        .git_exclude(true)
637        .max_depth(Some(20))
638        .build();
639
640    let cfg = crate::core::config::Config::load();
641    let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
642        .iter()
643        .filter_map(|p| glob::Pattern::new(p).ok())
644        .collect();
645    ignore_patterns.extend(
646        cfg.extra_ignore_patterns
647            .iter()
648            .filter_map(|p| glob::Pattern::new(p).ok()),
649    );
650
651    let mut files: Vec<String> = Vec::new();
652    for entry in walker.flatten() {
653        let path = entry.path();
654        if !path.is_file() {
655            continue;
656        }
657        if !is_code_file(path) {
658            continue;
659        }
660        let rel = path
661            .strip_prefix(root)
662            .unwrap_or(path)
663            .to_string_lossy()
664            .to_string();
665        if rel.is_empty() {
666            continue;
667        }
668        if ignore_patterns.iter().any(|p| p.matches(&rel)) {
669            continue;
670        }
671        if files.len() >= MAX_BM25_FILES {
672            tracing::warn!(
673                "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
674                root.display()
675            );
676            break;
677        }
678        files.push(rel);
679    }
680
681    files.sort();
682    files.dedup();
683    files
684}
685
686pub fn is_code_file(path: &Path) -> bool {
687    let ext = path
688        .extension()
689        .and_then(|e| e.to_str())
690        .unwrap_or("")
691        .to_lowercase();
692    matches!(
693        ext.as_str(),
694        "rs" | "ts"
695            | "tsx"
696            | "js"
697            | "jsx"
698            | "py"
699            | "go"
700            | "java"
701            | "c"
702            | "cc"
703            | "cpp"
704            | "h"
705            | "hpp"
706            | "rb"
707            | "cs"
708            | "kt"
709            | "swift"
710            | "php"
711            | "scala"
712            | "sql"
713            | "ex"
714            | "exs"
715            | "zig"
716            | "lua"
717            | "dart"
718            | "vue"
719            | "svelte"
720    )
721}
722
723fn tokenize(text: &str) -> Vec<String> {
724    let mut tokens = Vec::new();
725    let mut current = String::new();
726
727    for ch in text.chars() {
728        if ch.is_alphanumeric() || ch == '_' {
729            current.push(ch);
730        } else {
731            if current.len() >= 2 {
732                tokens.push(current.clone());
733            }
734            current.clear();
735        }
736    }
737    if current.len() >= 2 {
738        tokens.push(current);
739    }
740
741    split_camel_case_tokens(&tokens)
742}
743
744pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
745    tokenize(text)
746}
747
748fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
749    let mut result = Vec::new();
750    for token in tokens {
751        result.push(token.clone());
752        let mut start = 0;
753        let chars: Vec<char> = token.chars().collect();
754        for i in 1..chars.len() {
755            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
756                let part: String = chars[start..i].iter().collect();
757                if part.len() >= 2 {
758                    result.push(part);
759                }
760                start = i;
761            }
762        }
763        if start > 0 {
764            let part: String = chars[start..].iter().collect();
765            if part.len() >= 2 {
766                result.push(part);
767            }
768        }
769    }
770    result
771}
772
773fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
774    #[cfg(feature = "tree-sitter")]
775    {
776        let ext = std::path::Path::new(file_path)
777            .extension()
778            .and_then(|e| e.to_str())
779            .unwrap_or("");
780        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
781            return chunks;
782        }
783    }
784
785    let lines: Vec<&str> = content.lines().collect();
786    if lines.is_empty() {
787        return Vec::new();
788    }
789
790    let mut chunks = Vec::new();
791    let mut i = 0;
792
793    while i < lines.len() {
794        let trimmed = lines[i].trim();
795
796        if let Some((name, kind)) = detect_symbol(trimmed) {
797            let start = i;
798            let end = find_block_end(&lines, i);
799            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
800            let token_count = tokenize(&block).len();
801
802            chunks.push(CodeChunk {
803                file_path: file_path.to_string(),
804                symbol_name: name,
805                kind,
806                start_line: start + 1,
807                end_line: end + 1,
808                content: block,
809                tokens: Vec::new(),
810                token_count,
811            });
812
813            i = end + 1;
814        } else {
815            i += 1;
816        }
817    }
818
819    if chunks.is_empty() && !content.is_empty() {
820        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
821        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
822        //
823        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
824        let bytes = content.as_bytes();
825        let rk_chunks = crate::core::rabin_karp::chunk(content);
826        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
827            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
828                let end = (c.offset + c.length).min(bytes.len());
829                let slice = &bytes[c.offset..end];
830                let chunk_text = String::from_utf8_lossy(slice).into_owned();
831                let token_count = tokenize(&chunk_text).len();
832                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
833                let end_line = start_line + bytecount::count(slice, b'\n');
834                chunks.push(CodeChunk {
835                    file_path: file_path.to_string(),
836                    symbol_name: format!("{file_path}#chunk-{idx}"),
837                    kind: ChunkKind::Module,
838                    start_line,
839                    end_line: end_line.max(start_line),
840                    content: chunk_text,
841                    tokens: Vec::new(),
842                    token_count,
843                });
844            }
845        } else {
846            let token_count = tokenize(content).len();
847            let snippet = lines
848                .iter()
849                .take(50)
850                .copied()
851                .collect::<Vec<_>>()
852                .join("\n");
853            chunks.push(CodeChunk {
854                file_path: file_path.to_string(),
855                symbol_name: file_path.to_string(),
856                kind: ChunkKind::Module,
857                start_line: 1,
858                end_line: lines.len(),
859                content: snippet,
860                tokens: Vec::new(),
861                token_count,
862            });
863        }
864    }
865
866    chunks
867}
868
869fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
870    let trimmed = line.trim();
871
872    let patterns: &[(&str, ChunkKind)] = &[
873        ("pub async fn ", ChunkKind::Function),
874        ("async fn ", ChunkKind::Function),
875        ("pub fn ", ChunkKind::Function),
876        ("fn ", ChunkKind::Function),
877        ("pub struct ", ChunkKind::Struct),
878        ("struct ", ChunkKind::Struct),
879        ("pub enum ", ChunkKind::Struct),
880        ("enum ", ChunkKind::Struct),
881        ("impl ", ChunkKind::Impl),
882        ("pub trait ", ChunkKind::Struct),
883        ("trait ", ChunkKind::Struct),
884        ("export function ", ChunkKind::Function),
885        ("export async function ", ChunkKind::Function),
886        ("export default function ", ChunkKind::Function),
887        ("function ", ChunkKind::Function),
888        ("async function ", ChunkKind::Function),
889        ("export class ", ChunkKind::Class),
890        ("class ", ChunkKind::Class),
891        ("export interface ", ChunkKind::Struct),
892        ("interface ", ChunkKind::Struct),
893        ("def ", ChunkKind::Function),
894        ("async def ", ChunkKind::Function),
895        ("class ", ChunkKind::Class),
896        ("func ", ChunkKind::Function),
897    ];
898
899    for (prefix, kind) in patterns {
900        if let Some(rest) = trimmed.strip_prefix(prefix) {
901            let name: String = rest
902                .chars()
903                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
904                .take_while(|c| *c != '<')
905                .collect();
906            if !name.is_empty() {
907                return Some((name, kind.clone()));
908            }
909        }
910    }
911
912    None
913}
914
915fn find_block_end(lines: &[&str], start: usize) -> usize {
916    let mut depth = 0i32;
917    let mut found_open = false;
918
919    for (i, line) in lines.iter().enumerate().skip(start) {
920        for ch in line.chars() {
921            match ch {
922                '{' | '(' if !found_open || depth > 0 => {
923                    depth += 1;
924                    found_open = true;
925                }
926                '}' | ')' if depth > 0 => {
927                    depth -= 1;
928                    if depth == 0 && found_open {
929                        return i;
930                    }
931                }
932                _ => {}
933            }
934        }
935
936        if found_open && depth <= 0 && i > start {
937            return i;
938        }
939
940        if !found_open && i > start + 2 {
941            let trimmed = lines[i].trim();
942            if trimmed.is_empty()
943                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
944            {
945                return i.saturating_sub(1);
946            }
947        }
948    }
949
950    (start + 50).min(lines.len().saturating_sub(1))
951}
952
953pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
954    if results.is_empty() {
955        return "No results found.".to_string();
956    }
957
958    let mut out = String::new();
959    for (i, r) in results.iter().enumerate() {
960        if compact {
961            out.push_str(&format!(
962                "{}. {:.2} {}:{}-{} {:?} {}\n",
963                i + 1,
964                r.score,
965                r.file_path,
966                r.start_line,
967                r.end_line,
968                r.kind,
969                r.symbol_name,
970            ));
971        } else {
972            out.push_str(&format!(
973                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
974                i + 1,
975                r.score,
976                r.file_path,
977                r.symbol_name,
978                r.kind,
979                r.start_line,
980                r.end_line,
981                r.snippet,
982            ));
983        }
984    }
985    out
986}
987
988/// Enrich chunk content with file-path components for BM25 path-matching.
989///
990/// SACL (EMNLP 2025) shows that augmenting code with structural information
991/// improves retrieval by 7-12.8%. We append the file stem twice (for boost)
992/// and the immediate parent directory once, enabling queries like "auth handler"
993/// to match `src/auth/handler.rs`.
994fn enrich_for_bm25(chunk: &CodeChunk) -> String {
995    let path = Path::new(&chunk.file_path);
996    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
997    let dir = path
998        .parent()
999        .and_then(|p| p.file_name())
1000        .and_then(|d| d.to_str())
1001        .unwrap_or("");
1002
1003    if stem.is_empty() {
1004        return chunk.content.clone();
1005    }
1006
1007    format!("{} {} {} {}", chunk.content, stem, stem, dir)
1008}
1009
1010#[cfg(test)]
1011mod tests {
1012    use super::*;
1013    use tempfile::tempdir;
1014
1015    #[cfg(unix)]
1016    use std::os::unix::fs::PermissionsExt;
1017
1018    #[test]
1019    fn tokenize_splits_code() {
1020        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1021        assert!(tokens.contains(&"calculate_total".to_string()));
1022        assert!(tokens.contains(&"items".to_string()));
1023        assert!(tokens.contains(&"Vec".to_string()));
1024    }
1025
1026    #[test]
1027    fn camel_case_splitting() {
1028        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1029        assert!(tokens.contains(&"calculateTotal".to_string()));
1030        assert!(tokens.contains(&"calculate".to_string()));
1031        assert!(tokens.contains(&"Total".to_string()));
1032    }
1033
1034    #[test]
1035    fn detect_rust_function() {
1036        let (name, kind) =
1037            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1038        assert_eq!(name, "process_request");
1039        assert_eq!(kind, ChunkKind::Function);
1040    }
1041
1042    #[test]
1043    fn bm25_search_finds_relevant() {
1044        let mut index = BM25Index::new();
1045        index.add_chunk(CodeChunk {
1046            file_path: "auth.rs".into(),
1047            symbol_name: "validate_token".into(),
1048            kind: ChunkKind::Function,
1049            start_line: 1,
1050            end_line: 10,
1051            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1052            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1053            token_count: 8,
1054        });
1055        index.add_chunk(CodeChunk {
1056            file_path: "db.rs".into(),
1057            symbol_name: "connect_database".into(),
1058            kind: ChunkKind::Function,
1059            start_line: 1,
1060            end_line: 5,
1061            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1062            tokens: tokenize("fn connect_database url str Pool create_pool url"),
1063            token_count: 7,
1064        });
1065        index.finalize();
1066
1067        let results = index.search("jwt token validation", 5);
1068        assert!(!results.is_empty());
1069        assert_eq!(results[0].symbol_name, "validate_token");
1070    }
1071
1072    #[test]
1073    fn bm25_search_sorts_ties_deterministically() {
1074        let mut index = BM25Index::new();
1075
1076        // Insert in reverse path order to ensure the sort tie-break matters.
1077        index.add_chunk(CodeChunk {
1078            file_path: "b.rs".into(),
1079            symbol_name: "same".into(),
1080            kind: ChunkKind::Function,
1081            start_line: 1,
1082            end_line: 1,
1083            content: "fn same() {}".into(),
1084            tokens: tokenize("same token"),
1085            token_count: 2,
1086        });
1087        index.add_chunk(CodeChunk {
1088            file_path: "a.rs".into(),
1089            symbol_name: "same".into(),
1090            kind: ChunkKind::Function,
1091            start_line: 1,
1092            end_line: 1,
1093            content: "fn same() {}".into(),
1094            tokens: tokenize("same token"),
1095            token_count: 2,
1096        });
1097        index.finalize();
1098
1099        let results = index.search("same", 10);
1100        assert!(results.len() >= 2);
1101        assert_eq!(results[0].file_path, "a.rs");
1102        assert_eq!(results[1].file_path, "b.rs");
1103    }
1104
1105    #[test]
1106    fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1107        let td = tempdir().expect("tempdir");
1108        let root = td.path();
1109        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1110
1111        let idx = BM25Index::build_from_directory(root);
1112        assert!(!bm25_index_looks_stale(&idx, root));
1113
1114        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1115        assert!(bm25_index_looks_stale(&idx, root));
1116    }
1117
1118    #[test]
1119    #[cfg(unix)]
1120    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1121        let td = tempdir().expect("tempdir");
1122        let root = td.path();
1123
1124        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1125        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1126
1127        let idx1 = BM25Index::build_from_directory(root);
1128        assert!(idx1.files.contains_key("a.rs"));
1129        assert!(idx1.files.contains_key("b.rs"));
1130
1131        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
1132        let a_path = root.join("a.rs");
1133        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1134        perms.set_mode(0o000);
1135        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1136
1137        // Change b.rs (size changes) to force a re-read for that file.
1138        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1139            .expect("rewrite b.rs");
1140
1141        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1142        assert!(
1143            idx2.files.contains_key("a.rs"),
1144            "a.rs should be kept via reuse"
1145        );
1146        assert!(idx2.files.contains_key("b.rs"));
1147
1148        let b_has_b2 = idx2
1149            .chunks
1150            .iter()
1151            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1152        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1153
1154        // Restore permissions to avoid cleanup surprises.
1155        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1156        perms.set_mode(0o644);
1157        let _ = std::fs::set_permissions(&a_path, perms);
1158    }
1159
1160    #[test]
1161    fn load_quarantines_oversized_index() {
1162        let _env = crate::core::data_dir::test_env_lock();
1163        let td = tempdir().expect("tempdir");
1164        let root = td.path();
1165        let dir = crate::core::index_namespace::vectors_dir(root);
1166        std::fs::create_dir_all(&dir).expect("create vectors dir");
1167
1168        let index_path = dir.join("bm25_index.json");
1169        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1170        std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1171
1172        let result = BM25Index::load(root);
1173        assert!(result.is_none(), "oversized index should return None");
1174        assert!(
1175            !index_path.exists(),
1176            "original index should be removed after quarantine"
1177        );
1178        assert!(
1179            dir.join("bm25_index.json.quarantined").exists(),
1180            "quarantined file should exist"
1181        );
1182
1183        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1184    }
1185
1186    #[test]
1187    fn save_refuses_oversized_output() {
1188        let _env = crate::core::data_dir::test_env_lock();
1189        let data_dir = tempdir().expect("data_dir");
1190        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1191        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1192
1193        let td = tempdir().expect("tempdir");
1194        let root = td.path();
1195
1196        let mut index = BM25Index::new();
1197        index.add_chunk(CodeChunk {
1198            file_path: "a.rs".into(),
1199            symbol_name: "a".into(),
1200            kind: ChunkKind::Function,
1201            start_line: 1,
1202            end_line: 1,
1203            content: "fn a() {}".into(),
1204            tokens: tokenize("fn a"),
1205            token_count: 2,
1206        });
1207        index.finalize();
1208
1209        let _ = index.save(root);
1210        let index_path = BM25Index::index_file_path(root);
1211        assert!(
1212            !index_path.exists(),
1213            "save should refuse to persist oversized index"
1214        );
1215
1216        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1217    }
1218
1219    #[test]
1220    fn save_writes_project_root_marker() {
1221        let _env = crate::core::data_dir::test_env_lock();
1222        let td = tempdir().expect("tempdir");
1223        let root = td.path();
1224        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1225
1226        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1227        let index = BM25Index::build_from_directory(root);
1228        index.save(root).expect("save");
1229
1230        let dir = crate::core::index_namespace::vectors_dir(root);
1231        let marker = dir.join("project_root.txt");
1232        assert!(marker.exists(), "project_root.txt marker should exist");
1233        let content = std::fs::read_to_string(&marker).expect("read marker");
1234        assert_eq!(content, root.to_string_lossy());
1235    }
1236
1237    #[test]
1238    fn save_load_roundtrip_uses_zstd() {
1239        let _env = crate::core::data_dir::test_env_lock();
1240        let data_dir = tempdir().expect("data_dir");
1241        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1242        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1243        let td = tempdir().expect("tempdir");
1244        let root = td.path();
1245
1246        for i in 0..10 {
1247            std::fs::write(
1248                root.join(format!("mod{i}.rs")),
1249                format!(
1250                    "pub fn handler_{i}() {{\n    println!(\"hello\");\n}}\n\n\
1251                     pub fn helper_{i}() {{\n    println!(\"world\");\n}}\n"
1252                ),
1253            )
1254            .expect("write");
1255        }
1256
1257        let index = BM25Index::build_from_directory(root);
1258        assert!(index.doc_count > 0, "should have indexed chunks");
1259        index.save(root).expect("save");
1260
1261        let dir = crate::core::index_namespace::vectors_dir(root);
1262        let zst = dir.join("bm25_index.bin.zst");
1263        assert!(zst.exists(), "should write .bin.zst");
1264        assert!(
1265            !dir.join("bm25_index.bin").exists(),
1266            ".bin should be deleted"
1267        );
1268
1269        let loaded = BM25Index::load(root).expect("load compressed index");
1270        assert_eq!(loaded.doc_count, index.doc_count);
1271        assert_eq!(loaded.chunks.len(), index.chunks.len());
1272
1273        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1274        std::env::remove_var("LEAN_CTX_DATA_DIR");
1275    }
1276
1277    #[test]
1278    fn auto_migrate_bin_to_zst() {
1279        let _env = crate::core::data_dir::test_env_lock();
1280        let data_dir = tempdir().expect("data_dir");
1281        std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1282        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1283        let td = tempdir().expect("tempdir");
1284        let root = td.path();
1285
1286        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1287        let index = BM25Index::build_from_directory(root);
1288
1289        let dir = crate::core::index_namespace::vectors_dir(root);
1290        std::fs::create_dir_all(&dir).expect("mkdir");
1291        let data =
1292            bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1293        std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1294
1295        let loaded = BM25Index::load(root).expect("load should auto-migrate");
1296        assert_eq!(loaded.doc_count, index.doc_count);
1297        assert!(
1298            dir.join("bm25_index.bin.zst").exists(),
1299            ".bin.zst should be created"
1300        );
1301        assert!(
1302            !dir.join("bm25_index.bin").exists(),
1303            ".bin should be removed"
1304        );
1305
1306        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1307        std::env::remove_var("LEAN_CTX_DATA_DIR");
1308    }
1309
1310    #[test]
1311    fn list_code_files_skips_default_vendor_ignores() {
1312        let td = tempdir().expect("tempdir");
1313        let root = td.path();
1314
1315        std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1316        std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1317        std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1318        std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1319        std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1320
1321        let files = list_code_files(root);
1322        assert!(
1323            files.iter().any(|f| f == "main.rs"),
1324            "main.rs should be included"
1325        );
1326        assert!(
1327            !files.iter().any(|f| f.starts_with("vendor/")),
1328            "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1329        );
1330        assert!(
1331            !files.iter().any(|f| f.starts_with("dist/")),
1332            "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1333        );
1334    }
1335
1336    #[test]
1337    fn list_code_files_respects_max_files_cap() {
1338        let td = tempdir().expect("tempdir");
1339        let root = td.path();
1340
1341        // Create more files than MAX_BM25_FILES wouldn't let us test easily (5000),
1342        // but we can verify the cap constant exists and the function returns a bounded vec.
1343        for i in 0..10 {
1344            std::fs::write(
1345                root.join(format!("f{i}.rs")),
1346                format!("pub fn f{i}() {{}}\n"),
1347            )
1348            .expect("write");
1349        }
1350        let files = list_code_files(root);
1351        assert!(
1352            files.len() <= MAX_BM25_FILES,
1353            "file count should not exceed MAX_BM25_FILES"
1354        );
1355    }
1356
1357    #[test]
1358    fn max_bm25_cache_bytes_reads_env() {
1359        let _env = crate::core::data_dir::test_env_lock();
1360        std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1361        let bytes = max_bm25_cache_bytes();
1362        assert_eq!(bytes, 64 * 1024 * 1024);
1363        std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1364    }
1365}