1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11 "vendor/**",
12 "dist/**",
13 "build/**",
14 "public/vendor/**",
15 "public/js/**",
16 "public/css/**",
17 "public/build/**",
18 ".next/**",
19 ".nuxt/**",
20 "__pycache__/**",
21 "*.min.js",
22 "*.min.css",
23 "*.bundle.js",
24 "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29 .ok()
30 .and_then(|v| v.parse::<u64>().ok())
31 .unwrap_or_else(|| {
32 let cfg = crate::core::config::Config::load();
33 let profile = crate::core::config::MemoryProfile::effective(&cfg);
34 let profile_mb = profile.bm25_max_cache_mb();
35 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36 profile_mb
37 } else {
38 cfg.bm25_max_cache_mb
39 }
40 });
41 mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46 pub file_path: String,
47 pub symbol_name: String,
48 pub kind: ChunkKind,
49 pub start_line: usize,
50 pub end_line: usize,
51 pub content: String,
52 #[serde(skip_serializing, default)]
53 pub tokens: Vec<String>,
54 pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59 Function,
60 Struct,
61 Impl,
62 Module,
63 Class,
64 Method,
65 Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70 pub mtime_ms: u64,
71 pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75 fn from_path(path: &Path) -> Option<Self> {
76 let meta = path.metadata().ok()?;
77 let size_bytes = meta.len();
78 let mtime_ms = meta
79 .modified()
80 .ok()
81 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82 .map(|d| d.as_millis() as u64)?;
83 Some(Self {
84 mtime_ms,
85 size_bytes,
86 })
87 }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92 pub chunks: Vec<CodeChunk>,
93 pub inverted: HashMap<String, Vec<(usize, f64)>>,
94 pub avg_doc_len: f64,
95 pub doc_count: usize,
96 pub doc_freqs: HashMap<String, usize>,
97 #[serde(default)]
98 pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103 pub chunk_idx: usize,
104 pub score: f64,
105 pub file_path: String,
106 pub symbol_name: String,
107 pub kind: ChunkKind,
108 pub start_line: usize,
109 pub end_line: usize,
110 pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117 fn default() -> Self {
118 Self::new()
119 }
120}
121
122impl BM25Index {
123 pub fn new() -> Self {
124 Self {
125 chunks: Vec::new(),
126 inverted: HashMap::new(),
127 avg_doc_len: 0.0,
128 doc_count: 0,
129 doc_freqs: HashMap::new(),
130 files: HashMap::new(),
131 }
132 }
133
134 pub fn build_from_directory(root: &Path) -> Self {
135 let mut index = Self::new();
136 let files = list_code_files(root);
137 for rel in files {
138 let abs = root.join(&rel);
139 let Some(state) = IndexedFileState::from_path(&abs) else {
140 continue;
141 };
142 if let Ok(content) = std::fs::read_to_string(&abs) {
143 let mut chunks = extract_chunks(&rel, &content);
144 chunks.sort_by(|a, b| {
145 a.start_line
146 .cmp(&b.start_line)
147 .then_with(|| a.end_line.cmp(&b.end_line))
148 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
149 });
150 for chunk in chunks {
151 index.add_chunk(chunk);
152 }
153 index.files.insert(rel, state);
154 }
155 }
156
157 index.finalize();
158 index
159 }
160
161 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
162 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
163 for c in &prev.chunks {
164 old_by_file
165 .entry(c.file_path.clone())
166 .or_default()
167 .push(c.clone());
168 }
169 for v in old_by_file.values_mut() {
170 v.sort_by(|a, b| {
171 a.start_line
172 .cmp(&b.start_line)
173 .then_with(|| a.end_line.cmp(&b.end_line))
174 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
175 });
176 }
177
178 let mut index = Self::new();
179 let files = list_code_files(root);
180 for rel in files {
181 let abs = root.join(&rel);
182 let Some(state) = IndexedFileState::from_path(&abs) else {
183 continue;
184 };
185
186 let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
187 if unchanged {
188 if let Some(chunks) = old_by_file.get(&rel) {
189 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
190 for chunk in chunks {
191 index.add_chunk(chunk.clone());
192 }
193 index.files.insert(rel, state);
194 continue;
195 }
196 }
197 }
198
199 if let Ok(content) = std::fs::read_to_string(&abs) {
200 let mut chunks = extract_chunks(&rel, &content);
201 chunks.sort_by(|a, b| {
202 a.start_line
203 .cmp(&b.start_line)
204 .then_with(|| a.end_line.cmp(&b.end_line))
205 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
206 });
207 for chunk in chunks {
208 index.add_chunk(chunk);
209 }
210 index.files.insert(rel, state);
211 }
212 }
213
214 index.finalize();
215 index
216 }
217
218 fn add_chunk(&mut self, chunk: CodeChunk) {
219 let idx = self.chunks.len();
220
221 let tokens = tokenize(&chunk.content);
222 for token in &tokens {
223 let lower = token.to_lowercase();
224 self.inverted.entry(lower).or_default().push((idx, 1.0));
225 }
226
227 self.chunks.push(CodeChunk {
228 token_count: tokens.len(),
229 tokens: Vec::new(),
230 ..chunk
231 });
232 }
233
234 fn finalize(&mut self) {
235 self.doc_count = self.chunks.len();
236 if self.doc_count == 0 {
237 return;
238 }
239
240 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
241 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
242
243 self.doc_freqs.clear();
244 for (term, postings) in &self.inverted {
245 let unique_docs: std::collections::HashSet<usize> =
246 postings.iter().map(|(idx, _)| *idx).collect();
247 self.doc_freqs.insert(term.clone(), unique_docs.len());
248 }
249 }
250
251 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
252 let query_tokens = tokenize(query);
253 if query_tokens.is_empty() || self.doc_count == 0 {
254 return Vec::new();
255 }
256
257 let mut scores: HashMap<usize, f64> = HashMap::new();
258
259 for token in &query_tokens {
260 let lower = token.to_lowercase();
261 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
262 if df == 0.0 {
263 continue;
264 }
265
266 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
267
268 if let Some(postings) = self.inverted.get(&lower) {
269 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
270 for (idx, weight) in postings {
271 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
272 }
273
274 for (doc_idx, tf) in &doc_tfs {
275 let doc_len = self.chunks[*doc_idx].token_count as f64;
276 let norm_len = doc_len / self.avg_doc_len.max(1.0);
277 let bm25 = idf * (tf * (BM25_K1 + 1.0))
278 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
279
280 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
281 }
282 }
283 }
284
285 let mut results: Vec<SearchResult> = scores
286 .into_iter()
287 .map(|(idx, score)| {
288 let chunk = &self.chunks[idx];
289 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
290 SearchResult {
291 chunk_idx: idx,
292 score,
293 file_path: chunk.file_path.clone(),
294 symbol_name: chunk.symbol_name.clone(),
295 kind: chunk.kind.clone(),
296 start_line: chunk.start_line,
297 end_line: chunk.end_line,
298 snippet,
299 }
300 })
301 .collect();
302
303 results.sort_by(|a, b| {
304 b.score
305 .partial_cmp(&a.score)
306 .unwrap_or(std::cmp::Ordering::Equal)
307 .then_with(|| a.file_path.cmp(&b.file_path))
308 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
309 .then_with(|| a.start_line.cmp(&b.start_line))
310 .then_with(|| a.end_line.cmp(&b.end_line))
311 });
312 results.truncate(top_k);
313 results
314 }
315
316 pub fn save(&self, root: &Path) -> std::io::Result<()> {
317 if self.chunks.len() > CHUNK_COUNT_WARNING {
318 tracing::warn!(
319 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
320 self.chunks.len(),
321 CHUNK_COUNT_WARNING
322 );
323 }
324
325 let dir = index_dir(root);
326 std::fs::create_dir_all(&dir)?;
327 let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
328
329 let max_bytes = max_bm25_cache_bytes();
330 if data.len() as u64 > max_bytes {
331 tracing::warn!(
332 "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
333 data.len() as f64 / 1_048_576.0,
334 max_bytes / (1024 * 1024),
335 dir.display()
336 );
337 return Ok(());
338 }
339
340 let target = dir.join("bm25_index.json");
341 let tmp = dir.join("bm25_index.json.tmp");
342 std::fs::write(&tmp, &data)?;
343 std::fs::rename(&tmp, &target)?;
344
345 let _ = std::fs::write(
346 dir.join("project_root.txt"),
347 root.to_string_lossy().as_bytes(),
348 );
349
350 Ok(())
351 }
352
353 pub fn load(root: &Path) -> Option<Self> {
354 let path = index_dir(root).join("bm25_index.json");
355 let meta = std::fs::metadata(&path).ok()?;
356 let max_bytes = max_bm25_cache_bytes();
357 if meta.len() > max_bytes {
358 tracing::warn!(
359 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
360 meta.len() as f64 / 1_073_741_824.0,
361 max_bytes / (1024 * 1024),
362 path.display()
363 );
364 let quarantined = path.with_extension("json.quarantined");
365 let _ = std::fs::rename(&path, &quarantined);
366 return None;
367 }
368 let data = std::fs::read_to_string(&path).ok()?;
369 serde_json::from_str(&data).ok()
370 }
371
372 pub fn load_or_build(root: &Path) -> Self {
373 if let Some(idx) = Self::load(root) {
374 if !bm25_index_looks_stale(&idx, root) {
375 return idx;
376 }
377 tracing::warn!(
378 "[bm25_index: stale index detected for {}; rebuilding]",
379 root.display()
380 );
381 let rebuilt = if idx.files.is_empty() {
382 Self::build_from_directory(root)
383 } else {
384 Self::rebuild_incremental(root, &idx)
385 };
386 let _ = rebuilt.save(root);
387 return rebuilt;
388 }
389
390 let built = Self::build_from_directory(root);
391 let _ = built.save(root);
392 built
393 }
394
395 pub fn index_file_path(root: &Path) -> PathBuf {
396 index_dir(root).join("bm25_index.json")
397 }
398}
399
400fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
401 if index.chunks.is_empty() {
402 return false;
403 }
404
405 if index.files.is_empty() {
406 let mut seen = std::collections::HashSet::<&str>::new();
408 for chunk in &index.chunks {
409 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
410 if rel.is_empty() {
411 continue;
412 }
413 if !seen.insert(rel) {
414 continue;
415 }
416 if !root.join(rel).exists() {
417 return true;
418 }
419 }
420 return false;
421 }
422
423 for (rel, old_state) in &index.files {
425 let abs = root.join(rel);
426 if !abs.exists() {
427 return true;
428 }
429 let Some(cur) = IndexedFileState::from_path(&abs) else {
430 return true;
431 };
432 if &cur != old_state {
433 return true;
434 }
435 }
436
437 for rel in list_code_files(root) {
439 if !index.files.contains_key(&rel) {
440 return true;
441 }
442 }
443
444 false
445}
446
447fn index_dir(root: &Path) -> PathBuf {
448 crate::core::index_namespace::vectors_dir(root)
449}
450
451fn list_code_files(root: &Path) -> Vec<String> {
452 let walker = ignore::WalkBuilder::new(root)
453 .hidden(true)
454 .git_ignore(true)
455 .git_global(true)
456 .git_exclude(true)
457 .build();
458
459 let cfg = crate::core::config::Config::load();
460 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
461 .iter()
462 .filter_map(|p| glob::Pattern::new(p).ok())
463 .collect();
464 ignore_patterns.extend(
465 cfg.extra_ignore_patterns
466 .iter()
467 .filter_map(|p| glob::Pattern::new(p).ok()),
468 );
469
470 let mut files: Vec<String> = Vec::new();
471 for entry in walker.flatten() {
472 let path = entry.path();
473 if !path.is_file() {
474 continue;
475 }
476 if !is_code_file(path) {
477 continue;
478 }
479 let rel = path
480 .strip_prefix(root)
481 .unwrap_or(path)
482 .to_string_lossy()
483 .to_string();
484 if rel.is_empty() {
485 continue;
486 }
487 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
488 continue;
489 }
490 if files.len() >= MAX_BM25_FILES {
491 tracing::warn!(
492 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
493 root.display()
494 );
495 break;
496 }
497 files.push(rel);
498 }
499
500 files.sort();
501 files.dedup();
502 files
503}
504
505pub fn is_code_file(path: &Path) -> bool {
506 let ext = path
507 .extension()
508 .and_then(|e| e.to_str())
509 .unwrap_or("")
510 .to_lowercase();
511 matches!(
512 ext.as_str(),
513 "rs" | "ts"
514 | "tsx"
515 | "js"
516 | "jsx"
517 | "py"
518 | "go"
519 | "java"
520 | "c"
521 | "cc"
522 | "cpp"
523 | "h"
524 | "hpp"
525 | "rb"
526 | "cs"
527 | "kt"
528 | "swift"
529 | "php"
530 | "scala"
531 | "sql"
532 | "ex"
533 | "exs"
534 | "zig"
535 | "lua"
536 | "dart"
537 | "vue"
538 | "svelte"
539 )
540}
541
542fn tokenize(text: &str) -> Vec<String> {
543 let mut tokens = Vec::new();
544 let mut current = String::new();
545
546 for ch in text.chars() {
547 if ch.is_alphanumeric() || ch == '_' {
548 current.push(ch);
549 } else {
550 if current.len() >= 2 {
551 tokens.push(current.clone());
552 }
553 current.clear();
554 }
555 }
556 if current.len() >= 2 {
557 tokens.push(current);
558 }
559
560 split_camel_case_tokens(&tokens)
561}
562
563pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
564 tokenize(text)
565}
566
567fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
568 let mut result = Vec::new();
569 for token in tokens {
570 result.push(token.clone());
571 let mut start = 0;
572 let chars: Vec<char> = token.chars().collect();
573 for i in 1..chars.len() {
574 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
575 let part: String = chars[start..i].iter().collect();
576 if part.len() >= 2 {
577 result.push(part);
578 }
579 start = i;
580 }
581 }
582 if start > 0 {
583 let part: String = chars[start..].iter().collect();
584 if part.len() >= 2 {
585 result.push(part);
586 }
587 }
588 }
589 result
590}
591
592fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
593 #[cfg(feature = "tree-sitter")]
594 {
595 let ext = std::path::Path::new(file_path)
596 .extension()
597 .and_then(|e| e.to_str())
598 .unwrap_or("");
599 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
600 return chunks;
601 }
602 }
603
604 let lines: Vec<&str> = content.lines().collect();
605 if lines.is_empty() {
606 return Vec::new();
607 }
608
609 let mut chunks = Vec::new();
610 let mut i = 0;
611
612 while i < lines.len() {
613 let trimmed = lines[i].trim();
614
615 if let Some((name, kind)) = detect_symbol(trimmed) {
616 let start = i;
617 let end = find_block_end(&lines, i);
618 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
619 let token_count = tokenize(&block).len();
620
621 chunks.push(CodeChunk {
622 file_path: file_path.to_string(),
623 symbol_name: name,
624 kind,
625 start_line: start + 1,
626 end_line: end + 1,
627 content: block,
628 tokens: Vec::new(),
629 token_count,
630 });
631
632 i = end + 1;
633 } else {
634 i += 1;
635 }
636 }
637
638 if chunks.is_empty() && !content.is_empty() {
639 let bytes = content.as_bytes();
644 let rk_chunks = crate::core::rabin_karp::chunk(content);
645 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
646 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
647 let end = (c.offset + c.length).min(bytes.len());
648 let slice = &bytes[c.offset..end];
649 let chunk_text = String::from_utf8_lossy(slice).into_owned();
650 let token_count = tokenize(&chunk_text).len();
651 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
652 let end_line = start_line + bytecount::count(slice, b'\n');
653 chunks.push(CodeChunk {
654 file_path: file_path.to_string(),
655 symbol_name: format!("{file_path}#chunk-{idx}"),
656 kind: ChunkKind::Module,
657 start_line,
658 end_line: end_line.max(start_line),
659 content: chunk_text,
660 tokens: Vec::new(),
661 token_count,
662 });
663 }
664 } else {
665 let token_count = tokenize(content).len();
666 let snippet = lines
667 .iter()
668 .take(50)
669 .copied()
670 .collect::<Vec<_>>()
671 .join("\n");
672 chunks.push(CodeChunk {
673 file_path: file_path.to_string(),
674 symbol_name: file_path.to_string(),
675 kind: ChunkKind::Module,
676 start_line: 1,
677 end_line: lines.len(),
678 content: snippet,
679 tokens: Vec::new(),
680 token_count,
681 });
682 }
683 }
684
685 chunks
686}
687
688fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
689 let trimmed = line.trim();
690
691 let patterns: &[(&str, ChunkKind)] = &[
692 ("pub async fn ", ChunkKind::Function),
693 ("async fn ", ChunkKind::Function),
694 ("pub fn ", ChunkKind::Function),
695 ("fn ", ChunkKind::Function),
696 ("pub struct ", ChunkKind::Struct),
697 ("struct ", ChunkKind::Struct),
698 ("pub enum ", ChunkKind::Struct),
699 ("enum ", ChunkKind::Struct),
700 ("impl ", ChunkKind::Impl),
701 ("pub trait ", ChunkKind::Struct),
702 ("trait ", ChunkKind::Struct),
703 ("export function ", ChunkKind::Function),
704 ("export async function ", ChunkKind::Function),
705 ("export default function ", ChunkKind::Function),
706 ("function ", ChunkKind::Function),
707 ("async function ", ChunkKind::Function),
708 ("export class ", ChunkKind::Class),
709 ("class ", ChunkKind::Class),
710 ("export interface ", ChunkKind::Struct),
711 ("interface ", ChunkKind::Struct),
712 ("def ", ChunkKind::Function),
713 ("async def ", ChunkKind::Function),
714 ("class ", ChunkKind::Class),
715 ("func ", ChunkKind::Function),
716 ];
717
718 for (prefix, kind) in patterns {
719 if let Some(rest) = trimmed.strip_prefix(prefix) {
720 let name: String = rest
721 .chars()
722 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
723 .take_while(|c| *c != '<')
724 .collect();
725 if !name.is_empty() {
726 return Some((name, kind.clone()));
727 }
728 }
729 }
730
731 None
732}
733
734fn find_block_end(lines: &[&str], start: usize) -> usize {
735 let mut depth = 0i32;
736 let mut found_open = false;
737
738 for (i, line) in lines.iter().enumerate().skip(start) {
739 for ch in line.chars() {
740 match ch {
741 '{' | '(' if !found_open || depth > 0 => {
742 depth += 1;
743 found_open = true;
744 }
745 '}' | ')' if depth > 0 => {
746 depth -= 1;
747 if depth == 0 && found_open {
748 return i;
749 }
750 }
751 _ => {}
752 }
753 }
754
755 if found_open && depth <= 0 && i > start {
756 return i;
757 }
758
759 if !found_open && i > start + 2 {
760 let trimmed = lines[i].trim();
761 if trimmed.is_empty()
762 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
763 {
764 return i.saturating_sub(1);
765 }
766 }
767 }
768
769 (start + 50).min(lines.len().saturating_sub(1))
770}
771
772pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
773 if results.is_empty() {
774 return "No results found.".to_string();
775 }
776
777 let mut out = String::new();
778 for (i, r) in results.iter().enumerate() {
779 if compact {
780 out.push_str(&format!(
781 "{}. {:.2} {}:{}-{} {:?} {}\n",
782 i + 1,
783 r.score,
784 r.file_path,
785 r.start_line,
786 r.end_line,
787 r.kind,
788 r.symbol_name,
789 ));
790 } else {
791 out.push_str(&format!(
792 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
793 i + 1,
794 r.score,
795 r.file_path,
796 r.symbol_name,
797 r.kind,
798 r.start_line,
799 r.end_line,
800 r.snippet,
801 ));
802 }
803 }
804 out
805}
806
807#[cfg(test)]
808mod tests {
809 use super::*;
810 use tempfile::tempdir;
811
812 #[cfg(unix)]
813 use std::os::unix::fs::PermissionsExt;
814
815 #[test]
816 fn tokenize_splits_code() {
817 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
818 assert!(tokens.contains(&"calculate_total".to_string()));
819 assert!(tokens.contains(&"items".to_string()));
820 assert!(tokens.contains(&"Vec".to_string()));
821 }
822
823 #[test]
824 fn camel_case_splitting() {
825 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
826 assert!(tokens.contains(&"calculateTotal".to_string()));
827 assert!(tokens.contains(&"calculate".to_string()));
828 assert!(tokens.contains(&"Total".to_string()));
829 }
830
831 #[test]
832 fn detect_rust_function() {
833 let (name, kind) =
834 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
835 assert_eq!(name, "process_request");
836 assert_eq!(kind, ChunkKind::Function);
837 }
838
839 #[test]
840 fn bm25_search_finds_relevant() {
841 let mut index = BM25Index::new();
842 index.add_chunk(CodeChunk {
843 file_path: "auth.rs".into(),
844 symbol_name: "validate_token".into(),
845 kind: ChunkKind::Function,
846 start_line: 1,
847 end_line: 10,
848 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
849 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
850 token_count: 8,
851 });
852 index.add_chunk(CodeChunk {
853 file_path: "db.rs".into(),
854 symbol_name: "connect_database".into(),
855 kind: ChunkKind::Function,
856 start_line: 1,
857 end_line: 5,
858 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
859 tokens: tokenize("fn connect_database url str Pool create_pool url"),
860 token_count: 7,
861 });
862 index.finalize();
863
864 let results = index.search("jwt token validation", 5);
865 assert!(!results.is_empty());
866 assert_eq!(results[0].symbol_name, "validate_token");
867 }
868
869 #[test]
870 fn bm25_search_sorts_ties_deterministically() {
871 let mut index = BM25Index::new();
872
873 index.add_chunk(CodeChunk {
875 file_path: "b.rs".into(),
876 symbol_name: "same".into(),
877 kind: ChunkKind::Function,
878 start_line: 1,
879 end_line: 1,
880 content: "fn same() {}".into(),
881 tokens: tokenize("same token"),
882 token_count: 2,
883 });
884 index.add_chunk(CodeChunk {
885 file_path: "a.rs".into(),
886 symbol_name: "same".into(),
887 kind: ChunkKind::Function,
888 start_line: 1,
889 end_line: 1,
890 content: "fn same() {}".into(),
891 tokens: tokenize("same token"),
892 token_count: 2,
893 });
894 index.finalize();
895
896 let results = index.search("same", 10);
897 assert!(results.len() >= 2);
898 assert_eq!(results[0].file_path, "a.rs");
899 assert_eq!(results[1].file_path, "b.rs");
900 }
901
902 #[test]
903 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
904 let td = tempdir().expect("tempdir");
905 let root = td.path();
906 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
907
908 let idx = BM25Index::build_from_directory(root);
909 assert!(!bm25_index_looks_stale(&idx, root));
910
911 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
912 assert!(bm25_index_looks_stale(&idx, root));
913 }
914
915 #[test]
916 #[cfg(unix)]
917 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
918 let td = tempdir().expect("tempdir");
919 let root = td.path();
920
921 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
922 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
923
924 let idx1 = BM25Index::build_from_directory(root);
925 assert!(idx1.files.contains_key("a.rs"));
926 assert!(idx1.files.contains_key("b.rs"));
927
928 let a_path = root.join("a.rs");
930 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
931 perms.set_mode(0o000);
932 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
933
934 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
936 .expect("rewrite b.rs");
937
938 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
939 assert!(
940 idx2.files.contains_key("a.rs"),
941 "a.rs should be kept via reuse"
942 );
943 assert!(idx2.files.contains_key("b.rs"));
944
945 let b_has_b2 = idx2
946 .chunks
947 .iter()
948 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
949 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
950
951 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
953 perms.set_mode(0o644);
954 let _ = std::fs::set_permissions(&a_path, perms);
955 }
956
957 #[test]
958 fn load_quarantines_oversized_index() {
959 let _env = crate::core::data_dir::test_env_lock();
960 let td = tempdir().expect("tempdir");
961 let root = td.path();
962 let dir = crate::core::index_namespace::vectors_dir(root);
963 std::fs::create_dir_all(&dir).expect("create vectors dir");
964
965 let index_path = dir.join("bm25_index.json");
966 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
967 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
968
969 let result = BM25Index::load(root);
970 assert!(result.is_none(), "oversized index should return None");
971 assert!(
972 !index_path.exists(),
973 "original index should be removed after quarantine"
974 );
975 assert!(
976 dir.join("bm25_index.json.quarantined").exists(),
977 "quarantined file should exist"
978 );
979
980 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
981 }
982
983 #[test]
984 fn save_refuses_oversized_output() {
985 let _env = crate::core::data_dir::test_env_lock();
986 let data_dir = tempdir().expect("data_dir");
987 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
988 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
989
990 let td = tempdir().expect("tempdir");
991 let root = td.path();
992
993 let mut index = BM25Index::new();
994 index.add_chunk(CodeChunk {
995 file_path: "a.rs".into(),
996 symbol_name: "a".into(),
997 kind: ChunkKind::Function,
998 start_line: 1,
999 end_line: 1,
1000 content: "fn a() {}".into(),
1001 tokens: tokenize("fn a"),
1002 token_count: 2,
1003 });
1004 index.finalize();
1005
1006 let _ = index.save(root);
1007 let index_path = BM25Index::index_file_path(root);
1008 assert!(
1009 !index_path.exists(),
1010 "save should refuse to persist oversized index"
1011 );
1012
1013 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1014 }
1015
1016 #[test]
1017 fn save_writes_project_root_marker() {
1018 let td = tempdir().expect("tempdir");
1019 let root = td.path();
1020 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1021
1022 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1023 let index = BM25Index::build_from_directory(root);
1024 index.save(root).expect("save");
1025
1026 let dir = crate::core::index_namespace::vectors_dir(root);
1027 let marker = dir.join("project_root.txt");
1028 assert!(marker.exists(), "project_root.txt marker should exist");
1029 let content = std::fs::read_to_string(&marker).expect("read marker");
1030 assert_eq!(content, root.to_string_lossy());
1031 }
1032
1033 #[test]
1034 fn list_code_files_skips_default_vendor_ignores() {
1035 let td = tempdir().expect("tempdir");
1036 let root = td.path();
1037
1038 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1039 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1040 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1041 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1042 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1043
1044 let files = list_code_files(root);
1045 assert!(
1046 files.iter().any(|f| f == "main.rs"),
1047 "main.rs should be included"
1048 );
1049 assert!(
1050 !files.iter().any(|f| f.starts_with("vendor/")),
1051 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1052 );
1053 assert!(
1054 !files.iter().any(|f| f.starts_with("dist/")),
1055 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1056 );
1057 }
1058
1059 #[test]
1060 fn list_code_files_respects_max_files_cap() {
1061 let td = tempdir().expect("tempdir");
1062 let root = td.path();
1063
1064 for i in 0..10 {
1067 std::fs::write(
1068 root.join(format!("f{i}.rs")),
1069 format!("pub fn f{i}() {{}}\n"),
1070 )
1071 .expect("write");
1072 }
1073 let files = list_code_files(root);
1074 assert!(
1075 files.len() <= MAX_BM25_FILES,
1076 "file count should not exceed MAX_BM25_FILES"
1077 );
1078 }
1079
1080 #[test]
1081 fn max_bm25_cache_bytes_reads_env() {
1082 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1083 let bytes = max_bm25_cache_bytes();
1084 assert_eq!(bytes, 64 * 1024 * 1024);
1085 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1086 }
1087}