1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11 "vendor/**",
12 "dist/**",
13 "build/**",
14 "public/vendor/**",
15 "public/js/**",
16 "public/css/**",
17 "public/build/**",
18 ".next/**",
19 ".nuxt/**",
20 "__pycache__/**",
21 "*.min.js",
22 "*.min.css",
23 "*.bundle.js",
24 "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29 .ok()
30 .and_then(|v| v.parse::<u64>().ok())
31 .unwrap_or_else(|| {
32 let cfg = crate::core::config::Config::load();
33 let profile = crate::core::config::MemoryProfile::effective(&cfg);
34 let profile_mb = profile.bm25_max_cache_mb();
35 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36 profile_mb
37 } else {
38 cfg.bm25_max_cache_mb
39 }
40 });
41 mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46 pub file_path: String,
47 pub symbol_name: String,
48 pub kind: ChunkKind,
49 pub start_line: usize,
50 pub end_line: usize,
51 pub content: String,
52 #[serde(skip_serializing, default)]
53 pub tokens: Vec<String>,
54 pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59 Function,
60 Struct,
61 Impl,
62 Module,
63 Class,
64 Method,
65 Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70 pub mtime_ms: u64,
71 pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75 fn from_path(path: &Path) -> Option<Self> {
76 let meta = path.metadata().ok()?;
77 let size_bytes = meta.len();
78 let mtime_ms = meta
79 .modified()
80 .ok()
81 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82 .map(|d| d.as_millis() as u64)?;
83 Some(Self {
84 mtime_ms,
85 size_bytes,
86 })
87 }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92 pub chunks: Vec<CodeChunk>,
93 pub inverted: HashMap<String, Vec<(usize, f64)>>,
94 pub avg_doc_len: f64,
95 pub doc_count: usize,
96 pub doc_freqs: HashMap<String, usize>,
97 #[serde(default)]
98 pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103 pub chunk_idx: usize,
104 pub score: f64,
105 pub file_path: String,
106 pub symbol_name: String,
107 pub kind: ChunkKind,
108 pub start_line: usize,
109 pub end_line: usize,
110 pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117 fn default() -> Self {
118 Self::new()
119 }
120}
121
122impl BM25Index {
123 pub fn new() -> Self {
124 Self {
125 chunks: Vec::new(),
126 inverted: HashMap::new(),
127 avg_doc_len: 0.0,
128 doc_count: 0,
129 doc_freqs: HashMap::new(),
130 files: HashMap::new(),
131 }
132 }
133
134 #[cfg(test)]
136 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
137 let mut index = Self::new();
138 for mut chunk in chunks {
139 if chunk.token_count == 0 {
140 chunk.token_count = tokenize(&chunk.content).len();
141 }
142 index.add_chunk(chunk);
143 }
144 index.finalize();
145 index
146 }
147
148 pub fn build_from_directory(root: &Path) -> Self {
149 let mut index = Self::new();
150 let files = list_code_files(root);
151 for rel in files {
152 let abs = root.join(&rel);
153 let Some(state) = IndexedFileState::from_path(&abs) else {
154 continue;
155 };
156 if let Ok(content) = std::fs::read_to_string(&abs) {
157 let mut chunks = extract_chunks(&rel, &content);
158 chunks.sort_by(|a, b| {
159 a.start_line
160 .cmp(&b.start_line)
161 .then_with(|| a.end_line.cmp(&b.end_line))
162 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
163 });
164 for chunk in chunks {
165 index.add_chunk(chunk);
166 }
167 index.files.insert(rel, state);
168 }
169 }
170
171 index.finalize();
172 index
173 }
174
175 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
176 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
177 for c in &prev.chunks {
178 old_by_file
179 .entry(c.file_path.clone())
180 .or_default()
181 .push(c.clone());
182 }
183 for v in old_by_file.values_mut() {
184 v.sort_by(|a, b| {
185 a.start_line
186 .cmp(&b.start_line)
187 .then_with(|| a.end_line.cmp(&b.end_line))
188 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
189 });
190 }
191
192 let mut index = Self::new();
193 let files = list_code_files(root);
194 for rel in files {
195 let abs = root.join(&rel);
196 let Some(state) = IndexedFileState::from_path(&abs) else {
197 continue;
198 };
199
200 let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
201 if unchanged {
202 if let Some(chunks) = old_by_file.get(&rel) {
203 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
204 for chunk in chunks {
205 index.add_chunk(chunk.clone());
206 }
207 index.files.insert(rel, state);
208 continue;
209 }
210 }
211 }
212
213 if let Ok(content) = std::fs::read_to_string(&abs) {
214 let mut chunks = extract_chunks(&rel, &content);
215 chunks.sort_by(|a, b| {
216 a.start_line
217 .cmp(&b.start_line)
218 .then_with(|| a.end_line.cmp(&b.end_line))
219 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
220 });
221 for chunk in chunks {
222 index.add_chunk(chunk);
223 }
224 index.files.insert(rel, state);
225 }
226 }
227
228 index.finalize();
229 index
230 }
231
232 fn add_chunk(&mut self, chunk: CodeChunk) {
233 let idx = self.chunks.len();
234
235 let tokens = tokenize(&chunk.content);
236 for token in &tokens {
237 let lower = token.to_lowercase();
238 let postings = self.inverted.entry(lower.clone()).or_default();
239 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
240 *self.doc_freqs.entry(lower).or_insert(0) += 1;
241 }
242 postings.push((idx, 1.0));
243 }
244
245 self.chunks.push(CodeChunk {
246 token_count: tokens.len(),
247 tokens: Vec::new(),
248 ..chunk
249 });
250 }
251
252 fn finalize(&mut self) {
253 self.doc_count = self.chunks.len();
254 if self.doc_count == 0 {
255 return;
256 }
257
258 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
259 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
260 }
261
262 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
263 let query_tokens = tokenize(query);
264 if query_tokens.is_empty() || self.doc_count == 0 {
265 return Vec::new();
266 }
267
268 let mut scores: HashMap<usize, f64> = HashMap::new();
269
270 for token in &query_tokens {
271 let lower = token.to_lowercase();
272 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
273 if df == 0.0 {
274 continue;
275 }
276
277 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
278
279 if let Some(postings) = self.inverted.get(&lower) {
280 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
281 for (idx, weight) in postings {
282 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
283 }
284
285 for (doc_idx, tf) in &doc_tfs {
286 let doc_len = self.chunks[*doc_idx].token_count as f64;
287 let norm_len = doc_len / self.avg_doc_len.max(1.0);
288 let bm25 = idf * (tf * (BM25_K1 + 1.0))
289 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
290
291 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
292 }
293 }
294 }
295
296 let mut results: Vec<SearchResult> = scores
297 .into_iter()
298 .map(|(idx, score)| {
299 let chunk = &self.chunks[idx];
300 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
301 SearchResult {
302 chunk_idx: idx,
303 score,
304 file_path: chunk.file_path.clone(),
305 symbol_name: chunk.symbol_name.clone(),
306 kind: chunk.kind.clone(),
307 start_line: chunk.start_line,
308 end_line: chunk.end_line,
309 snippet,
310 }
311 })
312 .collect();
313
314 results.sort_by(|a, b| {
315 b.score
316 .partial_cmp(&a.score)
317 .unwrap_or(std::cmp::Ordering::Equal)
318 .then_with(|| a.file_path.cmp(&b.file_path))
319 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
320 .then_with(|| a.start_line.cmp(&b.start_line))
321 .then_with(|| a.end_line.cmp(&b.end_line))
322 });
323 results.truncate(top_k);
324 results
325 }
326
327 pub fn save(&self, root: &Path) -> std::io::Result<()> {
328 if self.chunks.len() > CHUNK_COUNT_WARNING {
329 tracing::warn!(
330 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
331 self.chunks.len(),
332 CHUNK_COUNT_WARNING
333 );
334 }
335
336 let dir = index_dir(root);
337 std::fs::create_dir_all(&dir)?;
338 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
339 .map_err(|e| std::io::Error::other(e.to_string()))?;
340
341 let max_bytes = max_bm25_cache_bytes();
342 if data.len() as u64 > max_bytes {
343 tracing::warn!(
344 "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
345 data.len() as f64 / 1_048_576.0,
346 max_bytes / (1024 * 1024),
347 dir.display()
348 );
349 return Ok(());
350 }
351
352 let target = dir.join("bm25_index.bin");
353 let tmp = dir.join("bm25_index.bin.tmp");
354 std::fs::write(&tmp, &data)?;
355 std::fs::rename(&tmp, &target)?;
356
357 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
358
359 let _ = std::fs::write(
360 dir.join("project_root.txt"),
361 root.to_string_lossy().as_bytes(),
362 );
363
364 Ok(())
365 }
366
367 pub fn load(root: &Path) -> Option<Self> {
368 let dir = index_dir(root);
369 let max_bytes = max_bm25_cache_bytes();
370
371 let bin_path = dir.join("bm25_index.bin");
372 if bin_path.exists() {
373 let meta = std::fs::metadata(&bin_path).ok()?;
374 if meta.len() > max_bytes {
375 tracing::warn!(
376 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
377 meta.len() as f64 / 1_073_741_824.0,
378 max_bytes / (1024 * 1024),
379 bin_path.display()
380 );
381 let quarantined = bin_path.with_extension("bin.quarantined");
382 let _ = std::fs::rename(&bin_path, &quarantined);
383 return None;
384 }
385 let data = std::fs::read(&bin_path).ok()?;
386 let (idx, _): (Self, _) =
387 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
388 return Some(idx);
389 }
390
391 let json_path = dir.join("bm25_index.json");
392 if json_path.exists() {
393 let meta = std::fs::metadata(&json_path).ok()?;
394 if meta.len() > max_bytes {
395 tracing::warn!(
396 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
397 meta.len() as f64 / 1_073_741_824.0,
398 max_bytes / (1024 * 1024),
399 json_path.display()
400 );
401 let quarantined = json_path.with_extension("json.quarantined");
402 let _ = std::fs::rename(&json_path, &quarantined);
403 return None;
404 }
405 let data = std::fs::read_to_string(&json_path).ok()?;
406 return serde_json::from_str(&data).ok();
407 }
408
409 None
410 }
411
412 pub fn load_or_build(root: &Path) -> Self {
413 if let Some(idx) = Self::load(root) {
414 if !bm25_index_looks_stale(&idx, root) {
415 return idx;
416 }
417 tracing::warn!(
418 "[bm25_index: stale index detected for {}; rebuilding]",
419 root.display()
420 );
421 let rebuilt = if idx.files.is_empty() {
422 Self::build_from_directory(root)
423 } else {
424 Self::rebuild_incremental(root, &idx)
425 };
426 let _ = rebuilt.save(root);
427 return rebuilt;
428 }
429
430 let built = Self::build_from_directory(root);
431 let _ = built.save(root);
432 built
433 }
434
435 pub fn index_file_path(root: &Path) -> PathBuf {
436 let dir = index_dir(root);
437 let bin = dir.join("bm25_index.bin");
438 if bin.exists() {
439 return bin;
440 }
441 dir.join("bm25_index.json")
442 }
443}
444
445fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
446 if index.chunks.is_empty() {
447 return false;
448 }
449
450 if index.files.is_empty() {
451 let mut seen = std::collections::HashSet::<&str>::new();
453 for chunk in &index.chunks {
454 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
455 if rel.is_empty() {
456 continue;
457 }
458 if !seen.insert(rel) {
459 continue;
460 }
461 if !root.join(rel).exists() {
462 return true;
463 }
464 }
465 return false;
466 }
467
468 for (rel, old_state) in &index.files {
470 let abs = root.join(rel);
471 if !abs.exists() {
472 return true;
473 }
474 let Some(cur) = IndexedFileState::from_path(&abs) else {
475 return true;
476 };
477 if &cur != old_state {
478 return true;
479 }
480 }
481
482 for rel in list_code_files(root) {
484 if !index.files.contains_key(&rel) {
485 return true;
486 }
487 }
488
489 false
490}
491
492fn index_dir(root: &Path) -> PathBuf {
493 crate::core::index_namespace::vectors_dir(root)
494}
495
496fn list_code_files(root: &Path) -> Vec<String> {
497 let walker = ignore::WalkBuilder::new(root)
498 .hidden(true)
499 .git_ignore(true)
500 .git_global(true)
501 .git_exclude(true)
502 .build();
503
504 let cfg = crate::core::config::Config::load();
505 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
506 .iter()
507 .filter_map(|p| glob::Pattern::new(p).ok())
508 .collect();
509 ignore_patterns.extend(
510 cfg.extra_ignore_patterns
511 .iter()
512 .filter_map(|p| glob::Pattern::new(p).ok()),
513 );
514
515 let mut files: Vec<String> = Vec::new();
516 for entry in walker.flatten() {
517 let path = entry.path();
518 if !path.is_file() {
519 continue;
520 }
521 if !is_code_file(path) {
522 continue;
523 }
524 let rel = path
525 .strip_prefix(root)
526 .unwrap_or(path)
527 .to_string_lossy()
528 .to_string();
529 if rel.is_empty() {
530 continue;
531 }
532 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
533 continue;
534 }
535 if files.len() >= MAX_BM25_FILES {
536 tracing::warn!(
537 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
538 root.display()
539 );
540 break;
541 }
542 files.push(rel);
543 }
544
545 files.sort();
546 files.dedup();
547 files
548}
549
550pub fn is_code_file(path: &Path) -> bool {
551 let ext = path
552 .extension()
553 .and_then(|e| e.to_str())
554 .unwrap_or("")
555 .to_lowercase();
556 matches!(
557 ext.as_str(),
558 "rs" | "ts"
559 | "tsx"
560 | "js"
561 | "jsx"
562 | "py"
563 | "go"
564 | "java"
565 | "c"
566 | "cc"
567 | "cpp"
568 | "h"
569 | "hpp"
570 | "rb"
571 | "cs"
572 | "kt"
573 | "swift"
574 | "php"
575 | "scala"
576 | "sql"
577 | "ex"
578 | "exs"
579 | "zig"
580 | "lua"
581 | "dart"
582 | "vue"
583 | "svelte"
584 )
585}
586
587fn tokenize(text: &str) -> Vec<String> {
588 let mut tokens = Vec::new();
589 let mut current = String::new();
590
591 for ch in text.chars() {
592 if ch.is_alphanumeric() || ch == '_' {
593 current.push(ch);
594 } else {
595 if current.len() >= 2 {
596 tokens.push(current.clone());
597 }
598 current.clear();
599 }
600 }
601 if current.len() >= 2 {
602 tokens.push(current);
603 }
604
605 split_camel_case_tokens(&tokens)
606}
607
608pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
609 tokenize(text)
610}
611
612fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
613 let mut result = Vec::new();
614 for token in tokens {
615 result.push(token.clone());
616 let mut start = 0;
617 let chars: Vec<char> = token.chars().collect();
618 for i in 1..chars.len() {
619 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
620 let part: String = chars[start..i].iter().collect();
621 if part.len() >= 2 {
622 result.push(part);
623 }
624 start = i;
625 }
626 }
627 if start > 0 {
628 let part: String = chars[start..].iter().collect();
629 if part.len() >= 2 {
630 result.push(part);
631 }
632 }
633 }
634 result
635}
636
637fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
638 #[cfg(feature = "tree-sitter")]
639 {
640 let ext = std::path::Path::new(file_path)
641 .extension()
642 .and_then(|e| e.to_str())
643 .unwrap_or("");
644 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
645 return chunks;
646 }
647 }
648
649 let lines: Vec<&str> = content.lines().collect();
650 if lines.is_empty() {
651 return Vec::new();
652 }
653
654 let mut chunks = Vec::new();
655 let mut i = 0;
656
657 while i < lines.len() {
658 let trimmed = lines[i].trim();
659
660 if let Some((name, kind)) = detect_symbol(trimmed) {
661 let start = i;
662 let end = find_block_end(&lines, i);
663 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
664 let token_count = tokenize(&block).len();
665
666 chunks.push(CodeChunk {
667 file_path: file_path.to_string(),
668 symbol_name: name,
669 kind,
670 start_line: start + 1,
671 end_line: end + 1,
672 content: block,
673 tokens: Vec::new(),
674 token_count,
675 });
676
677 i = end + 1;
678 } else {
679 i += 1;
680 }
681 }
682
683 if chunks.is_empty() && !content.is_empty() {
684 let bytes = content.as_bytes();
689 let rk_chunks = crate::core::rabin_karp::chunk(content);
690 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
691 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
692 let end = (c.offset + c.length).min(bytes.len());
693 let slice = &bytes[c.offset..end];
694 let chunk_text = String::from_utf8_lossy(slice).into_owned();
695 let token_count = tokenize(&chunk_text).len();
696 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
697 let end_line = start_line + bytecount::count(slice, b'\n');
698 chunks.push(CodeChunk {
699 file_path: file_path.to_string(),
700 symbol_name: format!("{file_path}#chunk-{idx}"),
701 kind: ChunkKind::Module,
702 start_line,
703 end_line: end_line.max(start_line),
704 content: chunk_text,
705 tokens: Vec::new(),
706 token_count,
707 });
708 }
709 } else {
710 let token_count = tokenize(content).len();
711 let snippet = lines
712 .iter()
713 .take(50)
714 .copied()
715 .collect::<Vec<_>>()
716 .join("\n");
717 chunks.push(CodeChunk {
718 file_path: file_path.to_string(),
719 symbol_name: file_path.to_string(),
720 kind: ChunkKind::Module,
721 start_line: 1,
722 end_line: lines.len(),
723 content: snippet,
724 tokens: Vec::new(),
725 token_count,
726 });
727 }
728 }
729
730 chunks
731}
732
733fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
734 let trimmed = line.trim();
735
736 let patterns: &[(&str, ChunkKind)] = &[
737 ("pub async fn ", ChunkKind::Function),
738 ("async fn ", ChunkKind::Function),
739 ("pub fn ", ChunkKind::Function),
740 ("fn ", ChunkKind::Function),
741 ("pub struct ", ChunkKind::Struct),
742 ("struct ", ChunkKind::Struct),
743 ("pub enum ", ChunkKind::Struct),
744 ("enum ", ChunkKind::Struct),
745 ("impl ", ChunkKind::Impl),
746 ("pub trait ", ChunkKind::Struct),
747 ("trait ", ChunkKind::Struct),
748 ("export function ", ChunkKind::Function),
749 ("export async function ", ChunkKind::Function),
750 ("export default function ", ChunkKind::Function),
751 ("function ", ChunkKind::Function),
752 ("async function ", ChunkKind::Function),
753 ("export class ", ChunkKind::Class),
754 ("class ", ChunkKind::Class),
755 ("export interface ", ChunkKind::Struct),
756 ("interface ", ChunkKind::Struct),
757 ("def ", ChunkKind::Function),
758 ("async def ", ChunkKind::Function),
759 ("class ", ChunkKind::Class),
760 ("func ", ChunkKind::Function),
761 ];
762
763 for (prefix, kind) in patterns {
764 if let Some(rest) = trimmed.strip_prefix(prefix) {
765 let name: String = rest
766 .chars()
767 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
768 .take_while(|c| *c != '<')
769 .collect();
770 if !name.is_empty() {
771 return Some((name, kind.clone()));
772 }
773 }
774 }
775
776 None
777}
778
779fn find_block_end(lines: &[&str], start: usize) -> usize {
780 let mut depth = 0i32;
781 let mut found_open = false;
782
783 for (i, line) in lines.iter().enumerate().skip(start) {
784 for ch in line.chars() {
785 match ch {
786 '{' | '(' if !found_open || depth > 0 => {
787 depth += 1;
788 found_open = true;
789 }
790 '}' | ')' if depth > 0 => {
791 depth -= 1;
792 if depth == 0 && found_open {
793 return i;
794 }
795 }
796 _ => {}
797 }
798 }
799
800 if found_open && depth <= 0 && i > start {
801 return i;
802 }
803
804 if !found_open && i > start + 2 {
805 let trimmed = lines[i].trim();
806 if trimmed.is_empty()
807 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
808 {
809 return i.saturating_sub(1);
810 }
811 }
812 }
813
814 (start + 50).min(lines.len().saturating_sub(1))
815}
816
817pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
818 if results.is_empty() {
819 return "No results found.".to_string();
820 }
821
822 let mut out = String::new();
823 for (i, r) in results.iter().enumerate() {
824 if compact {
825 out.push_str(&format!(
826 "{}. {:.2} {}:{}-{} {:?} {}\n",
827 i + 1,
828 r.score,
829 r.file_path,
830 r.start_line,
831 r.end_line,
832 r.kind,
833 r.symbol_name,
834 ));
835 } else {
836 out.push_str(&format!(
837 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
838 i + 1,
839 r.score,
840 r.file_path,
841 r.symbol_name,
842 r.kind,
843 r.start_line,
844 r.end_line,
845 r.snippet,
846 ));
847 }
848 }
849 out
850}
851
852#[cfg(test)]
853mod tests {
854 use super::*;
855 use tempfile::tempdir;
856
857 #[cfg(unix)]
858 use std::os::unix::fs::PermissionsExt;
859
860 #[test]
861 fn tokenize_splits_code() {
862 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
863 assert!(tokens.contains(&"calculate_total".to_string()));
864 assert!(tokens.contains(&"items".to_string()));
865 assert!(tokens.contains(&"Vec".to_string()));
866 }
867
868 #[test]
869 fn camel_case_splitting() {
870 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
871 assert!(tokens.contains(&"calculateTotal".to_string()));
872 assert!(tokens.contains(&"calculate".to_string()));
873 assert!(tokens.contains(&"Total".to_string()));
874 }
875
876 #[test]
877 fn detect_rust_function() {
878 let (name, kind) =
879 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
880 assert_eq!(name, "process_request");
881 assert_eq!(kind, ChunkKind::Function);
882 }
883
884 #[test]
885 fn bm25_search_finds_relevant() {
886 let mut index = BM25Index::new();
887 index.add_chunk(CodeChunk {
888 file_path: "auth.rs".into(),
889 symbol_name: "validate_token".into(),
890 kind: ChunkKind::Function,
891 start_line: 1,
892 end_line: 10,
893 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
894 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
895 token_count: 8,
896 });
897 index.add_chunk(CodeChunk {
898 file_path: "db.rs".into(),
899 symbol_name: "connect_database".into(),
900 kind: ChunkKind::Function,
901 start_line: 1,
902 end_line: 5,
903 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
904 tokens: tokenize("fn connect_database url str Pool create_pool url"),
905 token_count: 7,
906 });
907 index.finalize();
908
909 let results = index.search("jwt token validation", 5);
910 assert!(!results.is_empty());
911 assert_eq!(results[0].symbol_name, "validate_token");
912 }
913
914 #[test]
915 fn bm25_search_sorts_ties_deterministically() {
916 let mut index = BM25Index::new();
917
918 index.add_chunk(CodeChunk {
920 file_path: "b.rs".into(),
921 symbol_name: "same".into(),
922 kind: ChunkKind::Function,
923 start_line: 1,
924 end_line: 1,
925 content: "fn same() {}".into(),
926 tokens: tokenize("same token"),
927 token_count: 2,
928 });
929 index.add_chunk(CodeChunk {
930 file_path: "a.rs".into(),
931 symbol_name: "same".into(),
932 kind: ChunkKind::Function,
933 start_line: 1,
934 end_line: 1,
935 content: "fn same() {}".into(),
936 tokens: tokenize("same token"),
937 token_count: 2,
938 });
939 index.finalize();
940
941 let results = index.search("same", 10);
942 assert!(results.len() >= 2);
943 assert_eq!(results[0].file_path, "a.rs");
944 assert_eq!(results[1].file_path, "b.rs");
945 }
946
947 #[test]
948 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
949 let td = tempdir().expect("tempdir");
950 let root = td.path();
951 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
952
953 let idx = BM25Index::build_from_directory(root);
954 assert!(!bm25_index_looks_stale(&idx, root));
955
956 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
957 assert!(bm25_index_looks_stale(&idx, root));
958 }
959
960 #[test]
961 #[cfg(unix)]
962 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
963 let td = tempdir().expect("tempdir");
964 let root = td.path();
965
966 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
967 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
968
969 let idx1 = BM25Index::build_from_directory(root);
970 assert!(idx1.files.contains_key("a.rs"));
971 assert!(idx1.files.contains_key("b.rs"));
972
973 let a_path = root.join("a.rs");
975 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
976 perms.set_mode(0o000);
977 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
978
979 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
981 .expect("rewrite b.rs");
982
983 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
984 assert!(
985 idx2.files.contains_key("a.rs"),
986 "a.rs should be kept via reuse"
987 );
988 assert!(idx2.files.contains_key("b.rs"));
989
990 let b_has_b2 = idx2
991 .chunks
992 .iter()
993 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
994 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
995
996 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
998 perms.set_mode(0o644);
999 let _ = std::fs::set_permissions(&a_path, perms);
1000 }
1001
1002 #[test]
1003 fn load_quarantines_oversized_index() {
1004 let _env = crate::core::data_dir::test_env_lock();
1005 let td = tempdir().expect("tempdir");
1006 let root = td.path();
1007 let dir = crate::core::index_namespace::vectors_dir(root);
1008 std::fs::create_dir_all(&dir).expect("create vectors dir");
1009
1010 let index_path = dir.join("bm25_index.json");
1011 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1012 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1013
1014 let result = BM25Index::load(root);
1015 assert!(result.is_none(), "oversized index should return None");
1016 assert!(
1017 !index_path.exists(),
1018 "original index should be removed after quarantine"
1019 );
1020 assert!(
1021 dir.join("bm25_index.json.quarantined").exists(),
1022 "quarantined file should exist"
1023 );
1024
1025 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1026 }
1027
1028 #[test]
1029 fn save_refuses_oversized_output() {
1030 let _env = crate::core::data_dir::test_env_lock();
1031 let data_dir = tempdir().expect("data_dir");
1032 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1033 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1034
1035 let td = tempdir().expect("tempdir");
1036 let root = td.path();
1037
1038 let mut index = BM25Index::new();
1039 index.add_chunk(CodeChunk {
1040 file_path: "a.rs".into(),
1041 symbol_name: "a".into(),
1042 kind: ChunkKind::Function,
1043 start_line: 1,
1044 end_line: 1,
1045 content: "fn a() {}".into(),
1046 tokens: tokenize("fn a"),
1047 token_count: 2,
1048 });
1049 index.finalize();
1050
1051 let _ = index.save(root);
1052 let index_path = BM25Index::index_file_path(root);
1053 assert!(
1054 !index_path.exists(),
1055 "save should refuse to persist oversized index"
1056 );
1057
1058 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1059 }
1060
1061 #[test]
1062 fn save_writes_project_root_marker() {
1063 let _env = crate::core::data_dir::test_env_lock();
1064 let td = tempdir().expect("tempdir");
1065 let root = td.path();
1066 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1067
1068 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1069 let index = BM25Index::build_from_directory(root);
1070 index.save(root).expect("save");
1071
1072 let dir = crate::core::index_namespace::vectors_dir(root);
1073 let marker = dir.join("project_root.txt");
1074 assert!(marker.exists(), "project_root.txt marker should exist");
1075 let content = std::fs::read_to_string(&marker).expect("read marker");
1076 assert_eq!(content, root.to_string_lossy());
1077 }
1078
1079 #[test]
1080 fn list_code_files_skips_default_vendor_ignores() {
1081 let td = tempdir().expect("tempdir");
1082 let root = td.path();
1083
1084 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1085 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1086 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1087 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1088 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1089
1090 let files = list_code_files(root);
1091 assert!(
1092 files.iter().any(|f| f == "main.rs"),
1093 "main.rs should be included"
1094 );
1095 assert!(
1096 !files.iter().any(|f| f.starts_with("vendor/")),
1097 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1098 );
1099 assert!(
1100 !files.iter().any(|f| f.starts_with("dist/")),
1101 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1102 );
1103 }
1104
1105 #[test]
1106 fn list_code_files_respects_max_files_cap() {
1107 let td = tempdir().expect("tempdir");
1108 let root = td.path();
1109
1110 for i in 0..10 {
1113 std::fs::write(
1114 root.join(format!("f{i}.rs")),
1115 format!("pub fn f{i}() {{}}\n"),
1116 )
1117 .expect("write");
1118 }
1119 let files = list_code_files(root);
1120 assert!(
1121 files.len() <= MAX_BM25_FILES,
1122 "file count should not exceed MAX_BM25_FILES"
1123 );
1124 }
1125
1126 #[test]
1127 fn max_bm25_cache_bytes_reads_env() {
1128 let _env = crate::core::data_dir::test_env_lock();
1129 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1130 let bytes = max_bm25_cache_bytes();
1131 assert_eq!(bytes, 64 * 1024 * 1024);
1132 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1133 }
1134}