1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11 "vendor/**",
12 "dist/**",
13 "build/**",
14 "public/vendor/**",
15 "public/js/**",
16 "public/css/**",
17 "public/build/**",
18 ".next/**",
19 ".nuxt/**",
20 "__pycache__/**",
21 "*.min.js",
22 "*.min.css",
23 "*.bundle.js",
24 "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29 .ok()
30 .and_then(|v| v.parse::<u64>().ok())
31 .unwrap_or_else(|| {
32 let cfg = crate::core::config::Config::load();
33 let profile = crate::core::config::MemoryProfile::effective(&cfg);
34 let profile_mb = profile.bm25_max_cache_mb();
35 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
36 profile_mb
37 } else {
38 cfg.bm25_max_cache_mb
39 }
40 });
41 mb * 1024 * 1024
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct CodeChunk {
46 pub file_path: String,
47 pub symbol_name: String,
48 pub kind: ChunkKind,
49 pub start_line: usize,
50 pub end_line: usize,
51 pub content: String,
52 #[serde(skip_serializing, default)]
53 pub tokens: Vec<String>,
54 pub token_count: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
58pub enum ChunkKind {
59 Function,
60 Struct,
61 Impl,
62 Module,
63 Class,
64 Method,
65 Other,
66}
67
68#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
69pub struct IndexedFileState {
70 pub mtime_ms: u64,
71 pub size_bytes: u64,
72}
73
74impl IndexedFileState {
75 fn from_path(path: &Path) -> Option<Self> {
76 let meta = path.metadata().ok()?;
77 let size_bytes = meta.len();
78 let mtime_ms = meta
79 .modified()
80 .ok()
81 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
82 .map(|d| d.as_millis() as u64)?;
83 Some(Self {
84 mtime_ms,
85 size_bytes,
86 })
87 }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct BM25Index {
92 pub chunks: Vec<CodeChunk>,
93 pub inverted: HashMap<String, Vec<(usize, f64)>>,
94 pub avg_doc_len: f64,
95 pub doc_count: usize,
96 pub doc_freqs: HashMap<String, usize>,
97 #[serde(default)]
98 pub files: HashMap<String, IndexedFileState>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct SearchResult {
103 pub chunk_idx: usize,
104 pub score: f64,
105 pub file_path: String,
106 pub symbol_name: String,
107 pub kind: ChunkKind,
108 pub start_line: usize,
109 pub end_line: usize,
110 pub snippet: String,
111}
112
113const BM25_K1: f64 = 1.2;
114const BM25_B: f64 = 0.75;
115
116impl Default for BM25Index {
117 fn default() -> Self {
118 Self::new()
119 }
120}
121
122impl BM25Index {
123 pub fn new() -> Self {
124 Self {
125 chunks: Vec::new(),
126 inverted: HashMap::new(),
127 avg_doc_len: 0.0,
128 doc_count: 0,
129 doc_freqs: HashMap::new(),
130 files: HashMap::new(),
131 }
132 }
133
134 pub fn memory_usage_bytes(&self) -> usize {
136 let chunks_size: usize = self
137 .chunks
138 .iter()
139 .map(|c| {
140 c.content.len()
141 + c.file_path.len()
142 + c.symbol_name.len()
143 + c.tokens.iter().map(String::len).sum::<usize>()
144 + 64
145 })
146 .sum();
147 let inverted_size: usize = self
148 .inverted
149 .iter()
150 .map(|(k, v)| k.len() + v.len() * 16 + 32)
151 .sum();
152 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
153 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
154 chunks_size + inverted_size + files_size + freqs_size
155 }
156
157 pub fn unload(&mut self) {
159 let usage = self.memory_usage_bytes();
160 self.chunks = Vec::new();
161 self.inverted = HashMap::new();
162 self.doc_freqs = HashMap::new();
163 self.files = HashMap::new();
164 self.avg_doc_len = 0.0;
165 self.doc_count = 0;
166 tracing::info!(
167 "[bm25] unloaded index, freed ~{:.1}MB",
168 usage as f64 / 1_048_576.0
169 );
170 }
171
172 #[cfg(test)]
174 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
175 let mut index = Self::new();
176 for mut chunk in chunks {
177 if chunk.token_count == 0 {
178 chunk.token_count = tokenize(&chunk.content).len();
179 }
180 index.add_chunk(chunk);
181 }
182 index.finalize();
183 index
184 }
185
186 pub fn build_from_directory(root: &Path) -> Self {
187 let mut index = Self::new();
188 let files = list_code_files(root);
189 for rel in files {
190 let abs = root.join(&rel);
191 let Some(state) = IndexedFileState::from_path(&abs) else {
192 continue;
193 };
194 if let Ok(content) = std::fs::read_to_string(&abs) {
195 let mut chunks = extract_chunks(&rel, &content);
196 chunks.sort_by(|a, b| {
197 a.start_line
198 .cmp(&b.start_line)
199 .then_with(|| a.end_line.cmp(&b.end_line))
200 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
201 });
202 for chunk in chunks {
203 index.add_chunk(chunk);
204 }
205 index.files.insert(rel, state);
206 }
207 }
208
209 index.finalize();
210 index
211 }
212
213 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
214 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
215 for c in &prev.chunks {
216 old_by_file
217 .entry(c.file_path.clone())
218 .or_default()
219 .push(c.clone());
220 }
221 for v in old_by_file.values_mut() {
222 v.sort_by(|a, b| {
223 a.start_line
224 .cmp(&b.start_line)
225 .then_with(|| a.end_line.cmp(&b.end_line))
226 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
227 });
228 }
229
230 let mut index = Self::new();
231 let files = list_code_files(root);
232 for rel in files {
233 let abs = root.join(&rel);
234 let Some(state) = IndexedFileState::from_path(&abs) else {
235 continue;
236 };
237
238 let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
239 if unchanged {
240 if let Some(chunks) = old_by_file.get(&rel) {
241 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
242 for chunk in chunks {
243 index.add_chunk(chunk.clone());
244 }
245 index.files.insert(rel, state);
246 continue;
247 }
248 }
249 }
250
251 if let Ok(content) = std::fs::read_to_string(&abs) {
252 let mut chunks = extract_chunks(&rel, &content);
253 chunks.sort_by(|a, b| {
254 a.start_line
255 .cmp(&b.start_line)
256 .then_with(|| a.end_line.cmp(&b.end_line))
257 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
258 });
259 for chunk in chunks {
260 index.add_chunk(chunk);
261 }
262 index.files.insert(rel, state);
263 }
264 }
265
266 index.finalize();
267 index
268 }
269
270 fn add_chunk(&mut self, chunk: CodeChunk) {
271 let idx = self.chunks.len();
272
273 let tokens = tokenize(&chunk.content);
274 for token in &tokens {
275 let lower = token.to_lowercase();
276 let postings = self.inverted.entry(lower.clone()).or_default();
277 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
278 *self.doc_freqs.entry(lower).or_insert(0) += 1;
279 }
280 postings.push((idx, 1.0));
281 }
282
283 self.chunks.push(CodeChunk {
284 token_count: tokens.len(),
285 tokens: Vec::new(),
286 ..chunk
287 });
288 }
289
290 fn finalize(&mut self) {
291 self.doc_count = self.chunks.len();
292 if self.doc_count == 0 {
293 return;
294 }
295
296 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
297 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
298 }
299
300 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
301 let query_tokens = tokenize(query);
302 if query_tokens.is_empty() || self.doc_count == 0 {
303 return Vec::new();
304 }
305
306 let mut scores: HashMap<usize, f64> = HashMap::new();
307
308 for token in &query_tokens {
309 let lower = token.to_lowercase();
310 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
311 if df == 0.0 {
312 continue;
313 }
314
315 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
316
317 if let Some(postings) = self.inverted.get(&lower) {
318 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
319 for (idx, weight) in postings {
320 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
321 }
322
323 for (doc_idx, tf) in &doc_tfs {
324 let doc_len = self.chunks[*doc_idx].token_count as f64;
325 let norm_len = doc_len / self.avg_doc_len.max(1.0);
326 let bm25 = idf * (tf * (BM25_K1 + 1.0))
327 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
328
329 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
330 }
331 }
332 }
333
334 let mut results: Vec<SearchResult> = scores
335 .into_iter()
336 .map(|(idx, score)| {
337 let chunk = &self.chunks[idx];
338 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
339 SearchResult {
340 chunk_idx: idx,
341 score,
342 file_path: chunk.file_path.clone(),
343 symbol_name: chunk.symbol_name.clone(),
344 kind: chunk.kind.clone(),
345 start_line: chunk.start_line,
346 end_line: chunk.end_line,
347 snippet,
348 }
349 })
350 .collect();
351
352 results.sort_by(|a, b| {
353 b.score
354 .partial_cmp(&a.score)
355 .unwrap_or(std::cmp::Ordering::Equal)
356 .then_with(|| a.file_path.cmp(&b.file_path))
357 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
358 .then_with(|| a.start_line.cmp(&b.start_line))
359 .then_with(|| a.end_line.cmp(&b.end_line))
360 });
361 results.truncate(top_k);
362 results
363 }
364
365 pub fn save(&self, root: &Path) -> std::io::Result<()> {
366 if self.chunks.len() > CHUNK_COUNT_WARNING {
367 tracing::warn!(
368 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
369 self.chunks.len(),
370 CHUNK_COUNT_WARNING
371 );
372 }
373
374 let dir = index_dir(root);
375 std::fs::create_dir_all(&dir)?;
376 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
377 .map_err(|e| std::io::Error::other(e.to_string()))?;
378
379 let max_bytes = max_bm25_cache_bytes();
380 if data.len() as u64 > max_bytes {
381 tracing::warn!(
382 "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
383 data.len() as f64 / 1_048_576.0,
384 max_bytes / (1024 * 1024),
385 dir.display()
386 );
387 return Ok(());
388 }
389
390 let target = dir.join("bm25_index.bin");
391 let tmp = dir.join("bm25_index.bin.tmp");
392 std::fs::write(&tmp, &data)?;
393 std::fs::rename(&tmp, &target)?;
394
395 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
396
397 let _ = std::fs::write(
398 dir.join("project_root.txt"),
399 root.to_string_lossy().as_bytes(),
400 );
401
402 Ok(())
403 }
404
405 pub fn load(root: &Path) -> Option<Self> {
406 let dir = index_dir(root);
407 let max_bytes = max_bm25_cache_bytes();
408
409 let bin_path = dir.join("bm25_index.bin");
410 if bin_path.exists() {
411 let meta = std::fs::metadata(&bin_path).ok()?;
412 if meta.len() > max_bytes {
413 tracing::warn!(
414 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
415 meta.len() as f64 / 1_073_741_824.0,
416 max_bytes / (1024 * 1024),
417 bin_path.display()
418 );
419 let quarantined = bin_path.with_extension("bin.quarantined");
420 let _ = std::fs::rename(&bin_path, &quarantined);
421 return None;
422 }
423 let data = std::fs::read(&bin_path).ok()?;
424 let (idx, _): (Self, _) =
425 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
426 return Some(idx);
427 }
428
429 let json_path = dir.join("bm25_index.json");
430 if json_path.exists() {
431 let meta = std::fs::metadata(&json_path).ok()?;
432 if meta.len() > max_bytes {
433 tracing::warn!(
434 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
435 meta.len() as f64 / 1_073_741_824.0,
436 max_bytes / (1024 * 1024),
437 json_path.display()
438 );
439 let quarantined = json_path.with_extension("json.quarantined");
440 let _ = std::fs::rename(&json_path, &quarantined);
441 return None;
442 }
443 let data = std::fs::read_to_string(&json_path).ok()?;
444 return serde_json::from_str(&data).ok();
445 }
446
447 None
448 }
449
450 pub fn load_or_build(root: &Path) -> Self {
451 if let Some(idx) = Self::load(root) {
452 if !bm25_index_looks_stale(&idx, root) {
453 return idx;
454 }
455 tracing::warn!(
456 "[bm25_index: stale index detected for {}; rebuilding]",
457 root.display()
458 );
459 let rebuilt = if idx.files.is_empty() {
460 Self::build_from_directory(root)
461 } else {
462 Self::rebuild_incremental(root, &idx)
463 };
464 let _ = rebuilt.save(root);
465 return rebuilt;
466 }
467
468 let built = Self::build_from_directory(root);
469 let _ = built.save(root);
470 built
471 }
472
473 pub fn index_file_path(root: &Path) -> PathBuf {
474 let dir = index_dir(root);
475 let bin = dir.join("bm25_index.bin");
476 if bin.exists() {
477 return bin;
478 }
479 dir.join("bm25_index.json")
480 }
481}
482
483fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
484 if index.chunks.is_empty() {
485 return false;
486 }
487
488 if index.files.is_empty() {
489 let mut seen = std::collections::HashSet::<&str>::new();
491 for chunk in &index.chunks {
492 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
493 if rel.is_empty() {
494 continue;
495 }
496 if !seen.insert(rel) {
497 continue;
498 }
499 if !root.join(rel).exists() {
500 return true;
501 }
502 }
503 return false;
504 }
505
506 for (rel, old_state) in &index.files {
508 let abs = root.join(rel);
509 if !abs.exists() {
510 return true;
511 }
512 let Some(cur) = IndexedFileState::from_path(&abs) else {
513 return true;
514 };
515 if &cur != old_state {
516 return true;
517 }
518 }
519
520 for rel in list_code_files(root) {
522 if !index.files.contains_key(&rel) {
523 return true;
524 }
525 }
526
527 false
528}
529
530fn index_dir(root: &Path) -> PathBuf {
531 crate::core::index_namespace::vectors_dir(root)
532}
533
534fn list_code_files(root: &Path) -> Vec<String> {
535 let walker = ignore::WalkBuilder::new(root)
536 .hidden(true)
537 .git_ignore(true)
538 .git_global(true)
539 .git_exclude(true)
540 .build();
541
542 let cfg = crate::core::config::Config::load();
543 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
544 .iter()
545 .filter_map(|p| glob::Pattern::new(p).ok())
546 .collect();
547 ignore_patterns.extend(
548 cfg.extra_ignore_patterns
549 .iter()
550 .filter_map(|p| glob::Pattern::new(p).ok()),
551 );
552
553 let mut files: Vec<String> = Vec::new();
554 for entry in walker.flatten() {
555 let path = entry.path();
556 if !path.is_file() {
557 continue;
558 }
559 if !is_code_file(path) {
560 continue;
561 }
562 let rel = path
563 .strip_prefix(root)
564 .unwrap_or(path)
565 .to_string_lossy()
566 .to_string();
567 if rel.is_empty() {
568 continue;
569 }
570 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
571 continue;
572 }
573 if files.len() >= MAX_BM25_FILES {
574 tracing::warn!(
575 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
576 root.display()
577 );
578 break;
579 }
580 files.push(rel);
581 }
582
583 files.sort();
584 files.dedup();
585 files
586}
587
588pub fn is_code_file(path: &Path) -> bool {
589 let ext = path
590 .extension()
591 .and_then(|e| e.to_str())
592 .unwrap_or("")
593 .to_lowercase();
594 matches!(
595 ext.as_str(),
596 "rs" | "ts"
597 | "tsx"
598 | "js"
599 | "jsx"
600 | "py"
601 | "go"
602 | "java"
603 | "c"
604 | "cc"
605 | "cpp"
606 | "h"
607 | "hpp"
608 | "rb"
609 | "cs"
610 | "kt"
611 | "swift"
612 | "php"
613 | "scala"
614 | "sql"
615 | "ex"
616 | "exs"
617 | "zig"
618 | "lua"
619 | "dart"
620 | "vue"
621 | "svelte"
622 )
623}
624
625fn tokenize(text: &str) -> Vec<String> {
626 let mut tokens = Vec::new();
627 let mut current = String::new();
628
629 for ch in text.chars() {
630 if ch.is_alphanumeric() || ch == '_' {
631 current.push(ch);
632 } else {
633 if current.len() >= 2 {
634 tokens.push(current.clone());
635 }
636 current.clear();
637 }
638 }
639 if current.len() >= 2 {
640 tokens.push(current);
641 }
642
643 split_camel_case_tokens(&tokens)
644}
645
646pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
647 tokenize(text)
648}
649
650fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
651 let mut result = Vec::new();
652 for token in tokens {
653 result.push(token.clone());
654 let mut start = 0;
655 let chars: Vec<char> = token.chars().collect();
656 for i in 1..chars.len() {
657 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
658 let part: String = chars[start..i].iter().collect();
659 if part.len() >= 2 {
660 result.push(part);
661 }
662 start = i;
663 }
664 }
665 if start > 0 {
666 let part: String = chars[start..].iter().collect();
667 if part.len() >= 2 {
668 result.push(part);
669 }
670 }
671 }
672 result
673}
674
675fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
676 #[cfg(feature = "tree-sitter")]
677 {
678 let ext = std::path::Path::new(file_path)
679 .extension()
680 .and_then(|e| e.to_str())
681 .unwrap_or("");
682 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
683 return chunks;
684 }
685 }
686
687 let lines: Vec<&str> = content.lines().collect();
688 if lines.is_empty() {
689 return Vec::new();
690 }
691
692 let mut chunks = Vec::new();
693 let mut i = 0;
694
695 while i < lines.len() {
696 let trimmed = lines[i].trim();
697
698 if let Some((name, kind)) = detect_symbol(trimmed) {
699 let start = i;
700 let end = find_block_end(&lines, i);
701 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
702 let token_count = tokenize(&block).len();
703
704 chunks.push(CodeChunk {
705 file_path: file_path.to_string(),
706 symbol_name: name,
707 kind,
708 start_line: start + 1,
709 end_line: end + 1,
710 content: block,
711 tokens: Vec::new(),
712 token_count,
713 });
714
715 i = end + 1;
716 } else {
717 i += 1;
718 }
719 }
720
721 if chunks.is_empty() && !content.is_empty() {
722 let bytes = content.as_bytes();
727 let rk_chunks = crate::core::rabin_karp::chunk(content);
728 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
729 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
730 let end = (c.offset + c.length).min(bytes.len());
731 let slice = &bytes[c.offset..end];
732 let chunk_text = String::from_utf8_lossy(slice).into_owned();
733 let token_count = tokenize(&chunk_text).len();
734 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
735 let end_line = start_line + bytecount::count(slice, b'\n');
736 chunks.push(CodeChunk {
737 file_path: file_path.to_string(),
738 symbol_name: format!("{file_path}#chunk-{idx}"),
739 kind: ChunkKind::Module,
740 start_line,
741 end_line: end_line.max(start_line),
742 content: chunk_text,
743 tokens: Vec::new(),
744 token_count,
745 });
746 }
747 } else {
748 let token_count = tokenize(content).len();
749 let snippet = lines
750 .iter()
751 .take(50)
752 .copied()
753 .collect::<Vec<_>>()
754 .join("\n");
755 chunks.push(CodeChunk {
756 file_path: file_path.to_string(),
757 symbol_name: file_path.to_string(),
758 kind: ChunkKind::Module,
759 start_line: 1,
760 end_line: lines.len(),
761 content: snippet,
762 tokens: Vec::new(),
763 token_count,
764 });
765 }
766 }
767
768 chunks
769}
770
771fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
772 let trimmed = line.trim();
773
774 let patterns: &[(&str, ChunkKind)] = &[
775 ("pub async fn ", ChunkKind::Function),
776 ("async fn ", ChunkKind::Function),
777 ("pub fn ", ChunkKind::Function),
778 ("fn ", ChunkKind::Function),
779 ("pub struct ", ChunkKind::Struct),
780 ("struct ", ChunkKind::Struct),
781 ("pub enum ", ChunkKind::Struct),
782 ("enum ", ChunkKind::Struct),
783 ("impl ", ChunkKind::Impl),
784 ("pub trait ", ChunkKind::Struct),
785 ("trait ", ChunkKind::Struct),
786 ("export function ", ChunkKind::Function),
787 ("export async function ", ChunkKind::Function),
788 ("export default function ", ChunkKind::Function),
789 ("function ", ChunkKind::Function),
790 ("async function ", ChunkKind::Function),
791 ("export class ", ChunkKind::Class),
792 ("class ", ChunkKind::Class),
793 ("export interface ", ChunkKind::Struct),
794 ("interface ", ChunkKind::Struct),
795 ("def ", ChunkKind::Function),
796 ("async def ", ChunkKind::Function),
797 ("class ", ChunkKind::Class),
798 ("func ", ChunkKind::Function),
799 ];
800
801 for (prefix, kind) in patterns {
802 if let Some(rest) = trimmed.strip_prefix(prefix) {
803 let name: String = rest
804 .chars()
805 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
806 .take_while(|c| *c != '<')
807 .collect();
808 if !name.is_empty() {
809 return Some((name, kind.clone()));
810 }
811 }
812 }
813
814 None
815}
816
817fn find_block_end(lines: &[&str], start: usize) -> usize {
818 let mut depth = 0i32;
819 let mut found_open = false;
820
821 for (i, line) in lines.iter().enumerate().skip(start) {
822 for ch in line.chars() {
823 match ch {
824 '{' | '(' if !found_open || depth > 0 => {
825 depth += 1;
826 found_open = true;
827 }
828 '}' | ')' if depth > 0 => {
829 depth -= 1;
830 if depth == 0 && found_open {
831 return i;
832 }
833 }
834 _ => {}
835 }
836 }
837
838 if found_open && depth <= 0 && i > start {
839 return i;
840 }
841
842 if !found_open && i > start + 2 {
843 let trimmed = lines[i].trim();
844 if trimmed.is_empty()
845 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
846 {
847 return i.saturating_sub(1);
848 }
849 }
850 }
851
852 (start + 50).min(lines.len().saturating_sub(1))
853}
854
855pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
856 if results.is_empty() {
857 return "No results found.".to_string();
858 }
859
860 let mut out = String::new();
861 for (i, r) in results.iter().enumerate() {
862 if compact {
863 out.push_str(&format!(
864 "{}. {:.2} {}:{}-{} {:?} {}\n",
865 i + 1,
866 r.score,
867 r.file_path,
868 r.start_line,
869 r.end_line,
870 r.kind,
871 r.symbol_name,
872 ));
873 } else {
874 out.push_str(&format!(
875 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
876 i + 1,
877 r.score,
878 r.file_path,
879 r.symbol_name,
880 r.kind,
881 r.start_line,
882 r.end_line,
883 r.snippet,
884 ));
885 }
886 }
887 out
888}
889
890#[cfg(test)]
891mod tests {
892 use super::*;
893 use tempfile::tempdir;
894
895 #[cfg(unix)]
896 use std::os::unix::fs::PermissionsExt;
897
898 #[test]
899 fn tokenize_splits_code() {
900 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
901 assert!(tokens.contains(&"calculate_total".to_string()));
902 assert!(tokens.contains(&"items".to_string()));
903 assert!(tokens.contains(&"Vec".to_string()));
904 }
905
906 #[test]
907 fn camel_case_splitting() {
908 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
909 assert!(tokens.contains(&"calculateTotal".to_string()));
910 assert!(tokens.contains(&"calculate".to_string()));
911 assert!(tokens.contains(&"Total".to_string()));
912 }
913
914 #[test]
915 fn detect_rust_function() {
916 let (name, kind) =
917 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
918 assert_eq!(name, "process_request");
919 assert_eq!(kind, ChunkKind::Function);
920 }
921
922 #[test]
923 fn bm25_search_finds_relevant() {
924 let mut index = BM25Index::new();
925 index.add_chunk(CodeChunk {
926 file_path: "auth.rs".into(),
927 symbol_name: "validate_token".into(),
928 kind: ChunkKind::Function,
929 start_line: 1,
930 end_line: 10,
931 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
932 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
933 token_count: 8,
934 });
935 index.add_chunk(CodeChunk {
936 file_path: "db.rs".into(),
937 symbol_name: "connect_database".into(),
938 kind: ChunkKind::Function,
939 start_line: 1,
940 end_line: 5,
941 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
942 tokens: tokenize("fn connect_database url str Pool create_pool url"),
943 token_count: 7,
944 });
945 index.finalize();
946
947 let results = index.search("jwt token validation", 5);
948 assert!(!results.is_empty());
949 assert_eq!(results[0].symbol_name, "validate_token");
950 }
951
952 #[test]
953 fn bm25_search_sorts_ties_deterministically() {
954 let mut index = BM25Index::new();
955
956 index.add_chunk(CodeChunk {
958 file_path: "b.rs".into(),
959 symbol_name: "same".into(),
960 kind: ChunkKind::Function,
961 start_line: 1,
962 end_line: 1,
963 content: "fn same() {}".into(),
964 tokens: tokenize("same token"),
965 token_count: 2,
966 });
967 index.add_chunk(CodeChunk {
968 file_path: "a.rs".into(),
969 symbol_name: "same".into(),
970 kind: ChunkKind::Function,
971 start_line: 1,
972 end_line: 1,
973 content: "fn same() {}".into(),
974 tokens: tokenize("same token"),
975 token_count: 2,
976 });
977 index.finalize();
978
979 let results = index.search("same", 10);
980 assert!(results.len() >= 2);
981 assert_eq!(results[0].file_path, "a.rs");
982 assert_eq!(results[1].file_path, "b.rs");
983 }
984
985 #[test]
986 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
987 let td = tempdir().expect("tempdir");
988 let root = td.path();
989 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
990
991 let idx = BM25Index::build_from_directory(root);
992 assert!(!bm25_index_looks_stale(&idx, root));
993
994 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
995 assert!(bm25_index_looks_stale(&idx, root));
996 }
997
998 #[test]
999 #[cfg(unix)]
1000 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1001 let td = tempdir().expect("tempdir");
1002 let root = td.path();
1003
1004 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1005 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1006
1007 let idx1 = BM25Index::build_from_directory(root);
1008 assert!(idx1.files.contains_key("a.rs"));
1009 assert!(idx1.files.contains_key("b.rs"));
1010
1011 let a_path = root.join("a.rs");
1013 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1014 perms.set_mode(0o000);
1015 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1016
1017 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1019 .expect("rewrite b.rs");
1020
1021 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1022 assert!(
1023 idx2.files.contains_key("a.rs"),
1024 "a.rs should be kept via reuse"
1025 );
1026 assert!(idx2.files.contains_key("b.rs"));
1027
1028 let b_has_b2 = idx2
1029 .chunks
1030 .iter()
1031 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1032 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1033
1034 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1036 perms.set_mode(0o644);
1037 let _ = std::fs::set_permissions(&a_path, perms);
1038 }
1039
1040 #[test]
1041 fn load_quarantines_oversized_index() {
1042 let _env = crate::core::data_dir::test_env_lock();
1043 let td = tempdir().expect("tempdir");
1044 let root = td.path();
1045 let dir = crate::core::index_namespace::vectors_dir(root);
1046 std::fs::create_dir_all(&dir).expect("create vectors dir");
1047
1048 let index_path = dir.join("bm25_index.json");
1049 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1050 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1051
1052 let result = BM25Index::load(root);
1053 assert!(result.is_none(), "oversized index should return None");
1054 assert!(
1055 !index_path.exists(),
1056 "original index should be removed after quarantine"
1057 );
1058 assert!(
1059 dir.join("bm25_index.json.quarantined").exists(),
1060 "quarantined file should exist"
1061 );
1062
1063 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1064 }
1065
1066 #[test]
1067 fn save_refuses_oversized_output() {
1068 let _env = crate::core::data_dir::test_env_lock();
1069 let data_dir = tempdir().expect("data_dir");
1070 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1071 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1072
1073 let td = tempdir().expect("tempdir");
1074 let root = td.path();
1075
1076 let mut index = BM25Index::new();
1077 index.add_chunk(CodeChunk {
1078 file_path: "a.rs".into(),
1079 symbol_name: "a".into(),
1080 kind: ChunkKind::Function,
1081 start_line: 1,
1082 end_line: 1,
1083 content: "fn a() {}".into(),
1084 tokens: tokenize("fn a"),
1085 token_count: 2,
1086 });
1087 index.finalize();
1088
1089 let _ = index.save(root);
1090 let index_path = BM25Index::index_file_path(root);
1091 assert!(
1092 !index_path.exists(),
1093 "save should refuse to persist oversized index"
1094 );
1095
1096 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1097 }
1098
1099 #[test]
1100 fn save_writes_project_root_marker() {
1101 let _env = crate::core::data_dir::test_env_lock();
1102 let td = tempdir().expect("tempdir");
1103 let root = td.path();
1104 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1105
1106 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1107 let index = BM25Index::build_from_directory(root);
1108 index.save(root).expect("save");
1109
1110 let dir = crate::core::index_namespace::vectors_dir(root);
1111 let marker = dir.join("project_root.txt");
1112 assert!(marker.exists(), "project_root.txt marker should exist");
1113 let content = std::fs::read_to_string(&marker).expect("read marker");
1114 assert_eq!(content, root.to_string_lossy());
1115 }
1116
1117 #[test]
1118 fn list_code_files_skips_default_vendor_ignores() {
1119 let td = tempdir().expect("tempdir");
1120 let root = td.path();
1121
1122 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1123 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1124 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1125 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1126 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1127
1128 let files = list_code_files(root);
1129 assert!(
1130 files.iter().any(|f| f == "main.rs"),
1131 "main.rs should be included"
1132 );
1133 assert!(
1134 !files.iter().any(|f| f.starts_with("vendor/")),
1135 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1136 );
1137 assert!(
1138 !files.iter().any(|f| f.starts_with("dist/")),
1139 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1140 );
1141 }
1142
1143 #[test]
1144 fn list_code_files_respects_max_files_cap() {
1145 let td = tempdir().expect("tempdir");
1146 let root = td.path();
1147
1148 for i in 0..10 {
1151 std::fs::write(
1152 root.join(format!("f{i}.rs")),
1153 format!("pub fn f{i}() {{}}\n"),
1154 )
1155 .expect("write");
1156 }
1157 let files = list_code_files(root);
1158 assert!(
1159 files.len() <= MAX_BM25_FILES,
1160 "file count should not exceed MAX_BM25_FILES"
1161 );
1162 }
1163
1164 #[test]
1165 fn max_bm25_cache_bytes_reads_env() {
1166 let _env = crate::core::data_dir::test_env_lock();
1167 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1168 let bytes = max_bm25_cache_bytes();
1169 assert_eq!(bytes, 64 * 1024 * 1024);
1170 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1171 }
1172}