1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9
10const DEFAULT_BM25_IGNORES: &[&str] = &[
11 "vendor/**",
12 "dist/**",
13 "build/**",
14 "public/vendor/**",
15 "public/js/**",
16 "public/css/**",
17 "public/build/**",
18 ".next/**",
19 ".nuxt/**",
20 "__pycache__/**",
21 "*.min.js",
22 "*.min.css",
23 "*.bundle.js",
24 "*.chunk.js",
25];
26
27fn max_bm25_cache_bytes() -> u64 {
28 std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
29 .ok()
30 .and_then(|v| v.parse::<u64>().ok())
31 .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb)
32 * 1024
33 * 1024
34}
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct CodeChunk {
38 pub file_path: String,
39 pub symbol_name: String,
40 pub kind: ChunkKind,
41 pub start_line: usize,
42 pub end_line: usize,
43 pub content: String,
44 pub tokens: Vec<String>,
45 pub token_count: usize,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49pub enum ChunkKind {
50 Function,
51 Struct,
52 Impl,
53 Module,
54 Class,
55 Method,
56 Other,
57}
58
59#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
60pub struct IndexedFileState {
61 pub mtime_ms: u64,
62 pub size_bytes: u64,
63}
64
65impl IndexedFileState {
66 fn from_path(path: &Path) -> Option<Self> {
67 let meta = path.metadata().ok()?;
68 let size_bytes = meta.len();
69 let mtime_ms = meta
70 .modified()
71 .ok()
72 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
73 .map(|d| d.as_millis() as u64)?;
74 Some(Self {
75 mtime_ms,
76 size_bytes,
77 })
78 }
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct BM25Index {
83 pub chunks: Vec<CodeChunk>,
84 pub inverted: HashMap<String, Vec<(usize, f64)>>,
85 pub avg_doc_len: f64,
86 pub doc_count: usize,
87 pub doc_freqs: HashMap<String, usize>,
88 #[serde(default)]
89 pub files: HashMap<String, IndexedFileState>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SearchResult {
94 pub chunk_idx: usize,
95 pub score: f64,
96 pub file_path: String,
97 pub symbol_name: String,
98 pub kind: ChunkKind,
99 pub start_line: usize,
100 pub end_line: usize,
101 pub snippet: String,
102}
103
104const BM25_K1: f64 = 1.2;
105const BM25_B: f64 = 0.75;
106
107impl Default for BM25Index {
108 fn default() -> Self {
109 Self::new()
110 }
111}
112
113impl BM25Index {
114 pub fn new() -> Self {
115 Self {
116 chunks: Vec::new(),
117 inverted: HashMap::new(),
118 avg_doc_len: 0.0,
119 doc_count: 0,
120 doc_freqs: HashMap::new(),
121 files: HashMap::new(),
122 }
123 }
124
125 pub fn build_from_directory(root: &Path) -> Self {
126 let mut index = Self::new();
127 let files = list_code_files(root);
128 for rel in files {
129 let abs = root.join(&rel);
130 let Some(state) = IndexedFileState::from_path(&abs) else {
131 continue;
132 };
133 if let Ok(content) = std::fs::read_to_string(&abs) {
134 let mut chunks = extract_chunks(&rel, &content);
135 chunks.sort_by(|a, b| {
136 a.start_line
137 .cmp(&b.start_line)
138 .then_with(|| a.end_line.cmp(&b.end_line))
139 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
140 });
141 for chunk in chunks {
142 index.add_chunk(chunk);
143 }
144 index.files.insert(rel, state);
145 }
146 }
147
148 index.finalize();
149 index
150 }
151
152 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
153 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
154 for c in &prev.chunks {
155 old_by_file
156 .entry(c.file_path.clone())
157 .or_default()
158 .push(c.clone());
159 }
160 for v in old_by_file.values_mut() {
161 v.sort_by(|a, b| {
162 a.start_line
163 .cmp(&b.start_line)
164 .then_with(|| a.end_line.cmp(&b.end_line))
165 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
166 });
167 }
168
169 let mut index = Self::new();
170 let files = list_code_files(root);
171 for rel in files {
172 let abs = root.join(&rel);
173 let Some(state) = IndexedFileState::from_path(&abs) else {
174 continue;
175 };
176
177 let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
178 if unchanged {
179 if let Some(chunks) = old_by_file.get(&rel) {
180 for chunk in chunks {
181 index.add_chunk(chunk.clone());
182 }
183 index.files.insert(rel, state);
184 continue;
185 }
186 }
187
188 if let Ok(content) = std::fs::read_to_string(&abs) {
189 let mut chunks = extract_chunks(&rel, &content);
190 chunks.sort_by(|a, b| {
191 a.start_line
192 .cmp(&b.start_line)
193 .then_with(|| a.end_line.cmp(&b.end_line))
194 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
195 });
196 for chunk in chunks {
197 index.add_chunk(chunk);
198 }
199 index.files.insert(rel, state);
200 }
201 }
202
203 index.finalize();
204 index
205 }
206
207 fn add_chunk(&mut self, chunk: CodeChunk) {
208 let idx = self.chunks.len();
209
210 for token in &chunk.tokens {
211 let lower = token.to_lowercase();
212 self.inverted.entry(lower).or_default().push((idx, 1.0));
213 }
214
215 self.chunks.push(chunk);
216 }
217
218 fn finalize(&mut self) {
219 self.doc_count = self.chunks.len();
220 if self.doc_count == 0 {
221 return;
222 }
223
224 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
225 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
226
227 self.doc_freqs.clear();
228 for (term, postings) in &self.inverted {
229 let unique_docs: std::collections::HashSet<usize> =
230 postings.iter().map(|(idx, _)| *idx).collect();
231 self.doc_freqs.insert(term.clone(), unique_docs.len());
232 }
233 }
234
235 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
236 let query_tokens = tokenize(query);
237 if query_tokens.is_empty() || self.doc_count == 0 {
238 return Vec::new();
239 }
240
241 let mut scores: HashMap<usize, f64> = HashMap::new();
242
243 for token in &query_tokens {
244 let lower = token.to_lowercase();
245 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
246 if df == 0.0 {
247 continue;
248 }
249
250 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
251
252 if let Some(postings) = self.inverted.get(&lower) {
253 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
254 for (idx, weight) in postings {
255 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
256 }
257
258 for (doc_idx, tf) in &doc_tfs {
259 let doc_len = self.chunks[*doc_idx].token_count as f64;
260 let norm_len = doc_len / self.avg_doc_len.max(1.0);
261 let bm25 = idf * (tf * (BM25_K1 + 1.0))
262 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
263
264 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
265 }
266 }
267 }
268
269 let mut results: Vec<SearchResult> = scores
270 .into_iter()
271 .map(|(idx, score)| {
272 let chunk = &self.chunks[idx];
273 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
274 SearchResult {
275 chunk_idx: idx,
276 score,
277 file_path: chunk.file_path.clone(),
278 symbol_name: chunk.symbol_name.clone(),
279 kind: chunk.kind.clone(),
280 start_line: chunk.start_line,
281 end_line: chunk.end_line,
282 snippet,
283 }
284 })
285 .collect();
286
287 results.sort_by(|a, b| {
288 b.score
289 .partial_cmp(&a.score)
290 .unwrap_or(std::cmp::Ordering::Equal)
291 .then_with(|| a.file_path.cmp(&b.file_path))
292 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
293 .then_with(|| a.start_line.cmp(&b.start_line))
294 .then_with(|| a.end_line.cmp(&b.end_line))
295 });
296 results.truncate(top_k);
297 results
298 }
299
300 pub fn save(&self, root: &Path) -> std::io::Result<()> {
301 if self.chunks.len() > CHUNK_COUNT_WARNING {
302 tracing::warn!(
303 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
304 self.chunks.len(),
305 CHUNK_COUNT_WARNING
306 );
307 }
308
309 let dir = index_dir(root);
310 std::fs::create_dir_all(&dir)?;
311 let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
312
313 let max_bytes = max_bm25_cache_bytes();
314 if data.len() as u64 > max_bytes {
315 tracing::warn!(
316 "[bm25] serialized index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
317 data.len() as f64 / 1_048_576.0,
318 max_bytes / (1024 * 1024),
319 dir.display()
320 );
321 return Ok(());
322 }
323
324 let target = dir.join("bm25_index.json");
325 let tmp = dir.join("bm25_index.json.tmp");
326 std::fs::write(&tmp, &data)?;
327 std::fs::rename(&tmp, &target)?;
328
329 let _ = std::fs::write(
330 dir.join("project_root.txt"),
331 root.to_string_lossy().as_bytes(),
332 );
333
334 Ok(())
335 }
336
337 pub fn load(root: &Path) -> Option<Self> {
338 let path = index_dir(root).join("bm25_index.json");
339 let meta = std::fs::metadata(&path).ok()?;
340 let max_bytes = max_bm25_cache_bytes();
341 if meta.len() > max_bytes {
342 tracing::warn!(
343 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
344 meta.len() as f64 / 1_073_741_824.0,
345 max_bytes / (1024 * 1024),
346 path.display()
347 );
348 let quarantined = path.with_extension("json.quarantined");
349 let _ = std::fs::rename(&path, &quarantined);
350 return None;
351 }
352 let data = std::fs::read_to_string(&path).ok()?;
353 serde_json::from_str(&data).ok()
354 }
355
356 pub fn load_or_build(root: &Path) -> Self {
357 if let Some(idx) = Self::load(root) {
358 if !bm25_index_looks_stale(&idx, root) {
359 return idx;
360 }
361 tracing::warn!(
362 "[bm25_index: stale index detected for {}; rebuilding]",
363 root.display()
364 );
365 let rebuilt = if idx.files.is_empty() {
366 Self::build_from_directory(root)
367 } else {
368 Self::rebuild_incremental(root, &idx)
369 };
370 let _ = rebuilt.save(root);
371 return rebuilt;
372 }
373
374 let built = Self::build_from_directory(root);
375 let _ = built.save(root);
376 built
377 }
378
379 pub fn index_file_path(root: &Path) -> PathBuf {
380 index_dir(root).join("bm25_index.json")
381 }
382}
383
384fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
385 if index.chunks.is_empty() {
386 return false;
387 }
388
389 if index.files.is_empty() {
390 let mut seen = std::collections::HashSet::<&str>::new();
392 for chunk in &index.chunks {
393 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
394 if rel.is_empty() {
395 continue;
396 }
397 if !seen.insert(rel) {
398 continue;
399 }
400 if !root.join(rel).exists() {
401 return true;
402 }
403 }
404 return false;
405 }
406
407 for (rel, old_state) in &index.files {
409 let abs = root.join(rel);
410 if !abs.exists() {
411 return true;
412 }
413 let Some(cur) = IndexedFileState::from_path(&abs) else {
414 return true;
415 };
416 if &cur != old_state {
417 return true;
418 }
419 }
420
421 for rel in list_code_files(root) {
423 if !index.files.contains_key(&rel) {
424 return true;
425 }
426 }
427
428 false
429}
430
431fn index_dir(root: &Path) -> PathBuf {
432 crate::core::index_namespace::vectors_dir(root)
433}
434
435fn list_code_files(root: &Path) -> Vec<String> {
436 let walker = ignore::WalkBuilder::new(root)
437 .hidden(true)
438 .git_ignore(true)
439 .git_global(true)
440 .git_exclude(true)
441 .build();
442
443 let cfg = crate::core::config::Config::load();
444 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
445 .iter()
446 .filter_map(|p| glob::Pattern::new(p).ok())
447 .collect();
448 ignore_patterns.extend(
449 cfg.extra_ignore_patterns
450 .iter()
451 .filter_map(|p| glob::Pattern::new(p).ok()),
452 );
453
454 let mut files: Vec<String> = Vec::new();
455 for entry in walker.flatten() {
456 let path = entry.path();
457 if !path.is_file() {
458 continue;
459 }
460 if !is_code_file(path) {
461 continue;
462 }
463 let rel = path
464 .strip_prefix(root)
465 .unwrap_or(path)
466 .to_string_lossy()
467 .to_string();
468 if rel.is_empty() {
469 continue;
470 }
471 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
472 continue;
473 }
474 if files.len() >= MAX_BM25_FILES {
475 tracing::warn!(
476 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
477 root.display()
478 );
479 break;
480 }
481 files.push(rel);
482 }
483
484 files.sort();
485 files.dedup();
486 files
487}
488
489pub fn is_code_file(path: &Path) -> bool {
490 let ext = path
491 .extension()
492 .and_then(|e| e.to_str())
493 .unwrap_or("")
494 .to_lowercase();
495 matches!(
496 ext.as_str(),
497 "rs" | "ts"
498 | "tsx"
499 | "js"
500 | "jsx"
501 | "py"
502 | "go"
503 | "java"
504 | "c"
505 | "cc"
506 | "cpp"
507 | "h"
508 | "hpp"
509 | "rb"
510 | "cs"
511 | "kt"
512 | "swift"
513 | "php"
514 | "scala"
515 | "sql"
516 | "ex"
517 | "exs"
518 | "zig"
519 | "lua"
520 | "dart"
521 | "vue"
522 | "svelte"
523 )
524}
525
526fn tokenize(text: &str) -> Vec<String> {
527 let mut tokens = Vec::new();
528 let mut current = String::new();
529
530 for ch in text.chars() {
531 if ch.is_alphanumeric() || ch == '_' {
532 current.push(ch);
533 } else {
534 if current.len() >= 2 {
535 tokens.push(current.clone());
536 }
537 current.clear();
538 }
539 }
540 if current.len() >= 2 {
541 tokens.push(current);
542 }
543
544 split_camel_case_tokens(&tokens)
545}
546
547pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
548 tokenize(text)
549}
550
551fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
552 let mut result = Vec::new();
553 for token in tokens {
554 result.push(token.clone());
555 let mut start = 0;
556 let chars: Vec<char> = token.chars().collect();
557 for i in 1..chars.len() {
558 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
559 let part: String = chars[start..i].iter().collect();
560 if part.len() >= 2 {
561 result.push(part);
562 }
563 start = i;
564 }
565 }
566 if start > 0 {
567 let part: String = chars[start..].iter().collect();
568 if part.len() >= 2 {
569 result.push(part);
570 }
571 }
572 }
573 result
574}
575
576fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
577 #[cfg(feature = "tree-sitter")]
578 {
579 let ext = std::path::Path::new(file_path)
580 .extension()
581 .and_then(|e| e.to_str())
582 .unwrap_or("");
583 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
584 return chunks;
585 }
586 }
587
588 let lines: Vec<&str> = content.lines().collect();
589 if lines.is_empty() {
590 return Vec::new();
591 }
592
593 let mut chunks = Vec::new();
594 let mut i = 0;
595
596 while i < lines.len() {
597 let trimmed = lines[i].trim();
598
599 if let Some((name, kind)) = detect_symbol(trimmed) {
600 let start = i;
601 let end = find_block_end(&lines, i);
602 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
603 let tokens = tokenize(&block);
604 let token_count = tokens.len();
605
606 chunks.push(CodeChunk {
607 file_path: file_path.to_string(),
608 symbol_name: name,
609 kind,
610 start_line: start + 1,
611 end_line: end + 1,
612 content: block,
613 tokens,
614 token_count,
615 });
616
617 i = end + 1;
618 } else {
619 i += 1;
620 }
621 }
622
623 if chunks.is_empty() && !content.is_empty() {
624 let bytes = content.as_bytes();
629 let rk_chunks = crate::core::rabin_karp::chunk(content);
630 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
631 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
632 let end = (c.offset + c.length).min(bytes.len());
633 let slice = &bytes[c.offset..end];
634 let chunk_text = String::from_utf8_lossy(slice).into_owned();
635 let tokens = tokenize(&chunk_text);
636 let token_count = tokens.len();
637 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
638 let end_line = start_line + bytecount::count(slice, b'\n');
639 chunks.push(CodeChunk {
640 file_path: file_path.to_string(),
641 symbol_name: format!("{file_path}#chunk-{idx}"),
642 kind: ChunkKind::Module,
643 start_line,
644 end_line: end_line.max(start_line),
645 content: chunk_text,
646 tokens,
647 token_count,
648 });
649 }
650 } else {
651 let tokens = tokenize(content);
652 let token_count = tokens.len();
653 let snippet = lines
654 .iter()
655 .take(50)
656 .copied()
657 .collect::<Vec<_>>()
658 .join("\n");
659 chunks.push(CodeChunk {
660 file_path: file_path.to_string(),
661 symbol_name: file_path.to_string(),
662 kind: ChunkKind::Module,
663 start_line: 1,
664 end_line: lines.len(),
665 content: snippet,
666 tokens,
667 token_count,
668 });
669 }
670 }
671
672 chunks
673}
674
675fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
676 let trimmed = line.trim();
677
678 let patterns: &[(&str, ChunkKind)] = &[
679 ("pub async fn ", ChunkKind::Function),
680 ("async fn ", ChunkKind::Function),
681 ("pub fn ", ChunkKind::Function),
682 ("fn ", ChunkKind::Function),
683 ("pub struct ", ChunkKind::Struct),
684 ("struct ", ChunkKind::Struct),
685 ("pub enum ", ChunkKind::Struct),
686 ("enum ", ChunkKind::Struct),
687 ("impl ", ChunkKind::Impl),
688 ("pub trait ", ChunkKind::Struct),
689 ("trait ", ChunkKind::Struct),
690 ("export function ", ChunkKind::Function),
691 ("export async function ", ChunkKind::Function),
692 ("export default function ", ChunkKind::Function),
693 ("function ", ChunkKind::Function),
694 ("async function ", ChunkKind::Function),
695 ("export class ", ChunkKind::Class),
696 ("class ", ChunkKind::Class),
697 ("export interface ", ChunkKind::Struct),
698 ("interface ", ChunkKind::Struct),
699 ("def ", ChunkKind::Function),
700 ("async def ", ChunkKind::Function),
701 ("class ", ChunkKind::Class),
702 ("func ", ChunkKind::Function),
703 ];
704
705 for (prefix, kind) in patterns {
706 if let Some(rest) = trimmed.strip_prefix(prefix) {
707 let name: String = rest
708 .chars()
709 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
710 .take_while(|c| *c != '<')
711 .collect();
712 if !name.is_empty() {
713 return Some((name, kind.clone()));
714 }
715 }
716 }
717
718 None
719}
720
721fn find_block_end(lines: &[&str], start: usize) -> usize {
722 let mut depth = 0i32;
723 let mut found_open = false;
724
725 for (i, line) in lines.iter().enumerate().skip(start) {
726 for ch in line.chars() {
727 match ch {
728 '{' | '(' if !found_open || depth > 0 => {
729 depth += 1;
730 found_open = true;
731 }
732 '}' | ')' if depth > 0 => {
733 depth -= 1;
734 if depth == 0 && found_open {
735 return i;
736 }
737 }
738 _ => {}
739 }
740 }
741
742 if found_open && depth <= 0 && i > start {
743 return i;
744 }
745
746 if !found_open && i > start + 2 {
747 let trimmed = lines[i].trim();
748 if trimmed.is_empty()
749 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
750 {
751 return i.saturating_sub(1);
752 }
753 }
754 }
755
756 (start + 50).min(lines.len().saturating_sub(1))
757}
758
759pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
760 if results.is_empty() {
761 return "No results found.".to_string();
762 }
763
764 let mut out = String::new();
765 for (i, r) in results.iter().enumerate() {
766 if compact {
767 out.push_str(&format!(
768 "{}. {:.2} {}:{}-{} {:?} {}\n",
769 i + 1,
770 r.score,
771 r.file_path,
772 r.start_line,
773 r.end_line,
774 r.kind,
775 r.symbol_name,
776 ));
777 } else {
778 out.push_str(&format!(
779 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
780 i + 1,
781 r.score,
782 r.file_path,
783 r.symbol_name,
784 r.kind,
785 r.start_line,
786 r.end_line,
787 r.snippet,
788 ));
789 }
790 }
791 out
792}
793
794#[cfg(test)]
795mod tests {
796 use super::*;
797 use tempfile::tempdir;
798
799 #[cfg(unix)]
800 use std::os::unix::fs::PermissionsExt;
801
802 #[test]
803 fn tokenize_splits_code() {
804 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
805 assert!(tokens.contains(&"calculate_total".to_string()));
806 assert!(tokens.contains(&"items".to_string()));
807 assert!(tokens.contains(&"Vec".to_string()));
808 }
809
810 #[test]
811 fn camel_case_splitting() {
812 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
813 assert!(tokens.contains(&"calculateTotal".to_string()));
814 assert!(tokens.contains(&"calculate".to_string()));
815 assert!(tokens.contains(&"Total".to_string()));
816 }
817
818 #[test]
819 fn detect_rust_function() {
820 let (name, kind) =
821 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
822 assert_eq!(name, "process_request");
823 assert_eq!(kind, ChunkKind::Function);
824 }
825
826 #[test]
827 fn bm25_search_finds_relevant() {
828 let mut index = BM25Index::new();
829 index.add_chunk(CodeChunk {
830 file_path: "auth.rs".into(),
831 symbol_name: "validate_token".into(),
832 kind: ChunkKind::Function,
833 start_line: 1,
834 end_line: 10,
835 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
836 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
837 token_count: 8,
838 });
839 index.add_chunk(CodeChunk {
840 file_path: "db.rs".into(),
841 symbol_name: "connect_database".into(),
842 kind: ChunkKind::Function,
843 start_line: 1,
844 end_line: 5,
845 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
846 tokens: tokenize("fn connect_database url str Pool create_pool url"),
847 token_count: 7,
848 });
849 index.finalize();
850
851 let results = index.search("jwt token validation", 5);
852 assert!(!results.is_empty());
853 assert_eq!(results[0].symbol_name, "validate_token");
854 }
855
856 #[test]
857 fn bm25_search_sorts_ties_deterministically() {
858 let mut index = BM25Index::new();
859
860 index.add_chunk(CodeChunk {
862 file_path: "b.rs".into(),
863 symbol_name: "same".into(),
864 kind: ChunkKind::Function,
865 start_line: 1,
866 end_line: 1,
867 content: "fn same() {}".into(),
868 tokens: tokenize("same token"),
869 token_count: 2,
870 });
871 index.add_chunk(CodeChunk {
872 file_path: "a.rs".into(),
873 symbol_name: "same".into(),
874 kind: ChunkKind::Function,
875 start_line: 1,
876 end_line: 1,
877 content: "fn same() {}".into(),
878 tokens: tokenize("same token"),
879 token_count: 2,
880 });
881 index.finalize();
882
883 let results = index.search("same", 10);
884 assert!(results.len() >= 2);
885 assert_eq!(results[0].file_path, "a.rs");
886 assert_eq!(results[1].file_path, "b.rs");
887 }
888
889 #[test]
890 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
891 let td = tempdir().expect("tempdir");
892 let root = td.path();
893 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
894
895 let idx = BM25Index::build_from_directory(root);
896 assert!(!bm25_index_looks_stale(&idx, root));
897
898 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
899 assert!(bm25_index_looks_stale(&idx, root));
900 }
901
902 #[test]
903 #[cfg(unix)]
904 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
905 let td = tempdir().expect("tempdir");
906 let root = td.path();
907
908 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
909 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
910
911 let idx1 = BM25Index::build_from_directory(root);
912 assert!(idx1.files.contains_key("a.rs"));
913 assert!(idx1.files.contains_key("b.rs"));
914
915 let a_path = root.join("a.rs");
917 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
918 perms.set_mode(0o000);
919 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
920
921 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
923 .expect("rewrite b.rs");
924
925 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
926 assert!(
927 idx2.files.contains_key("a.rs"),
928 "a.rs should be kept via reuse"
929 );
930 assert!(idx2.files.contains_key("b.rs"));
931
932 let b_has_b2 = idx2
933 .chunks
934 .iter()
935 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
936 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
937
938 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
940 perms.set_mode(0o644);
941 let _ = std::fs::set_permissions(&a_path, perms);
942 }
943
944 #[test]
945 fn load_quarantines_oversized_index() {
946 let _env = crate::core::data_dir::test_env_lock();
947 let td = tempdir().expect("tempdir");
948 let root = td.path();
949 let dir = crate::core::index_namespace::vectors_dir(root);
950 std::fs::create_dir_all(&dir).expect("create vectors dir");
951
952 let index_path = dir.join("bm25_index.json");
953 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
954 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
955
956 let result = BM25Index::load(root);
957 assert!(result.is_none(), "oversized index should return None");
958 assert!(
959 !index_path.exists(),
960 "original index should be removed after quarantine"
961 );
962 assert!(
963 dir.join("bm25_index.json.quarantined").exists(),
964 "quarantined file should exist"
965 );
966
967 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
968 }
969
970 #[test]
971 fn save_refuses_oversized_output() {
972 let _env = crate::core::data_dir::test_env_lock();
973 let data_dir = tempdir().expect("data_dir");
974 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
975 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
976
977 let td = tempdir().expect("tempdir");
978 let root = td.path();
979
980 let mut index = BM25Index::new();
981 index.add_chunk(CodeChunk {
982 file_path: "a.rs".into(),
983 symbol_name: "a".into(),
984 kind: ChunkKind::Function,
985 start_line: 1,
986 end_line: 1,
987 content: "fn a() {}".into(),
988 tokens: tokenize("fn a"),
989 token_count: 2,
990 });
991 index.finalize();
992
993 let _ = index.save(root);
994 let index_path = BM25Index::index_file_path(root);
995 assert!(
996 !index_path.exists(),
997 "save should refuse to persist oversized index"
998 );
999
1000 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1001 }
1002
1003 #[test]
1004 fn save_writes_project_root_marker() {
1005 let td = tempdir().expect("tempdir");
1006 let root = td.path();
1007 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1008
1009 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1010 let index = BM25Index::build_from_directory(root);
1011 index.save(root).expect("save");
1012
1013 let dir = crate::core::index_namespace::vectors_dir(root);
1014 let marker = dir.join("project_root.txt");
1015 assert!(marker.exists(), "project_root.txt marker should exist");
1016 let content = std::fs::read_to_string(&marker).expect("read marker");
1017 assert_eq!(content, root.to_string_lossy());
1018 }
1019
1020 #[test]
1021 fn list_code_files_skips_default_vendor_ignores() {
1022 let td = tempdir().expect("tempdir");
1023 let root = td.path();
1024
1025 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1026 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1027 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1028 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1029 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1030
1031 let files = list_code_files(root);
1032 assert!(
1033 files.iter().any(|f| f == "main.rs"),
1034 "main.rs should be included"
1035 );
1036 assert!(
1037 !files.iter().any(|f| f.starts_with("vendor/")),
1038 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1039 );
1040 assert!(
1041 !files.iter().any(|f| f.starts_with("dist/")),
1042 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1043 );
1044 }
1045
1046 #[test]
1047 fn list_code_files_respects_max_files_cap() {
1048 let td = tempdir().expect("tempdir");
1049 let root = td.path();
1050
1051 for i in 0..10 {
1054 std::fs::write(
1055 root.join(format!("f{i}.rs")),
1056 format!("pub fn f{i}() {{}}\n"),
1057 )
1058 .expect("write");
1059 }
1060 let files = list_code_files(root);
1061 assert!(
1062 files.len() <= MAX_BM25_FILES,
1063 "file count should not exceed MAX_BM25_FILES"
1064 );
1065 }
1066
1067 #[test]
1068 fn max_bm25_cache_bytes_reads_env() {
1069 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1070 let bytes = max_bm25_cache_bytes();
1071 assert_eq!(bytes, 64 * 1024 * 1024);
1072 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1073 }
1074}