1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12 "vendor/**",
13 "dist/**",
14 "build/**",
15 "public/vendor/**",
16 "public/js/**",
17 "public/css/**",
18 "public/build/**",
19 ".next/**",
20 ".nuxt/**",
21 "__pycache__/**",
22 "*.min.js",
23 "*.min.css",
24 "*.bundle.js",
25 "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30 .ok()
31 .and_then(|v| v.parse::<u64>().ok())
32 .unwrap_or_else(|| {
33 let cfg = crate::core::config::Config::load();
34 let profile = crate::core::config::MemoryProfile::effective(&cfg);
35 let profile_mb = profile.bm25_max_cache_mb();
36 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37 profile_mb
38 } else {
39 cfg.bm25_max_cache_mb
40 }
41 });
42 mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47 pub file_path: String,
48 pub symbol_name: String,
49 pub kind: ChunkKind,
50 pub start_line: usize,
51 pub end_line: usize,
52 pub content: String,
53 #[serde(default)]
54 pub tokens: Vec<String>,
55 pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60 Function,
61 Struct,
62 Impl,
63 Module,
64 Class,
65 Method,
66 Other,
67}
68
69#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
70pub struct IndexedFileState {
71 pub mtime_ms: u64,
72 pub size_bytes: u64,
73}
74
75impl IndexedFileState {
76 fn from_path(path: &Path) -> Option<Self> {
77 let meta = path.metadata().ok()?;
78 let size_bytes = meta.len();
79 let mtime_ms = meta
80 .modified()
81 .ok()
82 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
83 .map(|d| d.as_millis() as u64)?;
84 Some(Self {
85 mtime_ms,
86 size_bytes,
87 })
88 }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct BM25Index {
93 pub chunks: Vec<CodeChunk>,
94 pub inverted: HashMap<String, Vec<(usize, f64)>>,
95 pub avg_doc_len: f64,
96 pub doc_count: usize,
97 pub doc_freqs: HashMap<String, usize>,
98 #[serde(default)]
99 pub files: HashMap<String, IndexedFileState>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct SearchResult {
104 pub chunk_idx: usize,
105 pub score: f64,
106 pub file_path: String,
107 pub symbol_name: String,
108 pub kind: ChunkKind,
109 pub start_line: usize,
110 pub end_line: usize,
111 pub snippet: String,
112}
113
114const BM25_K1: f64 = 1.2;
115const BM25_B: f64 = 0.75;
116
117impl Default for BM25Index {
118 fn default() -> Self {
119 Self::new()
120 }
121}
122
123impl BM25Index {
124 pub fn new() -> Self {
125 Self {
126 chunks: Vec::new(),
127 inverted: HashMap::new(),
128 avg_doc_len: 0.0,
129 doc_count: 0,
130 doc_freqs: HashMap::new(),
131 files: HashMap::new(),
132 }
133 }
134
135 pub fn memory_usage_bytes(&self) -> usize {
137 let chunks_size: usize = self
138 .chunks
139 .iter()
140 .map(|c| {
141 c.content.len()
142 + c.file_path.len()
143 + c.symbol_name.len()
144 + c.tokens.iter().map(String::len).sum::<usize>()
145 + 64
146 })
147 .sum();
148 let inverted_size: usize = self
149 .inverted
150 .iter()
151 .map(|(k, v)| k.len() + v.len() * 16 + 32)
152 .sum();
153 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
154 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
155 chunks_size + inverted_size + files_size + freqs_size
156 }
157
158 pub fn unload(&mut self) {
160 let usage = self.memory_usage_bytes();
161 self.chunks = Vec::new();
162 self.inverted = HashMap::new();
163 self.doc_freqs = HashMap::new();
164 self.files = HashMap::new();
165 self.avg_doc_len = 0.0;
166 self.doc_count = 0;
167 tracing::info!(
168 "[bm25] unloaded index, freed ~{:.1}MB",
169 usage as f64 / 1_048_576.0
170 );
171 }
172
173 #[cfg(test)]
175 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
176 let mut index = Self::new();
177 for mut chunk in chunks {
178 if chunk.token_count == 0 {
179 chunk.token_count = tokenize(&chunk.content).len();
180 }
181 index.add_chunk(chunk);
182 }
183 index.finalize();
184 index
185 }
186
187 pub fn build_from_directory(root: &Path) -> Self {
188 let mut index = Self::new();
189 let files = list_code_files(root);
190 for rel in files {
191 let abs = root.join(&rel);
192 let Some(state) = IndexedFileState::from_path(&abs) else {
193 continue;
194 };
195 if let Ok(content) = std::fs::read_to_string(&abs) {
196 let mut chunks = extract_chunks(&rel, &content);
197 chunks.sort_by(|a, b| {
198 a.start_line
199 .cmp(&b.start_line)
200 .then_with(|| a.end_line.cmp(&b.end_line))
201 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
202 });
203 for chunk in chunks {
204 index.add_chunk(chunk);
205 }
206 index.files.insert(rel, state);
207 }
208 }
209
210 index.finalize();
211 index
212 }
213
214 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
215 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
216 for c in &prev.chunks {
217 old_by_file
218 .entry(c.file_path.clone())
219 .or_default()
220 .push(c.clone());
221 }
222 for v in old_by_file.values_mut() {
223 v.sort_by(|a, b| {
224 a.start_line
225 .cmp(&b.start_line)
226 .then_with(|| a.end_line.cmp(&b.end_line))
227 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
228 });
229 }
230
231 let mut index = Self::new();
232 let files = list_code_files(root);
233 for rel in files {
234 let abs = root.join(&rel);
235 let Some(state) = IndexedFileState::from_path(&abs) else {
236 continue;
237 };
238
239 let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
240 if unchanged {
241 if let Some(chunks) = old_by_file.get(&rel) {
242 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
243 for chunk in chunks {
244 index.add_chunk(chunk.clone());
245 }
246 index.files.insert(rel, state);
247 continue;
248 }
249 }
250 }
251
252 if let Ok(content) = std::fs::read_to_string(&abs) {
253 let mut chunks = extract_chunks(&rel, &content);
254 chunks.sort_by(|a, b| {
255 a.start_line
256 .cmp(&b.start_line)
257 .then_with(|| a.end_line.cmp(&b.end_line))
258 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
259 });
260 for chunk in chunks {
261 index.add_chunk(chunk);
262 }
263 index.files.insert(rel, state);
264 }
265 }
266
267 index.finalize();
268 index
269 }
270
271 fn add_chunk(&mut self, chunk: CodeChunk) {
272 let idx = self.chunks.len();
273
274 let tokens = tokenize(&chunk.content);
275 for token in &tokens {
276 let lower = token.to_lowercase();
277 let postings = self.inverted.entry(lower.clone()).or_default();
278 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
279 *self.doc_freqs.entry(lower).or_insert(0) += 1;
280 }
281 postings.push((idx, 1.0));
282 }
283
284 self.chunks.push(CodeChunk {
285 token_count: tokens.len(),
286 tokens: Vec::new(),
287 ..chunk
288 });
289 }
290
291 fn finalize(&mut self) {
292 self.doc_count = self.chunks.len();
293 if self.doc_count == 0 {
294 return;
295 }
296
297 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
298 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
299 }
300
301 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
302 let query_tokens = tokenize(query);
303 if query_tokens.is_empty() || self.doc_count == 0 {
304 return Vec::new();
305 }
306
307 let mut scores: HashMap<usize, f64> = HashMap::new();
308
309 for token in &query_tokens {
310 let lower = token.to_lowercase();
311 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
312 if df == 0.0 {
313 continue;
314 }
315
316 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
317
318 if let Some(postings) = self.inverted.get(&lower) {
319 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
320 for (idx, weight) in postings {
321 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
322 }
323
324 for (doc_idx, tf) in &doc_tfs {
325 let doc_len = self.chunks[*doc_idx].token_count as f64;
326 let norm_len = doc_len / self.avg_doc_len.max(1.0);
327 let bm25 = idf * (tf * (BM25_K1 + 1.0))
328 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
329
330 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
331 }
332 }
333 }
334
335 let mut results: Vec<SearchResult> = scores
336 .into_iter()
337 .map(|(idx, score)| {
338 let chunk = &self.chunks[idx];
339 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
340 SearchResult {
341 chunk_idx: idx,
342 score,
343 file_path: chunk.file_path.clone(),
344 symbol_name: chunk.symbol_name.clone(),
345 kind: chunk.kind.clone(),
346 start_line: chunk.start_line,
347 end_line: chunk.end_line,
348 snippet,
349 }
350 })
351 .collect();
352
353 results.sort_by(|a, b| {
354 b.score
355 .partial_cmp(&a.score)
356 .unwrap_or(std::cmp::Ordering::Equal)
357 .then_with(|| a.file_path.cmp(&b.file_path))
358 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
359 .then_with(|| a.start_line.cmp(&b.start_line))
360 .then_with(|| a.end_line.cmp(&b.end_line))
361 });
362 results.truncate(top_k);
363 results
364 }
365
366 pub fn save(&self, root: &Path) -> std::io::Result<()> {
367 if self.chunks.len() > CHUNK_COUNT_WARNING {
368 tracing::warn!(
369 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
370 self.chunks.len(),
371 CHUNK_COUNT_WARNING
372 );
373 }
374
375 let dir = index_dir(root);
376 std::fs::create_dir_all(&dir)?;
377 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
378 .map_err(|e| std::io::Error::other(e.to_string()))?;
379
380 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
381 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
382
383 let max_bytes = max_bm25_cache_bytes();
384 if compressed.len() as u64 > max_bytes {
385 tracing::warn!(
386 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
387 compressed.len() as f64 / 1_048_576.0,
388 max_bytes / (1024 * 1024),
389 dir.display()
390 );
391 return Ok(());
392 }
393
394 tracing::info!(
395 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
396 data.len() as f64 / 1_048_576.0,
397 compressed.len() as f64 / 1_048_576.0,
398 (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
399 );
400
401 let target = dir.join("bm25_index.bin.zst");
402 let tmp = dir.join("bm25_index.bin.zst.tmp");
403 std::fs::write(&tmp, &compressed)?;
404 std::fs::rename(&tmp, &target)?;
405
406 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
407 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
408
409 let _ = std::fs::write(
410 dir.join("project_root.txt"),
411 root.to_string_lossy().as_bytes(),
412 );
413
414 Ok(())
415 }
416
417 pub fn load(root: &Path) -> Option<Self> {
418 let dir = index_dir(root);
419 let max_bytes = max_bm25_cache_bytes();
420
421 let zst_path = dir.join("bm25_index.bin.zst");
422 if zst_path.exists() {
423 let meta = std::fs::metadata(&zst_path).ok()?;
424 if meta.len() > max_bytes {
425 tracing::warn!(
426 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
427 meta.len() as f64 / 1_073_741_824.0,
428 max_bytes / (1024 * 1024),
429 zst_path.display()
430 );
431 let quarantined = zst_path.with_extension("zst.quarantined");
432 let _ = std::fs::rename(&zst_path, &quarantined);
433 return None;
434 }
435 let compressed = std::fs::read(&zst_path).ok()?;
436 let data = zstd::decode_all(compressed.as_slice()).ok()?;
437 let (idx, _): (Self, _) =
438 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
439 return Some(idx);
440 }
441
442 let bin_path = dir.join("bm25_index.bin");
443 if bin_path.exists() {
444 let meta = std::fs::metadata(&bin_path).ok()?;
445 if meta.len() > max_bytes {
446 tracing::warn!(
447 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
448 meta.len() as f64 / 1_073_741_824.0,
449 max_bytes / (1024 * 1024),
450 bin_path.display()
451 );
452 let quarantined = bin_path.with_extension("bin.quarantined");
453 let _ = std::fs::rename(&bin_path, &quarantined);
454 return None;
455 }
456 let data = std::fs::read(&bin_path).ok()?;
457 let (idx, _): (Self, _) =
458 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
459 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
461 let zst_tmp = zst_path.with_extension("zst.tmp");
462 if std::fs::write(&zst_tmp, &compressed).is_ok()
463 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
464 {
465 tracing::info!(
466 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
467 data.len() as f64 / 1_048_576.0,
468 compressed.len() as f64 / 1_048_576.0
469 );
470 let _ = std::fs::remove_file(&bin_path);
471 }
472 }
473 return Some(idx);
474 }
475
476 let json_path = dir.join("bm25_index.json");
477 if json_path.exists() {
478 let meta = std::fs::metadata(&json_path).ok()?;
479 if meta.len() > max_bytes {
480 tracing::warn!(
481 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
482 meta.len() as f64 / 1_073_741_824.0,
483 max_bytes / (1024 * 1024),
484 json_path.display()
485 );
486 let quarantined = json_path.with_extension("json.quarantined");
487 let _ = std::fs::rename(&json_path, &quarantined);
488 return None;
489 }
490 let data = std::fs::read_to_string(&json_path).ok()?;
491 return serde_json::from_str(&data).ok();
492 }
493
494 None
495 }
496
497 pub fn load_or_build(root: &Path) -> Self {
498 if let Some(idx) = Self::load(root) {
499 if !bm25_index_looks_stale(&idx, root) {
500 return idx;
501 }
502 tracing::warn!(
503 "[bm25_index: stale index detected for {}; rebuilding]",
504 root.display()
505 );
506 let rebuilt = if idx.files.is_empty() {
507 Self::build_from_directory(root)
508 } else {
509 Self::rebuild_incremental(root, &idx)
510 };
511 let _ = rebuilt.save(root);
512 return rebuilt;
513 }
514
515 let built = Self::build_from_directory(root);
516 let _ = built.save(root);
517 built
518 }
519
520 pub fn index_file_path(root: &Path) -> PathBuf {
521 let dir = index_dir(root);
522 let zst = dir.join("bm25_index.bin.zst");
523 if zst.exists() {
524 return zst;
525 }
526 let bin = dir.join("bm25_index.bin");
527 if bin.exists() {
528 return bin;
529 }
530 dir.join("bm25_index.json")
531 }
532}
533
534fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
535 if index.chunks.is_empty() {
536 return false;
537 }
538
539 if index.files.is_empty() {
540 let mut seen = std::collections::HashSet::<&str>::new();
542 for chunk in &index.chunks {
543 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
544 if rel.is_empty() {
545 continue;
546 }
547 if !seen.insert(rel) {
548 continue;
549 }
550 if !root.join(rel).exists() {
551 return true;
552 }
553 }
554 return false;
555 }
556
557 for (rel, old_state) in &index.files {
559 let abs = root.join(rel);
560 if !abs.exists() {
561 return true;
562 }
563 let Some(cur) = IndexedFileState::from_path(&abs) else {
564 return true;
565 };
566 if &cur != old_state {
567 return true;
568 }
569 }
570
571 for rel in list_code_files(root) {
573 if !index.files.contains_key(&rel) {
574 return true;
575 }
576 }
577
578 false
579}
580
581fn index_dir(root: &Path) -> PathBuf {
582 crate::core::index_namespace::vectors_dir(root)
583}
584
585fn list_code_files(root: &Path) -> Vec<String> {
586 let walker = ignore::WalkBuilder::new(root)
587 .hidden(true)
588 .git_ignore(true)
589 .git_global(true)
590 .git_exclude(true)
591 .build();
592
593 let cfg = crate::core::config::Config::load();
594 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
595 .iter()
596 .filter_map(|p| glob::Pattern::new(p).ok())
597 .collect();
598 ignore_patterns.extend(
599 cfg.extra_ignore_patterns
600 .iter()
601 .filter_map(|p| glob::Pattern::new(p).ok()),
602 );
603
604 let mut files: Vec<String> = Vec::new();
605 for entry in walker.flatten() {
606 let path = entry.path();
607 if !path.is_file() {
608 continue;
609 }
610 if !is_code_file(path) {
611 continue;
612 }
613 let rel = path
614 .strip_prefix(root)
615 .unwrap_or(path)
616 .to_string_lossy()
617 .to_string();
618 if rel.is_empty() {
619 continue;
620 }
621 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
622 continue;
623 }
624 if files.len() >= MAX_BM25_FILES {
625 tracing::warn!(
626 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
627 root.display()
628 );
629 break;
630 }
631 files.push(rel);
632 }
633
634 files.sort();
635 files.dedup();
636 files
637}
638
639pub fn is_code_file(path: &Path) -> bool {
640 let ext = path
641 .extension()
642 .and_then(|e| e.to_str())
643 .unwrap_or("")
644 .to_lowercase();
645 matches!(
646 ext.as_str(),
647 "rs" | "ts"
648 | "tsx"
649 | "js"
650 | "jsx"
651 | "py"
652 | "go"
653 | "java"
654 | "c"
655 | "cc"
656 | "cpp"
657 | "h"
658 | "hpp"
659 | "rb"
660 | "cs"
661 | "kt"
662 | "swift"
663 | "php"
664 | "scala"
665 | "sql"
666 | "ex"
667 | "exs"
668 | "zig"
669 | "lua"
670 | "dart"
671 | "vue"
672 | "svelte"
673 )
674}
675
676fn tokenize(text: &str) -> Vec<String> {
677 let mut tokens = Vec::new();
678 let mut current = String::new();
679
680 for ch in text.chars() {
681 if ch.is_alphanumeric() || ch == '_' {
682 current.push(ch);
683 } else {
684 if current.len() >= 2 {
685 tokens.push(current.clone());
686 }
687 current.clear();
688 }
689 }
690 if current.len() >= 2 {
691 tokens.push(current);
692 }
693
694 split_camel_case_tokens(&tokens)
695}
696
697pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
698 tokenize(text)
699}
700
701fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
702 let mut result = Vec::new();
703 for token in tokens {
704 result.push(token.clone());
705 let mut start = 0;
706 let chars: Vec<char> = token.chars().collect();
707 for i in 1..chars.len() {
708 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
709 let part: String = chars[start..i].iter().collect();
710 if part.len() >= 2 {
711 result.push(part);
712 }
713 start = i;
714 }
715 }
716 if start > 0 {
717 let part: String = chars[start..].iter().collect();
718 if part.len() >= 2 {
719 result.push(part);
720 }
721 }
722 }
723 result
724}
725
726fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
727 #[cfg(feature = "tree-sitter")]
728 {
729 let ext = std::path::Path::new(file_path)
730 .extension()
731 .and_then(|e| e.to_str())
732 .unwrap_or("");
733 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
734 return chunks;
735 }
736 }
737
738 let lines: Vec<&str> = content.lines().collect();
739 if lines.is_empty() {
740 return Vec::new();
741 }
742
743 let mut chunks = Vec::new();
744 let mut i = 0;
745
746 while i < lines.len() {
747 let trimmed = lines[i].trim();
748
749 if let Some((name, kind)) = detect_symbol(trimmed) {
750 let start = i;
751 let end = find_block_end(&lines, i);
752 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
753 let token_count = tokenize(&block).len();
754
755 chunks.push(CodeChunk {
756 file_path: file_path.to_string(),
757 symbol_name: name,
758 kind,
759 start_line: start + 1,
760 end_line: end + 1,
761 content: block,
762 tokens: Vec::new(),
763 token_count,
764 });
765
766 i = end + 1;
767 } else {
768 i += 1;
769 }
770 }
771
772 if chunks.is_empty() && !content.is_empty() {
773 let bytes = content.as_bytes();
778 let rk_chunks = crate::core::rabin_karp::chunk(content);
779 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
780 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
781 let end = (c.offset + c.length).min(bytes.len());
782 let slice = &bytes[c.offset..end];
783 let chunk_text = String::from_utf8_lossy(slice).into_owned();
784 let token_count = tokenize(&chunk_text).len();
785 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
786 let end_line = start_line + bytecount::count(slice, b'\n');
787 chunks.push(CodeChunk {
788 file_path: file_path.to_string(),
789 symbol_name: format!("{file_path}#chunk-{idx}"),
790 kind: ChunkKind::Module,
791 start_line,
792 end_line: end_line.max(start_line),
793 content: chunk_text,
794 tokens: Vec::new(),
795 token_count,
796 });
797 }
798 } else {
799 let token_count = tokenize(content).len();
800 let snippet = lines
801 .iter()
802 .take(50)
803 .copied()
804 .collect::<Vec<_>>()
805 .join("\n");
806 chunks.push(CodeChunk {
807 file_path: file_path.to_string(),
808 symbol_name: file_path.to_string(),
809 kind: ChunkKind::Module,
810 start_line: 1,
811 end_line: lines.len(),
812 content: snippet,
813 tokens: Vec::new(),
814 token_count,
815 });
816 }
817 }
818
819 chunks
820}
821
822fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
823 let trimmed = line.trim();
824
825 let patterns: &[(&str, ChunkKind)] = &[
826 ("pub async fn ", ChunkKind::Function),
827 ("async fn ", ChunkKind::Function),
828 ("pub fn ", ChunkKind::Function),
829 ("fn ", ChunkKind::Function),
830 ("pub struct ", ChunkKind::Struct),
831 ("struct ", ChunkKind::Struct),
832 ("pub enum ", ChunkKind::Struct),
833 ("enum ", ChunkKind::Struct),
834 ("impl ", ChunkKind::Impl),
835 ("pub trait ", ChunkKind::Struct),
836 ("trait ", ChunkKind::Struct),
837 ("export function ", ChunkKind::Function),
838 ("export async function ", ChunkKind::Function),
839 ("export default function ", ChunkKind::Function),
840 ("function ", ChunkKind::Function),
841 ("async function ", ChunkKind::Function),
842 ("export class ", ChunkKind::Class),
843 ("class ", ChunkKind::Class),
844 ("export interface ", ChunkKind::Struct),
845 ("interface ", ChunkKind::Struct),
846 ("def ", ChunkKind::Function),
847 ("async def ", ChunkKind::Function),
848 ("class ", ChunkKind::Class),
849 ("func ", ChunkKind::Function),
850 ];
851
852 for (prefix, kind) in patterns {
853 if let Some(rest) = trimmed.strip_prefix(prefix) {
854 let name: String = rest
855 .chars()
856 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
857 .take_while(|c| *c != '<')
858 .collect();
859 if !name.is_empty() {
860 return Some((name, kind.clone()));
861 }
862 }
863 }
864
865 None
866}
867
868fn find_block_end(lines: &[&str], start: usize) -> usize {
869 let mut depth = 0i32;
870 let mut found_open = false;
871
872 for (i, line) in lines.iter().enumerate().skip(start) {
873 for ch in line.chars() {
874 match ch {
875 '{' | '(' if !found_open || depth > 0 => {
876 depth += 1;
877 found_open = true;
878 }
879 '}' | ')' if depth > 0 => {
880 depth -= 1;
881 if depth == 0 && found_open {
882 return i;
883 }
884 }
885 _ => {}
886 }
887 }
888
889 if found_open && depth <= 0 && i > start {
890 return i;
891 }
892
893 if !found_open && i > start + 2 {
894 let trimmed = lines[i].trim();
895 if trimmed.is_empty()
896 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
897 {
898 return i.saturating_sub(1);
899 }
900 }
901 }
902
903 (start + 50).min(lines.len().saturating_sub(1))
904}
905
906pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
907 if results.is_empty() {
908 return "No results found.".to_string();
909 }
910
911 let mut out = String::new();
912 for (i, r) in results.iter().enumerate() {
913 if compact {
914 out.push_str(&format!(
915 "{}. {:.2} {}:{}-{} {:?} {}\n",
916 i + 1,
917 r.score,
918 r.file_path,
919 r.start_line,
920 r.end_line,
921 r.kind,
922 r.symbol_name,
923 ));
924 } else {
925 out.push_str(&format!(
926 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
927 i + 1,
928 r.score,
929 r.file_path,
930 r.symbol_name,
931 r.kind,
932 r.start_line,
933 r.end_line,
934 r.snippet,
935 ));
936 }
937 }
938 out
939}
940
941#[cfg(test)]
942mod tests {
943 use super::*;
944 use tempfile::tempdir;
945
946 #[cfg(unix)]
947 use std::os::unix::fs::PermissionsExt;
948
949 #[test]
950 fn tokenize_splits_code() {
951 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
952 assert!(tokens.contains(&"calculate_total".to_string()));
953 assert!(tokens.contains(&"items".to_string()));
954 assert!(tokens.contains(&"Vec".to_string()));
955 }
956
957 #[test]
958 fn camel_case_splitting() {
959 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
960 assert!(tokens.contains(&"calculateTotal".to_string()));
961 assert!(tokens.contains(&"calculate".to_string()));
962 assert!(tokens.contains(&"Total".to_string()));
963 }
964
965 #[test]
966 fn detect_rust_function() {
967 let (name, kind) =
968 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
969 assert_eq!(name, "process_request");
970 assert_eq!(kind, ChunkKind::Function);
971 }
972
973 #[test]
974 fn bm25_search_finds_relevant() {
975 let mut index = BM25Index::new();
976 index.add_chunk(CodeChunk {
977 file_path: "auth.rs".into(),
978 symbol_name: "validate_token".into(),
979 kind: ChunkKind::Function,
980 start_line: 1,
981 end_line: 10,
982 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
983 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
984 token_count: 8,
985 });
986 index.add_chunk(CodeChunk {
987 file_path: "db.rs".into(),
988 symbol_name: "connect_database".into(),
989 kind: ChunkKind::Function,
990 start_line: 1,
991 end_line: 5,
992 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
993 tokens: tokenize("fn connect_database url str Pool create_pool url"),
994 token_count: 7,
995 });
996 index.finalize();
997
998 let results = index.search("jwt token validation", 5);
999 assert!(!results.is_empty());
1000 assert_eq!(results[0].symbol_name, "validate_token");
1001 }
1002
1003 #[test]
1004 fn bm25_search_sorts_ties_deterministically() {
1005 let mut index = BM25Index::new();
1006
1007 index.add_chunk(CodeChunk {
1009 file_path: "b.rs".into(),
1010 symbol_name: "same".into(),
1011 kind: ChunkKind::Function,
1012 start_line: 1,
1013 end_line: 1,
1014 content: "fn same() {}".into(),
1015 tokens: tokenize("same token"),
1016 token_count: 2,
1017 });
1018 index.add_chunk(CodeChunk {
1019 file_path: "a.rs".into(),
1020 symbol_name: "same".into(),
1021 kind: ChunkKind::Function,
1022 start_line: 1,
1023 end_line: 1,
1024 content: "fn same() {}".into(),
1025 tokens: tokenize("same token"),
1026 token_count: 2,
1027 });
1028 index.finalize();
1029
1030 let results = index.search("same", 10);
1031 assert!(results.len() >= 2);
1032 assert_eq!(results[0].file_path, "a.rs");
1033 assert_eq!(results[1].file_path, "b.rs");
1034 }
1035
1036 #[test]
1037 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1038 let td = tempdir().expect("tempdir");
1039 let root = td.path();
1040 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1041
1042 let idx = BM25Index::build_from_directory(root);
1043 assert!(!bm25_index_looks_stale(&idx, root));
1044
1045 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1046 assert!(bm25_index_looks_stale(&idx, root));
1047 }
1048
1049 #[test]
1050 #[cfg(unix)]
1051 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1052 let td = tempdir().expect("tempdir");
1053 let root = td.path();
1054
1055 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1056 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1057
1058 let idx1 = BM25Index::build_from_directory(root);
1059 assert!(idx1.files.contains_key("a.rs"));
1060 assert!(idx1.files.contains_key("b.rs"));
1061
1062 let a_path = root.join("a.rs");
1064 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1065 perms.set_mode(0o000);
1066 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1067
1068 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1070 .expect("rewrite b.rs");
1071
1072 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1073 assert!(
1074 idx2.files.contains_key("a.rs"),
1075 "a.rs should be kept via reuse"
1076 );
1077 assert!(idx2.files.contains_key("b.rs"));
1078
1079 let b_has_b2 = idx2
1080 .chunks
1081 .iter()
1082 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1083 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1084
1085 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1087 perms.set_mode(0o644);
1088 let _ = std::fs::set_permissions(&a_path, perms);
1089 }
1090
1091 #[test]
1092 fn load_quarantines_oversized_index() {
1093 let _env = crate::core::data_dir::test_env_lock();
1094 let td = tempdir().expect("tempdir");
1095 let root = td.path();
1096 let dir = crate::core::index_namespace::vectors_dir(root);
1097 std::fs::create_dir_all(&dir).expect("create vectors dir");
1098
1099 let index_path = dir.join("bm25_index.json");
1100 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1101 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1102
1103 let result = BM25Index::load(root);
1104 assert!(result.is_none(), "oversized index should return None");
1105 assert!(
1106 !index_path.exists(),
1107 "original index should be removed after quarantine"
1108 );
1109 assert!(
1110 dir.join("bm25_index.json.quarantined").exists(),
1111 "quarantined file should exist"
1112 );
1113
1114 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1115 }
1116
1117 #[test]
1118 fn save_refuses_oversized_output() {
1119 let _env = crate::core::data_dir::test_env_lock();
1120 let data_dir = tempdir().expect("data_dir");
1121 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1122 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1123
1124 let td = tempdir().expect("tempdir");
1125 let root = td.path();
1126
1127 let mut index = BM25Index::new();
1128 index.add_chunk(CodeChunk {
1129 file_path: "a.rs".into(),
1130 symbol_name: "a".into(),
1131 kind: ChunkKind::Function,
1132 start_line: 1,
1133 end_line: 1,
1134 content: "fn a() {}".into(),
1135 tokens: tokenize("fn a"),
1136 token_count: 2,
1137 });
1138 index.finalize();
1139
1140 let _ = index.save(root);
1141 let index_path = BM25Index::index_file_path(root);
1142 assert!(
1143 !index_path.exists(),
1144 "save should refuse to persist oversized index"
1145 );
1146
1147 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1148 }
1149
1150 #[test]
1151 fn save_writes_project_root_marker() {
1152 let _env = crate::core::data_dir::test_env_lock();
1153 let td = tempdir().expect("tempdir");
1154 let root = td.path();
1155 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1156
1157 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1158 let index = BM25Index::build_from_directory(root);
1159 index.save(root).expect("save");
1160
1161 let dir = crate::core::index_namespace::vectors_dir(root);
1162 let marker = dir.join("project_root.txt");
1163 assert!(marker.exists(), "project_root.txt marker should exist");
1164 let content = std::fs::read_to_string(&marker).expect("read marker");
1165 assert_eq!(content, root.to_string_lossy());
1166 }
1167
1168 #[test]
1169 fn save_load_roundtrip_uses_zstd() {
1170 let _env = crate::core::data_dir::test_env_lock();
1171 let data_dir = tempdir().expect("data_dir");
1172 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1173 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1174 let td = tempdir().expect("tempdir");
1175 let root = td.path();
1176
1177 for i in 0..10 {
1178 std::fs::write(
1179 root.join(format!("mod{i}.rs")),
1180 format!(
1181 "pub fn handler_{i}() {{\n println!(\"hello\");\n}}\n\n\
1182 pub fn helper_{i}() {{\n println!(\"world\");\n}}\n"
1183 ),
1184 )
1185 .expect("write");
1186 }
1187
1188 let index = BM25Index::build_from_directory(root);
1189 assert!(index.doc_count > 0, "should have indexed chunks");
1190 index.save(root).expect("save");
1191
1192 let dir = crate::core::index_namespace::vectors_dir(root);
1193 let zst = dir.join("bm25_index.bin.zst");
1194 assert!(zst.exists(), "should write .bin.zst");
1195 assert!(
1196 !dir.join("bm25_index.bin").exists(),
1197 ".bin should be deleted"
1198 );
1199
1200 let loaded = BM25Index::load(root).expect("load compressed index");
1201 assert_eq!(loaded.doc_count, index.doc_count);
1202 assert_eq!(loaded.chunks.len(), index.chunks.len());
1203
1204 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1205 std::env::remove_var("LEAN_CTX_DATA_DIR");
1206 }
1207
1208 #[test]
1209 fn auto_migrate_bin_to_zst() {
1210 let _env = crate::core::data_dir::test_env_lock();
1211 let data_dir = tempdir().expect("data_dir");
1212 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1213 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1214 let td = tempdir().expect("tempdir");
1215 let root = td.path();
1216
1217 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1218 let index = BM25Index::build_from_directory(root);
1219
1220 let dir = crate::core::index_namespace::vectors_dir(root);
1221 std::fs::create_dir_all(&dir).expect("mkdir");
1222 let data =
1223 bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1224 std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1225
1226 let loaded = BM25Index::load(root).expect("load should auto-migrate");
1227 assert_eq!(loaded.doc_count, index.doc_count);
1228 assert!(
1229 dir.join("bm25_index.bin.zst").exists(),
1230 ".bin.zst should be created"
1231 );
1232 assert!(
1233 !dir.join("bm25_index.bin").exists(),
1234 ".bin should be removed"
1235 );
1236
1237 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1238 std::env::remove_var("LEAN_CTX_DATA_DIR");
1239 }
1240
1241 #[test]
1242 fn list_code_files_skips_default_vendor_ignores() {
1243 let td = tempdir().expect("tempdir");
1244 let root = td.path();
1245
1246 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1247 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1248 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1249 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1250 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1251
1252 let files = list_code_files(root);
1253 assert!(
1254 files.iter().any(|f| f == "main.rs"),
1255 "main.rs should be included"
1256 );
1257 assert!(
1258 !files.iter().any(|f| f.starts_with("vendor/")),
1259 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1260 );
1261 assert!(
1262 !files.iter().any(|f| f.starts_with("dist/")),
1263 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1264 );
1265 }
1266
1267 #[test]
1268 fn list_code_files_respects_max_files_cap() {
1269 let td = tempdir().expect("tempdir");
1270 let root = td.path();
1271
1272 for i in 0..10 {
1275 std::fs::write(
1276 root.join(format!("f{i}.rs")),
1277 format!("pub fn f{i}() {{}}\n"),
1278 )
1279 .expect("write");
1280 }
1281 let files = list_code_files(root);
1282 assert!(
1283 files.len() <= MAX_BM25_FILES,
1284 "file count should not exceed MAX_BM25_FILES"
1285 );
1286 }
1287
1288 #[test]
1289 fn max_bm25_cache_bytes_reads_env() {
1290 let _env = crate::core::data_dir::test_env_lock();
1291 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1292 let bytes = max_bm25_cache_bytes();
1293 assert_eq!(bytes, 64 * 1024 * 1024);
1294 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1295 }
1296}