1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12 "vendor/**",
13 "dist/**",
14 "build/**",
15 "public/vendor/**",
16 "public/js/**",
17 "public/css/**",
18 "public/build/**",
19 ".next/**",
20 ".nuxt/**",
21 "__pycache__/**",
22 "*.min.js",
23 "*.min.css",
24 "*.bundle.js",
25 "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30 .ok()
31 .and_then(|v| v.parse::<u64>().ok())
32 .unwrap_or_else(|| {
33 let cfg = crate::core::config::Config::load();
34 let profile = crate::core::config::MemoryProfile::effective(&cfg);
35 let profile_mb = profile.bm25_max_cache_mb();
36 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37 profile_mb
38 } else {
39 cfg.bm25_max_cache_mb
40 }
41 });
42 mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47 pub file_path: String,
48 pub symbol_name: String,
49 pub kind: ChunkKind,
50 pub start_line: usize,
51 pub end_line: usize,
52 pub content: String,
53 #[serde(default)]
54 pub tokens: Vec<String>,
55 pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60 Function,
61 Struct,
62 Impl,
63 Module,
64 Class,
65 Method,
66 Other,
67}
68
69#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
70pub struct IndexedFileState {
71 pub mtime_ms: u64,
72 pub size_bytes: u64,
73}
74
75impl IndexedFileState {
76 fn from_path(path: &Path) -> Option<Self> {
77 let meta = path.metadata().ok()?;
78 let size_bytes = meta.len();
79 let mtime_ms = meta
80 .modified()
81 .ok()
82 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
83 .map(|d| d.as_millis() as u64)?;
84 Some(Self {
85 mtime_ms,
86 size_bytes,
87 })
88 }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct BM25Index {
93 pub chunks: Vec<CodeChunk>,
94 pub inverted: HashMap<String, Vec<(usize, f64)>>,
95 pub avg_doc_len: f64,
96 pub doc_count: usize,
97 pub doc_freqs: HashMap<String, usize>,
98 #[serde(default)]
99 pub files: HashMap<String, IndexedFileState>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct SearchResult {
104 pub chunk_idx: usize,
105 pub score: f64,
106 pub file_path: String,
107 pub symbol_name: String,
108 pub kind: ChunkKind,
109 pub start_line: usize,
110 pub end_line: usize,
111 pub snippet: String,
112}
113
114const BM25_K1: f64 = 1.2;
115const BM25_B: f64 = 0.75;
116
117impl Default for BM25Index {
118 fn default() -> Self {
119 Self::new()
120 }
121}
122
123impl BM25Index {
124 pub fn new() -> Self {
125 Self {
126 chunks: Vec::new(),
127 inverted: HashMap::new(),
128 avg_doc_len: 0.0,
129 doc_count: 0,
130 doc_freqs: HashMap::new(),
131 files: HashMap::new(),
132 }
133 }
134
135 pub fn memory_usage_bytes(&self) -> usize {
137 let chunks_size: usize = self
138 .chunks
139 .iter()
140 .map(|c| {
141 c.content.len()
142 + c.file_path.len()
143 + c.symbol_name.len()
144 + c.tokens.iter().map(String::len).sum::<usize>()
145 + 64
146 })
147 .sum();
148 let inverted_size: usize = self
149 .inverted
150 .iter()
151 .map(|(k, v)| k.len() + v.len() * 16 + 32)
152 .sum();
153 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
154 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
155 chunks_size + inverted_size + files_size + freqs_size
156 }
157
158 pub fn unload(&mut self) {
160 let usage = self.memory_usage_bytes();
161 self.chunks = Vec::new();
162 self.inverted = HashMap::new();
163 self.doc_freqs = HashMap::new();
164 self.files = HashMap::new();
165 self.avg_doc_len = 0.0;
166 self.doc_count = 0;
167 tracing::info!(
168 "[bm25] unloaded index, freed ~{:.1}MB",
169 usage as f64 / 1_048_576.0
170 );
171 }
172
173 #[cfg(test)]
175 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
176 let mut index = Self::new();
177 for mut chunk in chunks {
178 if chunk.token_count == 0 {
179 chunk.token_count = tokenize(&chunk.content).len();
180 }
181 index.add_chunk(chunk);
182 }
183 index.finalize();
184 index
185 }
186
187 pub fn build_from_directory(root: &Path) -> Self {
188 let root_str = root.to_string_lossy();
189 if !super::graph_index::is_safe_scan_root_public(&root_str) {
190 tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
191 return Self::new();
192 }
193 let mut index = Self::new();
194 let files = list_code_files(root);
195 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
196
197 for (i, rel) in files.iter().enumerate() {
198 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
199 tracing::warn!(
200 "[bm25: stopping build at file {i}/{} due to memory pressure]",
201 files.len()
202 );
203 break;
204 }
205 if crate::core::memory_guard::abort_requested() {
206 tracing::warn!("[bm25: aborting build due to critical memory pressure]");
207 break;
208 }
209
210 let abs = root.join(rel);
211 let Some(state) = IndexedFileState::from_path(&abs) else {
212 continue;
213 };
214 if state.size_bytes > MAX_FILE_SIZE_BYTES {
215 continue;
216 }
217 if let Ok(content) = std::fs::read_to_string(&abs) {
218 let mut chunks = extract_chunks(rel, &content);
219 chunks.sort_by(|a, b| {
220 a.start_line
221 .cmp(&b.start_line)
222 .then_with(|| a.end_line.cmp(&b.end_line))
223 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
224 });
225 for chunk in chunks {
226 index.add_chunk(chunk);
227 }
228 index.files.insert(rel.clone(), state);
229 }
230 }
231
232 index.finalize();
233 index
234 }
235
236 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
237 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
238 for c in &prev.chunks {
239 old_by_file
240 .entry(c.file_path.clone())
241 .or_default()
242 .push(c.clone());
243 }
244 for v in old_by_file.values_mut() {
245 v.sort_by(|a, b| {
246 a.start_line
247 .cmp(&b.start_line)
248 .then_with(|| a.end_line.cmp(&b.end_line))
249 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
250 });
251 }
252
253 let mut index = Self::new();
254 let files = list_code_files(root);
255 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
256
257 for (i, rel) in files.iter().enumerate() {
258 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
259 tracing::warn!(
260 "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
261 files.len()
262 );
263 break;
264 }
265
266 let abs = root.join(rel);
267 let Some(state) = IndexedFileState::from_path(&abs) else {
268 continue;
269 };
270
271 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
272 if unchanged {
273 if let Some(chunks) = old_by_file.get(rel) {
274 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
275 for chunk in chunks {
276 index.add_chunk(chunk.clone());
277 }
278 index.files.insert(rel.clone(), state);
279 continue;
280 }
281 }
282 }
283
284 if state.size_bytes > MAX_FILE_SIZE_BYTES {
285 continue;
286 }
287 if let Ok(content) = std::fs::read_to_string(&abs) {
288 let mut chunks = extract_chunks(rel, &content);
289 chunks.sort_by(|a, b| {
290 a.start_line
291 .cmp(&b.start_line)
292 .then_with(|| a.end_line.cmp(&b.end_line))
293 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
294 });
295 for chunk in chunks {
296 index.add_chunk(chunk);
297 }
298 index.files.insert(rel.clone(), state);
299 }
300 }
301
302 index.finalize();
303 index
304 }
305
306 fn add_chunk(&mut self, chunk: CodeChunk) {
307 let idx = self.chunks.len();
308
309 let enriched = enrich_for_bm25(&chunk);
310 let tokens = tokenize(&enriched);
311 for token in &tokens {
312 let lower = token.to_lowercase();
313 let postings = self.inverted.entry(lower.clone()).or_default();
314 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
315 *self.doc_freqs.entry(lower).or_insert(0) += 1;
316 }
317 postings.push((idx, 1.0));
318 }
319
320 self.chunks.push(CodeChunk {
321 token_count: tokens.len(),
322 tokens: Vec::new(),
323 ..chunk
324 });
325 }
326
327 fn finalize(&mut self) {
328 self.doc_count = self.chunks.len();
329 if self.doc_count == 0 {
330 return;
331 }
332
333 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
334 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
335 }
336
337 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
338 let query_tokens = tokenize(query);
339 if query_tokens.is_empty() || self.doc_count == 0 {
340 return Vec::new();
341 }
342
343 let mut scores: HashMap<usize, f64> = HashMap::new();
344
345 for token in &query_tokens {
346 let lower = token.to_lowercase();
347 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
348 if df == 0.0 {
349 continue;
350 }
351
352 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
353
354 if let Some(postings) = self.inverted.get(&lower) {
355 let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
356 for (idx, weight) in postings {
357 *doc_tfs.entry(*idx).or_insert(0.0) += weight;
358 }
359
360 for (doc_idx, tf) in &doc_tfs {
361 let doc_len = self.chunks[*doc_idx].token_count as f64;
362 let norm_len = doc_len / self.avg_doc_len.max(1.0);
363 let bm25 = idf * (tf * (BM25_K1 + 1.0))
364 / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
365
366 *scores.entry(*doc_idx).or_insert(0.0) += bm25;
367 }
368 }
369 }
370
371 let mut results: Vec<SearchResult> = scores
372 .into_iter()
373 .map(|(idx, score)| {
374 let chunk = &self.chunks[idx];
375 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
376 SearchResult {
377 chunk_idx: idx,
378 score,
379 file_path: chunk.file_path.clone(),
380 symbol_name: chunk.symbol_name.clone(),
381 kind: chunk.kind.clone(),
382 start_line: chunk.start_line,
383 end_line: chunk.end_line,
384 snippet,
385 }
386 })
387 .collect();
388
389 results.sort_by(|a, b| {
390 b.score
391 .partial_cmp(&a.score)
392 .unwrap_or(std::cmp::Ordering::Equal)
393 .then_with(|| a.file_path.cmp(&b.file_path))
394 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
395 .then_with(|| a.start_line.cmp(&b.start_line))
396 .then_with(|| a.end_line.cmp(&b.end_line))
397 });
398 results.truncate(top_k);
399 results
400 }
401
402 pub fn save(&self, root: &Path) -> std::io::Result<()> {
403 if self.chunks.len() > CHUNK_COUNT_WARNING {
404 tracing::warn!(
405 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
406 self.chunks.len(),
407 CHUNK_COUNT_WARNING
408 );
409 }
410
411 let dir = index_dir(root);
412 std::fs::create_dir_all(&dir)?;
413 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
414 .map_err(|e| std::io::Error::other(e.to_string()))?;
415
416 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
417 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
418
419 let max_bytes = max_bm25_cache_bytes();
420 if compressed.len() as u64 > max_bytes {
421 tracing::warn!(
422 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
423 compressed.len() as f64 / 1_048_576.0,
424 max_bytes / (1024 * 1024),
425 dir.display()
426 );
427 return Ok(());
428 }
429
430 tracing::info!(
431 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
432 data.len() as f64 / 1_048_576.0,
433 compressed.len() as f64 / 1_048_576.0,
434 (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
435 );
436
437 let target = dir.join("bm25_index.bin.zst");
438 let tmp = dir.join("bm25_index.bin.zst.tmp");
439 std::fs::write(&tmp, &compressed)?;
440 std::fs::rename(&tmp, &target)?;
441
442 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
443 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
444
445 let _ = std::fs::write(
446 dir.join("project_root.txt"),
447 root.to_string_lossy().as_bytes(),
448 );
449
450 Ok(())
451 }
452
453 pub fn load(root: &Path) -> Option<Self> {
454 let dir = index_dir(root);
455 let max_bytes = max_bm25_cache_bytes();
456
457 let zst_path = dir.join("bm25_index.bin.zst");
458 if zst_path.exists() {
459 let meta = std::fs::metadata(&zst_path).ok()?;
460 if meta.len() > max_bytes {
461 tracing::warn!(
462 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
463 meta.len() as f64 / 1_073_741_824.0,
464 max_bytes / (1024 * 1024),
465 zst_path.display()
466 );
467 let quarantined = zst_path.with_extension("zst.quarantined");
468 let _ = std::fs::rename(&zst_path, &quarantined);
469 return None;
470 }
471 let compressed = std::fs::read(&zst_path).ok()?;
472 let data = zstd::decode_all(compressed.as_slice()).ok()?;
473 let (idx, _): (Self, _) =
474 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
475 return Some(idx);
476 }
477
478 let bin_path = dir.join("bm25_index.bin");
479 if bin_path.exists() {
480 let meta = std::fs::metadata(&bin_path).ok()?;
481 if meta.len() > max_bytes {
482 tracing::warn!(
483 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
484 meta.len() as f64 / 1_073_741_824.0,
485 max_bytes / (1024 * 1024),
486 bin_path.display()
487 );
488 let quarantined = bin_path.with_extension("bin.quarantined");
489 let _ = std::fs::rename(&bin_path, &quarantined);
490 return None;
491 }
492 let data = std::fs::read(&bin_path).ok()?;
493 let (idx, _): (Self, _) =
494 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
495 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
497 let zst_tmp = zst_path.with_extension("zst.tmp");
498 if std::fs::write(&zst_tmp, &compressed).is_ok()
499 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
500 {
501 tracing::info!(
502 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
503 data.len() as f64 / 1_048_576.0,
504 compressed.len() as f64 / 1_048_576.0
505 );
506 let _ = std::fs::remove_file(&bin_path);
507 }
508 }
509 return Some(idx);
510 }
511
512 let json_path = dir.join("bm25_index.json");
513 if json_path.exists() {
514 let meta = std::fs::metadata(&json_path).ok()?;
515 if meta.len() > max_bytes {
516 tracing::warn!(
517 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
518 meta.len() as f64 / 1_073_741_824.0,
519 max_bytes / (1024 * 1024),
520 json_path.display()
521 );
522 let quarantined = json_path.with_extension("json.quarantined");
523 let _ = std::fs::rename(&json_path, &quarantined);
524 return None;
525 }
526 let data = std::fs::read_to_string(&json_path).ok()?;
527 return serde_json::from_str(&data).ok();
528 }
529
530 None
531 }
532
533 pub fn load_or_build(root: &Path) -> Self {
534 if !is_safe_bm25_root(root) {
535 return Self::default();
536 }
537 if let Some(idx) = Self::load(root) {
538 if !bm25_index_looks_stale(&idx, root) {
539 return idx;
540 }
541 tracing::warn!(
542 "[bm25_index: stale index detected for {}; rebuilding]",
543 root.display()
544 );
545 let rebuilt = if idx.files.is_empty() {
546 Self::build_from_directory(root)
547 } else {
548 Self::rebuild_incremental(root, &idx)
549 };
550 let _ = rebuilt.save(root);
551 return rebuilt;
552 }
553
554 let built = Self::build_from_directory(root);
555 let _ = built.save(root);
556 built
557 }
558
559 pub fn index_file_path(root: &Path) -> PathBuf {
560 let dir = index_dir(root);
561 let zst = dir.join("bm25_index.bin.zst");
562 if zst.exists() {
563 return zst;
564 }
565 let bin = dir.join("bm25_index.bin");
566 if bin.exists() {
567 return bin;
568 }
569 dir.join("bm25_index.json")
570 }
571}
572
573fn is_safe_bm25_root(root: &Path) -> bool {
574 super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
575}
576
577fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
578 if index.chunks.is_empty() {
579 return false;
580 }
581
582 if index.files.is_empty() {
583 let mut seen = std::collections::HashSet::<&str>::new();
585 for chunk in &index.chunks {
586 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
587 if rel.is_empty() {
588 continue;
589 }
590 if !seen.insert(rel) {
591 continue;
592 }
593 if !root.join(rel).exists() {
594 return true;
595 }
596 }
597 return false;
598 }
599
600 for (rel, old_state) in &index.files {
602 let abs = root.join(rel);
603 if !abs.exists() {
604 return true;
605 }
606 let Some(cur) = IndexedFileState::from_path(&abs) else {
607 return true;
608 };
609 if &cur != old_state {
610 return true;
611 }
612 }
613
614 for rel in list_code_files(root) {
616 if !index.files.contains_key(&rel) {
617 return true;
618 }
619 }
620
621 false
622}
623
624fn index_dir(root: &Path) -> PathBuf {
625 crate::core::index_namespace::vectors_dir(root)
626}
627
628fn list_code_files(root: &Path) -> Vec<String> {
629 let walker = ignore::WalkBuilder::new(root)
630 .hidden(true)
631 .git_ignore(true)
632 .git_global(true)
633 .git_exclude(true)
634 .max_depth(Some(20))
635 .build();
636
637 let cfg = crate::core::config::Config::load();
638 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
639 .iter()
640 .filter_map(|p| glob::Pattern::new(p).ok())
641 .collect();
642 ignore_patterns.extend(
643 cfg.extra_ignore_patterns
644 .iter()
645 .filter_map(|p| glob::Pattern::new(p).ok()),
646 );
647
648 let mut files: Vec<String> = Vec::new();
649 for entry in walker.flatten() {
650 let path = entry.path();
651 if !path.is_file() {
652 continue;
653 }
654 if !is_code_file(path) {
655 continue;
656 }
657 let rel = path
658 .strip_prefix(root)
659 .unwrap_or(path)
660 .to_string_lossy()
661 .to_string();
662 if rel.is_empty() {
663 continue;
664 }
665 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
666 continue;
667 }
668 if files.len() >= MAX_BM25_FILES {
669 tracing::warn!(
670 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
671 root.display()
672 );
673 break;
674 }
675 files.push(rel);
676 }
677
678 files.sort();
679 files.dedup();
680 files
681}
682
683pub fn is_code_file(path: &Path) -> bool {
684 let ext = path
685 .extension()
686 .and_then(|e| e.to_str())
687 .unwrap_or("")
688 .to_lowercase();
689 matches!(
690 ext.as_str(),
691 "rs" | "ts"
692 | "tsx"
693 | "js"
694 | "jsx"
695 | "py"
696 | "go"
697 | "java"
698 | "c"
699 | "cc"
700 | "cpp"
701 | "h"
702 | "hpp"
703 | "rb"
704 | "cs"
705 | "kt"
706 | "swift"
707 | "php"
708 | "scala"
709 | "sql"
710 | "ex"
711 | "exs"
712 | "zig"
713 | "lua"
714 | "dart"
715 | "vue"
716 | "svelte"
717 )
718}
719
720fn tokenize(text: &str) -> Vec<String> {
721 let mut tokens = Vec::new();
722 let mut current = String::new();
723
724 for ch in text.chars() {
725 if ch.is_alphanumeric() || ch == '_' {
726 current.push(ch);
727 } else {
728 if current.len() >= 2 {
729 tokens.push(current.clone());
730 }
731 current.clear();
732 }
733 }
734 if current.len() >= 2 {
735 tokens.push(current);
736 }
737
738 split_camel_case_tokens(&tokens)
739}
740
741pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
742 tokenize(text)
743}
744
745fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
746 let mut result = Vec::new();
747 for token in tokens {
748 result.push(token.clone());
749 let mut start = 0;
750 let chars: Vec<char> = token.chars().collect();
751 for i in 1..chars.len() {
752 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
753 let part: String = chars[start..i].iter().collect();
754 if part.len() >= 2 {
755 result.push(part);
756 }
757 start = i;
758 }
759 }
760 if start > 0 {
761 let part: String = chars[start..].iter().collect();
762 if part.len() >= 2 {
763 result.push(part);
764 }
765 }
766 }
767 result
768}
769
770fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
771 #[cfg(feature = "tree-sitter")]
772 {
773 let ext = std::path::Path::new(file_path)
774 .extension()
775 .and_then(|e| e.to_str())
776 .unwrap_or("");
777 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
778 return chunks;
779 }
780 }
781
782 let lines: Vec<&str> = content.lines().collect();
783 if lines.is_empty() {
784 return Vec::new();
785 }
786
787 let mut chunks = Vec::new();
788 let mut i = 0;
789
790 while i < lines.len() {
791 let trimmed = lines[i].trim();
792
793 if let Some((name, kind)) = detect_symbol(trimmed) {
794 let start = i;
795 let end = find_block_end(&lines, i);
796 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
797 let token_count = tokenize(&block).len();
798
799 chunks.push(CodeChunk {
800 file_path: file_path.to_string(),
801 symbol_name: name,
802 kind,
803 start_line: start + 1,
804 end_line: end + 1,
805 content: block,
806 tokens: Vec::new(),
807 token_count,
808 });
809
810 i = end + 1;
811 } else {
812 i += 1;
813 }
814 }
815
816 if chunks.is_empty() && !content.is_empty() {
817 let bytes = content.as_bytes();
822 let rk_chunks = crate::core::rabin_karp::chunk(content);
823 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
824 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
825 let end = (c.offset + c.length).min(bytes.len());
826 let slice = &bytes[c.offset..end];
827 let chunk_text = String::from_utf8_lossy(slice).into_owned();
828 let token_count = tokenize(&chunk_text).len();
829 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
830 let end_line = start_line + bytecount::count(slice, b'\n');
831 chunks.push(CodeChunk {
832 file_path: file_path.to_string(),
833 symbol_name: format!("{file_path}#chunk-{idx}"),
834 kind: ChunkKind::Module,
835 start_line,
836 end_line: end_line.max(start_line),
837 content: chunk_text,
838 tokens: Vec::new(),
839 token_count,
840 });
841 }
842 } else {
843 let token_count = tokenize(content).len();
844 let snippet = lines
845 .iter()
846 .take(50)
847 .copied()
848 .collect::<Vec<_>>()
849 .join("\n");
850 chunks.push(CodeChunk {
851 file_path: file_path.to_string(),
852 symbol_name: file_path.to_string(),
853 kind: ChunkKind::Module,
854 start_line: 1,
855 end_line: lines.len(),
856 content: snippet,
857 tokens: Vec::new(),
858 token_count,
859 });
860 }
861 }
862
863 chunks
864}
865
866fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
867 let trimmed = line.trim();
868
869 let patterns: &[(&str, ChunkKind)] = &[
870 ("pub async fn ", ChunkKind::Function),
871 ("async fn ", ChunkKind::Function),
872 ("pub fn ", ChunkKind::Function),
873 ("fn ", ChunkKind::Function),
874 ("pub struct ", ChunkKind::Struct),
875 ("struct ", ChunkKind::Struct),
876 ("pub enum ", ChunkKind::Struct),
877 ("enum ", ChunkKind::Struct),
878 ("impl ", ChunkKind::Impl),
879 ("pub trait ", ChunkKind::Struct),
880 ("trait ", ChunkKind::Struct),
881 ("export function ", ChunkKind::Function),
882 ("export async function ", ChunkKind::Function),
883 ("export default function ", ChunkKind::Function),
884 ("function ", ChunkKind::Function),
885 ("async function ", ChunkKind::Function),
886 ("export class ", ChunkKind::Class),
887 ("class ", ChunkKind::Class),
888 ("export interface ", ChunkKind::Struct),
889 ("interface ", ChunkKind::Struct),
890 ("def ", ChunkKind::Function),
891 ("async def ", ChunkKind::Function),
892 ("class ", ChunkKind::Class),
893 ("func ", ChunkKind::Function),
894 ];
895
896 for (prefix, kind) in patterns {
897 if let Some(rest) = trimmed.strip_prefix(prefix) {
898 let name: String = rest
899 .chars()
900 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
901 .take_while(|c| *c != '<')
902 .collect();
903 if !name.is_empty() {
904 return Some((name, kind.clone()));
905 }
906 }
907 }
908
909 None
910}
911
912fn find_block_end(lines: &[&str], start: usize) -> usize {
913 let mut depth = 0i32;
914 let mut found_open = false;
915
916 for (i, line) in lines.iter().enumerate().skip(start) {
917 for ch in line.chars() {
918 match ch {
919 '{' | '(' if !found_open || depth > 0 => {
920 depth += 1;
921 found_open = true;
922 }
923 '}' | ')' if depth > 0 => {
924 depth -= 1;
925 if depth == 0 && found_open {
926 return i;
927 }
928 }
929 _ => {}
930 }
931 }
932
933 if found_open && depth <= 0 && i > start {
934 return i;
935 }
936
937 if !found_open && i > start + 2 {
938 let trimmed = lines[i].trim();
939 if trimmed.is_empty()
940 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
941 {
942 return i.saturating_sub(1);
943 }
944 }
945 }
946
947 (start + 50).min(lines.len().saturating_sub(1))
948}
949
950pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
951 if results.is_empty() {
952 return "No results found.".to_string();
953 }
954
955 let mut out = String::new();
956 for (i, r) in results.iter().enumerate() {
957 if compact {
958 out.push_str(&format!(
959 "{}. {:.2} {}:{}-{} {:?} {}\n",
960 i + 1,
961 r.score,
962 r.file_path,
963 r.start_line,
964 r.end_line,
965 r.kind,
966 r.symbol_name,
967 ));
968 } else {
969 out.push_str(&format!(
970 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
971 i + 1,
972 r.score,
973 r.file_path,
974 r.symbol_name,
975 r.kind,
976 r.start_line,
977 r.end_line,
978 r.snippet,
979 ));
980 }
981 }
982 out
983}
984
985fn enrich_for_bm25(chunk: &CodeChunk) -> String {
992 let path = Path::new(&chunk.file_path);
993 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
994 let dir = path
995 .parent()
996 .and_then(|p| p.file_name())
997 .and_then(|d| d.to_str())
998 .unwrap_or("");
999
1000 if stem.is_empty() {
1001 return chunk.content.clone();
1002 }
1003
1004 format!("{} {} {} {}", chunk.content, stem, stem, dir)
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009 use super::*;
1010 use tempfile::tempdir;
1011
1012 #[cfg(unix)]
1013 use std::os::unix::fs::PermissionsExt;
1014
1015 #[test]
1016 fn tokenize_splits_code() {
1017 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1018 assert!(tokens.contains(&"calculate_total".to_string()));
1019 assert!(tokens.contains(&"items".to_string()));
1020 assert!(tokens.contains(&"Vec".to_string()));
1021 }
1022
1023 #[test]
1024 fn camel_case_splitting() {
1025 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1026 assert!(tokens.contains(&"calculateTotal".to_string()));
1027 assert!(tokens.contains(&"calculate".to_string()));
1028 assert!(tokens.contains(&"Total".to_string()));
1029 }
1030
1031 #[test]
1032 fn detect_rust_function() {
1033 let (name, kind) =
1034 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1035 assert_eq!(name, "process_request");
1036 assert_eq!(kind, ChunkKind::Function);
1037 }
1038
1039 #[test]
1040 fn bm25_search_finds_relevant() {
1041 let mut index = BM25Index::new();
1042 index.add_chunk(CodeChunk {
1043 file_path: "auth.rs".into(),
1044 symbol_name: "validate_token".into(),
1045 kind: ChunkKind::Function,
1046 start_line: 1,
1047 end_line: 10,
1048 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1049 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1050 token_count: 8,
1051 });
1052 index.add_chunk(CodeChunk {
1053 file_path: "db.rs".into(),
1054 symbol_name: "connect_database".into(),
1055 kind: ChunkKind::Function,
1056 start_line: 1,
1057 end_line: 5,
1058 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1059 tokens: tokenize("fn connect_database url str Pool create_pool url"),
1060 token_count: 7,
1061 });
1062 index.finalize();
1063
1064 let results = index.search("jwt token validation", 5);
1065 assert!(!results.is_empty());
1066 assert_eq!(results[0].symbol_name, "validate_token");
1067 }
1068
1069 #[test]
1070 fn bm25_search_sorts_ties_deterministically() {
1071 let mut index = BM25Index::new();
1072
1073 index.add_chunk(CodeChunk {
1075 file_path: "b.rs".into(),
1076 symbol_name: "same".into(),
1077 kind: ChunkKind::Function,
1078 start_line: 1,
1079 end_line: 1,
1080 content: "fn same() {}".into(),
1081 tokens: tokenize("same token"),
1082 token_count: 2,
1083 });
1084 index.add_chunk(CodeChunk {
1085 file_path: "a.rs".into(),
1086 symbol_name: "same".into(),
1087 kind: ChunkKind::Function,
1088 start_line: 1,
1089 end_line: 1,
1090 content: "fn same() {}".into(),
1091 tokens: tokenize("same token"),
1092 token_count: 2,
1093 });
1094 index.finalize();
1095
1096 let results = index.search("same", 10);
1097 assert!(results.len() >= 2);
1098 assert_eq!(results[0].file_path, "a.rs");
1099 assert_eq!(results[1].file_path, "b.rs");
1100 }
1101
1102 #[test]
1103 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1104 let td = tempdir().expect("tempdir");
1105 let root = td.path();
1106 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1107
1108 let idx = BM25Index::build_from_directory(root);
1109 assert!(!bm25_index_looks_stale(&idx, root));
1110
1111 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1112 assert!(bm25_index_looks_stale(&idx, root));
1113 }
1114
1115 #[test]
1116 #[cfg(unix)]
1117 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1118 let td = tempdir().expect("tempdir");
1119 let root = td.path();
1120
1121 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1122 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1123
1124 let idx1 = BM25Index::build_from_directory(root);
1125 assert!(idx1.files.contains_key("a.rs"));
1126 assert!(idx1.files.contains_key("b.rs"));
1127
1128 let a_path = root.join("a.rs");
1130 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1131 perms.set_mode(0o000);
1132 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1133
1134 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1136 .expect("rewrite b.rs");
1137
1138 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1139 assert!(
1140 idx2.files.contains_key("a.rs"),
1141 "a.rs should be kept via reuse"
1142 );
1143 assert!(idx2.files.contains_key("b.rs"));
1144
1145 let b_has_b2 = idx2
1146 .chunks
1147 .iter()
1148 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1149 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1150
1151 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1153 perms.set_mode(0o644);
1154 let _ = std::fs::set_permissions(&a_path, perms);
1155 }
1156
1157 #[test]
1158 fn load_quarantines_oversized_index() {
1159 let _env = crate::core::data_dir::test_env_lock();
1160 let td = tempdir().expect("tempdir");
1161 let root = td.path();
1162 let dir = crate::core::index_namespace::vectors_dir(root);
1163 std::fs::create_dir_all(&dir).expect("create vectors dir");
1164
1165 let index_path = dir.join("bm25_index.json");
1166 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1167 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1168
1169 let result = BM25Index::load(root);
1170 assert!(result.is_none(), "oversized index should return None");
1171 assert!(
1172 !index_path.exists(),
1173 "original index should be removed after quarantine"
1174 );
1175 assert!(
1176 dir.join("bm25_index.json.quarantined").exists(),
1177 "quarantined file should exist"
1178 );
1179
1180 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1181 }
1182
1183 #[test]
1184 fn save_refuses_oversized_output() {
1185 let _env = crate::core::data_dir::test_env_lock();
1186 let data_dir = tempdir().expect("data_dir");
1187 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1188 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1189
1190 let td = tempdir().expect("tempdir");
1191 let root = td.path();
1192
1193 let mut index = BM25Index::new();
1194 index.add_chunk(CodeChunk {
1195 file_path: "a.rs".into(),
1196 symbol_name: "a".into(),
1197 kind: ChunkKind::Function,
1198 start_line: 1,
1199 end_line: 1,
1200 content: "fn a() {}".into(),
1201 tokens: tokenize("fn a"),
1202 token_count: 2,
1203 });
1204 index.finalize();
1205
1206 let _ = index.save(root);
1207 let index_path = BM25Index::index_file_path(root);
1208 assert!(
1209 !index_path.exists(),
1210 "save should refuse to persist oversized index"
1211 );
1212
1213 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1214 }
1215
1216 #[test]
1217 fn save_writes_project_root_marker() {
1218 let _env = crate::core::data_dir::test_env_lock();
1219 let td = tempdir().expect("tempdir");
1220 let root = td.path();
1221 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1222
1223 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1224 let index = BM25Index::build_from_directory(root);
1225 index.save(root).expect("save");
1226
1227 let dir = crate::core::index_namespace::vectors_dir(root);
1228 let marker = dir.join("project_root.txt");
1229 assert!(marker.exists(), "project_root.txt marker should exist");
1230 let content = std::fs::read_to_string(&marker).expect("read marker");
1231 assert_eq!(content, root.to_string_lossy());
1232 }
1233
1234 #[test]
1235 fn save_load_roundtrip_uses_zstd() {
1236 let _env = crate::core::data_dir::test_env_lock();
1237 let data_dir = tempdir().expect("data_dir");
1238 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1239 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1240 let td = tempdir().expect("tempdir");
1241 let root = td.path();
1242
1243 for i in 0..10 {
1244 std::fs::write(
1245 root.join(format!("mod{i}.rs")),
1246 format!(
1247 "pub fn handler_{i}() {{\n println!(\"hello\");\n}}\n\n\
1248 pub fn helper_{i}() {{\n println!(\"world\");\n}}\n"
1249 ),
1250 )
1251 .expect("write");
1252 }
1253
1254 let index = BM25Index::build_from_directory(root);
1255 assert!(index.doc_count > 0, "should have indexed chunks");
1256 index.save(root).expect("save");
1257
1258 let dir = crate::core::index_namespace::vectors_dir(root);
1259 let zst = dir.join("bm25_index.bin.zst");
1260 assert!(zst.exists(), "should write .bin.zst");
1261 assert!(
1262 !dir.join("bm25_index.bin").exists(),
1263 ".bin should be deleted"
1264 );
1265
1266 let loaded = BM25Index::load(root).expect("load compressed index");
1267 assert_eq!(loaded.doc_count, index.doc_count);
1268 assert_eq!(loaded.chunks.len(), index.chunks.len());
1269
1270 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1271 std::env::remove_var("LEAN_CTX_DATA_DIR");
1272 }
1273
1274 #[test]
1275 fn auto_migrate_bin_to_zst() {
1276 let _env = crate::core::data_dir::test_env_lock();
1277 let data_dir = tempdir().expect("data_dir");
1278 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1279 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1280 let td = tempdir().expect("tempdir");
1281 let root = td.path();
1282
1283 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1284 let index = BM25Index::build_from_directory(root);
1285
1286 let dir = crate::core::index_namespace::vectors_dir(root);
1287 std::fs::create_dir_all(&dir).expect("mkdir");
1288 let data =
1289 bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1290 std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1291
1292 let loaded = BM25Index::load(root).expect("load should auto-migrate");
1293 assert_eq!(loaded.doc_count, index.doc_count);
1294 assert!(
1295 dir.join("bm25_index.bin.zst").exists(),
1296 ".bin.zst should be created"
1297 );
1298 assert!(
1299 !dir.join("bm25_index.bin").exists(),
1300 ".bin should be removed"
1301 );
1302
1303 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1304 std::env::remove_var("LEAN_CTX_DATA_DIR");
1305 }
1306
1307 #[test]
1308 fn list_code_files_skips_default_vendor_ignores() {
1309 let td = tempdir().expect("tempdir");
1310 let root = td.path();
1311
1312 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1313 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1314 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1315 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1316 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1317
1318 let files = list_code_files(root);
1319 assert!(
1320 files.iter().any(|f| f == "main.rs"),
1321 "main.rs should be included"
1322 );
1323 assert!(
1324 !files.iter().any(|f| f.starts_with("vendor/")),
1325 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1326 );
1327 assert!(
1328 !files.iter().any(|f| f.starts_with("dist/")),
1329 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1330 );
1331 }
1332
1333 #[test]
1334 fn list_code_files_respects_max_files_cap() {
1335 let td = tempdir().expect("tempdir");
1336 let root = td.path();
1337
1338 for i in 0..10 {
1341 std::fs::write(
1342 root.join(format!("f{i}.rs")),
1343 format!("pub fn f{i}() {{}}\n"),
1344 )
1345 .expect("write");
1346 }
1347 let files = list_code_files(root);
1348 assert!(
1349 files.len() <= MAX_BM25_FILES,
1350 "file count should not exceed MAX_BM25_FILES"
1351 );
1352 }
1353
1354 #[test]
1355 fn max_bm25_cache_bytes_reads_env() {
1356 let _env = crate::core::data_dir::test_env_lock();
1357 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1358 let bytes = max_bm25_cache_bytes();
1359 assert_eq!(bytes, 64 * 1024 * 1024);
1360 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1361 }
1362}