1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12 "vendor/**",
13 "dist/**",
14 "build/**",
15 "public/vendor/**",
16 "public/js/**",
17 "public/css/**",
18 "public/build/**",
19 ".next/**",
20 ".nuxt/**",
21 "__pycache__/**",
22 "*.min.js",
23 "*.min.css",
24 "*.bundle.js",
25 "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30 .ok()
31 .and_then(|v| v.parse::<u64>().ok())
32 .unwrap_or_else(|| {
33 let cfg = crate::core::config::Config::load();
34 let profile = crate::core::config::MemoryProfile::effective(&cfg);
35 let profile_mb = profile.bm25_max_cache_mb();
36 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37 profile_mb
38 } else {
39 cfg.bm25_max_cache_mb
40 }
41 });
42 mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47 pub file_path: String,
48 pub symbol_name: String,
49 pub kind: ChunkKind,
50 pub start_line: usize,
51 pub end_line: usize,
52 pub content: String,
53 #[serde(default)]
54 pub tokens: Vec<String>,
55 pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60 Function,
61 Struct,
62 Impl,
63 Module,
64 Class,
65 Method,
66 Other,
67 Issue,
69 PullRequest,
70 WikiPage,
71 DbSchema,
72 ApiEndpoint,
73 Ticket,
74 ExternalOther,
75}
76
77#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
78pub struct IndexedFileState {
79 pub mtime_ms: u64,
80 pub size_bytes: u64,
81}
82
83impl IndexedFileState {
84 fn from_path(path: &Path) -> Option<Self> {
85 let meta = path.metadata().ok()?;
86 let size_bytes = meta.len();
87 let mtime_ms = meta
88 .modified()
89 .ok()
90 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
91 .map(|d| d.as_millis() as u64)?;
92 Some(Self {
93 mtime_ms,
94 size_bytes,
95 })
96 }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct BM25Index {
101 pub chunks: Vec<CodeChunk>,
102 pub inverted: HashMap<String, Vec<(usize, f64)>>,
103 pub avg_doc_len: f64,
104 pub doc_count: usize,
105 pub doc_freqs: HashMap<String, usize>,
106 #[serde(default)]
107 pub files: HashMap<String, IndexedFileState>,
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct SearchResult {
112 pub chunk_idx: usize,
113 pub score: f64,
114 pub file_path: String,
115 pub symbol_name: String,
116 pub kind: ChunkKind,
117 pub start_line: usize,
118 pub end_line: usize,
119 pub snippet: String,
120}
121
122const BM25_K1: f64 = 1.2;
123const BM25_B: f64 = 0.75;
124
125impl Default for BM25Index {
126 fn default() -> Self {
127 Self::new()
128 }
129}
130
131impl BM25Index {
132 pub fn new() -> Self {
133 Self {
134 chunks: Vec::new(),
135 inverted: HashMap::new(),
136 avg_doc_len: 0.0,
137 doc_count: 0,
138 doc_freqs: HashMap::new(),
139 files: HashMap::new(),
140 }
141 }
142
143 pub fn memory_usage_bytes(&self) -> usize {
145 let chunks_size: usize = self
146 .chunks
147 .iter()
148 .map(|c| {
149 c.content.len()
150 + c.file_path.len()
151 + c.symbol_name.len()
152 + c.tokens.iter().map(String::len).sum::<usize>()
153 + 64
154 })
155 .sum();
156 let inverted_size: usize = self
157 .inverted
158 .iter()
159 .map(|(k, v)| k.len() + v.len() * 16 + 32)
160 .sum();
161 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
162 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
163 chunks_size + inverted_size + files_size + freqs_size
164 }
165
166 pub fn unload(&mut self) {
168 let usage = self.memory_usage_bytes();
169 self.chunks = Vec::new();
170 self.inverted = HashMap::new();
171 self.doc_freqs = HashMap::new();
172 self.files = HashMap::new();
173 self.avg_doc_len = 0.0;
174 self.doc_count = 0;
175 tracing::info!(
176 "[bm25] unloaded index, freed ~{:.1}MB",
177 usage as f64 / 1_048_576.0
178 );
179 }
180
181 #[cfg(test)]
183 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
184 let mut index = Self::new();
185 for mut chunk in chunks {
186 if chunk.token_count == 0 {
187 chunk.token_count = tokenize(&chunk.content).len();
188 }
189 index.add_chunk(chunk);
190 }
191 index.finalize();
192 index
193 }
194
195 pub fn build_from_directory(root: &Path) -> Self {
196 let root_str = root.to_string_lossy();
197 if !super::graph_index::is_safe_scan_root_public(&root_str) {
198 tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
199 return Self::new();
200 }
201 let mut index = Self::new();
202 let files = list_code_files(root);
203 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
204
205 for (i, rel) in files.iter().enumerate() {
206 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
207 tracing::warn!(
208 "[bm25: stopping build at file {i}/{} due to memory pressure]",
209 files.len()
210 );
211 break;
212 }
213 if crate::core::memory_guard::abort_requested() {
214 tracing::warn!("[bm25: aborting build due to critical memory pressure]");
215 break;
216 }
217
218 let abs = root.join(rel);
219 let Some(state) = IndexedFileState::from_path(&abs) else {
220 continue;
221 };
222 if state.size_bytes > MAX_FILE_SIZE_BYTES {
223 continue;
224 }
225 if let Ok(content) = std::fs::read_to_string(&abs) {
226 let mut chunks = extract_chunks(rel, &content);
227 chunks.sort_by(|a, b| {
228 a.start_line
229 .cmp(&b.start_line)
230 .then_with(|| a.end_line.cmp(&b.end_line))
231 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
232 });
233 for chunk in chunks {
234 index.add_chunk(chunk);
235 }
236 index.files.insert(rel.clone(), state);
237 }
238 }
239
240 index.finalize();
241 index
242 }
243
244 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
245 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
246 for c in &prev.chunks {
247 old_by_file
248 .entry(c.file_path.clone())
249 .or_default()
250 .push(c.clone());
251 }
252 for v in old_by_file.values_mut() {
253 v.sort_by(|a, b| {
254 a.start_line
255 .cmp(&b.start_line)
256 .then_with(|| a.end_line.cmp(&b.end_line))
257 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
258 });
259 }
260
261 let mut index = Self::new();
262 let files = list_code_files(root);
263 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
264
265 for (i, rel) in files.iter().enumerate() {
266 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
267 tracing::warn!(
268 "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
269 files.len()
270 );
271 break;
272 }
273
274 let abs = root.join(rel);
275 let Some(state) = IndexedFileState::from_path(&abs) else {
276 continue;
277 };
278
279 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
280 if unchanged {
281 if let Some(chunks) = old_by_file.get(rel) {
282 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
283 for chunk in chunks {
284 index.add_chunk(chunk.clone());
285 }
286 index.files.insert(rel.clone(), state);
287 continue;
288 }
289 }
290 }
291
292 if state.size_bytes > MAX_FILE_SIZE_BYTES {
293 continue;
294 }
295 if let Ok(content) = std::fs::read_to_string(&abs) {
296 let mut chunks = extract_chunks(rel, &content);
297 chunks.sort_by(|a, b| {
298 a.start_line
299 .cmp(&b.start_line)
300 .then_with(|| a.end_line.cmp(&b.end_line))
301 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
302 });
303 for chunk in chunks {
304 index.add_chunk(chunk);
305 }
306 index.files.insert(rel.clone(), state);
307 }
308 }
309
310 index.finalize();
311 index
312 }
313
314 fn add_chunk(&mut self, chunk: CodeChunk) {
315 let idx = self.chunks.len();
316
317 let enriched = enrich_for_bm25(&chunk);
318 let tokens = tokenize(&enriched);
319 for token in &tokens {
320 let lower = token.to_lowercase();
321 let postings = self.inverted.entry(lower.clone()).or_default();
322 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
323 *self.doc_freqs.entry(lower).or_insert(0) += 1;
324 }
325 postings.push((idx, 1.0));
326 }
327
328 self.chunks.push(CodeChunk {
329 token_count: tokens.len(),
330 tokens: Vec::new(),
331 ..chunk
332 });
333 }
334
335 fn finalize(&mut self) {
336 self.doc_count = self.chunks.len();
337 if self.doc_count == 0 {
338 return;
339 }
340
341 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
342 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
343 }
344
345 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
346 let query_tokens = tokenize(query);
347 if query_tokens.is_empty() || self.doc_count == 0 {
348 return Vec::new();
349 }
350
351 let n = self.chunks.len();
354 let mut scores = vec![0.0f64; n];
355 let mut touched = Vec::with_capacity(n.min(256));
356
357 for token in &query_tokens {
358 let lower = token.to_lowercase();
359 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
360 if df == 0.0 {
361 continue;
362 }
363
364 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
365
366 if let Some(postings) = self.inverted.get(&lower) {
367 for &(idx, weight) in postings {
368 let doc_len = self.chunks[idx].token_count as f64;
369 let norm_len = doc_len / self.avg_doc_len.max(1.0);
370 let bm25 = idf * (weight * (BM25_K1 + 1.0))
371 / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
372
373 if scores[idx] == 0.0 {
374 touched.push(idx);
375 }
376 scores[idx] += bm25;
377 }
378 }
379 }
380
381 let mut results: Vec<SearchResult> = touched
382 .iter()
383 .filter(|&&idx| scores[idx] > 0.0)
384 .map(|&idx| {
385 let chunk = &self.chunks[idx];
386 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
387 SearchResult {
388 chunk_idx: idx,
389 score: scores[idx],
390 file_path: chunk.file_path.clone(),
391 symbol_name: chunk.symbol_name.clone(),
392 kind: chunk.kind.clone(),
393 start_line: chunk.start_line,
394 end_line: chunk.end_line,
395 snippet,
396 }
397 })
398 .collect();
399
400 results.sort_by(|a, b| {
401 b.score
402 .partial_cmp(&a.score)
403 .unwrap_or(std::cmp::Ordering::Equal)
404 .then_with(|| a.file_path.cmp(&b.file_path))
405 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
406 .then_with(|| a.start_line.cmp(&b.start_line))
407 .then_with(|| a.end_line.cmp(&b.end_line))
408 });
409 results.truncate(top_k);
410 results
411 }
412
413 pub fn save(&self, root: &Path) -> std::io::Result<()> {
414 if self.chunks.len() > CHUNK_COUNT_WARNING {
415 tracing::warn!(
416 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
417 self.chunks.len(),
418 CHUNK_COUNT_WARNING
419 );
420 }
421
422 let dir = index_dir(root);
423 std::fs::create_dir_all(&dir)?;
424 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
425 .map_err(|e| std::io::Error::other(e.to_string()))?;
426
427 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
428 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
429
430 let max_bytes = max_bm25_cache_bytes();
431 if compressed.len() as u64 > max_bytes {
432 tracing::warn!(
433 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
434 compressed.len() as f64 / 1_048_576.0,
435 max_bytes / (1024 * 1024),
436 dir.display()
437 );
438 return Ok(());
439 }
440
441 tracing::info!(
442 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
443 data.len() as f64 / 1_048_576.0,
444 compressed.len() as f64 / 1_048_576.0,
445 (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
446 );
447
448 let target = dir.join("bm25_index.bin.zst");
449 let tmp = dir.join("bm25_index.bin.zst.tmp");
450 std::fs::write(&tmp, &compressed)?;
451 std::fs::rename(&tmp, &target)?;
452
453 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
454 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
455
456 let _ = std::fs::write(
457 dir.join("project_root.txt"),
458 root.to_string_lossy().as_bytes(),
459 );
460
461 Ok(())
462 }
463
464 pub fn load(root: &Path) -> Option<Self> {
465 let dir = index_dir(root);
466 let max_bytes = max_bm25_cache_bytes();
467
468 let zst_path = dir.join("bm25_index.bin.zst");
469 if zst_path.exists() {
470 let meta = std::fs::metadata(&zst_path).ok()?;
471 if meta.len() > max_bytes {
472 tracing::warn!(
473 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
474 meta.len() as f64 / 1_073_741_824.0,
475 max_bytes / (1024 * 1024),
476 zst_path.display()
477 );
478 let quarantined = zst_path.with_extension("zst.quarantined");
479 let _ = std::fs::rename(&zst_path, &quarantined);
480 return None;
481 }
482 let compressed = std::fs::read(&zst_path).ok()?;
483 let max_decompressed = max_bytes * 20; let data = bounded_zstd_decode(&compressed, max_decompressed)?;
485 let (idx, _): (Self, _) =
486 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
487 return Some(idx);
488 }
489
490 let bin_path = dir.join("bm25_index.bin");
491 if bin_path.exists() {
492 let meta = std::fs::metadata(&bin_path).ok()?;
493 if meta.len() > max_bytes {
494 tracing::warn!(
495 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
496 meta.len() as f64 / 1_073_741_824.0,
497 max_bytes / (1024 * 1024),
498 bin_path.display()
499 );
500 let quarantined = bin_path.with_extension("bin.quarantined");
501 let _ = std::fs::rename(&bin_path, &quarantined);
502 return None;
503 }
504 let data = std::fs::read(&bin_path).ok()?;
505 let (idx, _): (Self, _) =
506 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
507 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
509 let zst_tmp = zst_path.with_extension("zst.tmp");
510 if std::fs::write(&zst_tmp, &compressed).is_ok()
511 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
512 {
513 tracing::info!(
514 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
515 data.len() as f64 / 1_048_576.0,
516 compressed.len() as f64 / 1_048_576.0
517 );
518 let _ = std::fs::remove_file(&bin_path);
519 }
520 }
521 return Some(idx);
522 }
523
524 let json_path = dir.join("bm25_index.json");
525 if json_path.exists() {
526 let meta = std::fs::metadata(&json_path).ok()?;
527 if meta.len() > max_bytes {
528 tracing::warn!(
529 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
530 meta.len() as f64 / 1_073_741_824.0,
531 max_bytes / (1024 * 1024),
532 json_path.display()
533 );
534 let quarantined = json_path.with_extension("json.quarantined");
535 let _ = std::fs::rename(&json_path, &quarantined);
536 return None;
537 }
538 let data = std::fs::read_to_string(&json_path).ok()?;
539 return serde_json::from_str(&data).ok();
540 }
541
542 None
543 }
544
545 pub fn load_or_build(root: &Path) -> Self {
546 if !is_safe_bm25_root(root) {
547 return Self::default();
548 }
549 if let Some(idx) = Self::load(root) {
550 if !bm25_index_looks_stale(&idx, root) {
551 return idx;
552 }
553 tracing::debug!(
554 "[bm25_index: stale index detected for {}; rebuilding]",
555 root.display()
556 );
557 let rebuilt = if idx.files.is_empty() {
558 Self::build_from_directory(root)
559 } else {
560 Self::rebuild_incremental(root, &idx)
561 };
562 let _ = rebuilt.save(root);
563 return rebuilt;
564 }
565
566 let built = Self::build_from_directory(root);
567 let _ = built.save(root);
568 built
569 }
570
571 pub fn index_file_path(root: &Path) -> PathBuf {
572 let dir = index_dir(root);
573 let zst = dir.join("bm25_index.bin.zst");
574 if zst.exists() {
575 return zst;
576 }
577 let bin = dir.join("bm25_index.bin");
578 if bin.exists() {
579 return bin;
580 }
581 dir.join("bm25_index.json")
582 }
583
584 pub fn ingest_content_chunks(
588 &mut self,
589 chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
590 ) -> usize {
591 let mut count = 0usize;
592 for cc in chunks {
593 self.add_chunk(cc.into());
594 count += 1;
595 }
596 if count > 0 {
597 self.finalize();
598 }
599 count
600 }
601
602 pub fn external_chunk_count(&self) -> usize {
604 self.chunks
605 .iter()
606 .filter(|c| c.file_path.contains("://"))
607 .count()
608 }
609}
610
611fn is_safe_bm25_root(root: &Path) -> bool {
612 super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
613}
614
615fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
616 if index.chunks.is_empty() {
617 return false;
618 }
619
620 if index.files.is_empty() {
621 let mut seen = std::collections::HashSet::<&str>::new();
623 for chunk in &index.chunks {
624 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
625 if rel.is_empty() {
626 continue;
627 }
628 if !seen.insert(rel) {
629 continue;
630 }
631 if !root.join(rel).exists() {
632 return true;
633 }
634 }
635 return false;
636 }
637
638 for (rel, old_state) in &index.files {
640 let abs = root.join(rel);
641 if !abs.exists() {
642 return true;
643 }
644 let Some(cur) = IndexedFileState::from_path(&abs) else {
645 return true;
646 };
647 if &cur != old_state {
648 return true;
649 }
650 }
651
652 for rel in list_code_files(root) {
654 if !index.files.contains_key(&rel) {
655 return true;
656 }
657 }
658
659 false
660}
661
662fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
663 use std::io::Read;
664 let mut decoder = zstd::Decoder::new(compressed).ok()?;
665 let mut buf = Vec::new();
666 let mut chunk = vec![0u8; 65536];
667 let mut total = 0u64;
668 loop {
669 let n = decoder.read(&mut chunk).ok()?;
670 if n == 0 {
671 break;
672 }
673 total += n as u64;
674 if total > max_bytes {
675 tracing::warn!(
676 "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
677 total as f64 / (1024.0 * 1024.0),
678 max_bytes as f64 / (1024.0 * 1024.0)
679 );
680 return None;
681 }
682 buf.extend_from_slice(&chunk[..n]);
683 }
684 Some(buf)
685}
686
687fn index_dir(root: &Path) -> PathBuf {
688 crate::core::index_namespace::vectors_dir(root)
689}
690
691fn list_code_files(root: &Path) -> Vec<String> {
692 let walker = ignore::WalkBuilder::new(root)
693 .hidden(true)
694 .git_ignore(true)
695 .git_global(true)
696 .git_exclude(true)
697 .max_depth(Some(20))
698 .build();
699
700 let cfg = crate::core::config::Config::load();
701 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
702 .iter()
703 .filter_map(|p| glob::Pattern::new(p).ok())
704 .collect();
705 ignore_patterns.extend(
706 cfg.extra_ignore_patterns
707 .iter()
708 .filter_map(|p| glob::Pattern::new(p).ok()),
709 );
710
711 let mut files: Vec<String> = Vec::new();
712 for entry in walker.flatten() {
713 let path = entry.path();
714 if !path.is_file() {
715 continue;
716 }
717 if !is_code_file(path) {
718 continue;
719 }
720 let rel = path
721 .strip_prefix(root)
722 .unwrap_or(path)
723 .to_string_lossy()
724 .to_string();
725 if rel.is_empty() {
726 continue;
727 }
728 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
729 continue;
730 }
731 if files.len() >= MAX_BM25_FILES {
732 tracing::warn!(
733 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
734 root.display()
735 );
736 break;
737 }
738 files.push(rel);
739 }
740
741 files.sort();
742 files.dedup();
743 files
744}
745
746pub fn is_code_file(path: &Path) -> bool {
747 let ext = path
748 .extension()
749 .and_then(|e| e.to_str())
750 .unwrap_or("")
751 .to_lowercase();
752 matches!(
753 ext.as_str(),
754 "rs" | "ts"
755 | "tsx"
756 | "js"
757 | "jsx"
758 | "py"
759 | "go"
760 | "java"
761 | "c"
762 | "cc"
763 | "cpp"
764 | "h"
765 | "hpp"
766 | "rb"
767 | "cs"
768 | "kt"
769 | "swift"
770 | "php"
771 | "scala"
772 | "sql"
773 | "ex"
774 | "exs"
775 | "zig"
776 | "lua"
777 | "dart"
778 | "vue"
779 | "svelte"
780 )
781}
782
783fn tokenize(text: &str) -> Vec<String> {
784 let mut tokens = Vec::new();
785 let mut current = String::new();
786
787 for ch in text.chars() {
788 if ch.is_alphanumeric() || ch == '_' {
789 current.push(ch);
790 } else {
791 if current.len() >= 2 {
792 tokens.push(current.clone());
793 }
794 current.clear();
795 }
796 }
797 if current.len() >= 2 {
798 tokens.push(current);
799 }
800
801 split_camel_case_tokens(&tokens)
802}
803
804pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
805 tokenize(text)
806}
807
808fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
809 let mut result = Vec::new();
810 for token in tokens {
811 result.push(token.clone());
812 let mut start = 0;
813 let chars: Vec<char> = token.chars().collect();
814 for i in 1..chars.len() {
815 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
816 let part: String = chars[start..i].iter().collect();
817 if part.len() >= 2 {
818 result.push(part);
819 }
820 start = i;
821 }
822 }
823 if start > 0 {
824 let part: String = chars[start..].iter().collect();
825 if part.len() >= 2 {
826 result.push(part);
827 }
828 }
829 }
830 result
831}
832
833fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
834 #[cfg(feature = "tree-sitter")]
835 {
836 let ext = std::path::Path::new(file_path)
837 .extension()
838 .and_then(|e| e.to_str())
839 .unwrap_or("");
840 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
841 return chunks;
842 }
843 }
844
845 let lines: Vec<&str> = content.lines().collect();
846 if lines.is_empty() {
847 return Vec::new();
848 }
849
850 let mut chunks = Vec::new();
851 let mut i = 0;
852
853 while i < lines.len() {
854 let trimmed = lines[i].trim();
855
856 if let Some((name, kind)) = detect_symbol(trimmed) {
857 let start = i;
858 let end = find_block_end(&lines, i);
859 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
860 let token_count = tokenize(&block).len();
861
862 chunks.push(CodeChunk {
863 file_path: file_path.to_string(),
864 symbol_name: name,
865 kind,
866 start_line: start + 1,
867 end_line: end + 1,
868 content: block,
869 tokens: Vec::new(),
870 token_count,
871 });
872
873 i = end + 1;
874 } else {
875 i += 1;
876 }
877 }
878
879 if chunks.is_empty() && !content.is_empty() {
880 let bytes = content.as_bytes();
885 let rk_chunks = crate::core::rabin_karp::chunk(content);
886 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
887 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
888 let end = (c.offset + c.length).min(bytes.len());
889 let slice = &bytes[c.offset..end];
890 let chunk_text = String::from_utf8_lossy(slice).into_owned();
891 let token_count = tokenize(&chunk_text).len();
892 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
893 let end_line = start_line + bytecount::count(slice, b'\n');
894 chunks.push(CodeChunk {
895 file_path: file_path.to_string(),
896 symbol_name: format!("{file_path}#chunk-{idx}"),
897 kind: ChunkKind::Module,
898 start_line,
899 end_line: end_line.max(start_line),
900 content: chunk_text,
901 tokens: Vec::new(),
902 token_count,
903 });
904 }
905 } else {
906 let token_count = tokenize(content).len();
907 let snippet = lines
908 .iter()
909 .take(50)
910 .copied()
911 .collect::<Vec<_>>()
912 .join("\n");
913 chunks.push(CodeChunk {
914 file_path: file_path.to_string(),
915 symbol_name: file_path.to_string(),
916 kind: ChunkKind::Module,
917 start_line: 1,
918 end_line: lines.len(),
919 content: snippet,
920 tokens: Vec::new(),
921 token_count,
922 });
923 }
924 }
925
926 chunks
927}
928
929fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
930 let trimmed = line.trim();
931
932 let patterns: &[(&str, ChunkKind)] = &[
933 ("pub async fn ", ChunkKind::Function),
934 ("async fn ", ChunkKind::Function),
935 ("pub fn ", ChunkKind::Function),
936 ("fn ", ChunkKind::Function),
937 ("pub struct ", ChunkKind::Struct),
938 ("struct ", ChunkKind::Struct),
939 ("pub enum ", ChunkKind::Struct),
940 ("enum ", ChunkKind::Struct),
941 ("impl ", ChunkKind::Impl),
942 ("pub trait ", ChunkKind::Struct),
943 ("trait ", ChunkKind::Struct),
944 ("export function ", ChunkKind::Function),
945 ("export async function ", ChunkKind::Function),
946 ("export default function ", ChunkKind::Function),
947 ("function ", ChunkKind::Function),
948 ("async function ", ChunkKind::Function),
949 ("export class ", ChunkKind::Class),
950 ("class ", ChunkKind::Class),
951 ("export interface ", ChunkKind::Struct),
952 ("interface ", ChunkKind::Struct),
953 ("def ", ChunkKind::Function),
954 ("async def ", ChunkKind::Function),
955 ("class ", ChunkKind::Class),
956 ("func ", ChunkKind::Function),
957 ];
958
959 for (prefix, kind) in patterns {
960 if let Some(rest) = trimmed.strip_prefix(prefix) {
961 let name: String = rest
962 .chars()
963 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
964 .take_while(|c| *c != '<')
965 .collect();
966 if !name.is_empty() {
967 return Some((name, kind.clone()));
968 }
969 }
970 }
971
972 None
973}
974
975fn find_block_end(lines: &[&str], start: usize) -> usize {
976 let mut depth = 0i32;
977 let mut found_open = false;
978
979 for (i, line) in lines.iter().enumerate().skip(start) {
980 for ch in line.chars() {
981 match ch {
982 '{' | '(' if !found_open || depth > 0 => {
983 depth += 1;
984 found_open = true;
985 }
986 '}' | ')' if depth > 0 => {
987 depth -= 1;
988 if depth == 0 && found_open {
989 return i;
990 }
991 }
992 _ => {}
993 }
994 }
995
996 if found_open && depth <= 0 && i > start {
997 return i;
998 }
999
1000 if !found_open && i > start + 2 {
1001 let trimmed = lines[i].trim();
1002 if trimmed.is_empty()
1003 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1004 {
1005 return i.saturating_sub(1);
1006 }
1007 }
1008 }
1009
1010 (start + 50).min(lines.len().saturating_sub(1))
1011}
1012
1013pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1014 if results.is_empty() {
1015 return "No results found.".to_string();
1016 }
1017
1018 let mut out = String::new();
1019 for (i, r) in results.iter().enumerate() {
1020 let is_external = r.file_path.contains("://");
1021 if compact {
1022 if is_external {
1023 out.push_str(&format!(
1024 "{}. {:.2} [{:?}] {} — {}\n",
1025 i + 1,
1026 r.score,
1027 r.kind,
1028 r.file_path,
1029 r.symbol_name,
1030 ));
1031 } else {
1032 out.push_str(&format!(
1033 "{}. {:.2} {}:{}-{} {:?} {}\n",
1034 i + 1,
1035 r.score,
1036 r.file_path,
1037 r.start_line,
1038 r.end_line,
1039 r.kind,
1040 r.symbol_name,
1041 ));
1042 }
1043 } else if is_external {
1044 out.push_str(&format!(
1045 "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1046 i + 1,
1047 r.score,
1048 r.kind,
1049 r.file_path,
1050 r.symbol_name,
1051 r.snippet,
1052 ));
1053 } else {
1054 out.push_str(&format!(
1055 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1056 i + 1,
1057 r.score,
1058 r.file_path,
1059 r.symbol_name,
1060 r.kind,
1061 r.start_line,
1062 r.end_line,
1063 r.snippet,
1064 ));
1065 }
1066 }
1067 out
1068}
1069
1070fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1077 let path = Path::new(&chunk.file_path);
1078 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1079 let dir = path
1080 .parent()
1081 .and_then(|p| p.file_name())
1082 .and_then(|d| d.to_str())
1083 .unwrap_or("");
1084
1085 if stem.is_empty() {
1086 return chunk.content.clone();
1087 }
1088
1089 format!("{} {} {} {}", chunk.content, stem, stem, dir)
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094 use super::*;
1095 use tempfile::tempdir;
1096
1097 #[cfg(unix)]
1098 use std::os::unix::fs::PermissionsExt;
1099
1100 #[test]
1101 fn tokenize_splits_code() {
1102 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1103 assert!(tokens.contains(&"calculate_total".to_string()));
1104 assert!(tokens.contains(&"items".to_string()));
1105 assert!(tokens.contains(&"Vec".to_string()));
1106 }
1107
1108 #[test]
1109 fn camel_case_splitting() {
1110 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1111 assert!(tokens.contains(&"calculateTotal".to_string()));
1112 assert!(tokens.contains(&"calculate".to_string()));
1113 assert!(tokens.contains(&"Total".to_string()));
1114 }
1115
1116 #[test]
1117 fn detect_rust_function() {
1118 let (name, kind) =
1119 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1120 assert_eq!(name, "process_request");
1121 assert_eq!(kind, ChunkKind::Function);
1122 }
1123
1124 #[test]
1125 fn bm25_search_finds_relevant() {
1126 let mut index = BM25Index::new();
1127 index.add_chunk(CodeChunk {
1128 file_path: "auth.rs".into(),
1129 symbol_name: "validate_token".into(),
1130 kind: ChunkKind::Function,
1131 start_line: 1,
1132 end_line: 10,
1133 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1134 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1135 token_count: 8,
1136 });
1137 index.add_chunk(CodeChunk {
1138 file_path: "db.rs".into(),
1139 symbol_name: "connect_database".into(),
1140 kind: ChunkKind::Function,
1141 start_line: 1,
1142 end_line: 5,
1143 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1144 tokens: tokenize("fn connect_database url str Pool create_pool url"),
1145 token_count: 7,
1146 });
1147 index.finalize();
1148
1149 let results = index.search("jwt token validation", 5);
1150 assert!(!results.is_empty());
1151 assert_eq!(results[0].symbol_name, "validate_token");
1152 }
1153
1154 #[test]
1155 fn bm25_search_sorts_ties_deterministically() {
1156 let mut index = BM25Index::new();
1157
1158 index.add_chunk(CodeChunk {
1160 file_path: "b.rs".into(),
1161 symbol_name: "same".into(),
1162 kind: ChunkKind::Function,
1163 start_line: 1,
1164 end_line: 1,
1165 content: "fn same() {}".into(),
1166 tokens: tokenize("same token"),
1167 token_count: 2,
1168 });
1169 index.add_chunk(CodeChunk {
1170 file_path: "a.rs".into(),
1171 symbol_name: "same".into(),
1172 kind: ChunkKind::Function,
1173 start_line: 1,
1174 end_line: 1,
1175 content: "fn same() {}".into(),
1176 tokens: tokenize("same token"),
1177 token_count: 2,
1178 });
1179 index.finalize();
1180
1181 let results = index.search("same", 10);
1182 assert!(results.len() >= 2);
1183 assert_eq!(results[0].file_path, "a.rs");
1184 assert_eq!(results[1].file_path, "b.rs");
1185 }
1186
1187 #[test]
1188 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1189 let td = tempdir().expect("tempdir");
1190 let root = td.path();
1191 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1192
1193 let idx = BM25Index::build_from_directory(root);
1194 assert!(!bm25_index_looks_stale(&idx, root));
1195
1196 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1197 assert!(bm25_index_looks_stale(&idx, root));
1198 }
1199
1200 #[test]
1201 #[cfg(unix)]
1202 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1203 let td = tempdir().expect("tempdir");
1204 let root = td.path();
1205
1206 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1207 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1208
1209 let idx1 = BM25Index::build_from_directory(root);
1210 assert!(idx1.files.contains_key("a.rs"));
1211 assert!(idx1.files.contains_key("b.rs"));
1212
1213 let a_path = root.join("a.rs");
1215 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1216 perms.set_mode(0o000);
1217 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1218
1219 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1221 .expect("rewrite b.rs");
1222
1223 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1224 assert!(
1225 idx2.files.contains_key("a.rs"),
1226 "a.rs should be kept via reuse"
1227 );
1228 assert!(idx2.files.contains_key("b.rs"));
1229
1230 let b_has_b2 = idx2
1231 .chunks
1232 .iter()
1233 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1234 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1235
1236 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1238 perms.set_mode(0o644);
1239 let _ = std::fs::set_permissions(&a_path, perms);
1240 }
1241
1242 #[test]
1243 fn load_quarantines_oversized_index() {
1244 let _env = crate::core::data_dir::test_env_lock();
1245 let td = tempdir().expect("tempdir");
1246 let root = td.path();
1247 let dir = crate::core::index_namespace::vectors_dir(root);
1248 std::fs::create_dir_all(&dir).expect("create vectors dir");
1249
1250 let index_path = dir.join("bm25_index.json");
1251 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1252 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1253
1254 let result = BM25Index::load(root);
1255 assert!(result.is_none(), "oversized index should return None");
1256 assert!(
1257 !index_path.exists(),
1258 "original index should be removed after quarantine"
1259 );
1260 assert!(
1261 dir.join("bm25_index.json.quarantined").exists(),
1262 "quarantined file should exist"
1263 );
1264
1265 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1266 }
1267
1268 #[test]
1269 fn save_refuses_oversized_output() {
1270 let _env = crate::core::data_dir::test_env_lock();
1271 let data_dir = tempdir().expect("data_dir");
1272 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1273 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1274
1275 let td = tempdir().expect("tempdir");
1276 let root = td.path();
1277
1278 let mut index = BM25Index::new();
1279 index.add_chunk(CodeChunk {
1280 file_path: "a.rs".into(),
1281 symbol_name: "a".into(),
1282 kind: ChunkKind::Function,
1283 start_line: 1,
1284 end_line: 1,
1285 content: "fn a() {}".into(),
1286 tokens: tokenize("fn a"),
1287 token_count: 2,
1288 });
1289 index.finalize();
1290
1291 let _ = index.save(root);
1292 let index_path = BM25Index::index_file_path(root);
1293 assert!(
1294 !index_path.exists(),
1295 "save should refuse to persist oversized index"
1296 );
1297
1298 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1299 }
1300
1301 #[test]
1302 fn save_writes_project_root_marker() {
1303 let _env = crate::core::data_dir::test_env_lock();
1304 let td = tempdir().expect("tempdir");
1305 let root = td.path();
1306 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1307
1308 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1309 let index = BM25Index::build_from_directory(root);
1310 index.save(root).expect("save");
1311
1312 let dir = crate::core::index_namespace::vectors_dir(root);
1313 let marker = dir.join("project_root.txt");
1314 assert!(marker.exists(), "project_root.txt marker should exist");
1315 let content = std::fs::read_to_string(&marker).expect("read marker");
1316 assert_eq!(content, root.to_string_lossy());
1317 }
1318
1319 #[test]
1320 fn save_load_roundtrip_uses_zstd() {
1321 let _env = crate::core::data_dir::test_env_lock();
1322 let data_dir = tempdir().expect("data_dir");
1323 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1324 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1325 let td = tempdir().expect("tempdir");
1326 let root = td.path();
1327
1328 for i in 0..10 {
1329 std::fs::write(
1330 root.join(format!("mod{i}.rs")),
1331 format!(
1332 "pub fn handler_{i}() {{\n println!(\"hello\");\n}}\n\n\
1333 pub fn helper_{i}() {{\n println!(\"world\");\n}}\n"
1334 ),
1335 )
1336 .expect("write");
1337 }
1338
1339 let index = BM25Index::build_from_directory(root);
1340 assert!(index.doc_count > 0, "should have indexed chunks");
1341 index.save(root).expect("save");
1342
1343 let dir = crate::core::index_namespace::vectors_dir(root);
1344 let zst = dir.join("bm25_index.bin.zst");
1345 assert!(zst.exists(), "should write .bin.zst");
1346 assert!(
1347 !dir.join("bm25_index.bin").exists(),
1348 ".bin should be deleted"
1349 );
1350
1351 let loaded = BM25Index::load(root).expect("load compressed index");
1352 assert_eq!(loaded.doc_count, index.doc_count);
1353 assert_eq!(loaded.chunks.len(), index.chunks.len());
1354
1355 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1356 std::env::remove_var("LEAN_CTX_DATA_DIR");
1357 }
1358
1359 #[test]
1360 fn auto_migrate_bin_to_zst() {
1361 let _env = crate::core::data_dir::test_env_lock();
1362 let data_dir = tempdir().expect("data_dir");
1363 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1364 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1365 let td = tempdir().expect("tempdir");
1366 let root = td.path();
1367
1368 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1369 let index = BM25Index::build_from_directory(root);
1370
1371 let dir = crate::core::index_namespace::vectors_dir(root);
1372 std::fs::create_dir_all(&dir).expect("mkdir");
1373 let data =
1374 bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1375 std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1376
1377 let loaded = BM25Index::load(root).expect("load should auto-migrate");
1378 assert_eq!(loaded.doc_count, index.doc_count);
1379 assert!(
1380 dir.join("bm25_index.bin.zst").exists(),
1381 ".bin.zst should be created"
1382 );
1383 assert!(
1384 !dir.join("bm25_index.bin").exists(),
1385 ".bin should be removed"
1386 );
1387
1388 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1389 std::env::remove_var("LEAN_CTX_DATA_DIR");
1390 }
1391
1392 #[test]
1393 fn list_code_files_skips_default_vendor_ignores() {
1394 let td = tempdir().expect("tempdir");
1395 let root = td.path();
1396
1397 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1398 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1399 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1400 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1401 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1402
1403 let files = list_code_files(root);
1404 assert!(
1405 files.iter().any(|f| f == "main.rs"),
1406 "main.rs should be included"
1407 );
1408 assert!(
1409 !files.iter().any(|f| f.starts_with("vendor/")),
1410 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1411 );
1412 assert!(
1413 !files.iter().any(|f| f.starts_with("dist/")),
1414 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1415 );
1416 }
1417
1418 #[test]
1419 fn list_code_files_respects_max_files_cap() {
1420 let td = tempdir().expect("tempdir");
1421 let root = td.path();
1422
1423 for i in 0..10 {
1426 std::fs::write(
1427 root.join(format!("f{i}.rs")),
1428 format!("pub fn f{i}() {{}}\n"),
1429 )
1430 .expect("write");
1431 }
1432 let files = list_code_files(root);
1433 assert!(
1434 files.len() <= MAX_BM25_FILES,
1435 "file count should not exceed MAX_BM25_FILES"
1436 );
1437 }
1438
1439 #[test]
1440 fn max_bm25_cache_bytes_reads_env() {
1441 let _env = crate::core::data_dir::test_env_lock();
1442 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1443 let bytes = max_bm25_cache_bytes();
1444 assert_eq!(bytes, 64 * 1024 * 1024);
1445 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1446 }
1447}