1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12 "vendor/**",
13 "dist/**",
14 "build/**",
15 "public/vendor/**",
16 "public/js/**",
17 "public/css/**",
18 "public/build/**",
19 ".next/**",
20 ".nuxt/**",
21 "__pycache__/**",
22 "*.min.js",
23 "*.min.css",
24 "*.bundle.js",
25 "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
33 .ok()
34 .and_then(|v| v.parse::<u64>().ok())
35 .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb_effective());
36 mb * 1024 * 1024
37}
38
39pub fn persist_ceiling_bytes() -> u64 {
43 max_bm25_cache_bytes()
44}
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50pub enum SaveOutcome {
51 Persisted { compressed_bytes: u64 },
53 SkippedTooLarge {
58 compressed_bytes: u64,
59 limit_bytes: u64,
60 },
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct CodeChunk {
65 pub file_path: String,
66 pub symbol_name: String,
67 pub kind: ChunkKind,
68 pub start_line: usize,
69 pub end_line: usize,
70 pub content: String,
71 #[serde(default)]
72 pub tokens: Vec<String>,
73 pub token_count: usize,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
77pub enum ChunkKind {
78 Function,
79 Struct,
80 Impl,
81 Module,
82 Class,
83 Method,
84 Other,
85 Issue,
87 PullRequest,
88 WikiPage,
89 DbSchema,
90 ApiEndpoint,
91 Ticket,
92 ExternalOther,
93}
94
95#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
96pub struct IndexedFileState {
97 pub mtime_ms: u64,
98 pub size_bytes: u64,
99}
100
101impl IndexedFileState {
102 fn from_path(path: &Path) -> Option<Self> {
103 let meta = path.metadata().ok()?;
104 let size_bytes = meta.len();
105 let mtime_ms = meta
106 .modified()
107 .ok()
108 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
109 .map(|d| d.as_millis() as u64)?;
110 Some(Self {
111 mtime_ms,
112 size_bytes,
113 })
114 }
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct BM25Index {
119 pub chunks: Vec<CodeChunk>,
120 pub inverted: HashMap<String, Vec<(usize, f64)>>,
121 pub avg_doc_len: f64,
122 pub doc_count: usize,
123 pub doc_freqs: HashMap<String, usize>,
124 #[serde(default)]
125 pub files: HashMap<String, IndexedFileState>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct SearchResult {
130 pub chunk_idx: usize,
131 pub score: f64,
132 pub file_path: String,
133 pub symbol_name: String,
134 pub kind: ChunkKind,
135 pub start_line: usize,
136 pub end_line: usize,
137 pub snippet: String,
138}
139
140const BM25_K1: f64 = 1.2;
141const BM25_B: f64 = 0.75;
142
143impl Default for BM25Index {
144 fn default() -> Self {
145 Self::new()
146 }
147}
148
149impl BM25Index {
150 pub fn new() -> Self {
151 Self {
152 chunks: Vec::new(),
153 inverted: HashMap::new(),
154 avg_doc_len: 0.0,
155 doc_count: 0,
156 doc_freqs: HashMap::new(),
157 files: HashMap::new(),
158 }
159 }
160
161 pub fn memory_usage_bytes(&self) -> usize {
163 let chunks_size: usize = self
164 .chunks
165 .iter()
166 .map(|c| {
167 c.content.len()
168 + c.file_path.len()
169 + c.symbol_name.len()
170 + c.tokens.iter().map(String::len).sum::<usize>()
171 + 64
172 })
173 .sum();
174 let inverted_size: usize = self
175 .inverted
176 .iter()
177 .map(|(k, v)| k.len() + v.len() * 16 + 32)
178 .sum();
179 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
180 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
181 chunks_size + inverted_size + files_size + freqs_size
182 }
183
184 pub fn unload(&mut self) {
186 let usage = self.memory_usage_bytes();
187 self.chunks = Vec::new();
188 self.inverted = HashMap::new();
189 self.doc_freqs = HashMap::new();
190 self.files = HashMap::new();
191 self.avg_doc_len = 0.0;
192 self.doc_count = 0;
193 tracing::info!(
194 "[bm25] unloaded index, freed ~{:.1}MB",
195 usage as f64 / 1_048_576.0
196 );
197 }
198
199 #[cfg(test)]
201 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
202 let mut index = Self::new();
203 for mut chunk in chunks {
204 if chunk.token_count == 0 {
205 chunk.token_count = tokenize(&chunk.content).len();
206 }
207 index.add_chunk(chunk);
208 }
209 index.finalize();
210 index
211 }
212
213 pub fn build_from_directory(root: &Path) -> Self {
214 Self::build_from_directory_inner(root, &HashMap::new())
215 }
216
217 pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
220 Self::build_from_directory_inner(root, content_hint)
221 }
222
223 fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
224 let root_str = root.to_string_lossy();
225 if !super::graph_index::is_safe_scan_root_public(&root_str) {
226 tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
227 return Self::new();
228 }
229 let mut index = Self::new();
230 let files = list_code_files(root);
231 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
232 let mut cache_hits = 0usize;
233
234 for (i, rel) in files.iter().enumerate() {
235 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
236 tracing::warn!(
237 "[bm25: stopping build at file {i}/{} due to memory pressure]",
238 files.len()
239 );
240 break;
241 }
242 if crate::core::memory_guard::abort_requested() {
243 tracing::warn!("[bm25: aborting build due to critical memory pressure]");
244 break;
245 }
246
247 let abs = root.join(rel);
248 let Some(state) = IndexedFileState::from_path(&abs) else {
249 continue;
250 };
251 if state.size_bytes > MAX_FILE_SIZE_BYTES {
252 continue;
253 }
254
255 let content = if let Some(cached) = content_hint.get(rel) {
256 cache_hits += 1;
257 std::borrow::Cow::Borrowed(cached.as_str())
258 } else {
259 match std::fs::read_to_string(&abs) {
260 Ok(c) => std::borrow::Cow::Owned(c),
261 Err(_) => continue,
262 }
263 };
264
265 let mut chunks = extract_chunks(rel, &content);
266 chunks.sort_by(|a, b| {
267 a.start_line
268 .cmp(&b.start_line)
269 .then_with(|| a.end_line.cmp(&b.end_line))
270 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
271 });
272 for chunk in chunks {
273 index.add_chunk(chunk);
274 }
275 index.files.insert(rel.clone(), state);
276 }
277
278 if cache_hits > 0 {
279 tracing::info!(
280 "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
281 files.len()
282 );
283 }
284
285 index.finalize();
286 index
287 }
288
289 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
290 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
291 for c in &prev.chunks {
292 old_by_file
293 .entry(c.file_path.clone())
294 .or_default()
295 .push(c.clone());
296 }
297 for v in old_by_file.values_mut() {
298 v.sort_by(|a, b| {
299 a.start_line
300 .cmp(&b.start_line)
301 .then_with(|| a.end_line.cmp(&b.end_line))
302 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
303 });
304 }
305
306 let mut index = Self::new();
307 let files = list_code_files(root);
308 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
309
310 for (i, rel) in files.iter().enumerate() {
311 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
312 tracing::warn!(
313 "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
314 files.len()
315 );
316 break;
317 }
318
319 let abs = root.join(rel);
320 let Some(state) = IndexedFileState::from_path(&abs) else {
321 continue;
322 };
323
324 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
325 if unchanged {
326 if let Some(chunks) = old_by_file.get(rel) {
327 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
328 for chunk in chunks {
329 index.add_chunk(chunk.clone());
330 }
331 index.files.insert(rel.clone(), state);
332 continue;
333 }
334 }
335 }
336
337 if state.size_bytes > MAX_FILE_SIZE_BYTES {
338 continue;
339 }
340 if let Ok(content) = std::fs::read_to_string(&abs) {
341 let mut chunks = extract_chunks(rel, &content);
342 chunks.sort_by(|a, b| {
343 a.start_line
344 .cmp(&b.start_line)
345 .then_with(|| a.end_line.cmp(&b.end_line))
346 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
347 });
348 for chunk in chunks {
349 index.add_chunk(chunk);
350 }
351 index.files.insert(rel.clone(), state);
352 }
353 }
354
355 index.finalize();
356 index
357 }
358
359 fn add_chunk(&mut self, chunk: CodeChunk) {
360 let idx = self.chunks.len();
361
362 let enriched = enrich_for_bm25(&chunk);
363 let tokens = tokenize(&enriched);
364 for token in &tokens {
365 let lower = token.to_lowercase();
366 let postings = self.inverted.entry(lower.clone()).or_default();
367 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
368 *self.doc_freqs.entry(lower).or_insert(0) += 1;
369 }
370 postings.push((idx, 1.0));
371 }
372
373 self.chunks.push(CodeChunk {
374 token_count: tokens.len(),
375 tokens: Vec::new(),
376 ..chunk
377 });
378 }
379
380 fn finalize(&mut self) {
381 self.doc_count = self.chunks.len();
382 if self.doc_count == 0 {
383 return;
384 }
385
386 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
387 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
388 }
389
390 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
391 let query_tokens = tokenize(query);
392 if query_tokens.is_empty() || self.doc_count == 0 {
393 return Vec::new();
394 }
395
396 let n = self.chunks.len();
399 let mut scores = vec![0.0f64; n];
400 let mut touched = Vec::with_capacity(n.min(256));
401
402 for token in &query_tokens {
403 let lower = token.to_lowercase();
404 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
405 if df == 0.0 {
406 continue;
407 }
408
409 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
410
411 if let Some(postings) = self.inverted.get(&lower) {
412 for &(idx, weight) in postings {
413 let doc_len = self.chunks[idx].token_count as f64;
414 let norm_len = doc_len / self.avg_doc_len.max(1.0);
415 let bm25 = idf * (weight * (BM25_K1 + 1.0))
416 / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
417
418 if scores[idx] == 0.0 {
419 touched.push(idx);
420 }
421 scores[idx] += bm25;
422 }
423 }
424 }
425
426 let mut results: Vec<SearchResult> = touched
427 .iter()
428 .filter(|&&idx| scores[idx] > 0.0)
429 .map(|&idx| {
430 let chunk = &self.chunks[idx];
431 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
432 SearchResult {
433 chunk_idx: idx,
434 score: scores[idx],
435 file_path: chunk.file_path.clone(),
436 symbol_name: chunk.symbol_name.clone(),
437 kind: chunk.kind.clone(),
438 start_line: chunk.start_line,
439 end_line: chunk.end_line,
440 snippet,
441 }
442 })
443 .collect();
444
445 results.sort_by(|a, b| {
446 b.score
447 .partial_cmp(&a.score)
448 .unwrap_or(std::cmp::Ordering::Equal)
449 .then_with(|| a.file_path.cmp(&b.file_path))
450 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
451 .then_with(|| a.start_line.cmp(&b.start_line))
452 .then_with(|| a.end_line.cmp(&b.end_line))
453 });
454 results.truncate(top_k);
455 results
456 }
457
458 pub fn save(&self, root: &Path) -> std::io::Result<SaveOutcome> {
459 if self.chunks.len() > CHUNK_COUNT_WARNING {
460 tracing::warn!(
461 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
462 self.chunks.len(),
463 CHUNK_COUNT_WARNING
464 );
465 }
466
467 let dir = index_dir(root);
468 std::fs::create_dir_all(&dir)?;
469 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
470 .map_err(|e| std::io::Error::other(e.to_string()))?;
471
472 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
473 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
474 let compressed_bytes = compressed.len() as u64;
475
476 let max_bytes = max_bm25_cache_bytes();
477 if compressed_bytes > max_bytes {
478 tracing::warn!(
484 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
485 compressed_bytes as f64 / 1_048_576.0,
486 max_bytes / (1024 * 1024),
487 dir.display()
488 );
489 return Ok(SaveOutcome::SkippedTooLarge {
490 compressed_bytes,
491 limit_bytes: max_bytes,
492 });
493 }
494
495 tracing::info!(
496 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
497 data.len() as f64 / 1_048_576.0,
498 compressed_bytes as f64 / 1_048_576.0,
499 (1.0 - compressed_bytes as f64 / data.len().max(1) as f64) * 100.0
500 );
501
502 let target = dir.join("bm25_index.bin.zst");
503 let tmp = dir.join("bm25_index.bin.zst.tmp");
504 std::fs::write(&tmp, &compressed)?;
505 std::fs::rename(&tmp, &target)?;
506
507 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
508 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
509
510 let _ = std::fs::write(
511 dir.join("project_root.txt"),
512 root.to_string_lossy().as_bytes(),
513 );
514
515 Ok(SaveOutcome::Persisted { compressed_bytes })
516 }
517
518 pub fn load(root: &Path) -> Option<Self> {
519 let dir = index_dir(root);
520 let max_bytes = max_bm25_cache_bytes();
521
522 let zst_path = dir.join("bm25_index.bin.zst");
523 if zst_path.exists() {
524 let meta = std::fs::metadata(&zst_path).ok()?;
525 if meta.len() > max_bytes {
526 tracing::warn!(
527 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
528 meta.len() as f64 / 1_073_741_824.0,
529 max_bytes / (1024 * 1024),
530 zst_path.display()
531 );
532 let quarantined = zst_path.with_extension("zst.quarantined");
533 let _ = std::fs::rename(&zst_path, &quarantined);
534 return None;
535 }
536 let compressed = std::fs::read(&zst_path).ok()?;
537 let max_decompressed = max_bytes * 20; let data = bounded_zstd_decode(&compressed, max_decompressed)?;
539 let (idx, _): (Self, _) =
540 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
541 return Some(idx);
542 }
543
544 let bin_path = dir.join("bm25_index.bin");
545 if bin_path.exists() {
546 let meta = std::fs::metadata(&bin_path).ok()?;
547 if meta.len() > max_bytes {
548 tracing::warn!(
549 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
550 meta.len() as f64 / 1_073_741_824.0,
551 max_bytes / (1024 * 1024),
552 bin_path.display()
553 );
554 let quarantined = bin_path.with_extension("bin.quarantined");
555 let _ = std::fs::rename(&bin_path, &quarantined);
556 return None;
557 }
558 let data = std::fs::read(&bin_path).ok()?;
559 let (idx, _): (Self, _) =
560 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
561 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
563 let zst_tmp = zst_path.with_extension("zst.tmp");
564 if std::fs::write(&zst_tmp, &compressed).is_ok()
565 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
566 {
567 tracing::info!(
568 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
569 data.len() as f64 / 1_048_576.0,
570 compressed.len() as f64 / 1_048_576.0
571 );
572 let _ = std::fs::remove_file(&bin_path);
573 }
574 }
575 return Some(idx);
576 }
577
578 let json_path = dir.join("bm25_index.json");
579 if json_path.exists() {
580 let meta = std::fs::metadata(&json_path).ok()?;
581 if meta.len() > max_bytes {
582 tracing::warn!(
583 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
584 meta.len() as f64 / 1_073_741_824.0,
585 max_bytes / (1024 * 1024),
586 json_path.display()
587 );
588 let quarantined = json_path.with_extension("json.quarantined");
589 let _ = std::fs::rename(&json_path, &quarantined);
590 return None;
591 }
592 let data = std::fs::read_to_string(&json_path).ok()?;
593 return serde_json::from_str(&data).ok();
594 }
595
596 None
597 }
598
599 pub fn load_or_build(root: &Path) -> Self {
600 Self::load_or_build_inner(root, false)
601 }
602
603 pub fn load_or_build_fast(root: &Path) -> Self {
606 Self::load_or_build_inner(root, true)
607 }
608
609 fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
610 if !is_safe_bm25_root(root) {
611 return Self::default();
612 }
613 if let Some(idx) = Self::load(root) {
614 let stale = if fast_stale {
615 bm25_index_looks_stale_fast(&idx, root)
616 } else {
617 bm25_index_looks_stale(&idx, root)
618 };
619 if !stale {
620 return idx;
621 }
622 tracing::debug!(
623 "[bm25_index: stale index detected for {}; rebuilding]",
624 root.display()
625 );
626 let rebuilt = if idx.files.is_empty() {
627 Self::build_from_directory(root)
628 } else {
629 Self::rebuild_incremental(root, &idx)
630 };
631 let _ = rebuilt.save(root);
632 return rebuilt;
633 }
634
635 let built = Self::build_from_directory(root);
636 let _ = built.save(root);
637 built
638 }
639
640 pub fn index_file_path(root: &Path) -> PathBuf {
641 let dir = index_dir(root);
642 let zst = dir.join("bm25_index.bin.zst");
643 if zst.exists() {
644 return zst;
645 }
646 let bin = dir.join("bm25_index.bin");
647 if bin.exists() {
648 return bin;
649 }
650 dir.join("bm25_index.json")
651 }
652
653 pub fn ingest_content_chunks(
657 &mut self,
658 chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
659 ) -> usize {
660 let mut count = 0usize;
661 for cc in chunks {
662 self.add_chunk(cc.into());
663 count += 1;
664 }
665 if count > 0 {
666 self.finalize();
667 }
668 count
669 }
670
671 pub fn external_chunk_count(&self) -> usize {
673 self.chunks
674 .iter()
675 .filter(|c| c.file_path.contains("://"))
676 .count()
677 }
678}
679
680fn is_safe_bm25_root(root: &Path) -> bool {
681 super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
682}
683
684fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
685 bm25_index_looks_stale_inner(index, root, false)
686}
687
688pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
691 bm25_index_looks_stale_inner(index, root, true)
692}
693
694fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
695 if index.chunks.is_empty() {
696 return false;
697 }
698
699 if index.files.is_empty() {
700 let mut seen = std::collections::HashSet::<&str>::new();
701 for chunk in &index.chunks {
702 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
703 if rel.is_empty() {
704 continue;
705 }
706 if !seen.insert(rel) {
707 continue;
708 }
709 if !root.join(rel).exists() {
710 return true;
711 }
712 }
713 return false;
714 }
715
716 if fast {
717 let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
718 let step = if index.files.len() > sample_size {
719 index.files.len() / sample_size
720 } else {
721 1
722 };
723 for (i, (rel, old_state)) in index.files.iter().enumerate() {
724 if i % step != 0 {
725 continue;
726 }
727 let abs = root.join(rel);
728 if !abs.exists() {
729 return true;
730 }
731 let Some(cur) = IndexedFileState::from_path(&abs) else {
732 return true;
733 };
734 if &cur != old_state {
735 return true;
736 }
737 }
738 return false;
739 }
740
741 for (rel, old_state) in &index.files {
742 let abs = root.join(rel);
743 if !abs.exists() {
744 return true;
745 }
746 let Some(cur) = IndexedFileState::from_path(&abs) else {
747 return true;
748 };
749 if &cur != old_state {
750 return true;
751 }
752 }
753
754 for rel in list_code_files(root) {
755 if !index.files.contains_key(&rel) {
756 return true;
757 }
758 }
759
760 false
761}
762
763const SENTINEL_SAMPLE_SIZE: usize = 10;
764
765fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
766 use std::io::Read;
767 let mut decoder = zstd::Decoder::new(compressed).ok()?;
768 let mut buf = Vec::new();
769 let mut chunk = vec![0u8; 65536];
770 let mut total = 0u64;
771 loop {
772 let n = decoder.read(&mut chunk).ok()?;
773 if n == 0 {
774 break;
775 }
776 total += n as u64;
777 if total > max_bytes {
778 tracing::warn!(
779 "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
780 total as f64 / (1024.0 * 1024.0),
781 max_bytes as f64 / (1024.0 * 1024.0)
782 );
783 return None;
784 }
785 buf.extend_from_slice(&chunk[..n]);
786 }
787 Some(buf)
788}
789
790fn index_dir(root: &Path) -> PathBuf {
791 crate::core::index_namespace::vectors_dir(root)
792}
793
794fn list_code_files(root: &Path) -> Vec<String> {
795 let walker = ignore::WalkBuilder::new(root)
796 .hidden(true)
797 .git_ignore(true)
798 .git_global(true)
799 .git_exclude(true)
800 .max_depth(Some(20))
801 .build();
802
803 let cfg = crate::core::config::Config::load();
804 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
805 .iter()
806 .filter_map(|p| glob::Pattern::new(p).ok())
807 .collect();
808 ignore_patterns.extend(
809 cfg.extra_ignore_patterns
810 .iter()
811 .filter_map(|p| glob::Pattern::new(p).ok()),
812 );
813
814 let mut files: Vec<String> = Vec::new();
815 for entry in walker.flatten() {
816 let path = entry.path();
817 if !path.is_file() {
818 continue;
819 }
820 if !is_code_file(path) {
821 continue;
822 }
823 let rel = path
824 .strip_prefix(root)
825 .unwrap_or(path)
826 .to_string_lossy()
827 .to_string();
828 if rel.is_empty() {
829 continue;
830 }
831 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
832 continue;
833 }
834 if files.len() >= MAX_BM25_FILES {
835 tracing::warn!(
836 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
837 root.display()
838 );
839 break;
840 }
841 files.push(rel);
842 }
843
844 files.sort();
845 files.dedup();
846 files
847}
848
849pub fn is_code_file(path: &Path) -> bool {
850 let ext = path
851 .extension()
852 .and_then(|e| e.to_str())
853 .unwrap_or("")
854 .to_lowercase();
855 matches!(
856 ext.as_str(),
857 "rs" | "ts"
858 | "tsx"
859 | "js"
860 | "jsx"
861 | "py"
862 | "go"
863 | "java"
864 | "c"
865 | "cc"
866 | "cpp"
867 | "h"
868 | "hpp"
869 | "rb"
870 | "cs"
871 | "kt"
872 | "swift"
873 | "php"
874 | "scala"
875 | "sql"
876 | "ex"
877 | "exs"
878 | "zig"
879 | "lua"
880 | "dart"
881 | "vue"
882 | "svelte"
883 )
884}
885
886fn tokenize(text: &str) -> Vec<String> {
887 let mut tokens = Vec::new();
888 let mut current = String::new();
889
890 for ch in text.chars() {
891 if ch.is_alphanumeric() || ch == '_' {
892 current.push(ch);
893 } else {
894 if current.len() >= 2 {
895 tokens.push(current.clone());
896 }
897 current.clear();
898 }
899 }
900 if current.len() >= 2 {
901 tokens.push(current);
902 }
903
904 split_camel_case_tokens(&tokens)
905}
906
907pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
908 tokenize(text)
909}
910
911fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
912 let mut result = Vec::new();
913 for token in tokens {
914 result.push(token.clone());
915 let mut start = 0;
916 let chars: Vec<char> = token.chars().collect();
917 for i in 1..chars.len() {
918 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
919 let part: String = chars[start..i].iter().collect();
920 if part.len() >= 2 {
921 result.push(part);
922 }
923 start = i;
924 }
925 }
926 if start > 0 {
927 let part: String = chars[start..].iter().collect();
928 if part.len() >= 2 {
929 result.push(part);
930 }
931 }
932 }
933 result
934}
935
936fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
937 #[cfg(feature = "tree-sitter")]
938 {
939 let ext = std::path::Path::new(file_path)
940 .extension()
941 .and_then(|e| e.to_str())
942 .unwrap_or("");
943 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
944 return chunks;
945 }
946 }
947
948 let lines: Vec<&str> = content.lines().collect();
949 if lines.is_empty() {
950 return Vec::new();
951 }
952
953 let mut chunks = Vec::new();
954 let mut i = 0;
955
956 while i < lines.len() {
957 let trimmed = lines[i].trim();
958
959 if let Some((name, kind)) = detect_symbol(trimmed) {
960 let start = i;
961 let end = find_block_end(&lines, i);
962 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
963 let token_count = tokenize(&block).len();
964
965 chunks.push(CodeChunk {
966 file_path: file_path.to_string(),
967 symbol_name: name,
968 kind,
969 start_line: start + 1,
970 end_line: end + 1,
971 content: block,
972 tokens: Vec::new(),
973 token_count,
974 });
975
976 i = end + 1;
977 } else {
978 i += 1;
979 }
980 }
981
982 if chunks.is_empty() && !content.is_empty() {
983 let bytes = content.as_bytes();
988 let rk_chunks = crate::core::rabin_karp::chunk(content);
989 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
990 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
991 let end = (c.offset + c.length).min(bytes.len());
992 let slice = &bytes[c.offset..end];
993 let chunk_text = String::from_utf8_lossy(slice).into_owned();
994 let token_count = tokenize(&chunk_text).len();
995 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
996 let end_line = start_line + bytecount::count(slice, b'\n');
997 chunks.push(CodeChunk {
998 file_path: file_path.to_string(),
999 symbol_name: format!("{file_path}#chunk-{idx}"),
1000 kind: ChunkKind::Module,
1001 start_line,
1002 end_line: end_line.max(start_line),
1003 content: chunk_text,
1004 tokens: Vec::new(),
1005 token_count,
1006 });
1007 }
1008 } else {
1009 let token_count = tokenize(content).len();
1010 let snippet = lines
1011 .iter()
1012 .take(50)
1013 .copied()
1014 .collect::<Vec<_>>()
1015 .join("\n");
1016 chunks.push(CodeChunk {
1017 file_path: file_path.to_string(),
1018 symbol_name: file_path.to_string(),
1019 kind: ChunkKind::Module,
1020 start_line: 1,
1021 end_line: lines.len(),
1022 content: snippet,
1023 tokens: Vec::new(),
1024 token_count,
1025 });
1026 }
1027 }
1028
1029 chunks
1030}
1031
1032fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
1033 let trimmed = line.trim();
1034
1035 let patterns: &[(&str, ChunkKind)] = &[
1036 ("pub async fn ", ChunkKind::Function),
1037 ("async fn ", ChunkKind::Function),
1038 ("pub fn ", ChunkKind::Function),
1039 ("fn ", ChunkKind::Function),
1040 ("pub struct ", ChunkKind::Struct),
1041 ("struct ", ChunkKind::Struct),
1042 ("pub enum ", ChunkKind::Struct),
1043 ("enum ", ChunkKind::Struct),
1044 ("impl ", ChunkKind::Impl),
1045 ("pub trait ", ChunkKind::Struct),
1046 ("trait ", ChunkKind::Struct),
1047 ("export function ", ChunkKind::Function),
1048 ("export async function ", ChunkKind::Function),
1049 ("export default function ", ChunkKind::Function),
1050 ("function ", ChunkKind::Function),
1051 ("async function ", ChunkKind::Function),
1052 ("export class ", ChunkKind::Class),
1053 ("class ", ChunkKind::Class),
1054 ("export interface ", ChunkKind::Struct),
1055 ("interface ", ChunkKind::Struct),
1056 ("def ", ChunkKind::Function),
1057 ("async def ", ChunkKind::Function),
1058 ("class ", ChunkKind::Class),
1059 ("func ", ChunkKind::Function),
1060 ];
1061
1062 for (prefix, kind) in patterns {
1063 if let Some(rest) = trimmed.strip_prefix(prefix) {
1064 let name: String = rest
1065 .chars()
1066 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
1067 .take_while(|c| *c != '<')
1068 .collect();
1069 if !name.is_empty() {
1070 return Some((name, kind.clone()));
1071 }
1072 }
1073 }
1074
1075 None
1076}
1077
1078fn find_block_end(lines: &[&str], start: usize) -> usize {
1079 let mut depth = 0i32;
1080 let mut found_open = false;
1081
1082 for (i, line) in lines.iter().enumerate().skip(start) {
1083 for ch in line.chars() {
1084 match ch {
1085 '{' | '(' if !found_open || depth > 0 => {
1086 depth += 1;
1087 found_open = true;
1088 }
1089 '}' | ')' if depth > 0 => {
1090 depth -= 1;
1091 if depth == 0 && found_open {
1092 return i;
1093 }
1094 }
1095 _ => {}
1096 }
1097 }
1098
1099 if found_open && depth <= 0 && i > start {
1100 return i;
1101 }
1102
1103 if !found_open && i > start + 2 {
1104 let trimmed = lines[i].trim();
1105 if trimmed.is_empty()
1106 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1107 {
1108 return i.saturating_sub(1);
1109 }
1110 }
1111 }
1112
1113 (start + 50).min(lines.len().saturating_sub(1))
1114}
1115
1116pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1117 if results.is_empty() {
1118 return "No results found.".to_string();
1119 }
1120
1121 let mut out = String::new();
1122 for (i, r) in results.iter().enumerate() {
1123 let is_external = r.file_path.contains("://");
1124 let normalized;
1128 let file_path: &str = if is_external {
1129 &r.file_path
1130 } else {
1131 normalized = crate::core::protocol::display_path(&r.file_path);
1132 &normalized
1133 };
1134 if compact {
1135 if is_external {
1136 out.push_str(&format!(
1137 "{}. {:.2} [{:?}] {} — {}\n",
1138 i + 1,
1139 r.score,
1140 r.kind,
1141 file_path,
1142 r.symbol_name,
1143 ));
1144 } else {
1145 out.push_str(&format!(
1146 "{}. {:.2} {}:{}-{} {:?} {}\n",
1147 i + 1,
1148 r.score,
1149 file_path,
1150 r.start_line,
1151 r.end_line,
1152 r.kind,
1153 r.symbol_name,
1154 ));
1155 }
1156 } else if is_external {
1157 out.push_str(&format!(
1158 "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1159 i + 1,
1160 r.score,
1161 r.kind,
1162 file_path,
1163 r.symbol_name,
1164 r.snippet,
1165 ));
1166 } else {
1167 out.push_str(&format!(
1168 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1169 i + 1,
1170 r.score,
1171 file_path,
1172 r.symbol_name,
1173 r.kind,
1174 r.start_line,
1175 r.end_line,
1176 r.snippet,
1177 ));
1178 }
1179 }
1180 out
1181}
1182
1183fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1190 let path = Path::new(&chunk.file_path);
1191 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1192 let dir = path
1193 .parent()
1194 .and_then(|p| p.file_name())
1195 .and_then(|d| d.to_str())
1196 .unwrap_or("");
1197
1198 if stem.is_empty() {
1199 return chunk.content.clone();
1200 }
1201
1202 format!("{} {} {} {}", chunk.content, stem, stem, dir)
1203}
1204
1205#[cfg(test)]
1206mod tests {
1207 use super::*;
1208 use tempfile::tempdir;
1209
1210 #[cfg(unix)]
1211 use std::os::unix::fs::PermissionsExt;
1212
1213 #[test]
1214 fn tokenize_splits_code() {
1215 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1216 assert!(tokens.contains(&"calculate_total".to_string()));
1217 assert!(tokens.contains(&"items".to_string()));
1218 assert!(tokens.contains(&"Vec".to_string()));
1219 }
1220
1221 #[test]
1222 fn format_search_results_normalizes_windows_separators() {
1223 let r = SearchResult {
1227 chunk_idx: 0,
1228 score: 1.0,
1229 file_path: r"C:\Users\zir\AppData\Local\Temp\win-build-log.txt".to_string(),
1230 symbol_name: "main".to_string(),
1231 kind: ChunkKind::Function,
1232 start_line: 1,
1233 end_line: 2,
1234 snippet: "x".to_string(),
1235 };
1236 let compact = format_search_results(std::slice::from_ref(&r), true);
1237 assert!(compact.contains("C:/Users/zir/AppData/Local/Temp/win-build-log.txt"));
1238 assert!(!compact.contains('\\'));
1239
1240 let verbose = format_search_results(std::slice::from_ref(&r), false);
1241 assert!(verbose.contains("C:/Users/zir/AppData/Local/Temp/win-build-log.txt"));
1242 assert!(!verbose.contains('\\'));
1243 }
1244
1245 #[test]
1246 fn format_search_results_leaves_external_uris_untouched() {
1247 let r = SearchResult {
1250 chunk_idx: 0,
1251 score: 1.0,
1252 file_path: "github://owner/repo/issues/42".to_string(),
1253 symbol_name: "issue".to_string(),
1254 kind: ChunkKind::Module,
1255 start_line: 0,
1256 end_line: 0,
1257 snippet: "y".to_string(),
1258 };
1259 let out = format_search_results(std::slice::from_ref(&r), true);
1260 assert!(out.contains("github://owner/repo/issues/42"));
1261 }
1262
1263 #[test]
1264 fn camel_case_splitting() {
1265 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1266 assert!(tokens.contains(&"calculateTotal".to_string()));
1267 assert!(tokens.contains(&"calculate".to_string()));
1268 assert!(tokens.contains(&"Total".to_string()));
1269 }
1270
1271 #[test]
1272 fn detect_rust_function() {
1273 let (name, kind) =
1274 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1275 assert_eq!(name, "process_request");
1276 assert_eq!(kind, ChunkKind::Function);
1277 }
1278
1279 #[test]
1280 fn bm25_search_finds_relevant() {
1281 let mut index = BM25Index::new();
1282 index.add_chunk(CodeChunk {
1283 file_path: "auth.rs".into(),
1284 symbol_name: "validate_token".into(),
1285 kind: ChunkKind::Function,
1286 start_line: 1,
1287 end_line: 10,
1288 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1289 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1290 token_count: 8,
1291 });
1292 index.add_chunk(CodeChunk {
1293 file_path: "db.rs".into(),
1294 symbol_name: "connect_database".into(),
1295 kind: ChunkKind::Function,
1296 start_line: 1,
1297 end_line: 5,
1298 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1299 tokens: tokenize("fn connect_database url str Pool create_pool url"),
1300 token_count: 7,
1301 });
1302 index.finalize();
1303
1304 let results = index.search("jwt token validation", 5);
1305 assert!(!results.is_empty());
1306 assert_eq!(results[0].symbol_name, "validate_token");
1307 }
1308
1309 #[test]
1310 fn bm25_search_sorts_ties_deterministically() {
1311 let mut index = BM25Index::new();
1312
1313 index.add_chunk(CodeChunk {
1315 file_path: "b.rs".into(),
1316 symbol_name: "same".into(),
1317 kind: ChunkKind::Function,
1318 start_line: 1,
1319 end_line: 1,
1320 content: "fn same() {}".into(),
1321 tokens: tokenize("same token"),
1322 token_count: 2,
1323 });
1324 index.add_chunk(CodeChunk {
1325 file_path: "a.rs".into(),
1326 symbol_name: "same".into(),
1327 kind: ChunkKind::Function,
1328 start_line: 1,
1329 end_line: 1,
1330 content: "fn same() {}".into(),
1331 tokens: tokenize("same token"),
1332 token_count: 2,
1333 });
1334 index.finalize();
1335
1336 let results = index.search("same", 10);
1337 assert!(results.len() >= 2);
1338 assert_eq!(results[0].file_path, "a.rs");
1339 assert_eq!(results[1].file_path, "b.rs");
1340 }
1341
1342 #[test]
1343 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1344 let td = tempdir().expect("tempdir");
1345 let root = td.path();
1346 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1347
1348 let idx = BM25Index::build_from_directory(root);
1349 assert!(!bm25_index_looks_stale(&idx, root));
1350
1351 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1352 assert!(bm25_index_looks_stale(&idx, root));
1353 }
1354
1355 #[test]
1356 #[cfg(unix)]
1357 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1358 let td = tempdir().expect("tempdir");
1359 let root = td.path();
1360
1361 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1362 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1363
1364 let idx1 = BM25Index::build_from_directory(root);
1365 assert!(idx1.files.contains_key("a.rs"));
1366 assert!(idx1.files.contains_key("b.rs"));
1367
1368 let a_path = root.join("a.rs");
1370 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1371 perms.set_mode(0o000);
1372 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1373
1374 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1376 .expect("rewrite b.rs");
1377
1378 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1379 assert!(
1380 idx2.files.contains_key("a.rs"),
1381 "a.rs should be kept via reuse"
1382 );
1383 assert!(idx2.files.contains_key("b.rs"));
1384
1385 let b_has_b2 = idx2
1386 .chunks
1387 .iter()
1388 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1389 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1390
1391 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1393 perms.set_mode(0o644);
1394 let _ = std::fs::set_permissions(&a_path, perms);
1395 }
1396
1397 #[test]
1398 fn load_quarantines_oversized_index() {
1399 let _env = crate::core::data_dir::test_env_lock();
1400 let td = tempdir().expect("tempdir");
1401 let root = td.path();
1402 let dir = crate::core::index_namespace::vectors_dir(root);
1403 std::fs::create_dir_all(&dir).expect("create vectors dir");
1404
1405 let index_path = dir.join("bm25_index.json");
1406 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1407 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1408
1409 let result = BM25Index::load(root);
1410 assert!(result.is_none(), "oversized index should return None");
1411 assert!(
1412 !index_path.exists(),
1413 "original index should be removed after quarantine"
1414 );
1415 assert!(
1416 dir.join("bm25_index.json.quarantined").exists(),
1417 "quarantined file should exist"
1418 );
1419
1420 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1421 }
1422
1423 #[test]
1424 fn save_refuses_oversized_output() {
1425 let _env = crate::core::data_dir::test_env_lock();
1426 let data_dir = tempdir().expect("data_dir");
1427 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1428 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1429
1430 let td = tempdir().expect("tempdir");
1431 let root = td.path();
1432
1433 let mut index = BM25Index::new();
1434 index.add_chunk(CodeChunk {
1435 file_path: "a.rs".into(),
1436 symbol_name: "a".into(),
1437 kind: ChunkKind::Function,
1438 start_line: 1,
1439 end_line: 1,
1440 content: "fn a() {}".into(),
1441 tokens: tokenize("fn a"),
1442 token_count: 2,
1443 });
1444 index.finalize();
1445
1446 let outcome = index
1447 .save(root)
1448 .expect("save returns Ok even when refusing");
1449 assert!(
1450 matches!(outcome, SaveOutcome::SkippedTooLarge { .. }),
1451 "oversized save must report SkippedTooLarge (not a silent success), got {outcome:?}"
1452 );
1453 let index_path = BM25Index::index_file_path(root);
1454 assert!(
1455 !index_path.exists(),
1456 "save should refuse to persist oversized index"
1457 );
1458
1459 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1460 }
1461
1462 #[test]
1463 fn save_reports_persisted_outcome() {
1464 let _env = crate::core::data_dir::test_env_lock();
1465 let data_dir = tempdir().expect("data_dir");
1466 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1467 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1468 let td = tempdir().expect("tempdir");
1469 let root = td.path();
1470 std::fs::write(root.join("a.rs"), "pub fn alpha() {}\n").expect("write");
1471
1472 let index = BM25Index::build_from_directory(root);
1473 let outcome = index.save(root).expect("save");
1474 match outcome {
1475 SaveOutcome::Persisted { compressed_bytes } => {
1476 assert!(compressed_bytes > 0, "persisted size should be non-zero");
1477 }
1478 SaveOutcome::SkippedTooLarge { .. } => {
1479 panic!("expected Persisted, got {outcome:?}")
1480 }
1481 }
1482
1483 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1484 std::env::remove_var("LEAN_CTX_DATA_DIR");
1485 }
1486
1487 #[test]
1488 fn persist_ceiling_honors_env_override() {
1489 let _env = crate::core::data_dir::test_env_lock();
1492 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "777");
1493 assert_eq!(persist_ceiling_bytes(), 777 * 1024 * 1024);
1494 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1495 }
1496
1497 #[test]
1498 fn save_writes_project_root_marker() {
1499 let _env = crate::core::data_dir::test_env_lock();
1500 let td = tempdir().expect("tempdir");
1501 let root = td.path();
1502 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1503
1504 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1505 let index = BM25Index::build_from_directory(root);
1506 index.save(root).expect("save");
1507
1508 let dir = crate::core::index_namespace::vectors_dir(root);
1509 let marker = dir.join("project_root.txt");
1510 assert!(marker.exists(), "project_root.txt marker should exist");
1511 let content = std::fs::read_to_string(&marker).expect("read marker");
1512 assert_eq!(content, root.to_string_lossy());
1513 }
1514
1515 #[test]
1516 fn save_load_roundtrip_uses_zstd() {
1517 let _env = crate::core::data_dir::test_env_lock();
1518 let data_dir = tempdir().expect("data_dir");
1519 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1520 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1521 let td = tempdir().expect("tempdir");
1522 let root = td.path();
1523
1524 for i in 0..10 {
1525 std::fs::write(
1526 root.join(format!("mod{i}.rs")),
1527 format!(
1528 "pub fn handler_{i}() {{\n println!(\"hello\");\n}}\n\n\
1529 pub fn helper_{i}() {{\n println!(\"world\");\n}}\n"
1530 ),
1531 )
1532 .expect("write");
1533 }
1534
1535 let index = BM25Index::build_from_directory(root);
1536 assert!(index.doc_count > 0, "should have indexed chunks");
1537 index.save(root).expect("save");
1538
1539 let dir = crate::core::index_namespace::vectors_dir(root);
1540 let zst = dir.join("bm25_index.bin.zst");
1541 assert!(zst.exists(), "should write .bin.zst");
1542 assert!(
1543 !dir.join("bm25_index.bin").exists(),
1544 ".bin should be deleted"
1545 );
1546
1547 let loaded = BM25Index::load(root).expect("load compressed index");
1548 assert_eq!(loaded.doc_count, index.doc_count);
1549 assert_eq!(loaded.chunks.len(), index.chunks.len());
1550
1551 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1552 std::env::remove_var("LEAN_CTX_DATA_DIR");
1553 }
1554
1555 #[test]
1556 fn auto_migrate_bin_to_zst() {
1557 let _env = crate::core::data_dir::test_env_lock();
1558 let data_dir = tempdir().expect("data_dir");
1559 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1560 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1561 let td = tempdir().expect("tempdir");
1562 let root = td.path();
1563
1564 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1565 let index = BM25Index::build_from_directory(root);
1566
1567 let dir = crate::core::index_namespace::vectors_dir(root);
1568 std::fs::create_dir_all(&dir).expect("mkdir");
1569 let data =
1570 bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1571 std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1572
1573 let loaded = BM25Index::load(root).expect("load should auto-migrate");
1574 assert_eq!(loaded.doc_count, index.doc_count);
1575 assert!(
1576 dir.join("bm25_index.bin.zst").exists(),
1577 ".bin.zst should be created"
1578 );
1579 assert!(
1580 !dir.join("bm25_index.bin").exists(),
1581 ".bin should be removed"
1582 );
1583
1584 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1585 std::env::remove_var("LEAN_CTX_DATA_DIR");
1586 }
1587
1588 #[test]
1589 fn list_code_files_skips_default_vendor_ignores() {
1590 let td = tempdir().expect("tempdir");
1591 let root = td.path();
1592
1593 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1594 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1595 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1596 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1597 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1598
1599 let files = list_code_files(root);
1600 assert!(
1601 files.iter().any(|f| f == "main.rs"),
1602 "main.rs should be included"
1603 );
1604 assert!(
1605 !files.iter().any(|f| f.starts_with("vendor/")),
1606 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1607 );
1608 assert!(
1609 !files.iter().any(|f| f.starts_with("dist/")),
1610 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1611 );
1612 }
1613
1614 #[test]
1615 fn list_code_files_respects_max_files_cap() {
1616 let td = tempdir().expect("tempdir");
1617 let root = td.path();
1618
1619 for i in 0..10 {
1622 std::fs::write(
1623 root.join(format!("f{i}.rs")),
1624 format!("pub fn f{i}() {{}}\n"),
1625 )
1626 .expect("write");
1627 }
1628 let files = list_code_files(root);
1629 assert!(
1630 files.len() <= MAX_BM25_FILES,
1631 "file count should not exceed MAX_BM25_FILES"
1632 );
1633 }
1634
1635 #[test]
1636 fn max_bm25_cache_bytes_reads_env() {
1637 let _env = crate::core::data_dir::test_env_lock();
1638 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1639 let bytes = max_bm25_cache_bytes();
1640 assert_eq!(bytes, 64 * 1024 * 1024);
1641 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1642 }
1643}