1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7const MAX_BM25_FILES: usize = 5000;
8const CHUNK_COUNT_WARNING: usize = 50_000;
9const ZSTD_LEVEL: i32 = 9;
10
11const DEFAULT_BM25_IGNORES: &[&str] = &[
12 "vendor/**",
13 "dist/**",
14 "build/**",
15 "public/vendor/**",
16 "public/js/**",
17 "public/css/**",
18 "public/build/**",
19 ".next/**",
20 ".nuxt/**",
21 "__pycache__/**",
22 "*.min.js",
23 "*.min.css",
24 "*.bundle.js",
25 "*.chunk.js",
26];
27
28fn max_bm25_cache_bytes() -> u64 {
29 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
30 .ok()
31 .and_then(|v| v.parse::<u64>().ok())
32 .unwrap_or_else(|| {
33 let cfg = crate::core::config::Config::load();
34 let profile = crate::core::config::MemoryProfile::effective(&cfg);
35 let profile_mb = profile.bm25_max_cache_mb();
36 if cfg.bm25_max_cache_mb == crate::core::config::default_bm25_max_cache_mb() {
37 profile_mb
38 } else {
39 cfg.bm25_max_cache_mb
40 }
41 });
42 mb * 1024 * 1024
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct CodeChunk {
47 pub file_path: String,
48 pub symbol_name: String,
49 pub kind: ChunkKind,
50 pub start_line: usize,
51 pub end_line: usize,
52 pub content: String,
53 #[serde(default)]
54 pub tokens: Vec<String>,
55 pub token_count: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ChunkKind {
60 Function,
61 Struct,
62 Impl,
63 Module,
64 Class,
65 Method,
66 Other,
67 Issue,
69 PullRequest,
70 WikiPage,
71 DbSchema,
72 ApiEndpoint,
73 Ticket,
74 ExternalOther,
75}
76
77#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
78pub struct IndexedFileState {
79 pub mtime_ms: u64,
80 pub size_bytes: u64,
81}
82
83impl IndexedFileState {
84 fn from_path(path: &Path) -> Option<Self> {
85 let meta = path.metadata().ok()?;
86 let size_bytes = meta.len();
87 let mtime_ms = meta
88 .modified()
89 .ok()
90 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
91 .map(|d| d.as_millis() as u64)?;
92 Some(Self {
93 mtime_ms,
94 size_bytes,
95 })
96 }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct BM25Index {
101 pub chunks: Vec<CodeChunk>,
102 pub inverted: HashMap<String, Vec<(usize, f64)>>,
103 pub avg_doc_len: f64,
104 pub doc_count: usize,
105 pub doc_freqs: HashMap<String, usize>,
106 #[serde(default)]
107 pub files: HashMap<String, IndexedFileState>,
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct SearchResult {
112 pub chunk_idx: usize,
113 pub score: f64,
114 pub file_path: String,
115 pub symbol_name: String,
116 pub kind: ChunkKind,
117 pub start_line: usize,
118 pub end_line: usize,
119 pub snippet: String,
120}
121
122const BM25_K1: f64 = 1.2;
123const BM25_B: f64 = 0.75;
124
125impl Default for BM25Index {
126 fn default() -> Self {
127 Self::new()
128 }
129}
130
131impl BM25Index {
132 pub fn new() -> Self {
133 Self {
134 chunks: Vec::new(),
135 inverted: HashMap::new(),
136 avg_doc_len: 0.0,
137 doc_count: 0,
138 doc_freqs: HashMap::new(),
139 files: HashMap::new(),
140 }
141 }
142
143 pub fn memory_usage_bytes(&self) -> usize {
145 let chunks_size: usize = self
146 .chunks
147 .iter()
148 .map(|c| {
149 c.content.len()
150 + c.file_path.len()
151 + c.symbol_name.len()
152 + c.tokens.iter().map(String::len).sum::<usize>()
153 + 64
154 })
155 .sum();
156 let inverted_size: usize = self
157 .inverted
158 .iter()
159 .map(|(k, v)| k.len() + v.len() * 16 + 32)
160 .sum();
161 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
162 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
163 chunks_size + inverted_size + files_size + freqs_size
164 }
165
166 pub fn unload(&mut self) {
168 let usage = self.memory_usage_bytes();
169 self.chunks = Vec::new();
170 self.inverted = HashMap::new();
171 self.doc_freqs = HashMap::new();
172 self.files = HashMap::new();
173 self.avg_doc_len = 0.0;
174 self.doc_count = 0;
175 tracing::info!(
176 "[bm25] unloaded index, freed ~{:.1}MB",
177 usage as f64 / 1_048_576.0
178 );
179 }
180
181 #[cfg(test)]
183 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
184 let mut index = Self::new();
185 for mut chunk in chunks {
186 if chunk.token_count == 0 {
187 chunk.token_count = tokenize(&chunk.content).len();
188 }
189 index.add_chunk(chunk);
190 }
191 index.finalize();
192 index
193 }
194
195 pub fn build_from_directory(root: &Path) -> Self {
196 Self::build_from_directory_inner(root, &HashMap::new())
197 }
198
199 pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
202 Self::build_from_directory_inner(root, content_hint)
203 }
204
205 fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
206 let root_str = root.to_string_lossy();
207 if !super::graph_index::is_safe_scan_root_public(&root_str) {
208 tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
209 return Self::new();
210 }
211 let mut index = Self::new();
212 let files = list_code_files(root);
213 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
214 let mut cache_hits = 0usize;
215
216 for (i, rel) in files.iter().enumerate() {
217 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
218 tracing::warn!(
219 "[bm25: stopping build at file {i}/{} due to memory pressure]",
220 files.len()
221 );
222 break;
223 }
224 if crate::core::memory_guard::abort_requested() {
225 tracing::warn!("[bm25: aborting build due to critical memory pressure]");
226 break;
227 }
228
229 let abs = root.join(rel);
230 let Some(state) = IndexedFileState::from_path(&abs) else {
231 continue;
232 };
233 if state.size_bytes > MAX_FILE_SIZE_BYTES {
234 continue;
235 }
236
237 let content = if let Some(cached) = content_hint.get(rel) {
238 cache_hits += 1;
239 std::borrow::Cow::Borrowed(cached.as_str())
240 } else {
241 match std::fs::read_to_string(&abs) {
242 Ok(c) => std::borrow::Cow::Owned(c),
243 Err(_) => continue,
244 }
245 };
246
247 let mut chunks = extract_chunks(rel, &content);
248 chunks.sort_by(|a, b| {
249 a.start_line
250 .cmp(&b.start_line)
251 .then_with(|| a.end_line.cmp(&b.end_line))
252 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
253 });
254 for chunk in chunks {
255 index.add_chunk(chunk);
256 }
257 index.files.insert(rel.clone(), state);
258 }
259
260 if cache_hits > 0 {
261 tracing::info!(
262 "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
263 files.len()
264 );
265 }
266
267 index.finalize();
268 index
269 }
270
271 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
272 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
273 for c in &prev.chunks {
274 old_by_file
275 .entry(c.file_path.clone())
276 .or_default()
277 .push(c.clone());
278 }
279 for v in old_by_file.values_mut() {
280 v.sort_by(|a, b| {
281 a.start_line
282 .cmp(&b.start_line)
283 .then_with(|| a.end_line.cmp(&b.end_line))
284 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
285 });
286 }
287
288 let mut index = Self::new();
289 let files = list_code_files(root);
290 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
291
292 for (i, rel) in files.iter().enumerate() {
293 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
294 tracing::warn!(
295 "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
296 files.len()
297 );
298 break;
299 }
300
301 let abs = root.join(rel);
302 let Some(state) = IndexedFileState::from_path(&abs) else {
303 continue;
304 };
305
306 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
307 if unchanged {
308 if let Some(chunks) = old_by_file.get(rel) {
309 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
310 for chunk in chunks {
311 index.add_chunk(chunk.clone());
312 }
313 index.files.insert(rel.clone(), state);
314 continue;
315 }
316 }
317 }
318
319 if state.size_bytes > MAX_FILE_SIZE_BYTES {
320 continue;
321 }
322 if let Ok(content) = std::fs::read_to_string(&abs) {
323 let mut chunks = extract_chunks(rel, &content);
324 chunks.sort_by(|a, b| {
325 a.start_line
326 .cmp(&b.start_line)
327 .then_with(|| a.end_line.cmp(&b.end_line))
328 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
329 });
330 for chunk in chunks {
331 index.add_chunk(chunk);
332 }
333 index.files.insert(rel.clone(), state);
334 }
335 }
336
337 index.finalize();
338 index
339 }
340
341 fn add_chunk(&mut self, chunk: CodeChunk) {
342 let idx = self.chunks.len();
343
344 let enriched = enrich_for_bm25(&chunk);
345 let tokens = tokenize(&enriched);
346 for token in &tokens {
347 let lower = token.to_lowercase();
348 let postings = self.inverted.entry(lower.clone()).or_default();
349 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
350 *self.doc_freqs.entry(lower).or_insert(0) += 1;
351 }
352 postings.push((idx, 1.0));
353 }
354
355 self.chunks.push(CodeChunk {
356 token_count: tokens.len(),
357 tokens: Vec::new(),
358 ..chunk
359 });
360 }
361
362 fn finalize(&mut self) {
363 self.doc_count = self.chunks.len();
364 if self.doc_count == 0 {
365 return;
366 }
367
368 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
369 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
370 }
371
372 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
373 let query_tokens = tokenize(query);
374 if query_tokens.is_empty() || self.doc_count == 0 {
375 return Vec::new();
376 }
377
378 let n = self.chunks.len();
381 let mut scores = vec![0.0f64; n];
382 let mut touched = Vec::with_capacity(n.min(256));
383
384 for token in &query_tokens {
385 let lower = token.to_lowercase();
386 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
387 if df == 0.0 {
388 continue;
389 }
390
391 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
392
393 if let Some(postings) = self.inverted.get(&lower) {
394 for &(idx, weight) in postings {
395 let doc_len = self.chunks[idx].token_count as f64;
396 let norm_len = doc_len / self.avg_doc_len.max(1.0);
397 let bm25 = idf * (weight * (BM25_K1 + 1.0))
398 / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
399
400 if scores[idx] == 0.0 {
401 touched.push(idx);
402 }
403 scores[idx] += bm25;
404 }
405 }
406 }
407
408 let mut results: Vec<SearchResult> = touched
409 .iter()
410 .filter(|&&idx| scores[idx] > 0.0)
411 .map(|&idx| {
412 let chunk = &self.chunks[idx];
413 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
414 SearchResult {
415 chunk_idx: idx,
416 score: scores[idx],
417 file_path: chunk.file_path.clone(),
418 symbol_name: chunk.symbol_name.clone(),
419 kind: chunk.kind.clone(),
420 start_line: chunk.start_line,
421 end_line: chunk.end_line,
422 snippet,
423 }
424 })
425 .collect();
426
427 results.sort_by(|a, b| {
428 b.score
429 .partial_cmp(&a.score)
430 .unwrap_or(std::cmp::Ordering::Equal)
431 .then_with(|| a.file_path.cmp(&b.file_path))
432 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
433 .then_with(|| a.start_line.cmp(&b.start_line))
434 .then_with(|| a.end_line.cmp(&b.end_line))
435 });
436 results.truncate(top_k);
437 results
438 }
439
440 pub fn save(&self, root: &Path) -> std::io::Result<()> {
441 if self.chunks.len() > CHUNK_COUNT_WARNING {
442 tracing::warn!(
443 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
444 self.chunks.len(),
445 CHUNK_COUNT_WARNING
446 );
447 }
448
449 let dir = index_dir(root);
450 std::fs::create_dir_all(&dir)?;
451 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
452 .map_err(|e| std::io::Error::other(e.to_string()))?;
453
454 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
455 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
456
457 let max_bytes = max_bm25_cache_bytes();
458 if compressed.len() as u64 > max_bytes {
459 tracing::warn!(
460 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
461 compressed.len() as f64 / 1_048_576.0,
462 max_bytes / (1024 * 1024),
463 dir.display()
464 );
465 return Ok(());
466 }
467
468 tracing::info!(
469 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
470 data.len() as f64 / 1_048_576.0,
471 compressed.len() as f64 / 1_048_576.0,
472 (1.0 - compressed.len() as f64 / data.len().max(1) as f64) * 100.0
473 );
474
475 let target = dir.join("bm25_index.bin.zst");
476 let tmp = dir.join("bm25_index.bin.zst.tmp");
477 std::fs::write(&tmp, &compressed)?;
478 std::fs::rename(&tmp, &target)?;
479
480 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
481 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
482
483 let _ = std::fs::write(
484 dir.join("project_root.txt"),
485 root.to_string_lossy().as_bytes(),
486 );
487
488 Ok(())
489 }
490
491 pub fn load(root: &Path) -> Option<Self> {
492 let dir = index_dir(root);
493 let max_bytes = max_bm25_cache_bytes();
494
495 let zst_path = dir.join("bm25_index.bin.zst");
496 if zst_path.exists() {
497 let meta = std::fs::metadata(&zst_path).ok()?;
498 if meta.len() > max_bytes {
499 tracing::warn!(
500 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
501 meta.len() as f64 / 1_073_741_824.0,
502 max_bytes / (1024 * 1024),
503 zst_path.display()
504 );
505 let quarantined = zst_path.with_extension("zst.quarantined");
506 let _ = std::fs::rename(&zst_path, &quarantined);
507 return None;
508 }
509 let compressed = std::fs::read(&zst_path).ok()?;
510 let max_decompressed = max_bytes * 20; let data = bounded_zstd_decode(&compressed, max_decompressed)?;
512 let (idx, _): (Self, _) =
513 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
514 return Some(idx);
515 }
516
517 let bin_path = dir.join("bm25_index.bin");
518 if bin_path.exists() {
519 let meta = std::fs::metadata(&bin_path).ok()?;
520 if meta.len() > max_bytes {
521 tracing::warn!(
522 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
523 meta.len() as f64 / 1_073_741_824.0,
524 max_bytes / (1024 * 1024),
525 bin_path.display()
526 );
527 let quarantined = bin_path.with_extension("bin.quarantined");
528 let _ = std::fs::rename(&bin_path, &quarantined);
529 return None;
530 }
531 let data = std::fs::read(&bin_path).ok()?;
532 let (idx, _): (Self, _) =
533 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
534 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
536 let zst_tmp = zst_path.with_extension("zst.tmp");
537 if std::fs::write(&zst_tmp, &compressed).is_ok()
538 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
539 {
540 tracing::info!(
541 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
542 data.len() as f64 / 1_048_576.0,
543 compressed.len() as f64 / 1_048_576.0
544 );
545 let _ = std::fs::remove_file(&bin_path);
546 }
547 }
548 return Some(idx);
549 }
550
551 let json_path = dir.join("bm25_index.json");
552 if json_path.exists() {
553 let meta = std::fs::metadata(&json_path).ok()?;
554 if meta.len() > max_bytes {
555 tracing::warn!(
556 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
557 meta.len() as f64 / 1_073_741_824.0,
558 max_bytes / (1024 * 1024),
559 json_path.display()
560 );
561 let quarantined = json_path.with_extension("json.quarantined");
562 let _ = std::fs::rename(&json_path, &quarantined);
563 return None;
564 }
565 let data = std::fs::read_to_string(&json_path).ok()?;
566 return serde_json::from_str(&data).ok();
567 }
568
569 None
570 }
571
572 pub fn load_or_build(root: &Path) -> Self {
573 Self::load_or_build_inner(root, false)
574 }
575
576 pub fn load_or_build_fast(root: &Path) -> Self {
579 Self::load_or_build_inner(root, true)
580 }
581
582 fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
583 if !is_safe_bm25_root(root) {
584 return Self::default();
585 }
586 if let Some(idx) = Self::load(root) {
587 let stale = if fast_stale {
588 bm25_index_looks_stale_fast(&idx, root)
589 } else {
590 bm25_index_looks_stale(&idx, root)
591 };
592 if !stale {
593 return idx;
594 }
595 tracing::debug!(
596 "[bm25_index: stale index detected for {}; rebuilding]",
597 root.display()
598 );
599 let rebuilt = if idx.files.is_empty() {
600 Self::build_from_directory(root)
601 } else {
602 Self::rebuild_incremental(root, &idx)
603 };
604 let _ = rebuilt.save(root);
605 return rebuilt;
606 }
607
608 let built = Self::build_from_directory(root);
609 let _ = built.save(root);
610 built
611 }
612
613 pub fn index_file_path(root: &Path) -> PathBuf {
614 let dir = index_dir(root);
615 let zst = dir.join("bm25_index.bin.zst");
616 if zst.exists() {
617 return zst;
618 }
619 let bin = dir.join("bm25_index.bin");
620 if bin.exists() {
621 return bin;
622 }
623 dir.join("bm25_index.json")
624 }
625
626 pub fn ingest_content_chunks(
630 &mut self,
631 chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
632 ) -> usize {
633 let mut count = 0usize;
634 for cc in chunks {
635 self.add_chunk(cc.into());
636 count += 1;
637 }
638 if count > 0 {
639 self.finalize();
640 }
641 count
642 }
643
644 pub fn external_chunk_count(&self) -> usize {
646 self.chunks
647 .iter()
648 .filter(|c| c.file_path.contains("://"))
649 .count()
650 }
651}
652
653fn is_safe_bm25_root(root: &Path) -> bool {
654 super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
655}
656
657fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
658 bm25_index_looks_stale_inner(index, root, false)
659}
660
661pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
664 bm25_index_looks_stale_inner(index, root, true)
665}
666
667fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
668 if index.chunks.is_empty() {
669 return false;
670 }
671
672 if index.files.is_empty() {
673 let mut seen = std::collections::HashSet::<&str>::new();
674 for chunk in &index.chunks {
675 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
676 if rel.is_empty() {
677 continue;
678 }
679 if !seen.insert(rel) {
680 continue;
681 }
682 if !root.join(rel).exists() {
683 return true;
684 }
685 }
686 return false;
687 }
688
689 if fast {
690 let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
691 let step = if index.files.len() > sample_size {
692 index.files.len() / sample_size
693 } else {
694 1
695 };
696 for (i, (rel, old_state)) in index.files.iter().enumerate() {
697 if i % step != 0 {
698 continue;
699 }
700 let abs = root.join(rel);
701 if !abs.exists() {
702 return true;
703 }
704 let Some(cur) = IndexedFileState::from_path(&abs) else {
705 return true;
706 };
707 if &cur != old_state {
708 return true;
709 }
710 }
711 return false;
712 }
713
714 for (rel, old_state) in &index.files {
715 let abs = root.join(rel);
716 if !abs.exists() {
717 return true;
718 }
719 let Some(cur) = IndexedFileState::from_path(&abs) else {
720 return true;
721 };
722 if &cur != old_state {
723 return true;
724 }
725 }
726
727 for rel in list_code_files(root) {
728 if !index.files.contains_key(&rel) {
729 return true;
730 }
731 }
732
733 false
734}
735
736const SENTINEL_SAMPLE_SIZE: usize = 10;
737
738fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
739 use std::io::Read;
740 let mut decoder = zstd::Decoder::new(compressed).ok()?;
741 let mut buf = Vec::new();
742 let mut chunk = vec![0u8; 65536];
743 let mut total = 0u64;
744 loop {
745 let n = decoder.read(&mut chunk).ok()?;
746 if n == 0 {
747 break;
748 }
749 total += n as u64;
750 if total > max_bytes {
751 tracing::warn!(
752 "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
753 total as f64 / (1024.0 * 1024.0),
754 max_bytes as f64 / (1024.0 * 1024.0)
755 );
756 return None;
757 }
758 buf.extend_from_slice(&chunk[..n]);
759 }
760 Some(buf)
761}
762
763fn index_dir(root: &Path) -> PathBuf {
764 crate::core::index_namespace::vectors_dir(root)
765}
766
767fn list_code_files(root: &Path) -> Vec<String> {
768 let walker = ignore::WalkBuilder::new(root)
769 .hidden(true)
770 .git_ignore(true)
771 .git_global(true)
772 .git_exclude(true)
773 .max_depth(Some(20))
774 .build();
775
776 let cfg = crate::core::config::Config::load();
777 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
778 .iter()
779 .filter_map(|p| glob::Pattern::new(p).ok())
780 .collect();
781 ignore_patterns.extend(
782 cfg.extra_ignore_patterns
783 .iter()
784 .filter_map(|p| glob::Pattern::new(p).ok()),
785 );
786
787 let mut files: Vec<String> = Vec::new();
788 for entry in walker.flatten() {
789 let path = entry.path();
790 if !path.is_file() {
791 continue;
792 }
793 if !is_code_file(path) {
794 continue;
795 }
796 let rel = path
797 .strip_prefix(root)
798 .unwrap_or(path)
799 .to_string_lossy()
800 .to_string();
801 if rel.is_empty() {
802 continue;
803 }
804 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
805 continue;
806 }
807 if files.len() >= MAX_BM25_FILES {
808 tracing::warn!(
809 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
810 root.display()
811 );
812 break;
813 }
814 files.push(rel);
815 }
816
817 files.sort();
818 files.dedup();
819 files
820}
821
822pub fn is_code_file(path: &Path) -> bool {
823 let ext = path
824 .extension()
825 .and_then(|e| e.to_str())
826 .unwrap_or("")
827 .to_lowercase();
828 matches!(
829 ext.as_str(),
830 "rs" | "ts"
831 | "tsx"
832 | "js"
833 | "jsx"
834 | "py"
835 | "go"
836 | "java"
837 | "c"
838 | "cc"
839 | "cpp"
840 | "h"
841 | "hpp"
842 | "rb"
843 | "cs"
844 | "kt"
845 | "swift"
846 | "php"
847 | "scala"
848 | "sql"
849 | "ex"
850 | "exs"
851 | "zig"
852 | "lua"
853 | "dart"
854 | "vue"
855 | "svelte"
856 )
857}
858
859fn tokenize(text: &str) -> Vec<String> {
860 let mut tokens = Vec::new();
861 let mut current = String::new();
862
863 for ch in text.chars() {
864 if ch.is_alphanumeric() || ch == '_' {
865 current.push(ch);
866 } else {
867 if current.len() >= 2 {
868 tokens.push(current.clone());
869 }
870 current.clear();
871 }
872 }
873 if current.len() >= 2 {
874 tokens.push(current);
875 }
876
877 split_camel_case_tokens(&tokens)
878}
879
880pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
881 tokenize(text)
882}
883
884fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
885 let mut result = Vec::new();
886 for token in tokens {
887 result.push(token.clone());
888 let mut start = 0;
889 let chars: Vec<char> = token.chars().collect();
890 for i in 1..chars.len() {
891 if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
892 let part: String = chars[start..i].iter().collect();
893 if part.len() >= 2 {
894 result.push(part);
895 }
896 start = i;
897 }
898 }
899 if start > 0 {
900 let part: String = chars[start..].iter().collect();
901 if part.len() >= 2 {
902 result.push(part);
903 }
904 }
905 }
906 result
907}
908
909fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
910 #[cfg(feature = "tree-sitter")]
911 {
912 let ext = std::path::Path::new(file_path)
913 .extension()
914 .and_then(|e| e.to_str())
915 .unwrap_or("");
916 if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
917 return chunks;
918 }
919 }
920
921 let lines: Vec<&str> = content.lines().collect();
922 if lines.is_empty() {
923 return Vec::new();
924 }
925
926 let mut chunks = Vec::new();
927 let mut i = 0;
928
929 while i < lines.len() {
930 let trimmed = lines[i].trim();
931
932 if let Some((name, kind)) = detect_symbol(trimmed) {
933 let start = i;
934 let end = find_block_end(&lines, i);
935 let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
936 let token_count = tokenize(&block).len();
937
938 chunks.push(CodeChunk {
939 file_path: file_path.to_string(),
940 symbol_name: name,
941 kind,
942 start_line: start + 1,
943 end_line: end + 1,
944 content: block,
945 tokens: Vec::new(),
946 token_count,
947 });
948
949 i = end + 1;
950 } else {
951 i += 1;
952 }
953 }
954
955 if chunks.is_empty() && !content.is_empty() {
956 let bytes = content.as_bytes();
961 let rk_chunks = crate::core::rabin_karp::chunk(content);
962 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
963 for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
964 let end = (c.offset + c.length).min(bytes.len());
965 let slice = &bytes[c.offset..end];
966 let chunk_text = String::from_utf8_lossy(slice).into_owned();
967 let token_count = tokenize(&chunk_text).len();
968 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
969 let end_line = start_line + bytecount::count(slice, b'\n');
970 chunks.push(CodeChunk {
971 file_path: file_path.to_string(),
972 symbol_name: format!("{file_path}#chunk-{idx}"),
973 kind: ChunkKind::Module,
974 start_line,
975 end_line: end_line.max(start_line),
976 content: chunk_text,
977 tokens: Vec::new(),
978 token_count,
979 });
980 }
981 } else {
982 let token_count = tokenize(content).len();
983 let snippet = lines
984 .iter()
985 .take(50)
986 .copied()
987 .collect::<Vec<_>>()
988 .join("\n");
989 chunks.push(CodeChunk {
990 file_path: file_path.to_string(),
991 symbol_name: file_path.to_string(),
992 kind: ChunkKind::Module,
993 start_line: 1,
994 end_line: lines.len(),
995 content: snippet,
996 tokens: Vec::new(),
997 token_count,
998 });
999 }
1000 }
1001
1002 chunks
1003}
1004
1005fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
1006 let trimmed = line.trim();
1007
1008 let patterns: &[(&str, ChunkKind)] = &[
1009 ("pub async fn ", ChunkKind::Function),
1010 ("async fn ", ChunkKind::Function),
1011 ("pub fn ", ChunkKind::Function),
1012 ("fn ", ChunkKind::Function),
1013 ("pub struct ", ChunkKind::Struct),
1014 ("struct ", ChunkKind::Struct),
1015 ("pub enum ", ChunkKind::Struct),
1016 ("enum ", ChunkKind::Struct),
1017 ("impl ", ChunkKind::Impl),
1018 ("pub trait ", ChunkKind::Struct),
1019 ("trait ", ChunkKind::Struct),
1020 ("export function ", ChunkKind::Function),
1021 ("export async function ", ChunkKind::Function),
1022 ("export default function ", ChunkKind::Function),
1023 ("function ", ChunkKind::Function),
1024 ("async function ", ChunkKind::Function),
1025 ("export class ", ChunkKind::Class),
1026 ("class ", ChunkKind::Class),
1027 ("export interface ", ChunkKind::Struct),
1028 ("interface ", ChunkKind::Struct),
1029 ("def ", ChunkKind::Function),
1030 ("async def ", ChunkKind::Function),
1031 ("class ", ChunkKind::Class),
1032 ("func ", ChunkKind::Function),
1033 ];
1034
1035 for (prefix, kind) in patterns {
1036 if let Some(rest) = trimmed.strip_prefix(prefix) {
1037 let name: String = rest
1038 .chars()
1039 .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
1040 .take_while(|c| *c != '<')
1041 .collect();
1042 if !name.is_empty() {
1043 return Some((name, kind.clone()));
1044 }
1045 }
1046 }
1047
1048 None
1049}
1050
1051fn find_block_end(lines: &[&str], start: usize) -> usize {
1052 let mut depth = 0i32;
1053 let mut found_open = false;
1054
1055 for (i, line) in lines.iter().enumerate().skip(start) {
1056 for ch in line.chars() {
1057 match ch {
1058 '{' | '(' if !found_open || depth > 0 => {
1059 depth += 1;
1060 found_open = true;
1061 }
1062 '}' | ')' if depth > 0 => {
1063 depth -= 1;
1064 if depth == 0 && found_open {
1065 return i;
1066 }
1067 }
1068 _ => {}
1069 }
1070 }
1071
1072 if found_open && depth <= 0 && i > start {
1073 return i;
1074 }
1075
1076 if !found_open && i > start + 2 {
1077 let trimmed = lines[i].trim();
1078 if trimmed.is_empty()
1079 || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
1080 {
1081 return i.saturating_sub(1);
1082 }
1083 }
1084 }
1085
1086 (start + 50).min(lines.len().saturating_sub(1))
1087}
1088
1089pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
1090 if results.is_empty() {
1091 return "No results found.".to_string();
1092 }
1093
1094 let mut out = String::new();
1095 for (i, r) in results.iter().enumerate() {
1096 let is_external = r.file_path.contains("://");
1097 if compact {
1098 if is_external {
1099 out.push_str(&format!(
1100 "{}. {:.2} [{:?}] {} — {}\n",
1101 i + 1,
1102 r.score,
1103 r.kind,
1104 r.file_path,
1105 r.symbol_name,
1106 ));
1107 } else {
1108 out.push_str(&format!(
1109 "{}. {:.2} {}:{}-{} {:?} {}\n",
1110 i + 1,
1111 r.score,
1112 r.file_path,
1113 r.start_line,
1114 r.end_line,
1115 r.kind,
1116 r.symbol_name,
1117 ));
1118 }
1119 } else if is_external {
1120 out.push_str(&format!(
1121 "\n--- Result {} (score: {:.2}) [{:?}] ---\n{} — {}\n{}\n",
1122 i + 1,
1123 r.score,
1124 r.kind,
1125 r.file_path,
1126 r.symbol_name,
1127 r.snippet,
1128 ));
1129 } else {
1130 out.push_str(&format!(
1131 "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
1132 i + 1,
1133 r.score,
1134 r.file_path,
1135 r.symbol_name,
1136 r.kind,
1137 r.start_line,
1138 r.end_line,
1139 r.snippet,
1140 ));
1141 }
1142 }
1143 out
1144}
1145
1146fn enrich_for_bm25(chunk: &CodeChunk) -> String {
1153 let path = Path::new(&chunk.file_path);
1154 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1155 let dir = path
1156 .parent()
1157 .and_then(|p| p.file_name())
1158 .and_then(|d| d.to_str())
1159 .unwrap_or("");
1160
1161 if stem.is_empty() {
1162 return chunk.content.clone();
1163 }
1164
1165 format!("{} {} {} {}", chunk.content, stem, stem, dir)
1166}
1167
1168#[cfg(test)]
1169mod tests {
1170 use super::*;
1171 use tempfile::tempdir;
1172
1173 #[cfg(unix)]
1174 use std::os::unix::fs::PermissionsExt;
1175
1176 #[test]
1177 fn tokenize_splits_code() {
1178 let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
1179 assert!(tokens.contains(&"calculate_total".to_string()));
1180 assert!(tokens.contains(&"items".to_string()));
1181 assert!(tokens.contains(&"Vec".to_string()));
1182 }
1183
1184 #[test]
1185 fn camel_case_splitting() {
1186 let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
1187 assert!(tokens.contains(&"calculateTotal".to_string()));
1188 assert!(tokens.contains(&"calculate".to_string()));
1189 assert!(tokens.contains(&"Total".to_string()));
1190 }
1191
1192 #[test]
1193 fn detect_rust_function() {
1194 let (name, kind) =
1195 detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
1196 assert_eq!(name, "process_request");
1197 assert_eq!(kind, ChunkKind::Function);
1198 }
1199
1200 #[test]
1201 fn bm25_search_finds_relevant() {
1202 let mut index = BM25Index::new();
1203 index.add_chunk(CodeChunk {
1204 file_path: "auth.rs".into(),
1205 symbol_name: "validate_token".into(),
1206 kind: ChunkKind::Function,
1207 start_line: 1,
1208 end_line: 10,
1209 content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
1210 tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
1211 token_count: 8,
1212 });
1213 index.add_chunk(CodeChunk {
1214 file_path: "db.rs".into(),
1215 symbol_name: "connect_database".into(),
1216 kind: ChunkKind::Function,
1217 start_line: 1,
1218 end_line: 5,
1219 content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
1220 tokens: tokenize("fn connect_database url str Pool create_pool url"),
1221 token_count: 7,
1222 });
1223 index.finalize();
1224
1225 let results = index.search("jwt token validation", 5);
1226 assert!(!results.is_empty());
1227 assert_eq!(results[0].symbol_name, "validate_token");
1228 }
1229
1230 #[test]
1231 fn bm25_search_sorts_ties_deterministically() {
1232 let mut index = BM25Index::new();
1233
1234 index.add_chunk(CodeChunk {
1236 file_path: "b.rs".into(),
1237 symbol_name: "same".into(),
1238 kind: ChunkKind::Function,
1239 start_line: 1,
1240 end_line: 1,
1241 content: "fn same() {}".into(),
1242 tokens: tokenize("same token"),
1243 token_count: 2,
1244 });
1245 index.add_chunk(CodeChunk {
1246 file_path: "a.rs".into(),
1247 symbol_name: "same".into(),
1248 kind: ChunkKind::Function,
1249 start_line: 1,
1250 end_line: 1,
1251 content: "fn same() {}".into(),
1252 tokens: tokenize("same token"),
1253 token_count: 2,
1254 });
1255 index.finalize();
1256
1257 let results = index.search("same", 10);
1258 assert!(results.len() >= 2);
1259 assert_eq!(results[0].file_path, "a.rs");
1260 assert_eq!(results[1].file_path, "b.rs");
1261 }
1262
1263 #[test]
1264 fn bm25_index_is_stale_when_any_indexed_file_is_missing() {
1265 let td = tempdir().expect("tempdir");
1266 let root = td.path();
1267 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
1268
1269 let idx = BM25Index::build_from_directory(root);
1270 assert!(!bm25_index_looks_stale(&idx, root));
1271
1272 std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
1273 assert!(bm25_index_looks_stale(&idx, root));
1274 }
1275
1276 #[test]
1277 #[cfg(unix)]
1278 fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
1279 let td = tempdir().expect("tempdir");
1280 let root = td.path();
1281
1282 std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
1283 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
1284
1285 let idx1 = BM25Index::build_from_directory(root);
1286 assert!(idx1.files.contains_key("a.rs"));
1287 assert!(idx1.files.contains_key("b.rs"));
1288
1289 let a_path = root.join("a.rs");
1291 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1292 perms.set_mode(0o000);
1293 std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
1294
1295 std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
1297 .expect("rewrite b.rs");
1298
1299 let idx2 = BM25Index::rebuild_incremental(root, &idx1);
1300 assert!(
1301 idx2.files.contains_key("a.rs"),
1302 "a.rs should be kept via reuse"
1303 );
1304 assert!(idx2.files.contains_key("b.rs"));
1305
1306 let b_has_b2 = idx2
1307 .chunks
1308 .iter()
1309 .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
1310 assert!(b_has_b2, "b.rs should be re-read and re-chunked");
1311
1312 let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
1314 perms.set_mode(0o644);
1315 let _ = std::fs::set_permissions(&a_path, perms);
1316 }
1317
1318 #[test]
1319 fn load_quarantines_oversized_index() {
1320 let _env = crate::core::data_dir::test_env_lock();
1321 let td = tempdir().expect("tempdir");
1322 let root = td.path();
1323 let dir = crate::core::index_namespace::vectors_dir(root);
1324 std::fs::create_dir_all(&dir).expect("create vectors dir");
1325
1326 let index_path = dir.join("bm25_index.json");
1327 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1328 std::fs::write(&index_path, r#"{"chunks":[]}"#).expect("write index");
1329
1330 let result = BM25Index::load(root);
1331 assert!(result.is_none(), "oversized index should return None");
1332 assert!(
1333 !index_path.exists(),
1334 "original index should be removed after quarantine"
1335 );
1336 assert!(
1337 dir.join("bm25_index.json.quarantined").exists(),
1338 "quarantined file should exist"
1339 );
1340
1341 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1342 }
1343
1344 #[test]
1345 fn save_refuses_oversized_output() {
1346 let _env = crate::core::data_dir::test_env_lock();
1347 let data_dir = tempdir().expect("data_dir");
1348 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1349 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "0");
1350
1351 let td = tempdir().expect("tempdir");
1352 let root = td.path();
1353
1354 let mut index = BM25Index::new();
1355 index.add_chunk(CodeChunk {
1356 file_path: "a.rs".into(),
1357 symbol_name: "a".into(),
1358 kind: ChunkKind::Function,
1359 start_line: 1,
1360 end_line: 1,
1361 content: "fn a() {}".into(),
1362 tokens: tokenize("fn a"),
1363 token_count: 2,
1364 });
1365 index.finalize();
1366
1367 let _ = index.save(root);
1368 let index_path = BM25Index::index_file_path(root);
1369 assert!(
1370 !index_path.exists(),
1371 "save should refuse to persist oversized index"
1372 );
1373
1374 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1375 }
1376
1377 #[test]
1378 fn save_writes_project_root_marker() {
1379 let _env = crate::core::data_dir::test_env_lock();
1380 let td = tempdir().expect("tempdir");
1381 let root = td.path();
1382 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1383
1384 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1385 let index = BM25Index::build_from_directory(root);
1386 index.save(root).expect("save");
1387
1388 let dir = crate::core::index_namespace::vectors_dir(root);
1389 let marker = dir.join("project_root.txt");
1390 assert!(marker.exists(), "project_root.txt marker should exist");
1391 let content = std::fs::read_to_string(&marker).expect("read marker");
1392 assert_eq!(content, root.to_string_lossy());
1393 }
1394
1395 #[test]
1396 fn save_load_roundtrip_uses_zstd() {
1397 let _env = crate::core::data_dir::test_env_lock();
1398 let data_dir = tempdir().expect("data_dir");
1399 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1400 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1401 let td = tempdir().expect("tempdir");
1402 let root = td.path();
1403
1404 for i in 0..10 {
1405 std::fs::write(
1406 root.join(format!("mod{i}.rs")),
1407 format!(
1408 "pub fn handler_{i}() {{\n println!(\"hello\");\n}}\n\n\
1409 pub fn helper_{i}() {{\n println!(\"world\");\n}}\n"
1410 ),
1411 )
1412 .expect("write");
1413 }
1414
1415 let index = BM25Index::build_from_directory(root);
1416 assert!(index.doc_count > 0, "should have indexed chunks");
1417 index.save(root).expect("save");
1418
1419 let dir = crate::core::index_namespace::vectors_dir(root);
1420 let zst = dir.join("bm25_index.bin.zst");
1421 assert!(zst.exists(), "should write .bin.zst");
1422 assert!(
1423 !dir.join("bm25_index.bin").exists(),
1424 ".bin should be deleted"
1425 );
1426
1427 let loaded = BM25Index::load(root).expect("load compressed index");
1428 assert_eq!(loaded.doc_count, index.doc_count);
1429 assert_eq!(loaded.chunks.len(), index.chunks.len());
1430
1431 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1432 std::env::remove_var("LEAN_CTX_DATA_DIR");
1433 }
1434
1435 #[test]
1436 fn auto_migrate_bin_to_zst() {
1437 let _env = crate::core::data_dir::test_env_lock();
1438 let data_dir = tempdir().expect("data_dir");
1439 std::env::set_var("LEAN_CTX_DATA_DIR", data_dir.path());
1440 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "512");
1441 let td = tempdir().expect("tempdir");
1442 let root = td.path();
1443
1444 std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write");
1445 let index = BM25Index::build_from_directory(root);
1446
1447 let dir = crate::core::index_namespace::vectors_dir(root);
1448 std::fs::create_dir_all(&dir).expect("mkdir");
1449 let data =
1450 bincode::serde::encode_to_vec(&index, bincode::config::standard()).expect("encode");
1451 std::fs::write(dir.join("bm25_index.bin"), &data).expect("write bin");
1452
1453 let loaded = BM25Index::load(root).expect("load should auto-migrate");
1454 assert_eq!(loaded.doc_count, index.doc_count);
1455 assert!(
1456 dir.join("bm25_index.bin.zst").exists(),
1457 ".bin.zst should be created"
1458 );
1459 assert!(
1460 !dir.join("bm25_index.bin").exists(),
1461 ".bin should be removed"
1462 );
1463
1464 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1465 std::env::remove_var("LEAN_CTX_DATA_DIR");
1466 }
1467
1468 #[test]
1469 fn list_code_files_skips_default_vendor_ignores() {
1470 let td = tempdir().expect("tempdir");
1471 let root = td.path();
1472
1473 std::fs::write(root.join("main.rs"), "pub fn main() {}\n").expect("write main");
1474 std::fs::create_dir_all(root.join("vendor/lib")).expect("mkdir vendor");
1475 std::fs::write(root.join("vendor/lib/dep.rs"), "pub fn dep() {}\n").expect("write vendor");
1476 std::fs::create_dir_all(root.join("dist")).expect("mkdir dist");
1477 std::fs::write(root.join("dist/bundle.js"), "function x() {}").expect("write dist");
1478
1479 let files = list_code_files(root);
1480 assert!(
1481 files.iter().any(|f| f == "main.rs"),
1482 "main.rs should be included"
1483 );
1484 assert!(
1485 !files.iter().any(|f| f.starts_with("vendor/")),
1486 "vendor/ files should be excluded by DEFAULT_BM25_IGNORES"
1487 );
1488 assert!(
1489 !files.iter().any(|f| f.starts_with("dist/")),
1490 "dist/ files should be excluded by DEFAULT_BM25_IGNORES"
1491 );
1492 }
1493
1494 #[test]
1495 fn list_code_files_respects_max_files_cap() {
1496 let td = tempdir().expect("tempdir");
1497 let root = td.path();
1498
1499 for i in 0..10 {
1502 std::fs::write(
1503 root.join(format!("f{i}.rs")),
1504 format!("pub fn f{i}() {{}}\n"),
1505 )
1506 .expect("write");
1507 }
1508 let files = list_code_files(root);
1509 assert!(
1510 files.len() <= MAX_BM25_FILES,
1511 "file count should not exceed MAX_BM25_FILES"
1512 );
1513 }
1514
1515 #[test]
1516 fn max_bm25_cache_bytes_reads_env() {
1517 let _env = crate::core::data_dir::test_env_lock();
1518 std::env::set_var("LEAN_CTX_BM25_MAX_CACHE_MB", "64");
1519 let bytes = max_bm25_cache_bytes();
1520 assert_eq!(bytes, 64 * 1024 * 1024);
1521 std::env::remove_var("LEAN_CTX_BM25_MAX_CACHE_MB");
1522 }
1523}