1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6mod chunking;
7pub use chunking::*;
8#[cfg(test)]
9mod tests;
10
11const MAX_BM25_FILES: usize = 5000;
12const CHUNK_COUNT_WARNING: usize = 50_000;
13const ZSTD_LEVEL: i32 = 9;
14
15const DEFAULT_BM25_IGNORES: &[&str] = &[
16 "vendor/**",
17 "dist/**",
18 "build/**",
19 "public/vendor/**",
20 "public/js/**",
21 "public/css/**",
22 "public/build/**",
23 ".next/**",
24 ".nuxt/**",
25 "__pycache__/**",
26 "*.min.js",
27 "*.min.css",
28 "*.bundle.js",
29 "*.chunk.js",
30];
31
32fn max_bm25_cache_bytes() -> u64 {
33 let mb = std::env::var("LEAN_CTX_BM25_MAX_CACHE_MB")
37 .ok()
38 .and_then(|v| v.parse::<u64>().ok())
39 .unwrap_or_else(|| crate::core::config::Config::load().bm25_max_cache_mb_effective());
40 mb * 1024 * 1024
41}
42
43pub fn persist_ceiling_bytes() -> u64 {
47 max_bm25_cache_bytes()
48}
49
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum SaveOutcome {
55 Persisted { compressed_bytes: u64 },
57 SkippedTooLarge {
62 compressed_bytes: u64,
63 limit_bytes: u64,
64 },
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct CodeChunk {
69 pub file_path: String,
70 pub symbol_name: String,
71 pub kind: ChunkKind,
72 pub start_line: usize,
73 pub end_line: usize,
74 pub content: String,
75 #[serde(default)]
76 pub tokens: Vec<String>,
77 pub token_count: usize,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
81pub enum ChunkKind {
82 Function,
83 Struct,
84 Impl,
85 Module,
86 Class,
87 Method,
88 Other,
89 Issue,
91 PullRequest,
92 WikiPage,
93 DbSchema,
94 ApiEndpoint,
95 Ticket,
96 ExternalOther,
97}
98
99#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
100pub struct IndexedFileState {
101 pub mtime_ms: u64,
102 pub size_bytes: u64,
103}
104
105impl IndexedFileState {
106 fn from_path(path: &Path) -> Option<Self> {
107 let meta = path.metadata().ok()?;
108 let size_bytes = meta.len();
109 let mtime_ms = meta
110 .modified()
111 .ok()
112 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
113 .map(|d| d.as_millis() as u64)?;
114 Some(Self {
115 mtime_ms,
116 size_bytes,
117 })
118 }
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct BM25Index {
123 pub chunks: Vec<CodeChunk>,
124 pub inverted: HashMap<String, Vec<(usize, f64)>>,
125 pub avg_doc_len: f64,
126 pub doc_count: usize,
127 pub doc_freqs: HashMap<String, usize>,
128 #[serde(default)]
129 pub files: HashMap<String, IndexedFileState>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct SearchResult {
134 pub chunk_idx: usize,
135 pub score: f64,
136 pub file_path: String,
137 pub symbol_name: String,
138 pub kind: ChunkKind,
139 pub start_line: usize,
140 pub end_line: usize,
141 pub snippet: String,
142}
143
144const BM25_K1: f64 = 1.2;
145const BM25_B: f64 = 0.75;
146
147impl Default for BM25Index {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153impl BM25Index {
154 pub fn new() -> Self {
155 Self {
156 chunks: Vec::new(),
157 inverted: HashMap::new(),
158 avg_doc_len: 0.0,
159 doc_count: 0,
160 doc_freqs: HashMap::new(),
161 files: HashMap::new(),
162 }
163 }
164
165 pub fn memory_usage_bytes(&self) -> usize {
167 let chunks_size: usize = self
168 .chunks
169 .iter()
170 .map(|c| {
171 c.content.len()
172 + c.file_path.len()
173 + c.symbol_name.len()
174 + c.tokens.iter().map(String::len).sum::<usize>()
175 + 64
176 })
177 .sum();
178 let inverted_size: usize = self
179 .inverted
180 .iter()
181 .map(|(k, v)| k.len() + v.len() * 16 + 32)
182 .sum();
183 let files_size: usize = self.files.keys().map(|k| k.len() + 24).sum();
184 let freqs_size: usize = self.doc_freqs.keys().map(|k| k.len() + 16).sum();
185 chunks_size + inverted_size + files_size + freqs_size
186 }
187
188 pub fn unload(&mut self) {
190 let usage = self.memory_usage_bytes();
191 self.chunks = Vec::new();
192 self.inverted = HashMap::new();
193 self.doc_freqs = HashMap::new();
194 self.files = HashMap::new();
195 self.avg_doc_len = 0.0;
196 self.doc_count = 0;
197 tracing::info!(
198 "[bm25] unloaded index, freed ~{:.1}MB",
199 usage as f64 / 1_048_576.0
200 );
201 }
202
203 #[cfg(test)]
205 pub(crate) fn from_chunks_for_test(chunks: Vec<CodeChunk>) -> Self {
206 let mut index = Self::new();
207 for mut chunk in chunks {
208 if chunk.token_count == 0 {
209 chunk.token_count = tokenize(&chunk.content).len();
210 }
211 index.add_chunk(chunk);
212 }
213 index.finalize();
214 index
215 }
216
217 pub fn build_from_directory(root: &Path) -> Self {
218 Self::build_from_directory_inner(root, &HashMap::new())
219 }
220
221 pub fn build_with_content_hint(root: &Path, content_hint: &HashMap<String, String>) -> Self {
224 Self::build_from_directory_inner(root, content_hint)
225 }
226
227 fn build_from_directory_inner(root: &Path, content_hint: &HashMap<String, String>) -> Self {
228 let root_str = root.to_string_lossy();
229 if !super::graph_index::is_safe_scan_root_public(&root_str) {
230 tracing::warn!("[bm25: scan aborted for unsafe root {root_str}]");
231 return Self::new();
232 }
233 let mut index = Self::new();
234 let files = list_code_files(root);
235 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
236 let mut cache_hits = 0usize;
237
238 for (i, rel) in files.iter().enumerate() {
239 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
240 tracing::warn!(
241 "[bm25: stopping build at file {i}/{} due to memory pressure]",
242 files.len()
243 );
244 break;
245 }
246 if crate::core::memory_guard::abort_requested() {
247 tracing::warn!("[bm25: aborting build due to critical memory pressure]");
248 break;
249 }
250
251 let abs = root.join(rel);
252 let Some(state) = IndexedFileState::from_path(&abs) else {
253 continue;
254 };
255 if state.size_bytes > MAX_FILE_SIZE_BYTES {
256 continue;
257 }
258
259 let cache_state = crate::core::content_cache::FileState {
264 mtime_ms: state.mtime_ms,
265 size_bytes: state.size_bytes,
266 };
267 let content = if let Some(cached) = content_hint.get(rel) {
268 cache_hits += 1;
269 std::borrow::Cow::Borrowed(cached.as_str())
270 } else if let Some(arc) = crate::core::content_cache::get(&abs, cache_state) {
271 cache_hits += 1;
272 std::borrow::Cow::Owned(arc.to_string())
273 } else {
274 match std::fs::read_to_string(&abs) {
275 Ok(c) => {
276 crate::core::content_cache::insert(
277 &abs,
278 cache_state,
279 std::sync::Arc::from(c.as_str()),
280 );
281 std::borrow::Cow::Owned(c)
282 }
283 Err(_) => continue,
284 }
285 };
286
287 let mut chunks = extract_chunks(rel, &content);
288 chunks.sort_by(|a, b| {
289 a.start_line
290 .cmp(&b.start_line)
291 .then_with(|| a.end_line.cmp(&b.end_line))
292 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
293 });
294 for chunk in chunks {
295 index.add_chunk(chunk);
296 }
297 index.files.insert(rel.clone(), state);
298 }
299
300 if cache_hits > 0 {
301 tracing::info!(
302 "[bm25: reused {cache_hits}/{} file contents from graph scan cache]",
303 files.len()
304 );
305 }
306
307 index.finalize();
308 index
309 }
310
311 pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
312 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
313 for c in &prev.chunks {
314 old_by_file
315 .entry(c.file_path.clone())
316 .or_default()
317 .push(c.clone());
318 }
319 for v in old_by_file.values_mut() {
320 v.sort_by(|a, b| {
321 a.start_line
322 .cmp(&b.start_line)
323 .then_with(|| a.end_line.cmp(&b.end_line))
324 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
325 });
326 }
327
328 let mut index = Self::new();
329 let files = list_code_files(root);
330 const MAX_FILE_SIZE_BYTES: u64 = 2 * 1024 * 1024;
331
332 for (i, rel) in files.iter().enumerate() {
333 if i.is_multiple_of(500) && crate::core::memory_guard::is_under_pressure() {
334 tracing::warn!(
335 "[bm25: stopping incremental rebuild at file {i}/{} due to memory pressure]",
336 files.len()
337 );
338 break;
339 }
340
341 let abs = root.join(rel);
342 let Some(state) = IndexedFileState::from_path(&abs) else {
343 continue;
344 };
345
346 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
347 if unchanged {
348 if let Some(chunks) = old_by_file.get(rel) {
349 if chunks.first().is_some_and(|c| !c.content.is_empty()) {
350 for chunk in chunks {
351 index.add_chunk(chunk.clone());
352 }
353 index.files.insert(rel.clone(), state);
354 continue;
355 }
356 }
357 }
358
359 if state.size_bytes > MAX_FILE_SIZE_BYTES {
360 continue;
361 }
362 if let Ok(content) = std::fs::read_to_string(&abs) {
363 let mut chunks = extract_chunks(rel, &content);
364 chunks.sort_by(|a, b| {
365 a.start_line
366 .cmp(&b.start_line)
367 .then_with(|| a.end_line.cmp(&b.end_line))
368 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
369 });
370 for chunk in chunks {
371 index.add_chunk(chunk);
372 }
373 index.files.insert(rel.clone(), state);
374 }
375 }
376
377 index.finalize();
378 index
379 }
380
381 fn add_chunk(&mut self, chunk: CodeChunk) {
382 let idx = self.chunks.len();
383
384 let enriched = enrich_for_bm25(&chunk);
385 let tokens = tokenize(&enriched);
386 for token in &tokens {
387 let lower = token.to_lowercase();
388 let postings = self.inverted.entry(lower.clone()).or_default();
389 if postings.last().map(|(last_idx, _)| *last_idx) != Some(idx) {
390 *self.doc_freqs.entry(lower).or_insert(0) += 1;
391 }
392 postings.push((idx, 1.0));
393 }
394
395 self.chunks.push(CodeChunk {
396 token_count: tokens.len(),
397 tokens: Vec::new(),
398 ..chunk
399 });
400 }
401
402 fn finalize(&mut self) {
403 self.doc_count = self.chunks.len();
404 if self.doc_count == 0 {
405 return;
406 }
407
408 let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
409 self.avg_doc_len = total_len as f64 / self.doc_count as f64;
410 }
411
412 pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
413 let query_tokens = tokenize(query);
414 if query_tokens.is_empty() || self.doc_count == 0 {
415 return Vec::new();
416 }
417
418 let n = self.chunks.len();
421 let mut scores = vec![0.0f64; n];
422 let mut touched = Vec::with_capacity(n.min(256));
423
424 for token in &query_tokens {
425 let lower = token.to_lowercase();
426 let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
427 if df == 0.0 {
428 continue;
429 }
430
431 let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
432
433 if let Some(postings) = self.inverted.get(&lower) {
434 for &(idx, weight) in postings {
435 let doc_len = self.chunks[idx].token_count as f64;
436 let norm_len = doc_len / self.avg_doc_len.max(1.0);
437 let bm25 = idf * (weight * (BM25_K1 + 1.0))
438 / (weight + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
439
440 if scores[idx] == 0.0 {
441 touched.push(idx);
442 }
443 scores[idx] += bm25;
444 }
445 }
446 }
447
448 let mut results: Vec<SearchResult> = touched
449 .iter()
450 .filter(|&&idx| scores[idx] > 0.0)
451 .map(|&idx| {
452 let chunk = &self.chunks[idx];
453 let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
454 SearchResult {
455 chunk_idx: idx,
456 score: scores[idx],
457 file_path: chunk.file_path.clone(),
458 symbol_name: chunk.symbol_name.clone(),
459 kind: chunk.kind.clone(),
460 start_line: chunk.start_line,
461 end_line: chunk.end_line,
462 snippet,
463 }
464 })
465 .collect();
466
467 results.sort_by(|a, b| {
468 b.score
469 .partial_cmp(&a.score)
470 .unwrap_or(std::cmp::Ordering::Equal)
471 .then_with(|| a.file_path.cmp(&b.file_path))
472 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
473 .then_with(|| a.start_line.cmp(&b.start_line))
474 .then_with(|| a.end_line.cmp(&b.end_line))
475 });
476 results.truncate(top_k);
477 results
478 }
479
480 pub fn save(&self, root: &Path) -> std::io::Result<SaveOutcome> {
481 if self.chunks.len() > CHUNK_COUNT_WARNING {
482 tracing::warn!(
483 "[bm25] index has {} chunks (threshold {}), consider adding extra_ignore_patterns",
484 self.chunks.len(),
485 CHUNK_COUNT_WARNING
486 );
487 }
488
489 let dir = index_dir(root);
490 std::fs::create_dir_all(&dir)?;
491 let data = bincode::serde::encode_to_vec(self, bincode::config::standard())
492 .map_err(|e| std::io::Error::other(e.to_string()))?;
493
494 let compressed = zstd::encode_all(data.as_slice(), ZSTD_LEVEL)
495 .map_err(|e| std::io::Error::other(format!("zstd compress: {e}")))?;
496 let compressed_bytes = compressed.len() as u64;
497
498 let max_bytes = max_bm25_cache_bytes();
499 if compressed_bytes > max_bytes {
500 tracing::warn!(
506 "[bm25] compressed index too large ({:.1} MB, limit {:.0} MB), refusing to persist: {}",
507 compressed_bytes as f64 / 1_048_576.0,
508 max_bytes / (1024 * 1024),
509 dir.display()
510 );
511 return Ok(SaveOutcome::SkippedTooLarge {
512 compressed_bytes,
513 limit_bytes: max_bytes,
514 });
515 }
516
517 tracing::info!(
518 "[bm25] index: {:.1} MB bincode → {:.1} MB zstd ({:.0}% saved)",
519 data.len() as f64 / 1_048_576.0,
520 compressed_bytes as f64 / 1_048_576.0,
521 (1.0 - compressed_bytes as f64 / data.len().max(1) as f64) * 100.0
522 );
523
524 let target = dir.join("bm25_index.bin.zst");
525 let tmp = dir.join("bm25_index.bin.zst.tmp");
526 std::fs::write(&tmp, &compressed)?;
527 std::fs::rename(&tmp, &target)?;
528
529 let _ = std::fs::remove_file(dir.join("bm25_index.bin"));
530 let _ = std::fs::remove_file(dir.join("bm25_index.json"));
531
532 let _ = std::fs::write(
533 dir.join("project_root.txt"),
534 root.to_string_lossy().as_bytes(),
535 );
536
537 Ok(SaveOutcome::Persisted { compressed_bytes })
538 }
539
540 pub fn load(root: &Path) -> Option<Self> {
541 let dir = index_dir(root);
542 let max_bytes = max_bm25_cache_bytes();
543
544 let zst_path = dir.join("bm25_index.bin.zst");
545 if zst_path.exists() {
546 let meta = std::fs::metadata(&zst_path).ok()?;
547 if meta.len() > max_bytes {
548 tracing::warn!(
549 "[bm25] compressed index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
550 meta.len() as f64 / 1_073_741_824.0,
551 max_bytes / (1024 * 1024),
552 zst_path.display()
553 );
554 let quarantined = zst_path.with_extension("zst.quarantined");
555 let _ = std::fs::rename(&zst_path, &quarantined);
556 return None;
557 }
558 let compressed = std::fs::read(&zst_path).ok()?;
559 let max_decompressed = max_bytes * 20; let data = bounded_zstd_decode(&compressed, max_decompressed)?;
561 let (idx, _): (Self, _) =
562 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
563 return Some(idx);
564 }
565
566 let bin_path = dir.join("bm25_index.bin");
567 if bin_path.exists() {
568 let meta = std::fs::metadata(&bin_path).ok()?;
569 if meta.len() > max_bytes {
570 tracing::warn!(
571 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
572 meta.len() as f64 / 1_073_741_824.0,
573 max_bytes / (1024 * 1024),
574 bin_path.display()
575 );
576 let quarantined = bin_path.with_extension("bin.quarantined");
577 let _ = std::fs::rename(&bin_path, &quarantined);
578 return None;
579 }
580 let data = std::fs::read(&bin_path).ok()?;
581 let (idx, _): (Self, _) =
582 bincode::serde::decode_from_slice(&data, bincode::config::standard()).ok()?;
583 if let Ok(compressed) = zstd::encode_all(data.as_slice(), ZSTD_LEVEL) {
585 let zst_tmp = zst_path.with_extension("zst.tmp");
586 if std::fs::write(&zst_tmp, &compressed).is_ok()
587 && std::fs::rename(&zst_tmp, &zst_path).is_ok()
588 {
589 tracing::info!(
590 "[bm25] migrated {:.1} MB → {:.1} MB zstd",
591 data.len() as f64 / 1_048_576.0,
592 compressed.len() as f64 / 1_048_576.0
593 );
594 let _ = std::fs::remove_file(&bin_path);
595 }
596 }
597 return Some(idx);
598 }
599
600 let json_path = dir.join("bm25_index.json");
601 if json_path.exists() {
602 let meta = std::fs::metadata(&json_path).ok()?;
603 if meta.len() > max_bytes {
604 tracing::warn!(
605 "[bm25] index too large ({:.1} GB, limit {:.0} MB), quarantining: {}",
606 meta.len() as f64 / 1_073_741_824.0,
607 max_bytes / (1024 * 1024),
608 json_path.display()
609 );
610 let quarantined = json_path.with_extension("json.quarantined");
611 let _ = std::fs::rename(&json_path, &quarantined);
612 return None;
613 }
614 let data = std::fs::read_to_string(&json_path).ok()?;
615 return serde_json::from_str(&data).ok();
616 }
617
618 None
619 }
620
621 pub fn load_or_build(root: &Path) -> Self {
622 Self::load_or_build_inner(root, false)
623 }
624
625 pub fn load_or_build_fast(root: &Path) -> Self {
628 Self::load_or_build_inner(root, true)
629 }
630
631 fn load_or_build_inner(root: &Path, fast_stale: bool) -> Self {
632 if !is_safe_bm25_root(root) {
633 return Self::default();
634 }
635 if let Some(idx) = Self::load(root) {
636 let stale = if fast_stale {
637 bm25_index_looks_stale_fast(&idx, root)
638 } else {
639 bm25_index_looks_stale(&idx, root)
640 };
641 if !stale {
642 return idx;
643 }
644 tracing::debug!(
645 "[bm25_index: stale index detected for {}; rebuilding]",
646 root.display()
647 );
648 let rebuilt = if idx.files.is_empty() {
649 Self::build_from_directory(root)
650 } else {
651 Self::rebuild_incremental(root, &idx)
652 };
653 let _ = rebuilt.save(root);
654 return rebuilt;
655 }
656
657 let built = Self::build_from_directory(root);
658 let _ = built.save(root);
659 built
660 }
661
662 pub fn index_file_path(root: &Path) -> PathBuf {
663 let dir = index_dir(root);
664 let zst = dir.join("bm25_index.bin.zst");
665 if zst.exists() {
666 return zst;
667 }
668 let bin = dir.join("bm25_index.bin");
669 if bin.exists() {
670 return bin;
671 }
672 dir.join("bm25_index.json")
673 }
674
675 pub fn ingest_content_chunks(
679 &mut self,
680 chunks: impl IntoIterator<Item = super::content_chunk::ContentChunk>,
681 ) -> usize {
682 let mut count = 0usize;
683 for cc in chunks {
684 self.add_chunk(cc.into());
685 count += 1;
686 }
687 if count > 0 {
688 self.finalize();
689 }
690 count
691 }
692
693 pub fn external_chunk_count(&self) -> usize {
695 self.chunks
696 .iter()
697 .filter(|c| c.file_path.contains("://"))
698 .count()
699 }
700}
701
702fn is_safe_bm25_root(root: &Path) -> bool {
703 super::graph_index::is_safe_scan_root_public(&root.to_string_lossy())
704}
705
706fn bm25_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
707 bm25_index_looks_stale_inner(index, root, false)
708}
709
710pub fn bm25_index_looks_stale_fast(index: &BM25Index, root: &Path) -> bool {
713 bm25_index_looks_stale_inner(index, root, true)
714}
715
716fn bm25_index_looks_stale_inner(index: &BM25Index, root: &Path, fast: bool) -> bool {
717 if index.chunks.is_empty() {
718 return false;
719 }
720
721 if index.files.is_empty() {
722 let mut seen = std::collections::HashSet::<&str>::new();
723 for chunk in &index.chunks {
724 let rel = chunk.file_path.trim_start_matches(['/', '\\']);
725 if rel.is_empty() {
726 continue;
727 }
728 if !seen.insert(rel) {
729 continue;
730 }
731 if !root.join(rel).exists() {
732 return true;
733 }
734 }
735 return false;
736 }
737
738 if fast {
739 let sample_size = index.files.len().min(SENTINEL_SAMPLE_SIZE);
740 let step = if index.files.len() > sample_size {
741 index.files.len() / sample_size
742 } else {
743 1
744 };
745 for (i, (rel, old_state)) in index.files.iter().enumerate() {
746 if i % step != 0 {
747 continue;
748 }
749 let abs = root.join(rel);
750 if !abs.exists() {
751 return true;
752 }
753 let Some(cur) = IndexedFileState::from_path(&abs) else {
754 return true;
755 };
756 if &cur != old_state {
757 return true;
758 }
759 }
760 return false;
761 }
762
763 for (rel, old_state) in &index.files {
764 let abs = root.join(rel);
765 if !abs.exists() {
766 return true;
767 }
768 let Some(cur) = IndexedFileState::from_path(&abs) else {
769 return true;
770 };
771 if &cur != old_state {
772 return true;
773 }
774 }
775
776 for rel in list_code_files(root) {
777 if !index.files.contains_key(&rel) {
778 return true;
779 }
780 }
781
782 false
783}
784
785const SENTINEL_SAMPLE_SIZE: usize = 10;
786
787fn bounded_zstd_decode(compressed: &[u8], max_bytes: u64) -> Option<Vec<u8>> {
788 use std::io::Read;
789 let mut decoder = zstd::Decoder::new(compressed).ok()?;
790 let mut buf = Vec::new();
791 let mut chunk = vec![0u8; 65536];
792 let mut total = 0u64;
793 loop {
794 let n = decoder.read(&mut chunk).ok()?;
795 if n == 0 {
796 break;
797 }
798 total += n as u64;
799 if total > max_bytes {
800 tracing::warn!(
801 "[bm25] decompressed index exceeds limit ({:.0} MB > {:.0} MB), aborting load",
802 total as f64 / (1024.0 * 1024.0),
803 max_bytes as f64 / (1024.0 * 1024.0)
804 );
805 return None;
806 }
807 buf.extend_from_slice(&chunk[..n]);
808 }
809 Some(buf)
810}
811
812fn index_dir(root: &Path) -> PathBuf {
813 crate::core::index_namespace::vectors_dir(root)
814}
815
816fn list_code_files(root: &Path) -> Vec<String> {
817 let walker = ignore::WalkBuilder::new(root)
818 .hidden(true)
819 .git_ignore(true)
820 .git_global(true)
821 .git_exclude(true)
822 .max_depth(Some(20))
823 .filter_entry(crate::core::cloud_files::keep_entry)
824 .build();
825
826 let cfg = crate::core::config::Config::load();
827 let mut ignore_patterns: Vec<glob::Pattern> = DEFAULT_BM25_IGNORES
828 .iter()
829 .filter_map(|p| glob::Pattern::new(p).ok())
830 .collect();
831 ignore_patterns.extend(
832 cfg.extra_ignore_patterns
833 .iter()
834 .filter_map(|p| glob::Pattern::new(p).ok()),
835 );
836
837 let mut files: Vec<String> = Vec::new();
838 for entry in walker.flatten() {
839 let path = entry.path();
840 if !path.is_file() {
841 continue;
842 }
843 if !is_code_file(path) {
844 continue;
845 }
846 let rel = path
847 .strip_prefix(root)
848 .unwrap_or(path)
849 .to_string_lossy()
850 .to_string();
851 if rel.is_empty() {
852 continue;
853 }
854 if ignore_patterns.iter().any(|p| p.matches(&rel)) {
855 continue;
856 }
857 if files.len() >= MAX_BM25_FILES {
858 tracing::warn!(
859 "[bm25] file cap reached ({MAX_BM25_FILES}), skipping remaining files in {}",
860 root.display()
861 );
862 break;
863 }
864 files.push(rel);
865 }
866
867 files.sort();
868 files.dedup();
869 files
870}
871
872pub fn is_code_file(path: &Path) -> bool {
873 let ext = path
874 .extension()
875 .and_then(|e| e.to_str())
876 .unwrap_or("")
877 .to_lowercase();
878 matches!(
879 ext.as_str(),
880 "rs" | "ts"
881 | "tsx"
882 | "js"
883 | "jsx"
884 | "py"
885 | "go"
886 | "java"
887 | "c"
888 | "cc"
889 | "cpp"
890 | "h"
891 | "hpp"
892 | "rb"
893 | "cs"
894 | "kt"
895 | "swift"
896 | "php"
897 | "scala"
898 | "sql"
899 | "ex"
900 | "exs"
901 | "zig"
902 | "lua"
903 | "dart"
904 | "vue"
905 | "svelte"
906 )
907}