Skip to main content

lean_ctx/core/
vector_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct CodeChunk {
9    pub file_path: String,
10    pub symbol_name: String,
11    pub kind: ChunkKind,
12    pub start_line: usize,
13    pub end_line: usize,
14    pub content: String,
15    pub tokens: Vec<String>,
16    pub token_count: usize,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
20pub enum ChunkKind {
21    Function,
22    Struct,
23    Impl,
24    Module,
25    Class,
26    Method,
27    Other,
28}
29
30#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
31pub struct IndexedFileState {
32    pub mtime_ms: u64,
33    pub size_bytes: u64,
34}
35
36impl IndexedFileState {
37    fn from_path(path: &Path) -> Option<Self> {
38        let meta = path.metadata().ok()?;
39        let size_bytes = meta.len();
40        let mtime_ms = meta
41            .modified()
42            .ok()
43            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
44            .map(|d| d.as_millis() as u64)?;
45        Some(Self {
46            mtime_ms,
47            size_bytes,
48        })
49    }
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct BM25Index {
54    pub chunks: Vec<CodeChunk>,
55    pub inverted: HashMap<String, Vec<(usize, f64)>>,
56    pub avg_doc_len: f64,
57    pub doc_count: usize,
58    pub doc_freqs: HashMap<String, usize>,
59    #[serde(default)]
60    pub files: HashMap<String, IndexedFileState>,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct SearchResult {
65    pub chunk_idx: usize,
66    pub score: f64,
67    pub file_path: String,
68    pub symbol_name: String,
69    pub kind: ChunkKind,
70    pub start_line: usize,
71    pub end_line: usize,
72    pub snippet: String,
73}
74
75const BM25_K1: f64 = 1.2;
76const BM25_B: f64 = 0.75;
77
78impl Default for BM25Index {
79    fn default() -> Self {
80        Self::new()
81    }
82}
83
84impl BM25Index {
85    pub fn new() -> Self {
86        Self {
87            chunks: Vec::new(),
88            inverted: HashMap::new(),
89            avg_doc_len: 0.0,
90            doc_count: 0,
91            doc_freqs: HashMap::new(),
92            files: HashMap::new(),
93        }
94    }
95
96    pub fn build_from_directory(root: &Path) -> Self {
97        let mut index = Self::new();
98        let files = list_code_files(root);
99        for rel in files {
100            let abs = root.join(&rel);
101            let Some(state) = IndexedFileState::from_path(&abs) else {
102                continue;
103            };
104            if let Ok(content) = std::fs::read_to_string(&abs) {
105                let mut chunks = extract_chunks(&rel, &content);
106                chunks.sort_by(|a, b| {
107                    a.start_line
108                        .cmp(&b.start_line)
109                        .then_with(|| a.end_line.cmp(&b.end_line))
110                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
111                });
112                for chunk in chunks {
113                    index.add_chunk(chunk);
114                }
115                index.files.insert(rel, state);
116            }
117        }
118
119        index.finalize();
120        index
121    }
122
123    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
124        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
125        for c in &prev.chunks {
126            old_by_file
127                .entry(c.file_path.clone())
128                .or_default()
129                .push(c.clone());
130        }
131        for v in old_by_file.values_mut() {
132            v.sort_by(|a, b| {
133                a.start_line
134                    .cmp(&b.start_line)
135                    .then_with(|| a.end_line.cmp(&b.end_line))
136                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
137            });
138        }
139
140        let mut index = Self::new();
141        let files = list_code_files(root);
142        for rel in files {
143            let abs = root.join(&rel);
144            let Some(state) = IndexedFileState::from_path(&abs) else {
145                continue;
146            };
147
148            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
149            if unchanged {
150                if let Some(chunks) = old_by_file.get(&rel) {
151                    for chunk in chunks {
152                        index.add_chunk(chunk.clone());
153                    }
154                    index.files.insert(rel, state);
155                    continue;
156                }
157            }
158
159            if let Ok(content) = std::fs::read_to_string(&abs) {
160                let mut chunks = extract_chunks(&rel, &content);
161                chunks.sort_by(|a, b| {
162                    a.start_line
163                        .cmp(&b.start_line)
164                        .then_with(|| a.end_line.cmp(&b.end_line))
165                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
166                });
167                for chunk in chunks {
168                    index.add_chunk(chunk);
169                }
170                index.files.insert(rel, state);
171            }
172        }
173
174        index.finalize();
175        index
176    }
177
178    fn add_chunk(&mut self, chunk: CodeChunk) {
179        let idx = self.chunks.len();
180
181        for token in &chunk.tokens {
182            let lower = token.to_lowercase();
183            self.inverted.entry(lower).or_default().push((idx, 1.0));
184        }
185
186        self.chunks.push(chunk);
187    }
188
189    fn finalize(&mut self) {
190        self.doc_count = self.chunks.len();
191        if self.doc_count == 0 {
192            return;
193        }
194
195        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
196        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
197
198        self.doc_freqs.clear();
199        for (term, postings) in &self.inverted {
200            let unique_docs: std::collections::HashSet<usize> =
201                postings.iter().map(|(idx, _)| *idx).collect();
202            self.doc_freqs.insert(term.clone(), unique_docs.len());
203        }
204    }
205
206    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
207        let query_tokens = tokenize(query);
208        if query_tokens.is_empty() || self.doc_count == 0 {
209            return Vec::new();
210        }
211
212        let mut scores: HashMap<usize, f64> = HashMap::new();
213
214        for token in &query_tokens {
215            let lower = token.to_lowercase();
216            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
217            if df == 0.0 {
218                continue;
219            }
220
221            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
222
223            if let Some(postings) = self.inverted.get(&lower) {
224                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
225                for (idx, weight) in postings {
226                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
227                }
228
229                for (doc_idx, tf) in &doc_tfs {
230                    let doc_len = self.chunks[*doc_idx].token_count as f64;
231                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
232                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
233                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
234
235                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
236                }
237            }
238        }
239
240        let mut results: Vec<SearchResult> = scores
241            .into_iter()
242            .map(|(idx, score)| {
243                let chunk = &self.chunks[idx];
244                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
245                SearchResult {
246                    chunk_idx: idx,
247                    score,
248                    file_path: chunk.file_path.clone(),
249                    symbol_name: chunk.symbol_name.clone(),
250                    kind: chunk.kind.clone(),
251                    start_line: chunk.start_line,
252                    end_line: chunk.end_line,
253                    snippet,
254                }
255            })
256            .collect();
257
258        results.sort_by(|a, b| {
259            b.score
260                .partial_cmp(&a.score)
261                .unwrap_or(std::cmp::Ordering::Equal)
262                .then_with(|| a.file_path.cmp(&b.file_path))
263                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
264                .then_with(|| a.start_line.cmp(&b.start_line))
265                .then_with(|| a.end_line.cmp(&b.end_line))
266        });
267        results.truncate(top_k);
268        results
269    }
270
271    pub fn save(&self, root: &Path) -> std::io::Result<()> {
272        let dir = index_dir(root);
273        std::fs::create_dir_all(&dir)?;
274        let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
275        let target = dir.join("bm25_index.json");
276        let tmp = dir.join("bm25_index.json.tmp");
277        std::fs::write(&tmp, data)?;
278        std::fs::rename(&tmp, &target)?;
279        Ok(())
280    }
281
282    pub fn load(root: &Path) -> Option<Self> {
283        let path = index_dir(root).join("bm25_index.json");
284        let data = std::fs::read_to_string(path).ok()?;
285        serde_json::from_str(&data).ok()
286    }
287
288    pub fn load_or_build(root: &Path) -> Self {
289        if let Some(idx) = Self::load(root) {
290            if !vector_index_looks_stale(&idx, root) {
291                return idx;
292            }
293            tracing::warn!(
294                "[vector_index: stale index detected for {}; rebuilding]",
295                root.display()
296            );
297            let rebuilt = if idx.files.is_empty() {
298                Self::build_from_directory(root)
299            } else {
300                Self::rebuild_incremental(root, &idx)
301            };
302            let _ = rebuilt.save(root);
303            return rebuilt;
304        }
305
306        let built = Self::build_from_directory(root);
307        let _ = built.save(root);
308        built
309    }
310
311    pub fn index_file_path(root: &Path) -> PathBuf {
312        index_dir(root).join("bm25_index.json")
313    }
314}
315
316fn vector_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
317    if index.chunks.is_empty() {
318        return false;
319    }
320
321    if index.files.is_empty() {
322        // Legacy index (pre file-state tracking): only detect missing files.
323        let mut seen = std::collections::HashSet::<&str>::new();
324        for chunk in &index.chunks {
325            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
326            if rel.is_empty() {
327                continue;
328            }
329            if !seen.insert(rel) {
330                continue;
331            }
332            if !root.join(rel).exists() {
333                return true;
334            }
335        }
336        return false;
337    }
338
339    // Missing or modified tracked files.
340    for (rel, old_state) in &index.files {
341        let abs = root.join(rel);
342        if !abs.exists() {
343            return true;
344        }
345        let Some(cur) = IndexedFileState::from_path(&abs) else {
346            return true;
347        };
348        if &cur != old_state {
349            return true;
350        }
351    }
352
353    // New files (present on disk but not in index).
354    for rel in list_code_files(root) {
355        if !index.files.contains_key(&rel) {
356            return true;
357        }
358    }
359
360    false
361}
362
363fn index_dir(root: &Path) -> PathBuf {
364    crate::core::index_namespace::vectors_dir(root)
365}
366
367fn list_code_files(root: &Path) -> Vec<String> {
368    let walker = ignore::WalkBuilder::new(root)
369        .hidden(true)
370        .git_ignore(true)
371        .git_global(true)
372        .git_exclude(true)
373        .build();
374
375    let mut files: Vec<String> = Vec::new();
376    for entry in walker.flatten() {
377        let path = entry.path();
378        if !path.is_file() {
379            continue;
380        }
381        if !is_code_file(path) {
382            continue;
383        }
384        let rel = path
385            .strip_prefix(root)
386            .unwrap_or(path)
387            .to_string_lossy()
388            .to_string();
389        if rel.is_empty() {
390            continue;
391        }
392        files.push(rel);
393    }
394
395    files.sort();
396    files.dedup();
397    files
398}
399
400pub fn is_code_file(path: &Path) -> bool {
401    let ext = path
402        .extension()
403        .and_then(|e| e.to_str())
404        .unwrap_or("")
405        .to_lowercase();
406    matches!(
407        ext.as_str(),
408        "rs" | "ts"
409            | "tsx"
410            | "js"
411            | "jsx"
412            | "py"
413            | "go"
414            | "java"
415            | "c"
416            | "cc"
417            | "cpp"
418            | "h"
419            | "hpp"
420            | "rb"
421            | "cs"
422            | "kt"
423            | "swift"
424            | "php"
425            | "scala"
426            | "sql"
427            | "ex"
428            | "exs"
429            | "zig"
430            | "lua"
431            | "dart"
432            | "vue"
433            | "svelte"
434    )
435}
436
437fn tokenize(text: &str) -> Vec<String> {
438    let mut tokens = Vec::new();
439    let mut current = String::new();
440
441    for ch in text.chars() {
442        if ch.is_alphanumeric() || ch == '_' {
443            current.push(ch);
444        } else {
445            if current.len() >= 2 {
446                tokens.push(current.clone());
447            }
448            current.clear();
449        }
450    }
451    if current.len() >= 2 {
452        tokens.push(current);
453    }
454
455    split_camel_case_tokens(&tokens)
456}
457
458pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
459    tokenize(text)
460}
461
462fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
463    let mut result = Vec::new();
464    for token in tokens {
465        result.push(token.clone());
466        let mut start = 0;
467        let chars: Vec<char> = token.chars().collect();
468        for i in 1..chars.len() {
469            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
470                let part: String = chars[start..i].iter().collect();
471                if part.len() >= 2 {
472                    result.push(part);
473                }
474                start = i;
475            }
476        }
477        if start > 0 {
478            let part: String = chars[start..].iter().collect();
479            if part.len() >= 2 {
480                result.push(part);
481            }
482        }
483    }
484    result
485}
486
487fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
488    #[cfg(feature = "tree-sitter")]
489    {
490        let ext = std::path::Path::new(file_path)
491            .extension()
492            .and_then(|e| e.to_str())
493            .unwrap_or("");
494        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
495            return chunks;
496        }
497    }
498
499    let lines: Vec<&str> = content.lines().collect();
500    if lines.is_empty() {
501        return Vec::new();
502    }
503
504    let mut chunks = Vec::new();
505    let mut i = 0;
506
507    while i < lines.len() {
508        let trimmed = lines[i].trim();
509
510        if let Some((name, kind)) = detect_symbol(trimmed) {
511            let start = i;
512            let end = find_block_end(&lines, i);
513            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
514            let tokens = tokenize(&block);
515            let token_count = tokens.len();
516
517            chunks.push(CodeChunk {
518                file_path: file_path.to_string(),
519                symbol_name: name,
520                kind,
521                start_line: start + 1,
522                end_line: end + 1,
523                content: block,
524                tokens,
525                token_count,
526            });
527
528            i = end + 1;
529        } else {
530            i += 1;
531        }
532    }
533
534    if chunks.is_empty() && !content.is_empty() {
535        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
536        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
537        //
538        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
539        let bytes = content.as_bytes();
540        let rk_chunks = crate::core::rabin_karp::chunk(content);
541        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
542            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
543                let end = (c.offset + c.length).min(bytes.len());
544                let slice = &bytes[c.offset..end];
545                let chunk_text = String::from_utf8_lossy(slice).into_owned();
546                let tokens = tokenize(&chunk_text);
547                let token_count = tokens.len();
548                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
549                let end_line = start_line + bytecount::count(slice, b'\n');
550                chunks.push(CodeChunk {
551                    file_path: file_path.to_string(),
552                    symbol_name: format!("{file_path}#chunk-{idx}"),
553                    kind: ChunkKind::Module,
554                    start_line,
555                    end_line: end_line.max(start_line),
556                    content: chunk_text,
557                    tokens,
558                    token_count,
559                });
560            }
561        } else {
562            let tokens = tokenize(content);
563            let token_count = tokens.len();
564            let snippet = lines
565                .iter()
566                .take(50)
567                .copied()
568                .collect::<Vec<_>>()
569                .join("\n");
570            chunks.push(CodeChunk {
571                file_path: file_path.to_string(),
572                symbol_name: file_path.to_string(),
573                kind: ChunkKind::Module,
574                start_line: 1,
575                end_line: lines.len(),
576                content: snippet,
577                tokens,
578                token_count,
579            });
580        }
581    }
582
583    chunks
584}
585
586fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
587    let trimmed = line.trim();
588
589    let patterns: &[(&str, ChunkKind)] = &[
590        ("pub async fn ", ChunkKind::Function),
591        ("async fn ", ChunkKind::Function),
592        ("pub fn ", ChunkKind::Function),
593        ("fn ", ChunkKind::Function),
594        ("pub struct ", ChunkKind::Struct),
595        ("struct ", ChunkKind::Struct),
596        ("pub enum ", ChunkKind::Struct),
597        ("enum ", ChunkKind::Struct),
598        ("impl ", ChunkKind::Impl),
599        ("pub trait ", ChunkKind::Struct),
600        ("trait ", ChunkKind::Struct),
601        ("export function ", ChunkKind::Function),
602        ("export async function ", ChunkKind::Function),
603        ("export default function ", ChunkKind::Function),
604        ("function ", ChunkKind::Function),
605        ("async function ", ChunkKind::Function),
606        ("export class ", ChunkKind::Class),
607        ("class ", ChunkKind::Class),
608        ("export interface ", ChunkKind::Struct),
609        ("interface ", ChunkKind::Struct),
610        ("def ", ChunkKind::Function),
611        ("async def ", ChunkKind::Function),
612        ("class ", ChunkKind::Class),
613        ("func ", ChunkKind::Function),
614    ];
615
616    for (prefix, kind) in patterns {
617        if let Some(rest) = trimmed.strip_prefix(prefix) {
618            let name: String = rest
619                .chars()
620                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
621                .take_while(|c| *c != '<')
622                .collect();
623            if !name.is_empty() {
624                return Some((name, kind.clone()));
625            }
626        }
627    }
628
629    None
630}
631
632fn find_block_end(lines: &[&str], start: usize) -> usize {
633    let mut depth = 0i32;
634    let mut found_open = false;
635
636    for (i, line) in lines.iter().enumerate().skip(start) {
637        for ch in line.chars() {
638            match ch {
639                '{' | '(' if !found_open || depth > 0 => {
640                    depth += 1;
641                    found_open = true;
642                }
643                '}' | ')' if depth > 0 => {
644                    depth -= 1;
645                    if depth == 0 && found_open {
646                        return i;
647                    }
648                }
649                _ => {}
650            }
651        }
652
653        if found_open && depth <= 0 && i > start {
654            return i;
655        }
656
657        if !found_open && i > start + 2 {
658            let trimmed = lines[i].trim();
659            if trimmed.is_empty()
660                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
661            {
662                return i.saturating_sub(1);
663            }
664        }
665    }
666
667    (start + 50).min(lines.len().saturating_sub(1))
668}
669
670pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
671    if results.is_empty() {
672        return "No results found.".to_string();
673    }
674
675    let mut out = String::new();
676    for (i, r) in results.iter().enumerate() {
677        if compact {
678            out.push_str(&format!(
679                "{}. {:.2} {}:{}-{} {:?} {}\n",
680                i + 1,
681                r.score,
682                r.file_path,
683                r.start_line,
684                r.end_line,
685                r.kind,
686                r.symbol_name,
687            ));
688        } else {
689            out.push_str(&format!(
690                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
691                i + 1,
692                r.score,
693                r.file_path,
694                r.symbol_name,
695                r.kind,
696                r.start_line,
697                r.end_line,
698                r.snippet,
699            ));
700        }
701    }
702    out
703}
704
705#[cfg(test)]
706mod tests {
707    use super::*;
708    use tempfile::tempdir;
709
710    #[cfg(unix)]
711    use std::os::unix::fs::PermissionsExt;
712
713    #[test]
714    fn tokenize_splits_code() {
715        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
716        assert!(tokens.contains(&"calculate_total".to_string()));
717        assert!(tokens.contains(&"items".to_string()));
718        assert!(tokens.contains(&"Vec".to_string()));
719    }
720
721    #[test]
722    fn camel_case_splitting() {
723        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
724        assert!(tokens.contains(&"calculateTotal".to_string()));
725        assert!(tokens.contains(&"calculate".to_string()));
726        assert!(tokens.contains(&"Total".to_string()));
727    }
728
729    #[test]
730    fn detect_rust_function() {
731        let (name, kind) =
732            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
733        assert_eq!(name, "process_request");
734        assert_eq!(kind, ChunkKind::Function);
735    }
736
737    #[test]
738    fn bm25_search_finds_relevant() {
739        let mut index = BM25Index::new();
740        index.add_chunk(CodeChunk {
741            file_path: "auth.rs".into(),
742            symbol_name: "validate_token".into(),
743            kind: ChunkKind::Function,
744            start_line: 1,
745            end_line: 10,
746            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
747            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
748            token_count: 8,
749        });
750        index.add_chunk(CodeChunk {
751            file_path: "db.rs".into(),
752            symbol_name: "connect_database".into(),
753            kind: ChunkKind::Function,
754            start_line: 1,
755            end_line: 5,
756            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
757            tokens: tokenize("fn connect_database url str Pool create_pool url"),
758            token_count: 7,
759        });
760        index.finalize();
761
762        let results = index.search("jwt token validation", 5);
763        assert!(!results.is_empty());
764        assert_eq!(results[0].symbol_name, "validate_token");
765    }
766
767    #[test]
768    fn bm25_search_sorts_ties_deterministically() {
769        let mut index = BM25Index::new();
770
771        // Insert in reverse path order to ensure the sort tie-break matters.
772        index.add_chunk(CodeChunk {
773            file_path: "b.rs".into(),
774            symbol_name: "same".into(),
775            kind: ChunkKind::Function,
776            start_line: 1,
777            end_line: 1,
778            content: "fn same() {}".into(),
779            tokens: tokenize("same token"),
780            token_count: 2,
781        });
782        index.add_chunk(CodeChunk {
783            file_path: "a.rs".into(),
784            symbol_name: "same".into(),
785            kind: ChunkKind::Function,
786            start_line: 1,
787            end_line: 1,
788            content: "fn same() {}".into(),
789            tokens: tokenize("same token"),
790            token_count: 2,
791        });
792        index.finalize();
793
794        let results = index.search("same", 10);
795        assert!(results.len() >= 2);
796        assert_eq!(results[0].file_path, "a.rs");
797        assert_eq!(results[1].file_path, "b.rs");
798    }
799
800    #[test]
801    fn vector_index_is_stale_when_any_indexed_file_is_missing() {
802        let td = tempdir().expect("tempdir");
803        let root = td.path();
804        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
805
806        let idx = BM25Index::build_from_directory(root);
807        assert!(!vector_index_looks_stale(&idx, root));
808
809        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
810        assert!(vector_index_looks_stale(&idx, root));
811    }
812
813    #[test]
814    #[cfg(unix)]
815    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
816        let td = tempdir().expect("tempdir");
817        let root = td.path();
818
819        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
820        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
821
822        let idx1 = BM25Index::build_from_directory(root);
823        assert!(idx1.files.contains_key("a.rs"));
824        assert!(idx1.files.contains_key("b.rs"));
825
826        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
827        let a_path = root.join("a.rs");
828        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
829        perms.set_mode(0o000);
830        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
831
832        // Change b.rs (size changes) to force a re-read for that file.
833        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
834            .expect("rewrite b.rs");
835
836        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
837        assert!(
838            idx2.files.contains_key("a.rs"),
839            "a.rs should be kept via reuse"
840        );
841        assert!(idx2.files.contains_key("b.rs"));
842
843        let b_has_b2 = idx2
844            .chunks
845            .iter()
846            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
847        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
848
849        // Restore permissions to avoid cleanup surprises.
850        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
851        perms.set_mode(0o644);
852        let _ = std::fs::set_permissions(&a_path, perms);
853    }
854}