Skip to main content

lean_ctx/core/
vector_index.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use serde::{Deserialize, Serialize};
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct CodeChunk {
9    pub file_path: String,
10    pub symbol_name: String,
11    pub kind: ChunkKind,
12    pub start_line: usize,
13    pub end_line: usize,
14    pub content: String,
15    pub tokens: Vec<String>,
16    pub token_count: usize,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
20pub enum ChunkKind {
21    Function,
22    Struct,
23    Impl,
24    Module,
25    Class,
26    Method,
27    Other,
28}
29
30#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
31pub struct IndexedFileState {
32    pub mtime_ms: u64,
33    pub size_bytes: u64,
34}
35
36impl IndexedFileState {
37    fn from_path(path: &Path) -> Option<Self> {
38        let meta = path.metadata().ok()?;
39        let size_bytes = meta.len();
40        let mtime_ms = meta
41            .modified()
42            .ok()
43            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
44            .map(|d| d.as_millis() as u64)?;
45        Some(Self {
46            mtime_ms,
47            size_bytes,
48        })
49    }
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct BM25Index {
54    pub chunks: Vec<CodeChunk>,
55    pub inverted: HashMap<String, Vec<(usize, f64)>>,
56    pub avg_doc_len: f64,
57    pub doc_count: usize,
58    pub doc_freqs: HashMap<String, usize>,
59    #[serde(default)]
60    pub files: HashMap<String, IndexedFileState>,
61}
62
63#[derive(Debug, Clone)]
64pub struct SearchResult {
65    pub chunk_idx: usize,
66    pub score: f64,
67    pub file_path: String,
68    pub symbol_name: String,
69    pub kind: ChunkKind,
70    pub start_line: usize,
71    pub end_line: usize,
72    pub snippet: String,
73}
74
75const BM25_K1: f64 = 1.2;
76const BM25_B: f64 = 0.75;
77
78impl Default for BM25Index {
79    fn default() -> Self {
80        Self::new()
81    }
82}
83
84impl BM25Index {
85    pub fn new() -> Self {
86        Self {
87            chunks: Vec::new(),
88            inverted: HashMap::new(),
89            avg_doc_len: 0.0,
90            doc_count: 0,
91            doc_freqs: HashMap::new(),
92            files: HashMap::new(),
93        }
94    }
95
96    pub fn build_from_directory(root: &Path) -> Self {
97        let mut index = Self::new();
98        let files = list_code_files(root);
99        for rel in files {
100            let abs = root.join(&rel);
101            let Some(state) = IndexedFileState::from_path(&abs) else {
102                continue;
103            };
104            if let Ok(content) = std::fs::read_to_string(&abs) {
105                let mut chunks = extract_chunks(&rel, &content);
106                chunks.sort_by(|a, b| {
107                    a.start_line
108                        .cmp(&b.start_line)
109                        .then_with(|| a.end_line.cmp(&b.end_line))
110                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
111                });
112                for chunk in chunks {
113                    index.add_chunk(chunk);
114                }
115                index.files.insert(rel, state);
116            }
117        }
118
119        index.finalize();
120        index
121    }
122
123    pub fn rebuild_incremental(root: &Path, prev: &BM25Index) -> Self {
124        let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
125        for c in &prev.chunks {
126            old_by_file
127                .entry(c.file_path.clone())
128                .or_default()
129                .push(c.clone());
130        }
131        for v in old_by_file.values_mut() {
132            v.sort_by(|a, b| {
133                a.start_line
134                    .cmp(&b.start_line)
135                    .then_with(|| a.end_line.cmp(&b.end_line))
136                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
137            });
138        }
139
140        let mut index = Self::new();
141        let files = list_code_files(root);
142        for rel in files {
143            let abs = root.join(&rel);
144            let Some(state) = IndexedFileState::from_path(&abs) else {
145                continue;
146            };
147
148            let unchanged = prev.files.get(&rel).is_some_and(|old| *old == state);
149            if unchanged {
150                if let Some(chunks) = old_by_file.get(&rel) {
151                    for chunk in chunks {
152                        index.add_chunk(chunk.clone());
153                    }
154                    index.files.insert(rel, state);
155                    continue;
156                }
157            }
158
159            if let Ok(content) = std::fs::read_to_string(&abs) {
160                let mut chunks = extract_chunks(&rel, &content);
161                chunks.sort_by(|a, b| {
162                    a.start_line
163                        .cmp(&b.start_line)
164                        .then_with(|| a.end_line.cmp(&b.end_line))
165                        .then_with(|| a.symbol_name.cmp(&b.symbol_name))
166                });
167                for chunk in chunks {
168                    index.add_chunk(chunk);
169                }
170                index.files.insert(rel, state);
171            }
172        }
173
174        index.finalize();
175        index
176    }
177
178    fn add_chunk(&mut self, chunk: CodeChunk) {
179        let idx = self.chunks.len();
180
181        for token in &chunk.tokens {
182            let lower = token.to_lowercase();
183            self.inverted.entry(lower).or_default().push((idx, 1.0));
184        }
185
186        self.chunks.push(chunk);
187    }
188
189    fn finalize(&mut self) {
190        self.doc_count = self.chunks.len();
191        if self.doc_count == 0 {
192            return;
193        }
194
195        let total_len: usize = self.chunks.iter().map(|c| c.token_count).sum();
196        self.avg_doc_len = total_len as f64 / self.doc_count as f64;
197
198        self.doc_freqs.clear();
199        for (term, postings) in &self.inverted {
200            let unique_docs: std::collections::HashSet<usize> =
201                postings.iter().map(|(idx, _)| *idx).collect();
202            self.doc_freqs.insert(term.clone(), unique_docs.len());
203        }
204    }
205
206    pub fn search(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
207        let query_tokens = tokenize(query);
208        if query_tokens.is_empty() || self.doc_count == 0 {
209            return Vec::new();
210        }
211
212        let mut scores: HashMap<usize, f64> = HashMap::new();
213
214        for token in &query_tokens {
215            let lower = token.to_lowercase();
216            let df = *self.doc_freqs.get(&lower).unwrap_or(&0) as f64;
217            if df == 0.0 {
218                continue;
219            }
220
221            let idf = ((self.doc_count as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
222
223            if let Some(postings) = self.inverted.get(&lower) {
224                let mut doc_tfs: HashMap<usize, f64> = HashMap::new();
225                for (idx, weight) in postings {
226                    *doc_tfs.entry(*idx).or_insert(0.0) += weight;
227                }
228
229                for (doc_idx, tf) in &doc_tfs {
230                    let doc_len = self.chunks[*doc_idx].token_count as f64;
231                    let norm_len = doc_len / self.avg_doc_len.max(1.0);
232                    let bm25 = idf * (tf * (BM25_K1 + 1.0))
233                        / (tf + BM25_K1 * (1.0 - BM25_B + BM25_B * norm_len));
234
235                    *scores.entry(*doc_idx).or_insert(0.0) += bm25;
236                }
237            }
238        }
239
240        let mut results: Vec<SearchResult> = scores
241            .into_iter()
242            .map(|(idx, score)| {
243                let chunk = &self.chunks[idx];
244                let snippet = chunk.content.lines().take(5).collect::<Vec<_>>().join("\n");
245                SearchResult {
246                    chunk_idx: idx,
247                    score,
248                    file_path: chunk.file_path.clone(),
249                    symbol_name: chunk.symbol_name.clone(),
250                    kind: chunk.kind.clone(),
251                    start_line: chunk.start_line,
252                    end_line: chunk.end_line,
253                    snippet,
254                }
255            })
256            .collect();
257
258        results.sort_by(|a, b| {
259            b.score
260                .partial_cmp(&a.score)
261                .unwrap_or(std::cmp::Ordering::Equal)
262        });
263        results.truncate(top_k);
264        results
265    }
266
267    pub fn save(&self, root: &Path) -> std::io::Result<()> {
268        let dir = index_dir(root);
269        std::fs::create_dir_all(&dir)?;
270        let data = serde_json::to_string(self).map_err(std::io::Error::other)?;
271        let target = dir.join("bm25_index.json");
272        let tmp = dir.join("bm25_index.json.tmp");
273        std::fs::write(&tmp, data)?;
274        std::fs::rename(&tmp, &target)?;
275        Ok(())
276    }
277
278    pub fn load(root: &Path) -> Option<Self> {
279        let path = index_dir(root).join("bm25_index.json");
280        let data = std::fs::read_to_string(path).ok()?;
281        serde_json::from_str(&data).ok()
282    }
283
284    pub fn load_or_build(root: &Path) -> Self {
285        if let Some(idx) = Self::load(root) {
286            if !vector_index_looks_stale(&idx, root) {
287                return idx;
288            }
289            tracing::warn!(
290                "[vector_index: stale index detected for {}; rebuilding]",
291                root.display()
292            );
293            let rebuilt = if idx.files.is_empty() {
294                Self::build_from_directory(root)
295            } else {
296                Self::rebuild_incremental(root, &idx)
297            };
298            let _ = rebuilt.save(root);
299            return rebuilt;
300        }
301
302        let built = Self::build_from_directory(root);
303        let _ = built.save(root);
304        built
305    }
306
307    pub fn index_file_path(root: &Path) -> PathBuf {
308        index_dir(root).join("bm25_index.json")
309    }
310}
311
312fn vector_index_looks_stale(index: &BM25Index, root: &Path) -> bool {
313    if index.chunks.is_empty() {
314        return false;
315    }
316
317    if index.files.is_empty() {
318        // Legacy index (pre file-state tracking): only detect missing files.
319        let mut seen = std::collections::HashSet::<&str>::new();
320        for chunk in &index.chunks {
321            let rel = chunk.file_path.trim_start_matches(['/', '\\']);
322            if rel.is_empty() {
323                continue;
324            }
325            if !seen.insert(rel) {
326                continue;
327            }
328            if !root.join(rel).exists() {
329                return true;
330            }
331        }
332        return false;
333    }
334
335    // Missing or modified tracked files.
336    for (rel, old_state) in &index.files {
337        let abs = root.join(rel);
338        if !abs.exists() {
339            return true;
340        }
341        let Some(cur) = IndexedFileState::from_path(&abs) else {
342            return true;
343        };
344        if &cur != old_state {
345            return true;
346        }
347    }
348
349    // New files (present on disk but not in index).
350    for rel in list_code_files(root) {
351        if !index.files.contains_key(&rel) {
352            return true;
353        }
354    }
355
356    false
357}
358
359fn index_dir(root: &Path) -> PathBuf {
360    crate::core::index_namespace::vectors_dir(root)
361}
362
363fn list_code_files(root: &Path) -> Vec<String> {
364    let walker = ignore::WalkBuilder::new(root)
365        .hidden(true)
366        .git_ignore(true)
367        .git_global(true)
368        .git_exclude(true)
369        .build();
370
371    let mut files: Vec<String> = Vec::new();
372    for entry in walker.flatten() {
373        let path = entry.path();
374        if !path.is_file() {
375            continue;
376        }
377        if !is_code_file(path) {
378            continue;
379        }
380        let rel = path
381            .strip_prefix(root)
382            .unwrap_or(path)
383            .to_string_lossy()
384            .to_string();
385        if rel.is_empty() {
386            continue;
387        }
388        files.push(rel);
389    }
390
391    files.sort();
392    files.dedup();
393    files
394}
395
396pub fn is_code_file(path: &Path) -> bool {
397    let ext = path
398        .extension()
399        .and_then(|e| e.to_str())
400        .unwrap_or("")
401        .to_lowercase();
402    matches!(
403        ext.as_str(),
404        "rs" | "ts"
405            | "tsx"
406            | "js"
407            | "jsx"
408            | "py"
409            | "go"
410            | "java"
411            | "c"
412            | "cc"
413            | "cpp"
414            | "h"
415            | "hpp"
416            | "rb"
417            | "cs"
418            | "kt"
419            | "swift"
420            | "php"
421            | "scala"
422            | "sql"
423            | "ex"
424            | "exs"
425            | "zig"
426            | "lua"
427            | "dart"
428            | "vue"
429            | "svelte"
430    )
431}
432
433fn tokenize(text: &str) -> Vec<String> {
434    let mut tokens = Vec::new();
435    let mut current = String::new();
436
437    for ch in text.chars() {
438        if ch.is_alphanumeric() || ch == '_' {
439            current.push(ch);
440        } else {
441            if current.len() >= 2 {
442                tokens.push(current.clone());
443            }
444            current.clear();
445        }
446    }
447    if current.len() >= 2 {
448        tokens.push(current);
449    }
450
451    split_camel_case_tokens(&tokens)
452}
453
454pub(crate) fn tokenize_for_index(text: &str) -> Vec<String> {
455    tokenize(text)
456}
457
458fn split_camel_case_tokens(tokens: &[String]) -> Vec<String> {
459    let mut result = Vec::new();
460    for token in tokens {
461        result.push(token.clone());
462        let mut start = 0;
463        let chars: Vec<char> = token.chars().collect();
464        for i in 1..chars.len() {
465            if chars[i].is_uppercase() && (i + 1 >= chars.len() || !chars[i + 1].is_uppercase()) {
466                let part: String = chars[start..i].iter().collect();
467                if part.len() >= 2 {
468                    result.push(part);
469                }
470                start = i;
471            }
472        }
473        if start > 0 {
474            let part: String = chars[start..].iter().collect();
475            if part.len() >= 2 {
476                result.push(part);
477            }
478        }
479    }
480    result
481}
482
483fn extract_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
484    #[cfg(feature = "tree-sitter")]
485    {
486        let ext = std::path::Path::new(file_path)
487            .extension()
488            .and_then(|e| e.to_str())
489            .unwrap_or("");
490        if let Some(chunks) = crate::core::chunks_ts::extract_chunks_ts(file_path, content, ext) {
491            return chunks;
492        }
493    }
494
495    let lines: Vec<&str> = content.lines().collect();
496    if lines.is_empty() {
497        return Vec::new();
498    }
499
500    let mut chunks = Vec::new();
501    let mut i = 0;
502
503    while i < lines.len() {
504        let trimmed = lines[i].trim();
505
506        if let Some((name, kind)) = detect_symbol(trimmed) {
507            let start = i;
508            let end = find_block_end(&lines, i);
509            let block: String = lines[start..=end.min(lines.len() - 1)].to_vec().join("\n");
510            let tokens = tokenize(&block);
511            let token_count = tokens.len();
512
513            chunks.push(CodeChunk {
514                file_path: file_path.to_string(),
515                symbol_name: name,
516                kind,
517                start_line: start + 1,
518                end_line: end + 1,
519                content: block,
520                tokens,
521                token_count,
522            });
523
524            i = end + 1;
525        } else {
526            i += 1;
527        }
528    }
529
530    if chunks.is_empty() && !content.is_empty() {
531        // Fallback: when no symbols are detected, chunk the file into stable, content-defined
532        // segments (rolling-hash) to enable meaningful semantic search over non-code assets.
533        //
534        // Safety note: rabin_karp uses byte offsets; we must slice bytes and decode safely.
535        let bytes = content.as_bytes();
536        let rk_chunks = crate::core::rabin_karp::chunk(content);
537        if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
538            for (idx, c) in rk_chunks.into_iter().take(50).enumerate() {
539                let end = (c.offset + c.length).min(bytes.len());
540                let slice = &bytes[c.offset..end];
541                let chunk_text = String::from_utf8_lossy(slice).into_owned();
542                let tokens = tokenize(&chunk_text);
543                let token_count = tokens.len();
544                let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
545                let end_line = start_line + bytecount::count(slice, b'\n');
546                chunks.push(CodeChunk {
547                    file_path: file_path.to_string(),
548                    symbol_name: format!("{file_path}#chunk-{idx}"),
549                    kind: ChunkKind::Module,
550                    start_line,
551                    end_line: end_line.max(start_line),
552                    content: chunk_text,
553                    tokens,
554                    token_count,
555                });
556            }
557        } else {
558            let tokens = tokenize(content);
559            let token_count = tokens.len();
560            let snippet = lines
561                .iter()
562                .take(50)
563                .copied()
564                .collect::<Vec<_>>()
565                .join("\n");
566            chunks.push(CodeChunk {
567                file_path: file_path.to_string(),
568                symbol_name: file_path.to_string(),
569                kind: ChunkKind::Module,
570                start_line: 1,
571                end_line: lines.len(),
572                content: snippet,
573                tokens,
574                token_count,
575            });
576        }
577    }
578
579    chunks
580}
581
582fn detect_symbol(line: &str) -> Option<(String, ChunkKind)> {
583    let trimmed = line.trim();
584
585    let patterns: &[(&str, ChunkKind)] = &[
586        ("pub async fn ", ChunkKind::Function),
587        ("async fn ", ChunkKind::Function),
588        ("pub fn ", ChunkKind::Function),
589        ("fn ", ChunkKind::Function),
590        ("pub struct ", ChunkKind::Struct),
591        ("struct ", ChunkKind::Struct),
592        ("pub enum ", ChunkKind::Struct),
593        ("enum ", ChunkKind::Struct),
594        ("impl ", ChunkKind::Impl),
595        ("pub trait ", ChunkKind::Struct),
596        ("trait ", ChunkKind::Struct),
597        ("export function ", ChunkKind::Function),
598        ("export async function ", ChunkKind::Function),
599        ("export default function ", ChunkKind::Function),
600        ("function ", ChunkKind::Function),
601        ("async function ", ChunkKind::Function),
602        ("export class ", ChunkKind::Class),
603        ("class ", ChunkKind::Class),
604        ("export interface ", ChunkKind::Struct),
605        ("interface ", ChunkKind::Struct),
606        ("def ", ChunkKind::Function),
607        ("async def ", ChunkKind::Function),
608        ("class ", ChunkKind::Class),
609        ("func ", ChunkKind::Function),
610    ];
611
612    for (prefix, kind) in patterns {
613        if let Some(rest) = trimmed.strip_prefix(prefix) {
614            let name: String = rest
615                .chars()
616                .take_while(|c| c.is_alphanumeric() || *c == '_' || *c == '<')
617                .take_while(|c| *c != '<')
618                .collect();
619            if !name.is_empty() {
620                return Some((name, kind.clone()));
621            }
622        }
623    }
624
625    None
626}
627
628fn find_block_end(lines: &[&str], start: usize) -> usize {
629    let mut depth = 0i32;
630    let mut found_open = false;
631
632    for (i, line) in lines.iter().enumerate().skip(start) {
633        for ch in line.chars() {
634            match ch {
635                '{' | '(' if !found_open || depth > 0 => {
636                    depth += 1;
637                    found_open = true;
638                }
639                '}' | ')' if depth > 0 => {
640                    depth -= 1;
641                    if depth == 0 && found_open {
642                        return i;
643                    }
644                }
645                _ => {}
646            }
647        }
648
649        if found_open && depth <= 0 && i > start {
650            return i;
651        }
652
653        if !found_open && i > start + 2 {
654            let trimmed = lines[i].trim();
655            if trimmed.is_empty()
656                || (!trimmed.starts_with(' ') && !trimmed.starts_with('\t') && i > start)
657            {
658                return i.saturating_sub(1);
659            }
660        }
661    }
662
663    (start + 50).min(lines.len().saturating_sub(1))
664}
665
666pub fn format_search_results(results: &[SearchResult], compact: bool) -> String {
667    if results.is_empty() {
668        return "No results found.".to_string();
669    }
670
671    let mut out = String::new();
672    for (i, r) in results.iter().enumerate() {
673        if compact {
674            out.push_str(&format!(
675                "{}. {:.2} {}:{}-{} {:?} {}\n",
676                i + 1,
677                r.score,
678                r.file_path,
679                r.start_line,
680                r.end_line,
681                r.kind,
682                r.symbol_name,
683            ));
684        } else {
685            out.push_str(&format!(
686                "\n--- Result {} (score: {:.2}) ---\n{} :: {} [{:?}] (L{}-{})\n{}\n",
687                i + 1,
688                r.score,
689                r.file_path,
690                r.symbol_name,
691                r.kind,
692                r.start_line,
693                r.end_line,
694                r.snippet,
695            ));
696        }
697    }
698    out
699}
700
701#[cfg(test)]
702mod tests {
703    use super::*;
704    use tempfile::tempdir;
705
706    #[cfg(unix)]
707    use std::os::unix::fs::PermissionsExt;
708
709    #[test]
710    fn tokenize_splits_code() {
711        let tokens = tokenize("fn calculate_total(items: Vec<Item>) -> f64");
712        assert!(tokens.contains(&"calculate_total".to_string()));
713        assert!(tokens.contains(&"items".to_string()));
714        assert!(tokens.contains(&"Vec".to_string()));
715    }
716
717    #[test]
718    fn camel_case_splitting() {
719        let tokens = split_camel_case_tokens(&["calculateTotal".to_string()]);
720        assert!(tokens.contains(&"calculateTotal".to_string()));
721        assert!(tokens.contains(&"calculate".to_string()));
722        assert!(tokens.contains(&"Total".to_string()));
723    }
724
725    #[test]
726    fn detect_rust_function() {
727        let (name, kind) =
728            detect_symbol("pub fn process_request(req: Request) -> Response {").unwrap();
729        assert_eq!(name, "process_request");
730        assert_eq!(kind, ChunkKind::Function);
731    }
732
733    #[test]
734    fn bm25_search_finds_relevant() {
735        let mut index = BM25Index::new();
736        index.add_chunk(CodeChunk {
737            file_path: "auth.rs".into(),
738            symbol_name: "validate_token".into(),
739            kind: ChunkKind::Function,
740            start_line: 1,
741            end_line: 10,
742            content: "fn validate_token(token: &str) -> bool { check_jwt_expiry(token) }".into(),
743            tokens: tokenize("fn validate_token token str bool check_jwt_expiry token"),
744            token_count: 8,
745        });
746        index.add_chunk(CodeChunk {
747            file_path: "db.rs".into(),
748            symbol_name: "connect_database".into(),
749            kind: ChunkKind::Function,
750            start_line: 1,
751            end_line: 5,
752            content: "fn connect_database(url: &str) -> Pool { create_pool(url) }".into(),
753            tokens: tokenize("fn connect_database url str Pool create_pool url"),
754            token_count: 7,
755        });
756        index.finalize();
757
758        let results = index.search("jwt token validation", 5);
759        assert!(!results.is_empty());
760        assert_eq!(results[0].symbol_name, "validate_token");
761    }
762
763    #[test]
764    fn vector_index_is_stale_when_any_indexed_file_is_missing() {
765        let td = tempdir().expect("tempdir");
766        let root = td.path();
767        std::fs::write(root.join("a.rs"), "pub fn a() {}\n").expect("write a.rs");
768
769        let idx = BM25Index::build_from_directory(root);
770        assert!(!vector_index_looks_stale(&idx, root));
771
772        std::fs::remove_file(root.join("a.rs")).expect("remove a.rs");
773        assert!(vector_index_looks_stale(&idx, root));
774    }
775
776    #[test]
777    #[cfg(unix)]
778    fn bm25_incremental_rebuild_reuses_unchanged_files_without_reading() {
779        let td = tempdir().expect("tempdir");
780        let root = td.path();
781
782        std::fs::write(root.join("a.rs"), "pub fn a() { println!(\"A\"); }\n").expect("write a.rs");
783        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B\"); }\n").expect("write b.rs");
784
785        let idx1 = BM25Index::build_from_directory(root);
786        assert!(idx1.files.contains_key("a.rs"));
787        assert!(idx1.files.contains_key("b.rs"));
788
789        // Make a.rs unreadable. Incremental rebuild must keep it indexed by reusing prior chunks.
790        let a_path = root.join("a.rs");
791        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
792        perms.set_mode(0o000);
793        std::fs::set_permissions(&a_path, perms).expect("chmod a.rs");
794
795        // Change b.rs (size changes) to force a re-read for that file.
796        std::fs::write(root.join("b.rs"), "pub fn b() { println!(\"B2\"); }\n")
797            .expect("rewrite b.rs");
798
799        let idx2 = BM25Index::rebuild_incremental(root, &idx1);
800        assert!(
801            idx2.files.contains_key("a.rs"),
802            "a.rs should be kept via reuse"
803        );
804        assert!(idx2.files.contains_key("b.rs"));
805
806        let b_has_b2 = idx2
807            .chunks
808            .iter()
809            .any(|c| c.file_path == "b.rs" && c.content.contains("B2"));
810        assert!(b_has_b2, "b.rs should be re-read and re-chunked");
811
812        // Restore permissions to avoid cleanup surprises.
813        let mut perms = std::fs::metadata(&a_path).expect("meta a.rs").permissions();
814        perms.set_mode(0o644);
815        let _ = std::fs::set_permissions(&a_path, perms);
816    }
817}