Skip to main content

lean_ctx/core/
artifact_index.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use crate::core::bm25_index::{BM25Index, ChunkKind, CodeChunk, IndexedFileState};
6
7const MAX_ARTIFACT_BYTES: u64 = 2_000_000;
8const MAX_CHUNKS_PER_FILE: usize = 50;
9
10pub fn index_file_path(project_root: &Path) -> PathBuf {
11    let code_idx = BM25Index::index_file_path(project_root);
12    let dir = code_idx.parent().unwrap_or_else(|| Path::new("."));
13    dir.join("bm25_artifacts_index.json")
14}
15
16pub fn load(project_root: &Path) -> Option<BM25Index> {
17    let path = index_file_path(project_root);
18    let data = std::fs::read_to_string(path).ok()?;
19    serde_json::from_str(&data).ok()
20}
21
22pub fn save(project_root: &Path, idx: &BM25Index) -> std::io::Result<()> {
23    let path = index_file_path(project_root);
24    if let Some(parent) = path.parent() {
25        std::fs::create_dir_all(parent)?;
26    }
27    let data = serde_json::to_string(idx).map_err(std::io::Error::other)?;
28    let tmp = path.with_extension("json.tmp");
29    std::fs::write(&tmp, data)?;
30    std::fs::rename(&tmp, &path)?;
31    Ok(())
32}
33
34pub fn load_or_build(project_root: &Path) -> (BM25Index, Vec<String>) {
35    let (files_now, mut warnings) = list_artifact_files(project_root);
36    if files_now.is_empty() {
37        return (load(project_root).unwrap_or_default(), warnings);
38    }
39
40    if let Some(prev) = load(project_root) {
41        if !index_looks_stale(&prev, project_root, &files_now) {
42            return (prev, warnings);
43        }
44        let rebuilt = if prev.files.is_empty() {
45            build_full(project_root, &files_now, &mut warnings)
46        } else {
47            rebuild_incremental(project_root, &prev, &files_now, &mut warnings)
48        };
49        let _ = save(project_root, &rebuilt);
50        return (rebuilt, warnings);
51    }
52
53    let built = build_full(project_root, &files_now, &mut warnings);
54    let _ = save(project_root, &built);
55    (built, warnings)
56}
57
58pub fn rebuild_from_scratch(project_root: &Path) -> (BM25Index, Vec<String>) {
59    let (files_now, mut warnings) = list_artifact_files(project_root);
60    let idx = build_full(project_root, &files_now, &mut warnings);
61    let _ = save(project_root, &idx);
62    (idx, warnings)
63}
64
65fn index_looks_stale(idx: &BM25Index, project_root: &Path, files_now: &[String]) -> bool {
66    if files_now.is_empty() {
67        return false;
68    }
69    if idx.files.is_empty() {
70        return true;
71    }
72
73    let now_set: HashSet<&str> = files_now.iter().map(String::as_str).collect();
74
75    for (rel, old_state) in &idx.files {
76        let abs = project_root.join(rel);
77        if !abs.exists() {
78            return true;
79        }
80        let Some(cur) = file_state(&abs) else {
81            return true;
82        };
83        if &cur != old_state {
84            return true;
85        }
86        if !now_set.contains(rel.as_str()) {
87            return true;
88        }
89    }
90
91    for rel in files_now {
92        if !idx.files.contains_key(rel) {
93            return true;
94        }
95    }
96
97    false
98}
99
100fn build_full(project_root: &Path, files: &[String], warnings: &mut Vec<String>) -> BM25Index {
101    let mut idx = BM25Index::new();
102
103    for rel in files {
104        let abs = project_root.join(rel);
105        let Some(state) = file_state(&abs) else {
106            continue;
107        };
108        let content = match std::fs::read_to_string(&abs) {
109            Ok(s) => s,
110            Err(e) => {
111                warnings.push(format!("artifact read failed: {rel} ({e})"));
112                continue;
113            }
114        };
115
116        let mut chunks = extract_artifact_chunks(rel, &content);
117        chunks.sort_by(|a, b| {
118            a.start_line
119                .cmp(&b.start_line)
120                .then_with(|| a.end_line.cmp(&b.end_line))
121                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
122        });
123        for chunk in chunks {
124            add_chunk(&mut idx, chunk);
125        }
126        idx.files.insert(rel.clone(), state);
127    }
128
129    finalize(&mut idx);
130    idx
131}
132
133fn rebuild_incremental(
134    project_root: &Path,
135    prev: &BM25Index,
136    files: &[String],
137    warnings: &mut Vec<String>,
138) -> BM25Index {
139    let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
140    for c in &prev.chunks {
141        old_by_file
142            .entry(c.file_path.clone())
143            .or_default()
144            .push(c.clone());
145    }
146    for v in old_by_file.values_mut() {
147        v.sort_by(|a, b| {
148            a.start_line
149                .cmp(&b.start_line)
150                .then_with(|| a.end_line.cmp(&b.end_line))
151                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
152        });
153    }
154
155    let mut idx = BM25Index::new();
156
157    for rel in files {
158        let abs = project_root.join(rel);
159        let Some(state) = file_state(&abs) else {
160            continue;
161        };
162
163        let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
164        if unchanged {
165            if let Some(chunks) = old_by_file.get(rel) {
166                for chunk in chunks {
167                    add_chunk(&mut idx, chunk.clone());
168                }
169                idx.files.insert(rel.clone(), state);
170                continue;
171            }
172        }
173
174        let content = match std::fs::read_to_string(&abs) {
175            Ok(s) => s,
176            Err(e) => {
177                warnings.push(format!("artifact read failed: {rel} ({e})"));
178                continue;
179            }
180        };
181
182        let mut chunks = extract_artifact_chunks(rel, &content);
183        chunks.sort_by(|a, b| {
184            a.start_line
185                .cmp(&b.start_line)
186                .then_with(|| a.end_line.cmp(&b.end_line))
187                .then_with(|| a.symbol_name.cmp(&b.symbol_name))
188        });
189        for chunk in chunks {
190            add_chunk(&mut idx, chunk);
191        }
192        idx.files.insert(rel.clone(), state);
193    }
194
195    finalize(&mut idx);
196    idx
197}
198
199fn add_chunk(idx: &mut BM25Index, chunk: CodeChunk) {
200    let chunk_idx = idx.chunks.len();
201    let tokens = crate::core::bm25_index::tokenize_for_index(&chunk.content);
202    for token in &tokens {
203        let lower = token.to_lowercase();
204        idx.inverted
205            .entry(lower)
206            .or_default()
207            .push((chunk_idx, 1.0));
208    }
209    idx.chunks.push(CodeChunk {
210        token_count: tokens.len(),
211        tokens: Vec::new(),
212        ..chunk
213    });
214}
215
216fn finalize(idx: &mut BM25Index) {
217    idx.doc_count = idx.chunks.len();
218    if idx.doc_count == 0 {
219        idx.avg_doc_len = 0.0;
220        idx.doc_freqs.clear();
221        return;
222    }
223
224    let total_len: usize = idx.chunks.iter().map(|c| c.token_count).sum();
225    idx.avg_doc_len = total_len as f64 / idx.doc_count as f64;
226
227    idx.doc_freqs.clear();
228    for (term, postings) in &idx.inverted {
229        let unique_docs: HashSet<usize> = postings.iter().map(|(i, _)| *i).collect();
230        idx.doc_freqs.insert(term.clone(), unique_docs.len());
231    }
232}
233
234fn list_artifact_files(project_root: &Path) -> (Vec<String>, Vec<String>) {
235    let resolved = crate::core::artifacts::load_resolved(project_root);
236    let mut warnings = resolved.warnings;
237
238    let cfg = crate::core::config::Config::load();
239    let extra_ignores: Vec<glob::Pattern> = cfg
240        .extra_ignore_patterns
241        .iter()
242        .filter_map(|p| glob::Pattern::new(p).ok())
243        .collect();
244
245    let mut files: Vec<String> = Vec::new();
246    for a in resolved.artifacts {
247        if !a.exists {
248            warnings.push(format!("artifact missing: {} ({})", a.name, a.path));
249            continue;
250        }
251
252        let abs = project_root.join(&a.path);
253        if a.is_dir {
254            let walker = ignore::WalkBuilder::new(&abs)
255                .hidden(true)
256                .git_ignore(true)
257                .git_global(true)
258                .git_exclude(true)
259                .build();
260            for entry in walker.flatten() {
261                let path = entry.path();
262                if !path.is_file() {
263                    continue;
264                }
265                if path.components().any(|c| c.as_os_str() == ".git") {
266                    continue;
267                }
268                if !is_artifact_text_file(path) {
269                    continue;
270                }
271                if let Ok(meta) = path.metadata() {
272                    if meta.len() > MAX_ARTIFACT_BYTES {
273                        continue;
274                    }
275                }
276                let rel = path
277                    .strip_prefix(project_root)
278                    .unwrap_or(path)
279                    .to_string_lossy()
280                    .to_string();
281                if rel.is_empty() {
282                    continue;
283                }
284                if extra_ignores.iter().any(|p| p.matches(&rel)) {
285                    continue;
286                }
287                files.push(rel);
288            }
289        } else {
290            if !abs.is_file() {
291                continue;
292            }
293            if !is_artifact_text_file(&abs) {
294                continue;
295            }
296            if let Ok(meta) = abs.metadata() {
297                if meta.len() > MAX_ARTIFACT_BYTES {
298                    continue;
299                }
300            }
301            if extra_ignores.iter().any(|p| p.matches(&a.path)) {
302                continue;
303            }
304            files.push(a.path);
305        }
306    }
307
308    files.sort();
309    files.dedup();
310    (files, warnings)
311}
312
313fn is_artifact_text_file(path: &Path) -> bool {
314    let name = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
315    if name.eq_ignore_ascii_case("Dockerfile") {
316        return true;
317    }
318    if name.eq_ignore_ascii_case(".env") {
319        return false;
320    }
321
322    let ext = path
323        .extension()
324        .and_then(|e| e.to_str())
325        .unwrap_or("")
326        .to_lowercase();
327    matches!(
328        ext.as_str(),
329        "md" | "mdx"
330            | "txt"
331            | "json"
332            | "yaml"
333            | "yml"
334            | "toml"
335            | "sql"
336            | "proto"
337            | "tf"
338            | "tfvars"
339            | "hcl"
340            | "rego"
341            | "graphql"
342            | "gql"
343            | "sh"
344            | "bash"
345            | "zsh"
346    )
347}
348
349fn file_state(path: &Path) -> Option<IndexedFileState> {
350    let meta = path.metadata().ok()?;
351    let size_bytes = meta.len();
352    let mtime_ms = meta
353        .modified()
354        .ok()
355        .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
356        .map(|d| d.as_millis() as u64)?;
357    Some(IndexedFileState {
358        mtime_ms,
359        size_bytes,
360    })
361}
362
363fn extract_artifact_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
364    let lines: Vec<&str> = content.lines().collect();
365    if lines.is_empty() {
366        return Vec::new();
367    }
368
369    let bytes = content.as_bytes();
370    let rk_chunks = crate::core::rabin_karp::chunk(content);
371    if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
372        let mut out: Vec<CodeChunk> = Vec::new();
373        for (idx, c) in rk_chunks.into_iter().take(MAX_CHUNKS_PER_FILE).enumerate() {
374            let end = (c.offset + c.length).min(bytes.len());
375            let slice = &bytes[c.offset..end];
376            let chunk_text = String::from_utf8_lossy(slice).into_owned();
377            let token_count = crate::core::bm25_index::tokenize_for_index(&chunk_text).len();
378            let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
379            let end_line = start_line + bytecount::count(slice, b'\n');
380            out.push(CodeChunk {
381                file_path: file_path.to_string(),
382                symbol_name: format!("{file_path}#chunk-{idx}"),
383                kind: ChunkKind::Other,
384                start_line,
385                end_line: end_line.max(start_line),
386                content: chunk_text,
387                tokens: Vec::new(),
388                token_count,
389            });
390        }
391        return out;
392    }
393
394    let token_count = crate::core::bm25_index::tokenize_for_index(content).len();
395    let snippet = lines
396        .iter()
397        .take(50)
398        .copied()
399        .collect::<Vec<_>>()
400        .join("\n");
401    vec![CodeChunk {
402        file_path: file_path.to_string(),
403        symbol_name: file_path.to_string(),
404        kind: ChunkKind::Other,
405        start_line: 1,
406        end_line: lines.len(),
407        content: snippet,
408        tokens: Vec::new(),
409        token_count,
410    }]
411}