1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::time::UNIX_EPOCH;
4
5use crate::core::bm25_index::{BM25Index, ChunkKind, CodeChunk, IndexedFileState};
6
7const MAX_ARTIFACT_BYTES: u64 = 2_000_000;
8const MAX_CHUNKS_PER_FILE: usize = 50;
9
10pub fn index_file_path(project_root: &Path) -> PathBuf {
11 let code_idx = BM25Index::index_file_path(project_root);
12 let dir = code_idx.parent().unwrap_or_else(|| Path::new("."));
13 dir.join("bm25_artifacts_index.json")
14}
15
16pub fn load(project_root: &Path) -> Option<BM25Index> {
17 let path = index_file_path(project_root);
18 let data = std::fs::read_to_string(path).ok()?;
19 serde_json::from_str(&data).ok()
20}
21
22pub fn save(project_root: &Path, idx: &BM25Index) -> std::io::Result<()> {
23 let path = index_file_path(project_root);
24 if let Some(parent) = path.parent() {
25 std::fs::create_dir_all(parent)?;
26 }
27 let data = serde_json::to_string(idx).map_err(std::io::Error::other)?;
28 let tmp = path.with_extension("json.tmp");
29 std::fs::write(&tmp, data)?;
30 std::fs::rename(&tmp, &path)?;
31 Ok(())
32}
33
34pub fn load_or_build(project_root: &Path) -> (BM25Index, Vec<String>) {
35 let (files_now, mut warnings) = list_artifact_files(project_root);
36 if files_now.is_empty() {
37 return (load(project_root).unwrap_or_default(), warnings);
38 }
39
40 if let Some(prev) = load(project_root) {
41 if !index_looks_stale(&prev, project_root, &files_now) {
42 return (prev, warnings);
43 }
44 let rebuilt = if prev.files.is_empty() {
45 build_full(project_root, &files_now, &mut warnings)
46 } else {
47 rebuild_incremental(project_root, &prev, &files_now, &mut warnings)
48 };
49 let _ = save(project_root, &rebuilt);
50 return (rebuilt, warnings);
51 }
52
53 let built = build_full(project_root, &files_now, &mut warnings);
54 let _ = save(project_root, &built);
55 (built, warnings)
56}
57
58pub fn rebuild_from_scratch(project_root: &Path) -> (BM25Index, Vec<String>) {
59 let (files_now, mut warnings) = list_artifact_files(project_root);
60 let idx = build_full(project_root, &files_now, &mut warnings);
61 let _ = save(project_root, &idx);
62 (idx, warnings)
63}
64
65fn index_looks_stale(idx: &BM25Index, project_root: &Path, files_now: &[String]) -> bool {
66 if files_now.is_empty() {
67 return false;
68 }
69 if idx.files.is_empty() {
70 return true;
71 }
72
73 let now_set: HashSet<&str> = files_now.iter().map(String::as_str).collect();
74
75 for (rel, old_state) in &idx.files {
76 let abs = project_root.join(rel);
77 if !abs.exists() {
78 return true;
79 }
80 let Some(cur) = file_state(&abs) else {
81 return true;
82 };
83 if &cur != old_state {
84 return true;
85 }
86 if !now_set.contains(rel.as_str()) {
87 return true;
88 }
89 }
90
91 for rel in files_now {
92 if !idx.files.contains_key(rel) {
93 return true;
94 }
95 }
96
97 false
98}
99
100fn build_full(project_root: &Path, files: &[String], warnings: &mut Vec<String>) -> BM25Index {
101 let mut idx = BM25Index::new();
102
103 for rel in files {
104 let abs = project_root.join(rel);
105 let Some(state) = file_state(&abs) else {
106 continue;
107 };
108 let content = match std::fs::read_to_string(&abs) {
109 Ok(s) => s,
110 Err(e) => {
111 warnings.push(format!("artifact read failed: {rel} ({e})"));
112 continue;
113 }
114 };
115
116 let mut chunks = extract_artifact_chunks(rel, &content);
117 chunks.sort_by(|a, b| {
118 a.start_line
119 .cmp(&b.start_line)
120 .then_with(|| a.end_line.cmp(&b.end_line))
121 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
122 });
123 for chunk in chunks {
124 add_chunk(&mut idx, chunk);
125 }
126 idx.files.insert(rel.clone(), state);
127 }
128
129 finalize(&mut idx);
130 idx
131}
132
133fn rebuild_incremental(
134 project_root: &Path,
135 prev: &BM25Index,
136 files: &[String],
137 warnings: &mut Vec<String>,
138) -> BM25Index {
139 let mut old_by_file: HashMap<String, Vec<CodeChunk>> = HashMap::new();
140 for c in &prev.chunks {
141 old_by_file
142 .entry(c.file_path.clone())
143 .or_default()
144 .push(c.clone());
145 }
146 for v in old_by_file.values_mut() {
147 v.sort_by(|a, b| {
148 a.start_line
149 .cmp(&b.start_line)
150 .then_with(|| a.end_line.cmp(&b.end_line))
151 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
152 });
153 }
154
155 let mut idx = BM25Index::new();
156
157 for rel in files {
158 let abs = project_root.join(rel);
159 let Some(state) = file_state(&abs) else {
160 continue;
161 };
162
163 let unchanged = prev.files.get(rel).is_some_and(|old| *old == state);
164 if unchanged {
165 if let Some(chunks) = old_by_file.get(rel) {
166 for chunk in chunks {
167 add_chunk(&mut idx, chunk.clone());
168 }
169 idx.files.insert(rel.clone(), state);
170 continue;
171 }
172 }
173
174 let content = match std::fs::read_to_string(&abs) {
175 Ok(s) => s,
176 Err(e) => {
177 warnings.push(format!("artifact read failed: {rel} ({e})"));
178 continue;
179 }
180 };
181
182 let mut chunks = extract_artifact_chunks(rel, &content);
183 chunks.sort_by(|a, b| {
184 a.start_line
185 .cmp(&b.start_line)
186 .then_with(|| a.end_line.cmp(&b.end_line))
187 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
188 });
189 for chunk in chunks {
190 add_chunk(&mut idx, chunk);
191 }
192 idx.files.insert(rel.clone(), state);
193 }
194
195 finalize(&mut idx);
196 idx
197}
198
199fn add_chunk(idx: &mut BM25Index, chunk: CodeChunk) {
200 let chunk_idx = idx.chunks.len();
201 let tokens = crate::core::bm25_index::tokenize_for_index(&chunk.content);
202 for token in &tokens {
203 let lower = token.to_lowercase();
204 idx.inverted
205 .entry(lower)
206 .or_default()
207 .push((chunk_idx, 1.0));
208 }
209 idx.chunks.push(CodeChunk {
210 token_count: tokens.len(),
211 tokens: Vec::new(),
212 ..chunk
213 });
214}
215
216fn finalize(idx: &mut BM25Index) {
217 idx.doc_count = idx.chunks.len();
218 if idx.doc_count == 0 {
219 idx.avg_doc_len = 0.0;
220 idx.doc_freqs.clear();
221 return;
222 }
223
224 let total_len: usize = idx.chunks.iter().map(|c| c.token_count).sum();
225 idx.avg_doc_len = total_len as f64 / idx.doc_count as f64;
226
227 idx.doc_freqs.clear();
228 for (term, postings) in &idx.inverted {
229 let unique_docs: HashSet<usize> = postings.iter().map(|(i, _)| *i).collect();
230 idx.doc_freqs.insert(term.clone(), unique_docs.len());
231 }
232}
233
234fn list_artifact_files(project_root: &Path) -> (Vec<String>, Vec<String>) {
235 let resolved = crate::core::artifacts::load_resolved(project_root);
236 let mut warnings = resolved.warnings;
237
238 let cfg = crate::core::config::Config::load();
239 let extra_ignores: Vec<glob::Pattern> = cfg
240 .extra_ignore_patterns
241 .iter()
242 .filter_map(|p| glob::Pattern::new(p).ok())
243 .collect();
244
245 let mut files: Vec<String> = Vec::new();
246 for a in resolved.artifacts {
247 if !a.exists {
248 warnings.push(format!("artifact missing: {} ({})", a.name, a.path));
249 continue;
250 }
251
252 let abs = project_root.join(&a.path);
253 if a.is_dir {
254 let walker = ignore::WalkBuilder::new(&abs)
255 .hidden(true)
256 .git_ignore(true)
257 .git_global(true)
258 .git_exclude(true)
259 .build();
260 for entry in walker.flatten() {
261 let path = entry.path();
262 if !path.is_file() {
263 continue;
264 }
265 if path.components().any(|c| c.as_os_str() == ".git") {
266 continue;
267 }
268 if !is_artifact_text_file(path) {
269 continue;
270 }
271 if let Ok(meta) = path.metadata() {
272 if meta.len() > MAX_ARTIFACT_BYTES {
273 continue;
274 }
275 }
276 let rel = path
277 .strip_prefix(project_root)
278 .unwrap_or(path)
279 .to_string_lossy()
280 .to_string();
281 if rel.is_empty() {
282 continue;
283 }
284 if extra_ignores.iter().any(|p| p.matches(&rel)) {
285 continue;
286 }
287 files.push(rel);
288 }
289 } else {
290 if !abs.is_file() {
291 continue;
292 }
293 if !is_artifact_text_file(&abs) {
294 continue;
295 }
296 if let Ok(meta) = abs.metadata() {
297 if meta.len() > MAX_ARTIFACT_BYTES {
298 continue;
299 }
300 }
301 if extra_ignores.iter().any(|p| p.matches(&a.path)) {
302 continue;
303 }
304 files.push(a.path);
305 }
306 }
307
308 files.sort();
309 files.dedup();
310 (files, warnings)
311}
312
313fn is_artifact_text_file(path: &Path) -> bool {
314 let name = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
315 if name.eq_ignore_ascii_case("Dockerfile") {
316 return true;
317 }
318 if name.eq_ignore_ascii_case(".env") {
319 return false;
320 }
321
322 let ext = path
323 .extension()
324 .and_then(|e| e.to_str())
325 .unwrap_or("")
326 .to_lowercase();
327 matches!(
328 ext.as_str(),
329 "md" | "mdx"
330 | "txt"
331 | "json"
332 | "yaml"
333 | "yml"
334 | "toml"
335 | "sql"
336 | "proto"
337 | "tf"
338 | "tfvars"
339 | "hcl"
340 | "rego"
341 | "graphql"
342 | "gql"
343 | "sh"
344 | "bash"
345 | "zsh"
346 )
347}
348
349fn file_state(path: &Path) -> Option<IndexedFileState> {
350 let meta = path.metadata().ok()?;
351 let size_bytes = meta.len();
352 let mtime_ms = meta
353 .modified()
354 .ok()
355 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
356 .map(|d| d.as_millis() as u64)?;
357 Some(IndexedFileState {
358 mtime_ms,
359 size_bytes,
360 })
361}
362
363fn extract_artifact_chunks(file_path: &str, content: &str) -> Vec<CodeChunk> {
364 let lines: Vec<&str> = content.lines().collect();
365 if lines.is_empty() {
366 return Vec::new();
367 }
368
369 let bytes = content.as_bytes();
370 let rk_chunks = crate::core::rabin_karp::chunk(content);
371 if !rk_chunks.is_empty() && rk_chunks.len() <= 200 {
372 let mut out: Vec<CodeChunk> = Vec::new();
373 for (idx, c) in rk_chunks.into_iter().take(MAX_CHUNKS_PER_FILE).enumerate() {
374 let end = (c.offset + c.length).min(bytes.len());
375 let slice = &bytes[c.offset..end];
376 let chunk_text = String::from_utf8_lossy(slice).into_owned();
377 let token_count = crate::core::bm25_index::tokenize_for_index(&chunk_text).len();
378 let start_line = 1 + bytecount::count(&bytes[..c.offset], b'\n');
379 let end_line = start_line + bytecount::count(slice, b'\n');
380 out.push(CodeChunk {
381 file_path: file_path.to_string(),
382 symbol_name: format!("{file_path}#chunk-{idx}"),
383 kind: ChunkKind::Other,
384 start_line,
385 end_line: end_line.max(start_line),
386 content: chunk_text,
387 tokens: Vec::new(),
388 token_count,
389 });
390 }
391 return out;
392 }
393
394 let token_count = crate::core::bm25_index::tokenize_for_index(content).len();
395 let snippet = lines
396 .iter()
397 .take(50)
398 .copied()
399 .collect::<Vec<_>>()
400 .join("\n");
401 vec![CodeChunk {
402 file_path: file_path.to_string(),
403 symbol_name: file_path.to_string(),
404 kind: ChunkKind::Other,
405 start_line: 1,
406 end_line: lines.len(),
407 content: snippet,
408 tokens: Vec::new(),
409 token_count,
410 }]
411}