1pub mod chunk;
13pub mod embed;
14pub mod index;
15
16use std::path::{Path, PathBuf};
17
18use serde::Serialize;
19
20use crate::error::RepographError;
21use crate::search::index::{Embedder, Store, fuse};
22
23pub const INDEX_DB_NAME: &str = "index.db";
25
26pub const MODEL_SUBDIR: &str = "models";
28
29pub const FIND_SCHEMA_VERSION: u32 = 2;
35
36const POOL_FACTOR: usize = 5;
39
40const MIN_POOL: usize = 50;
42
43const SNIPPET_MAX_CHARS: usize = 400;
45
46#[derive(Debug, Clone, Serialize)]
49pub struct Hit {
50 pub repo: String,
51 pub path: String,
53 pub line: u32,
55 pub score: f64,
57 pub snippet: String,
58}
59
60#[derive(Debug, Clone)]
63pub struct SearchOutcome {
64 pub hits: Vec<Hit>,
65 pub semantic_used: bool,
68 pub degraded: Option<String>,
71}
72
73#[derive(Debug, Clone, Default)]
75pub struct IndexOutcome {
76 pub repos_indexed: usize,
77 pub repos_skipped: usize,
78 pub files_indexed: usize,
79 pub files_unchanged: usize,
80 pub files_purged: usize,
81 pub changed: bool,
83 pub semantic: bool,
85 pub degraded: Option<String>,
87}
88
89#[derive(Debug, Clone, Default)]
91pub struct IndexStatus {
92 pub present: bool,
94 pub readable: bool,
96 pub stale: Vec<String>,
98}
99
100#[must_use]
102pub fn index_db_path(data_dir: &Path) -> PathBuf {
103 data_dir.join(INDEX_DB_NAME)
104}
105
106#[must_use]
108pub fn model_cache_dir(data_dir: &Path) -> PathBuf {
109 data_dir.join(MODEL_SUBDIR)
110}
111
112pub fn build_index(
123 data_dir: &Path,
124 repos: &[(String, PathBuf)],
125 semantic: bool,
126) -> Result<IndexOutcome, RepographError> {
127 let mut store = Store::open_for_build(&index_db_path(data_dir))?;
128 let (mut embedder, degraded) = make_embedder(semantic, &model_cache_dir(data_dir));
129 if let Some(e) = embedder.as_ref() {
130 store.ensure_model(e.model_id())?;
131 }
132
133 let mut outcome = IndexOutcome {
134 semantic: embedder.is_some(),
135 degraded,
136 ..IndexOutcome::default()
137 };
138
139 for (name, path) in repos {
140 let repo = match git2::Repository::open(path) {
141 Ok(r) => r,
142 Err(e) => {
143 tracing::warn!(repo = %name, error = %e, "skipping repo: cannot open");
144 outcome.repos_skipped += 1;
145 continue;
146 }
147 };
148 if repo.is_bare() {
149 tracing::warn!(repo = %name, "skipping bare repo");
150 outcome.repos_skipped += 1;
151 continue;
152 }
153 let files = match chunk::tracked_files(&repo, path) {
154 Ok(f) => f,
155 Err(e) => {
156 tracing::warn!(repo = %name, error = %e, "skipping repo: cannot read index");
157 outcome.repos_skipped += 1;
158 continue;
159 }
160 };
161 let head = head_commit(&repo);
162 #[allow(clippy::option_if_let_else)]
163 let emb: Option<&mut dyn Embedder> = match &mut embedder {
164 Some(e) => Some(e.as_mut()),
165 None => None,
166 };
167 let stats = store.reconcile_repo(name, &files, head.as_deref(), emb)?;
168 outcome.repos_indexed += 1;
169 outcome.files_indexed += stats.files_indexed;
170 outcome.files_unchanged += stats.files_unchanged;
171 outcome.files_purged += stats.files_purged;
172 }
173 outcome.changed = outcome.files_indexed > 0 || outcome.files_purged > 0;
174 Ok(outcome)
175}
176
177pub fn search(
188 data_dir: &Path,
189 query: &str,
190 repos_filter: &[String],
191 limit: usize,
192 semantic: bool,
193) -> Result<SearchOutcome, RepographError> {
194 let store = Store::open_existing(&index_db_path(data_dir))?;
195 let pool = limit.max(1).saturating_mul(POOL_FACTOR).max(MIN_POOL);
196
197 let lexical = store.search_lexical(query, repos_filter, pool)?;
198
199 let mut vector = Vec::new();
200 let mut semantic_used = false;
201 let mut degraded = None;
202
203 if semantic {
204 let (embedder, deg) = make_embedder(true, &model_cache_dir(data_dir));
205 degraded = deg;
206 if let Some(mut e) = embedder {
207 if store.has_vectors()? {
208 match e.embed(&[query.to_string()]) {
209 Ok(v) if !v.is_empty() => {
210 vector = store.search_vectors(&v[0], repos_filter, pool)?;
211 semantic_used = true;
212 }
213 Ok(_) => degraded = Some("query produced no embedding".to_string()),
214 Err(msg) => degraded = Some(msg),
215 }
216 } else {
217 degraded =
218 Some("index has no embeddings — run `repograph index --semantic`".to_string());
219 }
220 }
221 }
222
223 let fused = fuse(&[lexical.as_slice(), vector.as_slice()]);
224 let top: Vec<i64> = fused.iter().take(limit).map(|(id, _)| *id).collect();
225 let rows = store.fetch_chunks(&top)?;
226 let hits = fused
227 .iter()
228 .take(limit)
229 .filter_map(|(id, score)| {
230 rows.get(id).map(|row| Hit {
231 repo: row.repo.clone(),
232 path: row.path.clone(),
233 line: row.start_line,
234 score: *score,
235 snippet: snippet(&row.content),
236 })
237 })
238 .collect();
239
240 Ok(SearchOutcome {
241 hits,
242 semantic_used,
243 degraded,
244 })
245}
246
247pub fn index_health(
258 data_dir: &Path,
259 repos: &[(String, PathBuf)],
260) -> Result<IndexStatus, RepographError> {
261 let db = index_db_path(data_dir);
262 if !db.is_file() {
263 return Ok(IndexStatus::default());
264 }
265 let store = match Store::open_existing(&db) {
266 Ok(s) => s,
267 Err(RepographError::IndexMissing) => return Ok(IndexStatus::default()),
268 Err(_) => {
269 return Ok(IndexStatus {
270 present: true,
271 readable: false,
272 stale: Vec::new(),
273 });
274 }
275 };
276 let Ok(commits) = store.indexed_commits() else {
279 return Ok(IndexStatus {
280 present: true,
281 readable: false,
282 stale: Vec::new(),
283 });
284 };
285 let mut stale = Vec::new();
286 for (name, path) in repos {
287 let current = git2::Repository::open(path)
288 .ok()
289 .and_then(|r| head_commit(&r));
290 match commits.get(name) {
291 Some(indexed) if *indexed == current => {}
292 _ => stale.push(name.clone()),
293 }
294 }
295 stale.sort();
296 Ok(IndexStatus {
297 present: true,
298 readable: true,
299 stale,
300 })
301}
302
303fn make_embedder(
307 semantic: bool,
308 model_cache_dir: &Path,
309) -> (Option<Box<dyn Embedder>>, Option<String>) {
310 if !semantic {
311 return (None, None);
312 }
313 match embed::create(model_cache_dir) {
314 Ok(e) => (Some(e), None),
315 Err(reason) => (None, Some(reason)),
316 }
317}
318
319fn head_commit(repo: &git2::Repository) -> Option<String> {
320 repo.head().ok()?.target().map(|oid| oid.to_string())
321}
322
323fn snippet(content: &str) -> String {
325 if content.chars().count() <= SNIPPET_MAX_CHARS {
326 return content.to_string();
327 }
328 let truncated: String = content.chars().take(SNIPPET_MAX_CHARS).collect();
329 format!("{truncated}…")
330}
331
332#[cfg(test)]
333mod tests {
334 #![allow(clippy::unwrap_used, clippy::format_collect)]
335 use super::*;
336 use tempfile::TempDir;
337
338 fn init_repo(parent: &Path, name: &str, files: &[(&str, &str)]) -> PathBuf {
339 let dir = parent.join(name);
340 std::fs::create_dir_all(&dir).unwrap();
341 let repo = git2::Repository::init(&dir).unwrap();
342 for (rel, body) in files {
343 std::fs::write(dir.join(rel), body).unwrap();
344 }
345 let sig = git2::Signature::now("T", "t@e").unwrap();
346 let mut index = repo.index().unwrap();
347 index
348 .add_all(["*"], git2::IndexAddOption::DEFAULT, None)
349 .unwrap();
350 index.write().unwrap();
351 let tree_id = index.write_tree().unwrap();
352 let tree = repo.find_tree(tree_id).unwrap();
353 repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[])
354 .unwrap();
355 dir
356 }
357
358 #[test]
359 fn build_then_search_across_repos() {
360 let tmp = TempDir::new().unwrap();
361 let data = tmp.path().join("data");
362 let api = init_repo(
363 tmp.path(),
364 "api",
365 &[("auth.rs", "fn rotate_refresh_token() {}\n")],
366 );
367 let ui = init_repo(
368 tmp.path(),
369 "ui",
370 &[("button.rs", "fn render_button() {}\n")],
371 );
372 let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
373
374 let outcome = build_index(&data, &repos, false).unwrap();
375 assert_eq!(outcome.repos_indexed, 2);
376 assert!(outcome.files_indexed >= 2);
377
378 let result = search(&data, "rotate_refresh_token", &[], 5, false).unwrap();
379 assert!(!result.hits.is_empty());
380 assert_eq!(result.hits[0].repo, "api");
381 assert_eq!(result.hits[0].path, "auth.rs");
382 assert!(!result.semantic_used);
383 }
384
385 #[test]
386 fn search_without_index_is_index_missing() {
387 let tmp = TempDir::new().unwrap();
388 let err = search(&tmp.path().join("data"), "anything", &[], 5, false).unwrap_err();
389 assert!(matches!(err, RepographError::IndexMissing));
390 }
391
392 #[test]
393 fn workspace_filter_scopes_results() {
394 let tmp = TempDir::new().unwrap();
395 let data = tmp.path().join("data");
396 let api = init_repo(tmp.path(), "api", &[("a.rs", "fn shared_widget() {}\n")]);
397 let ui = init_repo(tmp.path(), "ui", &[("b.rs", "fn shared_widget() {}\n")]);
398 let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
399 build_index(&data, &repos, false).unwrap();
400
401 let scoped = search(&data, "shared_widget", &["api".to_string()], 5, false).unwrap();
402 assert!(!scoped.hits.is_empty());
403 assert!(scoped.hits.iter().all(|h| h.repo == "api"));
404 }
405
406 #[test]
407 fn no_match_is_empty_not_error() {
408 let tmp = TempDir::new().unwrap();
409 let data = tmp.path().join("data");
410 let api = init_repo(tmp.path(), "api", &[("a.rs", "fn alpha() {}\n")]);
411 build_index(&data, &[("api".to_string(), api)], false).unwrap();
412 let result = search(&data, "zzz_nonexistent_symbol_qqq", &[], 5, false).unwrap();
413 assert!(result.hits.is_empty());
414 }
415
416 #[test]
417 fn limit_bounds_hits() {
418 let tmp = TempDir::new().unwrap();
419 let data = tmp.path().join("data");
420 let body: String = (0..50).map(|n| format!("fn widget_{n}() {{}}\n")).collect();
421 let api = init_repo(tmp.path(), "api", &[("w.rs", &body)]);
422 build_index(&data, &[("api".to_string(), api)], false).unwrap();
423 let result = search(&data, "widget", &[], 3, false).unwrap();
424 assert!(result.hits.len() <= 3);
425 }
426
427 #[test]
428 fn semantic_requested_without_feature_degrades_to_lexical() {
429 let tmp = TempDir::new().unwrap();
430 let data = tmp.path().join("data");
431 let api = init_repo(tmp.path(), "api", &[("a.rs", "fn parse_csv() {}\n")]);
432 build_index(&data, &[("api".to_string(), api)], true).unwrap();
433 let result = search(&data, "parse_csv", &[], 5, true).unwrap();
434 assert!(!result.hits.is_empty());
436 if cfg!(not(feature = "semantic")) {
437 assert!(!result.semantic_used);
438 assert!(result.degraded.is_some());
439 }
440 }
441
442 #[test]
443 fn health_missing_index_is_absent_not_error() {
444 let tmp = TempDir::new().unwrap();
445 let status = index_health(&tmp.path().join("data"), &[]).unwrap();
446 assert!(!status.present);
447 assert!(status.stale.is_empty());
448 }
449
450 #[test]
451 fn health_reports_current_and_stale() {
452 let tmp = TempDir::new().unwrap();
453 let data = tmp.path().join("data");
454 let api = init_repo(tmp.path(), "api", &[("a.rs", "fn a() {}\n")]);
455 let repos = vec![("api".to_string(), api.clone())];
456 build_index(&data, &repos, false).unwrap();
457
458 let status = index_health(&data, &repos).unwrap();
459 assert!(status.present && status.readable);
460 assert!(status.stale.is_empty(), "freshly indexed repo is current");
461
462 let ghost = vec![("ghost".to_string(), api)];
464 let mixed = index_health(&data, &ghost).unwrap();
465 assert_eq!(mixed.stale, vec!["ghost".to_string()]);
466 }
467}