pub mod chunk;
pub mod embed;
pub mod index;
use std::path::{Path, PathBuf};
use serde::Serialize;
use crate::error::RepographError;
use crate::search::index::{Embedder, Store, fuse};
pub const INDEX_DB_NAME: &str = "index.db";
pub const MODEL_SUBDIR: &str = "models";
pub const FIND_SCHEMA_VERSION: u32 = 2;
const POOL_FACTOR: usize = 5;
const MIN_POOL: usize = 50;
const SNIPPET_MAX_CHARS: usize = 400;
#[derive(Debug, Clone, Serialize)]
pub struct Hit {
pub repo: String,
pub path: String,
pub line: u32,
pub score: f64,
pub snippet: String,
}
#[derive(Debug, Clone)]
pub struct SearchOutcome {
pub hits: Vec<Hit>,
pub semantic_used: bool,
pub degraded: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct IndexOutcome {
pub repos_indexed: usize,
pub repos_skipped: usize,
pub files_indexed: usize,
pub files_unchanged: usize,
pub files_purged: usize,
pub changed: bool,
pub semantic: bool,
pub degraded: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct IndexStatus {
pub present: bool,
pub readable: bool,
pub stale: Vec<String>,
}
#[must_use]
pub fn index_db_path(data_dir: &Path) -> PathBuf {
data_dir.join(INDEX_DB_NAME)
}
#[must_use]
pub fn model_cache_dir(data_dir: &Path) -> PathBuf {
data_dir.join(MODEL_SUBDIR)
}
pub fn build_index(
data_dir: &Path,
repos: &[(String, PathBuf)],
semantic: bool,
) -> Result<IndexOutcome, RepographError> {
let mut store = Store::open_for_build(&index_db_path(data_dir))?;
let (mut embedder, degraded) = make_embedder(semantic, &model_cache_dir(data_dir));
if let Some(e) = embedder.as_ref() {
store.ensure_model(e.model_id())?;
}
let mut outcome = IndexOutcome {
semantic: embedder.is_some(),
degraded,
..IndexOutcome::default()
};
for (name, path) in repos {
let repo = match git2::Repository::open(path) {
Ok(r) => r,
Err(e) => {
tracing::warn!(repo = %name, error = %e, "skipping repo: cannot open");
outcome.repos_skipped += 1;
continue;
}
};
if repo.is_bare() {
tracing::warn!(repo = %name, "skipping bare repo");
outcome.repos_skipped += 1;
continue;
}
let files = match chunk::tracked_files(&repo, path) {
Ok(f) => f,
Err(e) => {
tracing::warn!(repo = %name, error = %e, "skipping repo: cannot read index");
outcome.repos_skipped += 1;
continue;
}
};
let head = head_commit(&repo);
#[allow(clippy::option_if_let_else)]
let emb: Option<&mut dyn Embedder> = match &mut embedder {
Some(e) => Some(e.as_mut()),
None => None,
};
let stats = store.reconcile_repo(name, &files, head.as_deref(), emb)?;
outcome.repos_indexed += 1;
outcome.files_indexed += stats.files_indexed;
outcome.files_unchanged += stats.files_unchanged;
outcome.files_purged += stats.files_purged;
}
outcome.changed = outcome.files_indexed > 0 || outcome.files_purged > 0;
Ok(outcome)
}
pub fn search(
data_dir: &Path,
query: &str,
repos_filter: &[String],
limit: usize,
semantic: bool,
) -> Result<SearchOutcome, RepographError> {
let store = Store::open_existing(&index_db_path(data_dir))?;
let pool = limit.max(1).saturating_mul(POOL_FACTOR).max(MIN_POOL);
let lexical = store.search_lexical(query, repos_filter, pool)?;
let mut vector = Vec::new();
let mut semantic_used = false;
let mut degraded = None;
if semantic {
let (embedder, deg) = make_embedder(true, &model_cache_dir(data_dir));
degraded = deg;
if let Some(mut e) = embedder {
if store.has_vectors()? {
match e.embed(&[query.to_string()]) {
Ok(v) if !v.is_empty() => {
vector = store.search_vectors(&v[0], repos_filter, pool)?;
semantic_used = true;
}
Ok(_) => degraded = Some("query produced no embedding".to_string()),
Err(msg) => degraded = Some(msg),
}
} else {
degraded =
Some("index has no embeddings — run `repograph index --semantic`".to_string());
}
}
}
let fused = fuse(&[lexical.as_slice(), vector.as_slice()]);
let top: Vec<i64> = fused.iter().take(limit).map(|(id, _)| *id).collect();
let rows = store.fetch_chunks(&top)?;
let hits = fused
.iter()
.take(limit)
.filter_map(|(id, score)| {
rows.get(id).map(|row| Hit {
repo: row.repo.clone(),
path: row.path.clone(),
line: row.start_line,
score: *score,
snippet: snippet(&row.content),
})
})
.collect();
Ok(SearchOutcome {
hits,
semantic_used,
degraded,
})
}
pub fn index_health(
data_dir: &Path,
repos: &[(String, PathBuf)],
) -> Result<IndexStatus, RepographError> {
let db = index_db_path(data_dir);
if !db.is_file() {
return Ok(IndexStatus::default());
}
let store = match Store::open_existing(&db) {
Ok(s) => s,
Err(RepographError::IndexMissing) => return Ok(IndexStatus::default()),
Err(_) => {
return Ok(IndexStatus {
present: true,
readable: false,
stale: Vec::new(),
});
}
};
let Ok(commits) = store.indexed_commits() else {
return Ok(IndexStatus {
present: true,
readable: false,
stale: Vec::new(),
});
};
let mut stale = Vec::new();
for (name, path) in repos {
let current = git2::Repository::open(path)
.ok()
.and_then(|r| head_commit(&r));
match commits.get(name) {
Some(indexed) if *indexed == current => {}
_ => stale.push(name.clone()),
}
}
stale.sort();
Ok(IndexStatus {
present: true,
readable: true,
stale,
})
}
fn make_embedder(
semantic: bool,
model_cache_dir: &Path,
) -> (Option<Box<dyn Embedder>>, Option<String>) {
if !semantic {
return (None, None);
}
match embed::create(model_cache_dir) {
Ok(e) => (Some(e), None),
Err(reason) => (None, Some(reason)),
}
}
fn head_commit(repo: &git2::Repository) -> Option<String> {
repo.head().ok()?.target().map(|oid| oid.to_string())
}
fn snippet(content: &str) -> String {
if content.chars().count() <= SNIPPET_MAX_CHARS {
return content.to_string();
}
let truncated: String = content.chars().take(SNIPPET_MAX_CHARS).collect();
format!("{truncated}…")
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used, clippy::format_collect)]
use super::*;
use tempfile::TempDir;
fn init_repo(parent: &Path, name: &str, files: &[(&str, &str)]) -> PathBuf {
let dir = parent.join(name);
std::fs::create_dir_all(&dir).unwrap();
let repo = git2::Repository::init(&dir).unwrap();
for (rel, body) in files {
std::fs::write(dir.join(rel), body).unwrap();
}
let sig = git2::Signature::now("T", "t@e").unwrap();
let mut index = repo.index().unwrap();
index
.add_all(["*"], git2::IndexAddOption::DEFAULT, None)
.unwrap();
index.write().unwrap();
let tree_id = index.write_tree().unwrap();
let tree = repo.find_tree(tree_id).unwrap();
repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[])
.unwrap();
dir
}
#[test]
fn build_then_search_across_repos() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let api = init_repo(
tmp.path(),
"api",
&[("auth.rs", "fn rotate_refresh_token() {}\n")],
);
let ui = init_repo(
tmp.path(),
"ui",
&[("button.rs", "fn render_button() {}\n")],
);
let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
let outcome = build_index(&data, &repos, false).unwrap();
assert_eq!(outcome.repos_indexed, 2);
assert!(outcome.files_indexed >= 2);
let result = search(&data, "rotate_refresh_token", &[], 5, false).unwrap();
assert!(!result.hits.is_empty());
assert_eq!(result.hits[0].repo, "api");
assert_eq!(result.hits[0].path, "auth.rs");
assert!(!result.semantic_used);
}
#[test]
fn search_without_index_is_index_missing() {
let tmp = TempDir::new().unwrap();
let err = search(&tmp.path().join("data"), "anything", &[], 5, false).unwrap_err();
assert!(matches!(err, RepographError::IndexMissing));
}
#[test]
fn workspace_filter_scopes_results() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let api = init_repo(tmp.path(), "api", &[("a.rs", "fn shared_widget() {}\n")]);
let ui = init_repo(tmp.path(), "ui", &[("b.rs", "fn shared_widget() {}\n")]);
let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
build_index(&data, &repos, false).unwrap();
let scoped = search(&data, "shared_widget", &["api".to_string()], 5, false).unwrap();
assert!(!scoped.hits.is_empty());
assert!(scoped.hits.iter().all(|h| h.repo == "api"));
}
#[test]
fn no_match_is_empty_not_error() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let api = init_repo(tmp.path(), "api", &[("a.rs", "fn alpha() {}\n")]);
build_index(&data, &[("api".to_string(), api)], false).unwrap();
let result = search(&data, "zzz_nonexistent_symbol_qqq", &[], 5, false).unwrap();
assert!(result.hits.is_empty());
}
#[test]
fn limit_bounds_hits() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let body: String = (0..50).map(|n| format!("fn widget_{n}() {{}}\n")).collect();
let api = init_repo(tmp.path(), "api", &[("w.rs", &body)]);
build_index(&data, &[("api".to_string(), api)], false).unwrap();
let result = search(&data, "widget", &[], 3, false).unwrap();
assert!(result.hits.len() <= 3);
}
#[test]
fn semantic_requested_without_feature_degrades_to_lexical() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let api = init_repo(tmp.path(), "api", &[("a.rs", "fn parse_csv() {}\n")]);
build_index(&data, &[("api".to_string(), api)], true).unwrap();
let result = search(&data, "parse_csv", &[], 5, true).unwrap();
assert!(!result.hits.is_empty());
if cfg!(not(feature = "semantic")) {
assert!(!result.semantic_used);
assert!(result.degraded.is_some());
}
}
#[test]
fn health_missing_index_is_absent_not_error() {
let tmp = TempDir::new().unwrap();
let status = index_health(&tmp.path().join("data"), &[]).unwrap();
assert!(!status.present);
assert!(status.stale.is_empty());
}
#[test]
fn health_reports_current_and_stale() {
let tmp = TempDir::new().unwrap();
let data = tmp.path().join("data");
let api = init_repo(tmp.path(), "api", &[("a.rs", "fn a() {}\n")]);
let repos = vec![("api".to_string(), api.clone())];
build_index(&data, &repos, false).unwrap();
let status = index_health(&data, &repos).unwrap();
assert!(status.present && status.readable);
assert!(status.stale.is_empty(), "freshly indexed repo is current");
let ghost = vec![("ghost".to_string(), api)];
let mixed = index_health(&data, &ghost).unwrap();
assert_eq!(mixed.stale, vec!["ghost".to_string()]);
}
}