Skip to main content

repograph_core/search/
mod.rs

1//! Cross-repo precedent search: index the git-tracked content of every
2//! registered repo, then retrieve it by meaning or by keyword.
3//!
4//! The store ([`index`]) is a single `SQLite` database spanning all repos so one
5//! query reaches everything — the answer to "I solved this somewhere, I just
6//! don't remember which repo." Retrieval is hybrid: BM25 lexical (FTS5) fused
7//! with semantic cosine over local embeddings ([`embed`], feature `semantic`),
8//! merged by reciprocal-rank fusion. The binary resolves the data directory and
9//! passes it in; this module performs the `dirs`-free path joins, mirroring how
10//! `config.rs` takes a `dir: &Path`.
11
12pub mod chunk;
13pub mod embed;
14pub mod index;
15
16use std::path::{Path, PathBuf};
17
18use serde::Serialize;
19
20use crate::error::RepographError;
21use crate::search::index::{Embedder, Store, fuse};
22
23/// File name of the central index database under the data directory.
24pub const INDEX_DB_NAME: &str = "index.db";
25
26/// Subdirectory of the data directory that caches the embedding model.
27pub const MODEL_SUBDIR: &str = "models";
28
29/// Schema version of the `repograph find` JSON envelope.
30///
31/// Bumped to `2` when `semantic_used` and `degraded` were added so an agent can
32/// detect, from the stdout payload alone, whether semantic retrieval ran or
33/// silently fell back to lexical.
34pub const FIND_SCHEMA_VERSION: u32 = 2;
35
36/// Candidate-pool multiplier: we pull `limit * POOL_FACTOR` candidates from each
37/// retrieval arm before fusing, so fusion has room to reorder.
38const POOL_FACTOR: usize = 5;
39
40/// Floor on the candidate pool, so small `--limit` values still fuse usefully.
41const MIN_POOL: usize = 50;
42
43/// Maximum characters in a result snippet before truncation.
44const SNIPPET_MAX_CHARS: usize = 400;
45
46/// One ranked search result. Field order is the JSON serialization order and
47/// part of the stable output contract.
48#[derive(Debug, Clone, Serialize)]
49pub struct Hit {
50    pub repo: String,
51    /// Repo-relative path, forward-slashed.
52    pub path: String,
53    /// 1-based start line of the matched chunk.
54    pub line: u32,
55    /// Fused relevance score (higher is better).
56    pub score: f64,
57    pub snippet: String,
58}
59
60/// Outcome of [`search`]: the ranked hits plus retrieval diagnostics for the
61/// binary's stderr (never part of the stdout data contract).
62#[derive(Debug, Clone)]
63pub struct SearchOutcome {
64    pub hits: Vec<Hit>,
65    /// True when semantic retrieval actually contributed (embedder available
66    /// and vectors present).
67    pub semantic_used: bool,
68    /// Set when semantic was requested but unavailable — the reason, for a
69    /// stderr notice. `None` when not requested or fully satisfied.
70    pub degraded: Option<String>,
71}
72
73/// Outcome of [`build_index`].
74#[derive(Debug, Clone, Default)]
75pub struct IndexOutcome {
76    pub repos_indexed: usize,
77    pub repos_skipped: usize,
78    pub files_indexed: usize,
79    pub files_unchanged: usize,
80    pub files_purged: usize,
81    /// True when at least one file was (re)indexed or purged this run.
82    pub changed: bool,
83    /// True when semantic embeddings were written.
84    pub semantic: bool,
85    /// Set when semantic was requested but unavailable.
86    pub degraded: Option<String>,
87}
88
89/// Health of the search index, consumed by `repograph doctor`.
90#[derive(Debug, Clone, Default)]
91pub struct IndexStatus {
92    /// The index database file exists.
93    pub present: bool,
94    /// The index opened and matched this build's schema.
95    pub readable: bool,
96    /// Repos that are missing from the index or stale relative to their HEAD.
97    pub stale: Vec<String>,
98}
99
100/// Path to the index database within `data_dir`.
101#[must_use]
102pub fn index_db_path(data_dir: &Path) -> PathBuf {
103    data_dir.join(INDEX_DB_NAME)
104}
105
106/// Path to the embedding-model cache within `data_dir`.
107#[must_use]
108pub fn model_cache_dir(data_dir: &Path) -> PathBuf {
109    data_dir.join(MODEL_SUBDIR)
110}
111
112/// Build or refresh the index over `repos` (each `(name, absolute_path)`).
113///
114/// Indexing is incremental and git-aware: only changed files are re-chunked,
115/// removed files are purged. Repos that cannot be opened, are bare, or have no
116/// commits are skipped with a warning rather than aborting the run.
117///
118/// # Errors
119///
120/// Returns [`RepographError::Index`] on a store failure, or
121/// [`RepographError::Io`] if the data directory cannot be created.
122pub fn build_index(
123    data_dir: &Path,
124    repos: &[(String, PathBuf)],
125    semantic: bool,
126) -> Result<IndexOutcome, RepographError> {
127    let mut store = Store::open_for_build(&index_db_path(data_dir))?;
128    let (mut embedder, degraded) = make_embedder(semantic, &model_cache_dir(data_dir));
129    if let Some(e) = embedder.as_ref() {
130        store.ensure_model(e.model_id())?;
131    }
132
133    let mut outcome = IndexOutcome {
134        semantic: embedder.is_some(),
135        degraded,
136        ..IndexOutcome::default()
137    };
138
139    for (name, path) in repos {
140        let repo = match git2::Repository::open(path) {
141            Ok(r) => r,
142            Err(e) => {
143                tracing::warn!(repo = %name, error = %e, "skipping repo: cannot open");
144                outcome.repos_skipped += 1;
145                continue;
146            }
147        };
148        if repo.is_bare() {
149            tracing::warn!(repo = %name, "skipping bare repo");
150            outcome.repos_skipped += 1;
151            continue;
152        }
153        let files = match chunk::tracked_files(&repo, path) {
154            Ok(f) => f,
155            Err(e) => {
156                tracing::warn!(repo = %name, error = %e, "skipping repo: cannot read index");
157                outcome.repos_skipped += 1;
158                continue;
159            }
160        };
161        let head = head_commit(&repo);
162        #[allow(clippy::option_if_let_else)]
163        let emb: Option<&mut dyn Embedder> = match &mut embedder {
164            Some(e) => Some(e.as_mut()),
165            None => None,
166        };
167        let stats = store.reconcile_repo(name, &files, head.as_deref(), emb)?;
168        outcome.repos_indexed += 1;
169        outcome.files_indexed += stats.files_indexed;
170        outcome.files_unchanged += stats.files_unchanged;
171        outcome.files_purged += stats.files_purged;
172    }
173    outcome.changed = outcome.files_indexed > 0 || outcome.files_purged > 0;
174    Ok(outcome)
175}
176
177/// Search the index, returning ranked hits across all repos or one workspace.
178///
179/// `repos_filter` (when non-empty) scopes results to those repo names.
180/// `semantic` requests the hybrid path; it degrades to lexical with a populated
181/// `degraded` reason when embeddings or the model are unavailable.
182///
183/// # Errors
184///
185/// Returns [`RepographError::IndexMissing`] (exit 3) when no index has been
186/// built, or [`RepographError::Index`] (exit 1) when the index is unreadable.
187pub fn search(
188    data_dir: &Path,
189    query: &str,
190    repos_filter: &[String],
191    limit: usize,
192    semantic: bool,
193) -> Result<SearchOutcome, RepographError> {
194    let store = Store::open_existing(&index_db_path(data_dir))?;
195    let pool = limit.max(1).saturating_mul(POOL_FACTOR).max(MIN_POOL);
196
197    let lexical = store.search_lexical(query, repos_filter, pool)?;
198
199    let mut vector = Vec::new();
200    let mut semantic_used = false;
201    let mut degraded = None;
202
203    if semantic {
204        let (embedder, deg) = make_embedder(true, &model_cache_dir(data_dir));
205        degraded = deg;
206        if let Some(mut e) = embedder {
207            if store.has_vectors()? {
208                match e.embed(&[query.to_string()]) {
209                    Ok(v) if !v.is_empty() => {
210                        vector = store.search_vectors(&v[0], repos_filter, pool)?;
211                        semantic_used = true;
212                    }
213                    Ok(_) => degraded = Some("query produced no embedding".to_string()),
214                    Err(msg) => degraded = Some(msg),
215                }
216            } else {
217                degraded =
218                    Some("index has no embeddings — run `repograph index --semantic`".to_string());
219            }
220        }
221    }
222
223    let fused = fuse(&[lexical.as_slice(), vector.as_slice()]);
224    let top: Vec<i64> = fused.iter().take(limit).map(|(id, _)| *id).collect();
225    let rows = store.fetch_chunks(&top)?;
226    let hits = fused
227        .iter()
228        .take(limit)
229        .filter_map(|(id, score)| {
230            rows.get(id).map(|row| Hit {
231                repo: row.repo.clone(),
232                path: row.path.clone(),
233                line: row.start_line,
234                score: *score,
235                snippet: snippet(&row.content),
236            })
237        })
238        .collect();
239
240    Ok(SearchOutcome {
241        hits,
242        semantic_used,
243        degraded,
244    })
245}
246
247/// Compute the [`IndexStatus`] for `repos`.
248///
249/// Never errors: a missing, unreadable, or mid-inspection-failing index is
250/// reported via the `present`/`readable` flags so `doctor` surfaces it as a
251/// warning rather than aborting the whole report.
252///
253/// # Errors
254///
255/// Returns `Ok` in all cases; the `Result` is retained for signature stability
256/// with the other core entry points the binary calls with `?`.
257pub fn index_health(
258    data_dir: &Path,
259    repos: &[(String, PathBuf)],
260) -> Result<IndexStatus, RepographError> {
261    let db = index_db_path(data_dir);
262    if !db.is_file() {
263        return Ok(IndexStatus::default());
264    }
265    let store = match Store::open_existing(&db) {
266        Ok(s) => s,
267        Err(RepographError::IndexMissing) => return Ok(IndexStatus::default()),
268        Err(_) => {
269            return Ok(IndexStatus {
270                present: true,
271                readable: false,
272                stale: Vec::new(),
273            });
274        }
275    };
276    // A present index that opens but fails a query mid-inspection (corruption,
277    // a partial write) is "present but not readable", not a hard error.
278    let Ok(commits) = store.indexed_commits() else {
279        return Ok(IndexStatus {
280            present: true,
281            readable: false,
282            stale: Vec::new(),
283        });
284    };
285    let mut stale = Vec::new();
286    for (name, path) in repos {
287        let current = git2::Repository::open(path)
288            .ok()
289            .and_then(|r| head_commit(&r));
290        match commits.get(name) {
291            Some(indexed) if *indexed == current => {}
292            _ => stale.push(name.clone()),
293        }
294    }
295    stale.sort();
296    Ok(IndexStatus {
297        present: true,
298        readable: true,
299        stale,
300    })
301}
302
303/// Construct an embedder when `semantic` is requested. Returns `(None,
304/// Some(reason))` when semantic is requested but unavailable, `(Some, None)` on
305/// success, and `(None, None)` when semantic was not requested.
306fn make_embedder(
307    semantic: bool,
308    model_cache_dir: &Path,
309) -> (Option<Box<dyn Embedder>>, Option<String>) {
310    if !semantic {
311        return (None, None);
312    }
313    match embed::create(model_cache_dir) {
314        Ok(e) => (Some(e), None),
315        Err(reason) => (None, Some(reason)),
316    }
317}
318
319fn head_commit(repo: &git2::Repository) -> Option<String> {
320    repo.head().ok()?.target().map(|oid| oid.to_string())
321}
322
323/// Trim a chunk's content to a bounded snippet, appending an ellipsis when cut.
324fn snippet(content: &str) -> String {
325    if content.chars().count() <= SNIPPET_MAX_CHARS {
326        return content.to_string();
327    }
328    let truncated: String = content.chars().take(SNIPPET_MAX_CHARS).collect();
329    format!("{truncated}…")
330}
331
332#[cfg(test)]
333mod tests {
334    #![allow(clippy::unwrap_used, clippy::format_collect)]
335    use super::*;
336    use tempfile::TempDir;
337
338    fn init_repo(parent: &Path, name: &str, files: &[(&str, &str)]) -> PathBuf {
339        let dir = parent.join(name);
340        std::fs::create_dir_all(&dir).unwrap();
341        let repo = git2::Repository::init(&dir).unwrap();
342        for (rel, body) in files {
343            std::fs::write(dir.join(rel), body).unwrap();
344        }
345        let sig = git2::Signature::now("T", "t@e").unwrap();
346        let mut index = repo.index().unwrap();
347        index
348            .add_all(["*"], git2::IndexAddOption::DEFAULT, None)
349            .unwrap();
350        index.write().unwrap();
351        let tree_id = index.write_tree().unwrap();
352        let tree = repo.find_tree(tree_id).unwrap();
353        repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[])
354            .unwrap();
355        dir
356    }
357
358    #[test]
359    fn build_then_search_across_repos() {
360        let tmp = TempDir::new().unwrap();
361        let data = tmp.path().join("data");
362        let api = init_repo(
363            tmp.path(),
364            "api",
365            &[("auth.rs", "fn rotate_refresh_token() {}\n")],
366        );
367        let ui = init_repo(
368            tmp.path(),
369            "ui",
370            &[("button.rs", "fn render_button() {}\n")],
371        );
372        let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
373
374        let outcome = build_index(&data, &repos, false).unwrap();
375        assert_eq!(outcome.repos_indexed, 2);
376        assert!(outcome.files_indexed >= 2);
377
378        let result = search(&data, "rotate_refresh_token", &[], 5, false).unwrap();
379        assert!(!result.hits.is_empty());
380        assert_eq!(result.hits[0].repo, "api");
381        assert_eq!(result.hits[0].path, "auth.rs");
382        assert!(!result.semantic_used);
383    }
384
385    #[test]
386    fn search_without_index_is_index_missing() {
387        let tmp = TempDir::new().unwrap();
388        let err = search(&tmp.path().join("data"), "anything", &[], 5, false).unwrap_err();
389        assert!(matches!(err, RepographError::IndexMissing));
390    }
391
392    #[test]
393    fn workspace_filter_scopes_results() {
394        let tmp = TempDir::new().unwrap();
395        let data = tmp.path().join("data");
396        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn shared_widget() {}\n")]);
397        let ui = init_repo(tmp.path(), "ui", &[("b.rs", "fn shared_widget() {}\n")]);
398        let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
399        build_index(&data, &repos, false).unwrap();
400
401        let scoped = search(&data, "shared_widget", &["api".to_string()], 5, false).unwrap();
402        assert!(!scoped.hits.is_empty());
403        assert!(scoped.hits.iter().all(|h| h.repo == "api"));
404    }
405
406    #[test]
407    fn no_match_is_empty_not_error() {
408        let tmp = TempDir::new().unwrap();
409        let data = tmp.path().join("data");
410        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn alpha() {}\n")]);
411        build_index(&data, &[("api".to_string(), api)], false).unwrap();
412        let result = search(&data, "zzz_nonexistent_symbol_qqq", &[], 5, false).unwrap();
413        assert!(result.hits.is_empty());
414    }
415
416    #[test]
417    fn limit_bounds_hits() {
418        let tmp = TempDir::new().unwrap();
419        let data = tmp.path().join("data");
420        let body: String = (0..50).map(|n| format!("fn widget_{n}() {{}}\n")).collect();
421        let api = init_repo(tmp.path(), "api", &[("w.rs", &body)]);
422        build_index(&data, &[("api".to_string(), api)], false).unwrap();
423        let result = search(&data, "widget", &[], 3, false).unwrap();
424        assert!(result.hits.len() <= 3);
425    }
426
427    #[test]
428    fn semantic_requested_without_feature_degrades_to_lexical() {
429        let tmp = TempDir::new().unwrap();
430        let data = tmp.path().join("data");
431        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn parse_csv() {}\n")]);
432        build_index(&data, &[("api".to_string(), api)], true).unwrap();
433        let result = search(&data, "parse_csv", &[], 5, true).unwrap();
434        // Lexical still returns the hit; semantic did not contribute.
435        assert!(!result.hits.is_empty());
436        if cfg!(not(feature = "semantic")) {
437            assert!(!result.semantic_used);
438            assert!(result.degraded.is_some());
439        }
440    }
441
442    #[test]
443    fn health_missing_index_is_absent_not_error() {
444        let tmp = TempDir::new().unwrap();
445        let status = index_health(&tmp.path().join("data"), &[]).unwrap();
446        assert!(!status.present);
447        assert!(status.stale.is_empty());
448    }
449
450    #[test]
451    fn health_reports_current_and_stale() {
452        let tmp = TempDir::new().unwrap();
453        let data = tmp.path().join("data");
454        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn a() {}\n")]);
455        let repos = vec![("api".to_string(), api.clone())];
456        build_index(&data, &repos, false).unwrap();
457
458        let status = index_health(&data, &repos).unwrap();
459        assert!(status.present && status.readable);
460        assert!(status.stale.is_empty(), "freshly indexed repo is current");
461
462        // A repo never indexed is stale.
463        let ghost = vec![("ghost".to_string(), api)];
464        let mixed = index_health(&data, &ghost).unwrap();
465        assert_eq!(mixed.stale, vec!["ghost".to_string()]);
466    }
467}