repograph-core 0.4.0

Core library for repograph: registering, grouping, and exposing local git repositories as structured context for AI agents.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
//! Cross-repo precedent search: index the git-tracked content of every
//! registered repo, then retrieve it by meaning or by keyword.
//!
//! The store ([`index`]) is a single `SQLite` database spanning all repos so one
//! query reaches everything — the answer to "I solved this somewhere, I just
//! don't remember which repo." Retrieval is hybrid: BM25 lexical (FTS5) fused
//! with semantic cosine over local embeddings ([`embed`], feature `semantic`),
//! merged by reciprocal-rank fusion. The binary resolves the data directory and
//! passes it in; this module performs the `dirs`-free path joins, mirroring how
//! `config.rs` takes a `dir: &Path`.

pub mod chunk;
pub mod embed;
pub mod index;

use std::path::{Path, PathBuf};

use serde::Serialize;

use crate::error::RepographError;
use crate::search::index::{Embedder, Store, fuse};

/// File name of the central index database under the data directory.
pub const INDEX_DB_NAME: &str = "index.db";

/// Subdirectory of the data directory that caches the embedding model.
pub const MODEL_SUBDIR: &str = "models";

/// Schema version of the `repograph find` JSON envelope.
///
/// Bumped to `2` when `semantic_used` and `degraded` were added so an agent can
/// detect, from the stdout payload alone, whether semantic retrieval ran or
/// silently fell back to lexical.
pub const FIND_SCHEMA_VERSION: u32 = 2;

/// Candidate-pool multiplier: we pull `limit * POOL_FACTOR` candidates from each
/// retrieval arm before fusing, so fusion has room to reorder.
const POOL_FACTOR: usize = 5;

/// Floor on the candidate pool, so small `--limit` values still fuse usefully.
const MIN_POOL: usize = 50;

/// Maximum characters in a result snippet before truncation.
const SNIPPET_MAX_CHARS: usize = 400;

/// One ranked search result. Field order is the JSON serialization order and
/// part of the stable output contract.
#[derive(Debug, Clone, Serialize)]
pub struct Hit {
    pub repo: String,
    /// Repo-relative path, forward-slashed.
    pub path: String,
    /// 1-based start line of the matched chunk.
    pub line: u32,
    /// Fused relevance score (higher is better).
    pub score: f64,
    pub snippet: String,
}

/// Outcome of [`search`]: the ranked hits plus retrieval diagnostics for the
/// binary's stderr (never part of the stdout data contract).
#[derive(Debug, Clone)]
pub struct SearchOutcome {
    pub hits: Vec<Hit>,
    /// True when semantic retrieval actually contributed (embedder available
    /// and vectors present).
    pub semantic_used: bool,
    /// Set when semantic was requested but unavailable — the reason, for a
    /// stderr notice. `None` when not requested or fully satisfied.
    pub degraded: Option<String>,
}

/// Outcome of [`build_index`].
#[derive(Debug, Clone, Default)]
pub struct IndexOutcome {
    pub repos_indexed: usize,
    pub repos_skipped: usize,
    pub files_indexed: usize,
    pub files_unchanged: usize,
    pub files_purged: usize,
    /// True when at least one file was (re)indexed or purged this run.
    pub changed: bool,
    /// True when semantic embeddings were written.
    pub semantic: bool,
    /// Set when semantic was requested but unavailable.
    pub degraded: Option<String>,
}

/// Health of the search index, consumed by `repograph doctor`.
#[derive(Debug, Clone, Default)]
pub struct IndexStatus {
    /// The index database file exists.
    pub present: bool,
    /// The index opened and matched this build's schema.
    pub readable: bool,
    /// Repos that are missing from the index or stale relative to their HEAD.
    pub stale: Vec<String>,
}

/// Path to the index database within `data_dir`.
#[must_use]
pub fn index_db_path(data_dir: &Path) -> PathBuf {
    data_dir.join(INDEX_DB_NAME)
}

/// Path to the embedding-model cache within `data_dir`.
#[must_use]
pub fn model_cache_dir(data_dir: &Path) -> PathBuf {
    data_dir.join(MODEL_SUBDIR)
}

/// Build or refresh the index over `repos` (each `(name, absolute_path)`).
///
/// Indexing is incremental and git-aware: only changed files are re-chunked,
/// removed files are purged. Repos that cannot be opened, are bare, or have no
/// commits are skipped with a warning rather than aborting the run.
///
/// # Errors
///
/// Returns [`RepographError::Index`] on a store failure, or
/// [`RepographError::Io`] if the data directory cannot be created.
pub fn build_index(
    data_dir: &Path,
    repos: &[(String, PathBuf)],
    semantic: bool,
) -> Result<IndexOutcome, RepographError> {
    let mut store = Store::open_for_build(&index_db_path(data_dir))?;
    let (mut embedder, degraded) = make_embedder(semantic, &model_cache_dir(data_dir));
    if let Some(e) = embedder.as_ref() {
        store.ensure_model(e.model_id())?;
    }

    let mut outcome = IndexOutcome {
        semantic: embedder.is_some(),
        degraded,
        ..IndexOutcome::default()
    };

    for (name, path) in repos {
        let repo = match git2::Repository::open(path) {
            Ok(r) => r,
            Err(e) => {
                tracing::warn!(repo = %name, error = %e, "skipping repo: cannot open");
                outcome.repos_skipped += 1;
                continue;
            }
        };
        if repo.is_bare() {
            tracing::warn!(repo = %name, "skipping bare repo");
            outcome.repos_skipped += 1;
            continue;
        }
        let files = match chunk::tracked_files(&repo, path) {
            Ok(f) => f,
            Err(e) => {
                tracing::warn!(repo = %name, error = %e, "skipping repo: cannot read index");
                outcome.repos_skipped += 1;
                continue;
            }
        };
        let head = head_commit(&repo);
        #[allow(clippy::option_if_let_else)]
        let emb: Option<&mut dyn Embedder> = match &mut embedder {
            Some(e) => Some(e.as_mut()),
            None => None,
        };
        let stats = store.reconcile_repo(name, &files, head.as_deref(), emb)?;
        outcome.repos_indexed += 1;
        outcome.files_indexed += stats.files_indexed;
        outcome.files_unchanged += stats.files_unchanged;
        outcome.files_purged += stats.files_purged;
    }
    outcome.changed = outcome.files_indexed > 0 || outcome.files_purged > 0;
    Ok(outcome)
}

/// Search the index, returning ranked hits across all repos or one workspace.
///
/// `repos_filter` (when non-empty) scopes results to those repo names.
/// `semantic` requests the hybrid path; it degrades to lexical with a populated
/// `degraded` reason when embeddings or the model are unavailable.
///
/// # Errors
///
/// Returns [`RepographError::IndexMissing`] (exit 3) when no index has been
/// built, or [`RepographError::Index`] (exit 1) when the index is unreadable.
pub fn search(
    data_dir: &Path,
    query: &str,
    repos_filter: &[String],
    limit: usize,
    semantic: bool,
) -> Result<SearchOutcome, RepographError> {
    let store = Store::open_existing(&index_db_path(data_dir))?;
    let pool = limit.max(1).saturating_mul(POOL_FACTOR).max(MIN_POOL);

    let lexical = store.search_lexical(query, repos_filter, pool)?;

    let mut vector = Vec::new();
    let mut semantic_used = false;
    let mut degraded = None;

    if semantic {
        let (embedder, deg) = make_embedder(true, &model_cache_dir(data_dir));
        degraded = deg;
        if let Some(mut e) = embedder {
            if store.has_vectors()? {
                match e.embed(&[query.to_string()]) {
                    Ok(v) if !v.is_empty() => {
                        vector = store.search_vectors(&v[0], repos_filter, pool)?;
                        semantic_used = true;
                    }
                    Ok(_) => degraded = Some("query produced no embedding".to_string()),
                    Err(msg) => degraded = Some(msg),
                }
            } else {
                degraded =
                    Some("index has no embeddings — run `repograph index --semantic`".to_string());
            }
        }
    }

    let fused = fuse(&[lexical.as_slice(), vector.as_slice()]);
    let top: Vec<i64> = fused.iter().take(limit).map(|(id, _)| *id).collect();
    let rows = store.fetch_chunks(&top)?;
    let hits = fused
        .iter()
        .take(limit)
        .filter_map(|(id, score)| {
            rows.get(id).map(|row| Hit {
                repo: row.repo.clone(),
                path: row.path.clone(),
                line: row.start_line,
                score: *score,
                snippet: snippet(&row.content),
            })
        })
        .collect();

    Ok(SearchOutcome {
        hits,
        semantic_used,
        degraded,
    })
}

/// Compute the [`IndexStatus`] for `repos`.
///
/// Never errors: a missing, unreadable, or mid-inspection-failing index is
/// reported via the `present`/`readable` flags so `doctor` surfaces it as a
/// warning rather than aborting the whole report.
///
/// # Errors
///
/// Returns `Ok` in all cases; the `Result` is retained for signature stability
/// with the other core entry points the binary calls with `?`.
pub fn index_health(
    data_dir: &Path,
    repos: &[(String, PathBuf)],
) -> Result<IndexStatus, RepographError> {
    let db = index_db_path(data_dir);
    if !db.is_file() {
        return Ok(IndexStatus::default());
    }
    let store = match Store::open_existing(&db) {
        Ok(s) => s,
        Err(RepographError::IndexMissing) => return Ok(IndexStatus::default()),
        Err(_) => {
            return Ok(IndexStatus {
                present: true,
                readable: false,
                stale: Vec::new(),
            });
        }
    };
    // A present index that opens but fails a query mid-inspection (corruption,
    // a partial write) is "present but not readable", not a hard error.
    let Ok(commits) = store.indexed_commits() else {
        return Ok(IndexStatus {
            present: true,
            readable: false,
            stale: Vec::new(),
        });
    };
    let mut stale = Vec::new();
    for (name, path) in repos {
        let current = git2::Repository::open(path)
            .ok()
            .and_then(|r| head_commit(&r));
        match commits.get(name) {
            Some(indexed) if *indexed == current => {}
            _ => stale.push(name.clone()),
        }
    }
    stale.sort();
    Ok(IndexStatus {
        present: true,
        readable: true,
        stale,
    })
}

/// Construct an embedder when `semantic` is requested. Returns `(None,
/// Some(reason))` when semantic is requested but unavailable, `(Some, None)` on
/// success, and `(None, None)` when semantic was not requested.
fn make_embedder(
    semantic: bool,
    model_cache_dir: &Path,
) -> (Option<Box<dyn Embedder>>, Option<String>) {
    if !semantic {
        return (None, None);
    }
    match embed::create(model_cache_dir) {
        Ok(e) => (Some(e), None),
        Err(reason) => (None, Some(reason)),
    }
}

fn head_commit(repo: &git2::Repository) -> Option<String> {
    repo.head().ok()?.target().map(|oid| oid.to_string())
}

/// Trim a chunk's content to a bounded snippet, appending an ellipsis when cut.
fn snippet(content: &str) -> String {
    if content.chars().count() <= SNIPPET_MAX_CHARS {
        return content.to_string();
    }
    let truncated: String = content.chars().take(SNIPPET_MAX_CHARS).collect();
    format!("{truncated}")
}

#[cfg(test)]
mod tests {
    #![allow(clippy::unwrap_used, clippy::format_collect)]
    use super::*;
    use tempfile::TempDir;

    fn init_repo(parent: &Path, name: &str, files: &[(&str, &str)]) -> PathBuf {
        let dir = parent.join(name);
        std::fs::create_dir_all(&dir).unwrap();
        let repo = git2::Repository::init(&dir).unwrap();
        for (rel, body) in files {
            std::fs::write(dir.join(rel), body).unwrap();
        }
        let sig = git2::Signature::now("T", "t@e").unwrap();
        let mut index = repo.index().unwrap();
        index
            .add_all(["*"], git2::IndexAddOption::DEFAULT, None)
            .unwrap();
        index.write().unwrap();
        let tree_id = index.write_tree().unwrap();
        let tree = repo.find_tree(tree_id).unwrap();
        repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[])
            .unwrap();
        dir
    }

    #[test]
    fn build_then_search_across_repos() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let api = init_repo(
            tmp.path(),
            "api",
            &[("auth.rs", "fn rotate_refresh_token() {}\n")],
        );
        let ui = init_repo(
            tmp.path(),
            "ui",
            &[("button.rs", "fn render_button() {}\n")],
        );
        let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];

        let outcome = build_index(&data, &repos, false).unwrap();
        assert_eq!(outcome.repos_indexed, 2);
        assert!(outcome.files_indexed >= 2);

        let result = search(&data, "rotate_refresh_token", &[], 5, false).unwrap();
        assert!(!result.hits.is_empty());
        assert_eq!(result.hits[0].repo, "api");
        assert_eq!(result.hits[0].path, "auth.rs");
        assert!(!result.semantic_used);
    }

    #[test]
    fn search_without_index_is_index_missing() {
        let tmp = TempDir::new().unwrap();
        let err = search(&tmp.path().join("data"), "anything", &[], 5, false).unwrap_err();
        assert!(matches!(err, RepographError::IndexMissing));
    }

    #[test]
    fn workspace_filter_scopes_results() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn shared_widget() {}\n")]);
        let ui = init_repo(tmp.path(), "ui", &[("b.rs", "fn shared_widget() {}\n")]);
        let repos = vec![("api".to_string(), api), ("ui".to_string(), ui)];
        build_index(&data, &repos, false).unwrap();

        let scoped = search(&data, "shared_widget", &["api".to_string()], 5, false).unwrap();
        assert!(!scoped.hits.is_empty());
        assert!(scoped.hits.iter().all(|h| h.repo == "api"));
    }

    #[test]
    fn no_match_is_empty_not_error() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn alpha() {}\n")]);
        build_index(&data, &[("api".to_string(), api)], false).unwrap();
        let result = search(&data, "zzz_nonexistent_symbol_qqq", &[], 5, false).unwrap();
        assert!(result.hits.is_empty());
    }

    #[test]
    fn limit_bounds_hits() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let body: String = (0..50).map(|n| format!("fn widget_{n}() {{}}\n")).collect();
        let api = init_repo(tmp.path(), "api", &[("w.rs", &body)]);
        build_index(&data, &[("api".to_string(), api)], false).unwrap();
        let result = search(&data, "widget", &[], 3, false).unwrap();
        assert!(result.hits.len() <= 3);
    }

    #[test]
    fn semantic_requested_without_feature_degrades_to_lexical() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn parse_csv() {}\n")]);
        build_index(&data, &[("api".to_string(), api)], true).unwrap();
        let result = search(&data, "parse_csv", &[], 5, true).unwrap();
        // Lexical still returns the hit; semantic did not contribute.
        assert!(!result.hits.is_empty());
        if cfg!(not(feature = "semantic")) {
            assert!(!result.semantic_used);
            assert!(result.degraded.is_some());
        }
    }

    #[test]
    fn health_missing_index_is_absent_not_error() {
        let tmp = TempDir::new().unwrap();
        let status = index_health(&tmp.path().join("data"), &[]).unwrap();
        assert!(!status.present);
        assert!(status.stale.is_empty());
    }

    #[test]
    fn health_reports_current_and_stale() {
        let tmp = TempDir::new().unwrap();
        let data = tmp.path().join("data");
        let api = init_repo(tmp.path(), "api", &[("a.rs", "fn a() {}\n")]);
        let repos = vec![("api".to_string(), api.clone())];
        build_index(&data, &repos, false).unwrap();

        let status = index_health(&data, &repos).unwrap();
        assert!(status.present && status.readable);
        assert!(status.stale.is_empty(), "freshly indexed repo is current");

        // A repo never indexed is stale.
        let ghost = vec![("ghost".to_string(), api)];
        let mixed = index_health(&data, &ghost).unwrap();
        assert_eq!(mixed.stale, vec!["ghost".to_string()]);
    }
}