Skip to main content

mkit_git_bridge/
gitsrc.rs

1//! Reading objects out of a local git repository
2//! (SPEC-GIT-IMPORT §2): one long-lived `git cat-file --batch` child
3//! for object bytes, plus `rev-list` / `ls-refs` / config plumbing.
4//!
5//! git owns wire protocol, auth, and pack storage; this module owns
6//! only the subprocess conversation. All inputs to the child are
7//! 40-hex object ids (never user strings), so the argv/stdin surface
8//! is injection-free by construction.
9
10use crate::error::BridgeError;
11use crate::gitobj::{Sha1Id, sha1_from_hex, sha1_hex};
12use std::io::{BufRead, BufReader, Read, Write};
13use std::path::{Path, PathBuf};
14use std::process::{Child, ChildStdin, ChildStdout, Command, Stdio};
15
16/// Refuse single objects above this size up front (the mkit per-file
17/// cap; SPEC-GIT-IMPORT §3.1) so a hostile upstream can't make us
18/// buffer arbitrarily.
19pub const MAX_OBJECT_BYTES: u64 = 1024 * 1024 * 1024;
20
21/// Base `git` command against `repo` with the bridge's subprocess
22/// hygiene applied: `GIT_TERMINAL_PROMPT=0` (a credential prompt must
23/// fail cleanly, never hang a CI run) and user hooks neutralized via
24/// `core.hooksPath` pointed at the platform null device (a
25/// `core.hooksPath` in user config must not run arbitrary hooks
26/// against the private staging mirror). User/system gitconfig is
27/// otherwise INHERITED on purpose: credential helpers, `core.sshCommand`,
28/// and proxy settings live there and remote fetch/push need them —
29/// none of it can affect translation output (mkit generates every
30/// object byte itself).
31#[must_use]
32pub fn git_command(repo: &Path) -> Command {
33    let mut c = Command::new("git");
34    c.arg("-C").arg(repo);
35    apply_hygiene(&mut c);
36    c
37}
38
39/// Apply the subprocess hygiene (see [`git_command`]) to a caller-built
40/// `git` command without a `-C <repo>` (e.g. a bare `git --version`
41/// probe).
42pub fn apply_hygiene(c: &mut Command) {
43    c.env("GIT_TERMINAL_PROMPT", "0");
44    let null = if cfg!(windows) { "NUL" } else { "/dev/null" };
45    c.arg("-c").arg(format!("core.hooksPath={null}"));
46}
47
48/// The git object type names `cat-file --batch` reports.
49#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50pub enum GitObjKind {
51    Blob,
52    Tree,
53    Commit,
54    Tag,
55}
56
57impl GitObjKind {
58    fn from_name(name: &str) -> Option<Self> {
59        Some(match name {
60            "blob" => Self::Blob,
61            "tree" => Self::Tree,
62            "commit" => Self::Commit,
63            "tag" => Self::Tag,
64            _ => return None,
65        })
66    }
67}
68
69/// A long-lived `git cat-file --batch` child bound to one repository.
70///
71/// Batch protocol (verified against git ≥ 2.30): write `<oid>\n` to
72/// stdin; read `<oid> <type> <size>\n`, exactly `<size>` body bytes,
73/// then one trailing `\n`. Unknown ids answer `<oid> missing\n`
74/// (no body — the stream stays clean). Any OTHER read error leaves
75/// the stream desynchronized: treat it as fatal for this batch.
76#[derive(Debug)]
77pub struct CatFileBatch {
78    child: Child,
79    stdin: ChildStdin,
80    stdout: BufReader<ChildStdout>,
81    repo: PathBuf,
82}
83
84impl CatFileBatch {
85    /// Spawn the child against `repo` (a `.git`/bare directory).
86    pub fn open(repo: &Path) -> Result<Self, BridgeError> {
87        let mut child = git_command(repo)
88            .args(["cat-file", "--batch"])
89            .stdin(Stdio::piped())
90            .stdout(Stdio::piped())
91            .stderr(Stdio::null())
92            .spawn()
93            .map_err(|e| BridgeError::Source(format!("spawn git cat-file: {e}")))?;
94        let stdin = child
95            .stdin
96            .take()
97            .ok_or_else(|| BridgeError::Source("cat-file stdin unavailable".into()))?;
98        let stdout = child
99            .stdout
100            .take()
101            .map(BufReader::new)
102            .ok_or_else(|| BridgeError::Source("cat-file stdout unavailable".into()))?;
103        Ok(Self {
104            child,
105            stdin,
106            stdout,
107            repo: repo.to_path_buf(),
108        })
109    }
110
111    /// Read one object's kind + body bytes.
112    pub fn read(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError> {
113        let hex = sha1_hex(id);
114        self.stdin
115            .write_all(format!("{hex}\n").as_bytes())
116            .and_then(|()| self.stdin.flush())
117            .map_err(|e| BridgeError::Source(format!("cat-file write: {e}")))?;
118
119        let mut header = String::new();
120        self.stdout
121            .read_line(&mut header)
122            .map_err(|e| BridgeError::Source(format!("cat-file read: {e}")))?;
123        let header = header.trim_end();
124        let mut parts = header.split(' ');
125        let (Some(echo), Some(kind_or_missing)) = (parts.next(), parts.next()) else {
126            return Err(BridgeError::Source(format!(
127                "cat-file: malformed header {header:?} (repo {})",
128                self.repo.display()
129            )));
130        };
131        if kind_or_missing == "missing" {
132            return Err(BridgeError::Source(format!("object {echo} missing")));
133        }
134        let kind = GitObjKind::from_name(kind_or_missing)
135            .ok_or_else(|| BridgeError::Source(format!("cat-file: unknown type {header:?}")))?;
136        let size: u64 = parts
137            .next()
138            .and_then(|s| s.parse().ok())
139            .ok_or_else(|| BridgeError::Source(format!("cat-file: bad size {header:?}")))?;
140        if size > MAX_OBJECT_BYTES {
141            // Drain body + trailing newline so the batch stream stays
142            // synchronized (callers keep using this child), and refuse
143            // PER-REF: one oversized object must not abort the whole
144            // import (SPEC-GIT-IMPORT §3.1).
145            // checked: a u64::MAX size would wrap `size + 1` to 0 in
146            // release, skip the drain entirely, and silently desync
147            // the batch stream — every later read returns wrong bytes.
148            let Some(mut remaining) = size.checked_add(1) else {
149                return Err(BridgeError::Source(format!(
150                    "object {echo} reports an absurd size ({size}); cat-file \
151                     stream untrustworthy"
152                )));
153            };
154            let mut sink_buf = vec![0u8; 64 * 1024];
155            while remaining > 0 {
156                let take = remaining.min(sink_buf.len() as u64);
157                #[allow(clippy::cast_possible_truncation)] // take <= 64 KiB
158                let take = take as usize;
159                self.stdout
160                    .read_exact(&mut sink_buf[..take])
161                    .map_err(|e| BridgeError::Source(format!("cat-file drain: {e}")))?;
162                remaining -= take as u64;
163            }
164            let mut obj = crate::gitobj::Sha1Id::default();
165            if let Some(parsed) = sha1_from_hex(echo) {
166                obj = parsed;
167            }
168            return Err(crate::error::Refusal::BlobTooLarge {
169                object: {
170                    let mut h = [0u8; 32];
171                    h[..20].copy_from_slice(&obj);
172                    h
173                },
174                size,
175            }
176            .into());
177        }
178        #[allow(clippy::cast_possible_truncation)] // size checked against the cap above
179        let mut body = vec![0u8; size as usize];
180        self.stdout
181            .read_exact(&mut body)
182            .map_err(|e| BridgeError::Source(format!("cat-file body: {e}")))?;
183        let mut nl = [0u8; 1];
184        self.stdout
185            .read_exact(&mut nl)
186            .map_err(|e| BridgeError::Source(format!("cat-file trailer: {e}")))?;
187        Ok((kind, body))
188    }
189}
190
191impl Drop for CatFileBatch {
192    fn drop(&mut self) {
193        // Kill + reap to avoid a zombie (stdin close alone would
194        // also end the batch loop, but kill is prompt and unconditional).
195        let _ = self.child.kill();
196        let _ = self.child.wait();
197    }
198}
199
200fn git_stdout(repo: &Path, args: &[&str]) -> Result<String, BridgeError> {
201    let out = git_command(repo)
202        .args(args)
203        .output()
204        .map_err(|e| BridgeError::Source(format!("spawn git: {e}")))?;
205    if !out.status.success() {
206        return Err(BridgeError::Source(format!(
207            "git {} failed: {}",
208            args.first().copied().unwrap_or(""),
209            String::from_utf8_lossy(&out.stderr).trim()
210        )));
211    }
212    String::from_utf8(out.stdout).map_err(|_| BridgeError::Source("git output not UTF-8".into()))
213}
214
215/// Commit ids reachable from `tips` minus `exclude`, parents-first
216/// (`--reverse --topo-order`), i.e. translation order.
217pub fn rev_list(
218    repo: &Path,
219    tips: &[Sha1Id],
220    exclude: &[Sha1Id],
221) -> Result<Vec<Sha1Id>, BridgeError> {
222    let mut args: Vec<String> = vec!["rev-list".into(), "--reverse".into(), "--topo-order".into()];
223    for t in tips {
224        args.push(sha1_hex(t));
225    }
226    for e in exclude {
227        args.push(format!("^{}", sha1_hex(e)));
228    }
229    let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
230    let out = git_stdout(repo, &arg_refs)?;
231    out.lines()
232        .map(|l| {
233            sha1_from_hex(l.trim())
234                .ok_or_else(|| BridgeError::Source(format!("rev-list: bad id {l:?}")))
235        })
236        .collect()
237}
238
239/// One upstream ref as listed in the staging mirror.
240#[derive(Debug, Clone, PartialEq, Eq)]
241pub struct UpstreamRef {
242    /// Full ref name (`refs/heads/main`, `refs/tags/v1`).
243    pub name: String,
244    /// The ref's own id (a tag OBJECT id for annotated tags).
245    pub id: Sha1Id,
246    /// The peeled commit id for annotated tags (`<ref>^{}` rows).
247    pub peeled: Option<Sha1Id>,
248}
249
250/// List `refs/heads/*` and `refs/tags/*` in the mirror, with peels.
251pub fn list_refs(repo: &Path) -> Result<Vec<UpstreamRef>, BridgeError> {
252    let out = git_stdout(
253        repo,
254        &[
255            "for-each-ref",
256            "--format=%(refname) %(objectname) %(*objectname)",
257            "refs/heads",
258            "refs/tags",
259        ],
260    )?;
261    let mut refs = Vec::new();
262    for line in out.lines() {
263        let mut parts = line.split(' ');
264        let (Some(name), Some(id_hex)) = (parts.next(), parts.next()) else {
265            continue;
266        };
267        let Some(id) = sha1_from_hex(id_hex) else {
268            continue;
269        };
270        let peeled = parts
271            .next()
272            .filter(|s| !s.is_empty())
273            .and_then(sha1_from_hex);
274        refs.push(UpstreamRef {
275            name: name.to_owned(),
276            id,
277            peeled,
278        });
279    }
280    Ok(refs)
281}
282
283/// The mirror's default-branch ref (`HEAD` symref target), if any.
284pub fn default_branch(repo: &Path) -> Result<Option<String>, BridgeError> {
285    let out = git_command(repo)
286        .args(["symbolic-ref", "--quiet", "HEAD"])
287        .output()
288        .map_err(|e| BridgeError::Source(format!("spawn git: {e}")))?;
289    if !out.status.success() {
290        return Ok(None); // detached/unborn HEAD
291    }
292    Ok(Some(String::from_utf8_lossy(&out.stdout).trim().to_owned()))
293}
294
295/// Is `old` an ancestor of `new` in this repo? (`git merge-base
296/// --is-ancestor`: exit 0 = yes, 1 = no.)
297pub fn is_ancestor(repo: &Path, old: &Sha1Id, new: &Sha1Id) -> Result<bool, BridgeError> {
298    let st = git_command(repo)
299        .args([
300            "merge-base",
301            "--is-ancestor",
302            &sha1_hex(old),
303            &sha1_hex(new),
304        ])
305        .stdout(Stdio::null())
306        .stderr(Stdio::null())
307        .status()
308        .map_err(|e| BridgeError::Source(format!("spawn git: {e}")))?;
309    match st.code() {
310        Some(0) => Ok(true),
311        Some(1) => Ok(false),
312        _ => Err(BridgeError::Source(
313            "merge-base --is-ancestor failed".into(),
314        )),
315    }
316}
317
318/// Whether the repo still has `id` (`git cat-file -e`). Exit 0 =
319/// present; any nonzero = absent (gc'd, never fetched, garbage).
320pub fn object_exists(repo: &Path, id: &Sha1Id) -> Result<bool, BridgeError> {
321    let st = git_command(repo)
322        .args(["cat-file", "-e", &sha1_hex(id)])
323        .stdout(Stdio::null())
324        .stderr(Stdio::null())
325        .status()
326        .map_err(|e| BridgeError::Source(format!("spawn git: {e}")))?;
327    Ok(st.code() == Some(0))
328}
329
330/// SPEC-GIT-IMPORT §2: SHA-256 upstreams refuse whole-import.
331pub fn is_sha256_repo(repo: &Path) -> Result<bool, BridgeError> {
332    let out = git_command(repo)
333        .args(["config", "extensions.objectformat"])
334        .output()
335        .map_err(|e| BridgeError::Source(format!("spawn git: {e}")))?;
336    // Unset config exits non-zero — that's the sha1 default.
337    Ok(out.status.success()
338        && String::from_utf8_lossy(&out.stdout)
339            .trim()
340            .eq_ignore_ascii_case("sha256"))
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346
347    fn git_available() -> bool {
348        Command::new("git")
349            .arg("--version")
350            .stdout(Stdio::null())
351            .stderr(Stdio::null())
352            .status()
353            .is_ok_and(|s| s.success())
354    }
355
356    /// Build a tiny real repo: two commits + an annotated tag.
357    fn fixture() -> Option<(tempfile::TempDir, Sha1Id)> {
358        if !git_available() {
359            return None;
360        }
361        let td = tempfile::tempdir().unwrap();
362        let run = |args: &[&str]| {
363            let out = Command::new("git")
364                .arg("-C")
365                .arg(td.path())
366                .args(args)
367                .env("GIT_AUTHOR_NAME", "A")
368                .env("GIT_AUTHOR_EMAIL", "a@x")
369                .env("GIT_COMMITTER_NAME", "C")
370                .env("GIT_COMMITTER_EMAIL", "c@x")
371                .env("GIT_AUTHOR_DATE", "1700000000 +0000")
372                .env("GIT_COMMITTER_DATE", "1700000000 +0000")
373                .output()
374                .unwrap();
375            assert!(out.status.success(), "git {args:?}: {out:?}");
376            String::from_utf8_lossy(&out.stdout).trim().to_owned()
377        };
378        run(&["init", "--quiet", "--initial-branch=main", "."]);
379        std::fs::write(td.path().join("a.txt"), "hello\n").unwrap();
380        run(&["add", "a.txt"]);
381        run(&["commit", "--quiet", "-m", "first"]);
382        std::fs::write(td.path().join("b.txt"), "world\n").unwrap();
383        run(&["add", "b.txt"]);
384        run(&["commit", "--quiet", "-m", "second"]);
385        run(&["tag", "-a", "v1", "-m", "tag msg"]);
386        let head = sha1_from_hex(&run(&["rev-parse", "HEAD"])).unwrap();
387        Some((td, head))
388    }
389
390    #[test]
391    fn batch_reads_kinds_and_missing() {
392        let Some((td, head)) = fixture() else { return };
393        let git_dir = td.path().join(".git");
394        let mut batch = CatFileBatch::open(&git_dir).unwrap();
395        let (kind, body) = batch.read(&head).unwrap();
396        assert_eq!(kind, GitObjKind::Commit);
397        let c = crate::gitparse::parse_commit(&body).unwrap();
398        assert_eq!(c.message, b"second\n");
399        assert_eq!(c.committer.timestamp, 1_700_000_000);
400        // The tree, then a blob through the tree.
401        let (kind, tree_body) = batch.read(&c.tree).unwrap();
402        assert_eq!(kind, GitObjKind::Tree);
403        let entries = crate::gitparse::parse_tree(&tree_body).unwrap();
404        assert_eq!(entries.len(), 2);
405        let (kind, blob) = batch.read(&entries[0].id).unwrap();
406        assert_eq!(kind, GitObjKind::Blob);
407        assert_eq!(blob, b"hello\n");
408        // Missing object errors without poisoning the stream.
409        assert!(batch.read(&[0xEEu8; 20]).is_err());
410        assert!(batch.read(&head).is_ok(), "stream survives a miss");
411    }
412
413    #[test]
414    fn rev_list_orders_parents_first_and_excludes() {
415        let Some((td, head)) = fixture() else { return };
416        let git_dir = td.path().join(".git");
417        let all = rev_list(&git_dir, &[head], &[]).unwrap();
418        assert_eq!(all.len(), 2);
419        assert_eq!(*all.last().unwrap(), head, "tip last (parents first)");
420        let inc = rev_list(&git_dir, &[head], &[all[0]]).unwrap();
421        assert_eq!(inc, vec![head], "exclusion yields the delta only");
422    }
423
424    #[test]
425    fn list_refs_peels_tags_and_default_branch() {
426        let Some((td, head)) = fixture() else { return };
427        let git_dir = td.path().join(".git");
428        let refs = list_refs(&git_dir).unwrap();
429        let tag = refs.iter().find(|r| r.name == "refs/tags/v1").unwrap();
430        assert_ne!(tag.id, head, "annotated tag has its own object id");
431        assert_eq!(tag.peeled, Some(head));
432        let main = refs.iter().find(|r| r.name == "refs/heads/main").unwrap();
433        assert_eq!(main.id, head);
434        assert_eq!(main.peeled, None);
435        assert_eq!(
436            default_branch(&git_dir).unwrap().as_deref(),
437            Some("refs/heads/main")
438        );
439        assert!(!is_sha256_repo(&git_dir).unwrap());
440    }
441}