Skip to main content

git_lfs_git/
cat_file.rs

1//! `git cat-file --batch[-check]` long-running subprocess wrappers.
2//!
3//! Both flavors keep one git subprocess alive across many queries, which
4//! is critical for scanners that need to inspect thousands of OIDs (one
5//! fork per object would dominate runtime). Send `<oid>\n` on stdin,
6//! parse `<oid> <type> <size>\n` (or `<oid> missing\n`) from stdout.
7//! `--batch` additionally streams `<size>` bytes of content + a trailing
8//! newline after the header.
9//!
10//! See `git-cat-file(1)` § "BATCH OUTPUT".
11
12use std::io::{BufRead, BufReader, Read, Write};
13use std::path::Path;
14use std::process::{Child, ChildStdin, ChildStdout, Command, Stdio};
15
16use crate::Error;
17
18/// One header response from `cat-file --batch[-check]`.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum CatFileHeader {
21    /// Object exists. `size` is the in-repo content length in bytes.
22    Found {
23        oid: String,
24        kind: String,
25        size: u64,
26    },
27    /// Git replied with `<requested-oid> missing`.
28    Missing { oid: String },
29}
30
31/// Full response from `cat-file --batch`: a header plus exactly `size`
32/// bytes of content (only present when the header is [`CatFileHeader::Found`]).
33#[derive(Debug, Clone)]
34pub struct BlobContent {
35    pub oid: String,
36    pub kind: String,
37    pub size: u64,
38    pub content: Vec<u8>,
39}
40
41/// `git cat-file --batch-check` — header-only mode. Use this to decide
42/// whether to spend the I/O on reading a blob's content (e.g. filter to
43/// blobs ≤ MAX_POINTER_SIZE before paying the read cost).
44pub struct CatFileBatchCheck {
45    stdin: Option<ChildStdin>,
46    stdout: BufReader<ChildStdout>,
47    child: Child,
48}
49
50impl CatFileBatchCheck {
51    pub fn spawn(cwd: &Path) -> Result<Self, Error> {
52        let mut child = Command::new("git")
53            .arg("-C")
54            .arg(cwd)
55            .args(["cat-file", "--batch-check"])
56            .stdin(Stdio::piped())
57            .stdout(Stdio::piped())
58            .stderr(Stdio::piped())
59            .spawn()?;
60        let stdin = child.stdin.take().expect("piped");
61        let stdout = BufReader::new(child.stdout.take().expect("piped"));
62        Ok(Self {
63            stdin: Some(stdin),
64            stdout,
65            child,
66        })
67    }
68
69    /// Look up one OID. Returns the parsed header.
70    pub fn check(&mut self, oid: &str) -> Result<CatFileHeader, Error> {
71        let stdin = self
72            .stdin
73            .as_mut()
74            .ok_or_else(|| Error::Failed("cat-file --batch-check stdin closed".into()))?;
75        writeln!(stdin, "{oid}")?;
76        stdin.flush()?;
77        let mut line = String::new();
78        self.stdout.read_line(&mut line)?;
79        if line.is_empty() {
80            return Err(Error::Failed(
81                "cat-file --batch-check exited unexpectedly".into(),
82            ));
83        }
84        parse_header(line.trim_end_matches('\n'))
85    }
86}
87
88impl Drop for CatFileBatchCheck {
89    fn drop(&mut self) {
90        // Closing stdin signals cat-file to exit cleanly.
91        drop(self.stdin.take());
92        let _ = self.child.wait();
93    }
94}
95
96/// `git cat-file --batch` — header + content mode. Use this once you've
97/// narrowed candidates with [`CatFileBatchCheck`] (typically by size).
98pub struct CatFileBatch {
99    stdin: Option<ChildStdin>,
100    stdout: BufReader<ChildStdout>,
101    child: Child,
102}
103
104impl CatFileBatch {
105    pub fn spawn(cwd: &Path) -> Result<Self, Error> {
106        let mut child = Command::new("git")
107            .arg("-C")
108            .arg(cwd)
109            .args(["cat-file", "--batch"])
110            .stdin(Stdio::piped())
111            .stdout(Stdio::piped())
112            .stderr(Stdio::piped())
113            .spawn()?;
114        let stdin = child.stdin.take().expect("piped");
115        let stdout = BufReader::new(child.stdout.take().expect("piped"));
116        Ok(Self {
117            stdin: Some(stdin),
118            stdout,
119            child,
120        })
121    }
122
123    /// Read one OID. Returns `Ok(None)` if git replied "missing"; otherwise
124    /// the full blob content. Reads exactly `size` bytes after the header,
125    /// then consumes the trailing newline git emits between objects.
126    pub fn read(&mut self, oid: &str) -> Result<Option<BlobContent>, Error> {
127        let stdin = self
128            .stdin
129            .as_mut()
130            .ok_or_else(|| Error::Failed("cat-file --batch stdin closed".into()))?;
131        writeln!(stdin, "{oid}")?;
132        stdin.flush()?;
133        let mut line = String::new();
134        self.stdout.read_line(&mut line)?;
135        if line.is_empty() {
136            return Err(Error::Failed("cat-file --batch exited unexpectedly".into()));
137        }
138        match parse_header(line.trim_end_matches('\n'))? {
139            CatFileHeader::Missing { .. } => Ok(None),
140            CatFileHeader::Found { oid, kind, size } => {
141                let mut content = vec![0u8; size as usize];
142                self.stdout.read_exact(&mut content)?;
143                let mut nl = [0u8; 1];
144                self.stdout.read_exact(&mut nl)?;
145                if nl[0] != b'\n' {
146                    return Err(Error::Failed(format!(
147                        "cat-file --batch: expected trailing newline, got byte 0x{:02x}",
148                        nl[0]
149                    )));
150                }
151                Ok(Some(BlobContent {
152                    oid,
153                    kind,
154                    size,
155                    content,
156                }))
157            }
158        }
159    }
160}
161
162impl Drop for CatFileBatch {
163    fn drop(&mut self) {
164        drop(self.stdin.take());
165        let _ = self.child.wait();
166    }
167}
168
169/// Parse a `cat-file --batch[-check]` header line.
170///
171/// Lines come in two flavors:
172/// - `<oid> <type> <size>` — object found
173/// - `<oid> missing` — object not in the repo
174fn parse_header(line: &str) -> Result<CatFileHeader, Error> {
175    let mut parts = line.splitn(3, ' ');
176    let oid = parts
177        .next()
178        .ok_or_else(|| Error::Failed(format!("cat-file: empty header line {line:?}")))?
179        .to_owned();
180    let second = parts
181        .next()
182        .ok_or_else(|| Error::Failed(format!("cat-file: malformed header {line:?}")))?;
183    if second == "missing" {
184        return Ok(CatFileHeader::Missing { oid });
185    }
186    let size_str = parts
187        .next()
188        .ok_or_else(|| Error::Failed(format!("cat-file: missing size in {line:?}")))?;
189    let size = size_str
190        .parse::<u64>()
191        .map_err(|e| Error::Failed(format!("cat-file: bad size {size_str:?}: {e}")))?;
192    Ok(CatFileHeader::Found {
193        oid,
194        kind: second.to_owned(),
195        size,
196    })
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202    use crate::tests::commit_helper::*;
203
204    #[test]
205    fn parse_header_found() {
206        let h = parse_header("abc123 blob 42").unwrap();
207        match h {
208            CatFileHeader::Found { oid, kind, size } => {
209                assert_eq!(oid, "abc123");
210                assert_eq!(kind, "blob");
211                assert_eq!(size, 42);
212            }
213            other => panic!("expected Found, got {other:?}"),
214        }
215    }
216
217    #[test]
218    fn parse_header_missing() {
219        let h = parse_header("abc123 missing").unwrap();
220        assert!(matches!(h, CatFileHeader::Missing { oid } if oid == "abc123"));
221    }
222
223    #[test]
224    fn parse_header_malformed() {
225        assert!(parse_header("").is_err());
226        assert!(parse_header("only-one-token").is_err());
227        assert!(parse_header("oid blob not-a-size").is_err());
228    }
229
230    #[test]
231    fn batch_check_known_blob() {
232        let repo = init_repo();
233        commit_file(&repo, "a.txt", b"hello");
234        // Find the blob OID via ls-tree (cheap shell).
235        let out = std::process::Command::new("git")
236            .arg("-C")
237            .arg(repo.path())
238            .args(["ls-tree", "-r", "HEAD"])
239            .output()
240            .unwrap();
241        let stdout = String::from_utf8_lossy(&out.stdout);
242        let blob_oid = stdout.split_whitespace().nth(2).unwrap();
243
244        let mut bc = CatFileBatchCheck::spawn(repo.path()).unwrap();
245        let h = bc.check(blob_oid).unwrap();
246        match h {
247            CatFileHeader::Found { kind, size, .. } => {
248                assert_eq!(kind, "blob");
249                assert_eq!(size, 5); // "hello"
250            }
251            other => panic!("expected Found, got {other:?}"),
252        }
253    }
254
255    #[test]
256    fn batch_check_missing_oid() {
257        let repo = init_repo();
258        commit_file(&repo, "a.txt", b"x");
259        let mut bc = CatFileBatchCheck::spawn(repo.path()).unwrap();
260        let nope = "0000000000000000000000000000000000000001";
261        match bc.check(nope).unwrap() {
262            CatFileHeader::Missing { oid } => assert_eq!(oid, nope),
263            other => panic!("expected Missing, got {other:?}"),
264        }
265    }
266
267    #[test]
268    fn batch_reads_content_and_trailing_newline() {
269        let repo = init_repo();
270        // Use bytes that include a literal newline in the middle so we
271        // exercise the read_exact path rather than relying on read_line.
272        let content = b"line one\nline two\n";
273        commit_file(&repo, "multi.txt", content);
274        let out = std::process::Command::new("git")
275            .arg("-C")
276            .arg(repo.path())
277            .args(["ls-tree", "-r", "HEAD"])
278            .output()
279            .unwrap();
280        let blob_oid = String::from_utf8_lossy(&out.stdout)
281            .split_whitespace()
282            .nth(2)
283            .unwrap()
284            .to_owned();
285
286        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
287        let blob = b.read(&blob_oid).unwrap().unwrap();
288        assert_eq!(blob.kind, "blob");
289        assert_eq!(blob.size, content.len() as u64);
290        assert_eq!(blob.content, content);
291    }
292
293    #[test]
294    fn batch_returns_none_for_missing() {
295        let repo = init_repo();
296        commit_file(&repo, "x.txt", b"x");
297        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
298        let r = b.read("0000000000000000000000000000000000000001").unwrap();
299        assert!(r.is_none());
300    }
301
302    #[test]
303    fn batch_handles_many_queries_in_one_session() {
304        let repo = init_repo();
305        commit_file(&repo, "a.txt", b"AAA");
306        commit_file(&repo, "b.txt", b"BBBB");
307        commit_file(&repo, "c.txt", b"CCCCC");
308
309        // Collect all blob OIDs.
310        let out = std::process::Command::new("git")
311            .arg("-C")
312            .arg(repo.path())
313            .args(["ls-tree", "-r", "HEAD"])
314            .output()
315            .unwrap();
316        let oids: Vec<String> = String::from_utf8_lossy(&out.stdout)
317            .lines()
318            .map(|l| l.split_whitespace().nth(2).unwrap().to_owned())
319            .collect();
320        assert_eq!(oids.len(), 3);
321
322        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
323        let mut sizes = Vec::new();
324        for oid in &oids {
325            let blob = b.read(oid).unwrap().unwrap();
326            sizes.push(blob.size);
327        }
328        sizes.sort_unstable();
329        assert_eq!(sizes, vec![3, 4, 5]);
330    }
331}