Skip to main content

git_lfs_git/
cat_file.rs

1//! `git cat-file --batch[-check]` long-running subprocess wrappers.
2//!
3//! Both flavors keep one git subprocess alive across many queries, which
4//! is critical for scanners that need to inspect thousands of OIDs (one
5//! fork per object would dominate runtime). Send `<oid>\n` on stdin,
6//! parse `<oid> <type> <size>\n` (or `<oid> missing\n`) from stdout.
7//! `--batch` additionally streams `<size>` bytes of content + a trailing
8//! newline after the header.
9//!
10//! See `git-cat-file(1)` § "BATCH OUTPUT".
11
12use std::io::{BufRead, BufReader, Read, Write};
13use std::path::Path;
14use std::process::{Child, ChildStdin, ChildStdout, Command, Stdio};
15
16use crate::Error;
17
18/// One header response from `cat-file --batch[-check]`.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum CatFileHeader {
21    /// Object exists. `size` is the in-repo content length in bytes.
22    Found { oid: String, kind: String, size: u64 },
23    /// Git replied with `<requested-oid> missing`.
24    Missing { oid: String },
25}
26
27/// Full response from `cat-file --batch`: a header plus exactly `size`
28/// bytes of content (only present when the header is [`CatFileHeader::Found`]).
29#[derive(Debug, Clone)]
30pub struct BlobContent {
31    pub oid: String,
32    pub kind: String,
33    pub size: u64,
34    pub content: Vec<u8>,
35}
36
37/// `git cat-file --batch-check` — header-only mode. Use this to decide
38/// whether to spend the I/O on reading a blob's content (e.g. filter to
39/// blobs ≤ MAX_POINTER_SIZE before paying the read cost).
40pub struct CatFileBatchCheck {
41    stdin: Option<ChildStdin>,
42    stdout: BufReader<ChildStdout>,
43    child: Child,
44}
45
46impl CatFileBatchCheck {
47    pub fn spawn(cwd: &Path) -> Result<Self, Error> {
48        let mut child = Command::new("git")
49            .arg("-C")
50            .arg(cwd)
51            .args(["cat-file", "--batch-check"])
52            .stdin(Stdio::piped())
53            .stdout(Stdio::piped())
54            .stderr(Stdio::piped())
55            .spawn()?;
56        let stdin = child.stdin.take().expect("piped");
57        let stdout = BufReader::new(child.stdout.take().expect("piped"));
58        Ok(Self { stdin: Some(stdin), stdout, child })
59    }
60
61    /// Look up one OID. Returns the parsed header.
62    pub fn check(&mut self, oid: &str) -> Result<CatFileHeader, Error> {
63        let stdin = self
64            .stdin
65            .as_mut()
66            .ok_or_else(|| Error::Failed("cat-file --batch-check stdin closed".into()))?;
67        writeln!(stdin, "{oid}")?;
68        stdin.flush()?;
69        let mut line = String::new();
70        self.stdout.read_line(&mut line)?;
71        if line.is_empty() {
72            return Err(Error::Failed(
73                "cat-file --batch-check exited unexpectedly".into(),
74            ));
75        }
76        parse_header(line.trim_end_matches('\n'))
77    }
78}
79
80impl Drop for CatFileBatchCheck {
81    fn drop(&mut self) {
82        // Closing stdin signals cat-file to exit cleanly.
83        drop(self.stdin.take());
84        let _ = self.child.wait();
85    }
86}
87
88/// `git cat-file --batch` — header + content mode. Use this once you've
89/// narrowed candidates with [`CatFileBatchCheck`] (typically by size).
90pub struct CatFileBatch {
91    stdin: Option<ChildStdin>,
92    stdout: BufReader<ChildStdout>,
93    child: Child,
94}
95
96impl CatFileBatch {
97    pub fn spawn(cwd: &Path) -> Result<Self, Error> {
98        let mut child = Command::new("git")
99            .arg("-C")
100            .arg(cwd)
101            .args(["cat-file", "--batch"])
102            .stdin(Stdio::piped())
103            .stdout(Stdio::piped())
104            .stderr(Stdio::piped())
105            .spawn()?;
106        let stdin = child.stdin.take().expect("piped");
107        let stdout = BufReader::new(child.stdout.take().expect("piped"));
108        Ok(Self { stdin: Some(stdin), stdout, child })
109    }
110
111    /// Read one OID. Returns `Ok(None)` if git replied "missing"; otherwise
112    /// the full blob content. Reads exactly `size` bytes after the header,
113    /// then consumes the trailing newline git emits between objects.
114    pub fn read(&mut self, oid: &str) -> Result<Option<BlobContent>, Error> {
115        let stdin = self
116            .stdin
117            .as_mut()
118            .ok_or_else(|| Error::Failed("cat-file --batch stdin closed".into()))?;
119        writeln!(stdin, "{oid}")?;
120        stdin.flush()?;
121        let mut line = String::new();
122        self.stdout.read_line(&mut line)?;
123        if line.is_empty() {
124            return Err(Error::Failed("cat-file --batch exited unexpectedly".into()));
125        }
126        match parse_header(line.trim_end_matches('\n'))? {
127            CatFileHeader::Missing { .. } => Ok(None),
128            CatFileHeader::Found { oid, kind, size } => {
129                let mut content = vec![0u8; size as usize];
130                self.stdout.read_exact(&mut content)?;
131                let mut nl = [0u8; 1];
132                self.stdout.read_exact(&mut nl)?;
133                if nl[0] != b'\n' {
134                    return Err(Error::Failed(format!(
135                        "cat-file --batch: expected trailing newline, got byte 0x{:02x}",
136                        nl[0]
137                    )));
138                }
139                Ok(Some(BlobContent { oid, kind, size, content }))
140            }
141        }
142    }
143}
144
145impl Drop for CatFileBatch {
146    fn drop(&mut self) {
147        drop(self.stdin.take());
148        let _ = self.child.wait();
149    }
150}
151
152/// Parse a `cat-file --batch[-check]` header line.
153///
154/// Lines come in two flavors:
155/// - `<oid> <type> <size>` — object found
156/// - `<oid> missing` — object not in the repo
157fn parse_header(line: &str) -> Result<CatFileHeader, Error> {
158    let mut parts = line.splitn(3, ' ');
159    let oid = parts
160        .next()
161        .ok_or_else(|| Error::Failed(format!("cat-file: empty header line {line:?}")))?
162        .to_owned();
163    let second = parts
164        .next()
165        .ok_or_else(|| Error::Failed(format!("cat-file: malformed header {line:?}")))?;
166    if second == "missing" {
167        return Ok(CatFileHeader::Missing { oid });
168    }
169    let size_str = parts
170        .next()
171        .ok_or_else(|| Error::Failed(format!("cat-file: missing size in {line:?}")))?;
172    let size = size_str
173        .parse::<u64>()
174        .map_err(|e| Error::Failed(format!("cat-file: bad size {size_str:?}: {e}")))?;
175    Ok(CatFileHeader::Found {
176        oid,
177        kind: second.to_owned(),
178        size,
179    })
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185    use crate::tests::commit_helper::*;
186
187    #[test]
188    fn parse_header_found() {
189        let h = parse_header("abc123 blob 42").unwrap();
190        match h {
191            CatFileHeader::Found { oid, kind, size } => {
192                assert_eq!(oid, "abc123");
193                assert_eq!(kind, "blob");
194                assert_eq!(size, 42);
195            }
196            other => panic!("expected Found, got {other:?}"),
197        }
198    }
199
200    #[test]
201    fn parse_header_missing() {
202        let h = parse_header("abc123 missing").unwrap();
203        assert!(matches!(h, CatFileHeader::Missing { oid } if oid == "abc123"));
204    }
205
206    #[test]
207    fn parse_header_malformed() {
208        assert!(parse_header("").is_err());
209        assert!(parse_header("only-one-token").is_err());
210        assert!(parse_header("oid blob not-a-size").is_err());
211    }
212
213    #[test]
214    fn batch_check_known_blob() {
215        let repo = init_repo();
216        commit_file(&repo, "a.txt", b"hello");
217        // Find the blob OID via ls-tree (cheap shell).
218        let out = std::process::Command::new("git")
219            .arg("-C")
220            .arg(repo.path())
221            .args(["ls-tree", "-r", "HEAD"])
222            .output()
223            .unwrap();
224        let stdout = String::from_utf8_lossy(&out.stdout);
225        let blob_oid = stdout.split_whitespace().nth(2).unwrap();
226
227        let mut bc = CatFileBatchCheck::spawn(repo.path()).unwrap();
228        let h = bc.check(blob_oid).unwrap();
229        match h {
230            CatFileHeader::Found { kind, size, .. } => {
231                assert_eq!(kind, "blob");
232                assert_eq!(size, 5); // "hello"
233            }
234            other => panic!("expected Found, got {other:?}"),
235        }
236    }
237
238    #[test]
239    fn batch_check_missing_oid() {
240        let repo = init_repo();
241        commit_file(&repo, "a.txt", b"x");
242        let mut bc = CatFileBatchCheck::spawn(repo.path()).unwrap();
243        let nope = "0000000000000000000000000000000000000001";
244        match bc.check(nope).unwrap() {
245            CatFileHeader::Missing { oid } => assert_eq!(oid, nope),
246            other => panic!("expected Missing, got {other:?}"),
247        }
248    }
249
250    #[test]
251    fn batch_reads_content_and_trailing_newline() {
252        let repo = init_repo();
253        // Use bytes that include a literal newline in the middle so we
254        // exercise the read_exact path rather than relying on read_line.
255        let content = b"line one\nline two\n";
256        commit_file(&repo, "multi.txt", content);
257        let out = std::process::Command::new("git")
258            .arg("-C")
259            .arg(repo.path())
260            .args(["ls-tree", "-r", "HEAD"])
261            .output()
262            .unwrap();
263        let blob_oid = String::from_utf8_lossy(&out.stdout)
264            .split_whitespace()
265            .nth(2)
266            .unwrap()
267            .to_owned();
268
269        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
270        let blob = b.read(&blob_oid).unwrap().unwrap();
271        assert_eq!(blob.kind, "blob");
272        assert_eq!(blob.size, content.len() as u64);
273        assert_eq!(blob.content, content);
274    }
275
276    #[test]
277    fn batch_returns_none_for_missing() {
278        let repo = init_repo();
279        commit_file(&repo, "x.txt", b"x");
280        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
281        let r = b.read("0000000000000000000000000000000000000001").unwrap();
282        assert!(r.is_none());
283    }
284
285    #[test]
286    fn batch_handles_many_queries_in_one_session() {
287        let repo = init_repo();
288        commit_file(&repo, "a.txt", b"AAA");
289        commit_file(&repo, "b.txt", b"BBBB");
290        commit_file(&repo, "c.txt", b"CCCCC");
291
292        // Collect all blob OIDs.
293        let out = std::process::Command::new("git")
294            .arg("-C")
295            .arg(repo.path())
296            .args(["ls-tree", "-r", "HEAD"])
297            .output()
298            .unwrap();
299        let oids: Vec<String> = String::from_utf8_lossy(&out.stdout)
300            .lines()
301            .map(|l| l.split_whitespace().nth(2).unwrap().to_owned())
302            .collect();
303        assert_eq!(oids.len(), 3);
304
305        let mut b = CatFileBatch::spawn(repo.path()).unwrap();
306        let mut sizes = Vec::new();
307        for oid in &oids {
308            let blob = b.read(oid).unwrap().unwrap();
309            sizes.push(blob.size);
310        }
311        sizes.sort_unstable();
312        assert_eq!(sizes, vec![3, 4, 5]);
313    }
314}