Skip to main content

git_lfs_git/
scanner.rs

1//! Scanner: walk git history, find LFS pointer blobs.
2//!
3//! This is the entry point used by `git lfs fetch`/`pull`/`push` to
4//! enumerate the LFS pointers reachable from a set of refs. The pipeline
5//! mirrors upstream:
6//!
7//! 1. [`rev_list()`] emits every reachable object (commits,
8//!    trees, blobs).
9//! 2. [`CatFileBatchCheck`] filters those to blobs whose size could fit in
10//!    a pointer file (≤ [`MAX_POINTER_SIZE`]). Blobs are read from index;
11//!    cheap header-only check, no content I/O.
12//! 3. [`CatFileBatch`] reads the surviving candidates' content. Each is
13//!    parsed as a [`Pointer`]; non-pointers are silently skipped.
14//! 4. The output is deduplicated by LFS OID (the pointer's content OID,
15//!    not the git blob OID): the same LFS object can appear in many
16//!    blobs/paths, but we only need to fetch it once.
17
18use std::collections::HashSet;
19use std::path::{Path, PathBuf};
20use std::process::Command;
21
22use git_lfs_pointer::{MAX_POINTER_SIZE, Oid, Pointer};
23
24use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
25use crate::{Error, rev_list};
26
27/// One LFS pointer discovered by the scanner.
28#[derive(Debug, Clone)]
29pub struct PointerEntry {
30    /// LFS object OID (the `oid sha256:...` field of the pointer file).
31    pub oid: Oid,
32    /// Object size in bytes (per the pointer's `size` field).
33    pub size: u64,
34    /// First working-tree path the pointer was found at. A single LFS
35    /// object can appear under many paths in history; we keep the first.
36    /// Useful for progress display ("downloading foo/bar.bin"); not the
37    /// authoritative source — caller should not rely on it for routing.
38    pub path: Option<PathBuf>,
39    /// `true` if the pointer's source bytes were byte-canonical. Used by
40    /// `git lfs fsck --pointers` to flag pointers that parse but don't
41    /// match the canonical encoding.
42    pub canonical: bool,
43}
44
45/// Walk history reachable from `include` minus `exclude`, return unique
46/// LFS pointers.
47///
48/// Order is undefined and should not be relied on. Callers that want a
49/// stable order should sort the result.
50///
51/// **History semantics**: matches upstream's `ScanRefs` — every blob in
52/// every commit's tree is examined, including blobs that have since been
53/// deleted or modified. This catches LFS objects from the full history
54/// of the named refs, which is what `git lfs fetch <ref>` is documented
55/// to do.
56pub fn scan_pointers(
57    cwd: &Path,
58    include: &[&str],
59    exclude: &[&str],
60) -> Result<Vec<PointerEntry>, Error> {
61    let entries = rev_list(cwd, include, exclude)?;
62
63    // Phase 1: header-only check. Filter to blobs whose size could plausibly
64    // be a pointer file. Tracking name alongside so we can report it.
65    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
66    let mut candidates: Vec<(String, Option<String>)> = Vec::new();
67    for entry in entries {
68        match bcheck.check(&entry.oid)? {
69            CatFileHeader::Found { kind, size, .. }
70                if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
71            {
72                candidates.push((entry.oid, entry.name));
73            }
74            // Trees, commits, oversized blobs, missing — all skipped.
75            _ => {}
76        }
77    }
78    drop(bcheck);
79
80    // Phase 2: read content of each candidate, parse as pointer, dedup
81    // by LFS OID. Same LFS object referenced from multiple paths/commits
82    // collapses to one entry (its `path` is the first git emitted).
83    let mut batch = CatFileBatch::spawn(cwd)?;
84    let mut seen: HashSet<Oid> = HashSet::new();
85    let mut out = Vec::new();
86    for (oid, name) in candidates {
87        let Some(blob) = batch.read(&oid)? else { continue };
88        let Ok(pointer) = Pointer::parse(&blob.content) else { continue };
89        if seen.insert(pointer.oid) {
90            out.push(PointerEntry {
91                oid: pointer.oid,
92                size: pointer.size,
93                path: name.map(PathBuf::from),
94                canonical: pointer.canonical,
95            });
96        }
97    }
98    Ok(out)
99}
100
101/// One blob found while walking a tree, before any pointer-parsing or
102/// size-based filtering. Paths and OIDs are reported verbatim from
103/// `git ls-tree`.
104#[derive(Debug, Clone)]
105pub struct TreeBlob {
106    /// Working-tree path of the blob.
107    pub path: PathBuf,
108    /// Git blob OID (the SHA-1 of the blob in the object database).
109    pub blob_oid: String,
110    /// Size of the blob in bytes, per `cat-file --batch-check`.
111    pub size: u64,
112}
113
114/// Walk the tree at `reference` and return *every* blob — no size filter,
115/// no pointer parsing. Used by `fsck --pointers` for its full-tree sweep
116/// when classifying paths against `.gitattributes`.
117pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
118    let out = Command::new("git")
119        .arg("-C")
120        .arg(cwd)
121        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
122        .output()?;
123    if !out.status.success() {
124        return Err(Error::Failed(format!(
125            "git ls-tree failed: {}",
126            String::from_utf8_lossy(&out.stderr).trim()
127        )));
128    }
129    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
130    let mut blobs = Vec::new();
131    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
132        let s = std::str::from_utf8(record).map_err(|e| {
133            Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
134        })?;
135        let (header, path) = s
136            .split_once('\t')
137            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
138        let mut parts = header.split_whitespace();
139        let _mode = parts.next();
140        let kind = parts.next();
141        let oid = parts
142            .next()
143            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
144        if kind != Some("blob") {
145            continue;
146        }
147        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
148            && kind == "blob"
149        {
150            blobs.push(TreeBlob {
151                path: PathBuf::from(path),
152                blob_oid: oid.to_owned(),
153                size,
154            });
155        }
156    }
157    Ok(blobs)
158}
159
160/// Walk the tree at `reference`, returning one entry per LFS pointer blob.
161///
162/// Unlike [`scan_pointers`], this does *not* walk history and does *not*
163/// dedupe by LFS OID — each path in the tree that points at an LFS
164/// pointer becomes its own entry. Multiple paths pointing at the same
165/// LFS object yield multiple entries, with their working-tree paths
166/// preserved. This matches upstream's `ScanTree` semantics, used by
167/// `ls-files` and `status`.
168///
169/// Paths are read from `git ls-tree -r -z` so embedded newlines or
170/// quoting metacharacters round-trip cleanly.
171pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
172    let out = Command::new("git")
173        .arg("-C")
174        .arg(cwd)
175        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
176        .output()?;
177    if !out.status.success() {
178        return Err(Error::Failed(format!(
179            "git ls-tree failed: {}",
180            String::from_utf8_lossy(&out.stderr).trim()
181        )));
182    }
183
184    // Phase 1: parse `<mode> <type> <oid>\t<path>` records, keep blobs
185    // small enough to be a pointer.
186    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
187    let mut candidates: Vec<(String, String)> = Vec::new();
188    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
189        let s = std::str::from_utf8(record).map_err(|e| {
190            Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
191        })?;
192        let (header, path) = s
193            .split_once('\t')
194            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
195        let mut parts = header.split_whitespace();
196        let _mode = parts.next();
197        let kind = parts.next();
198        let oid = parts
199            .next()
200            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
201        if kind != Some("blob") {
202            continue;
203        }
204        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
205            && kind == "blob"
206            && (size as usize) < MAX_POINTER_SIZE
207        {
208            candidates.push((oid.to_owned(), path.to_owned()));
209        }
210    }
211    drop(bcheck);
212
213    // Phase 2: read each candidate blob, parse as pointer, emit one
214    // entry per path. No OID dedup — that's intentional, callers may
215    // want to know every path an object lives at in this tree.
216    let mut batch = CatFileBatch::spawn(cwd)?;
217    let mut entries = Vec::new();
218    for (oid, path) in candidates {
219        let Some(blob) = batch.read(&oid)? else { continue };
220        let Ok(pointer) = Pointer::parse(&blob.content) else {
221            continue;
222        };
223        entries.push(PointerEntry {
224            oid: pointer.oid,
225            size: pointer.size,
226            path: Some(PathBuf::from(path)),
227            canonical: pointer.canonical,
228        });
229    }
230    Ok(entries)
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236    use crate::tests::commit_helper::*;
237
238    /// Build a canonical pointer text for a known content. Mirrors what
239    /// `git lfs clean` would emit, so we don't need to wire the filter
240    /// crate into git's tests.
241    fn pointer_text(content: &[u8]) -> Vec<u8> {
242        use sha2::{Digest, Sha256};
243        let oid_bytes: [u8; 32] = Sha256::digest(content).into();
244        let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
245            use std::fmt::Write;
246            let _ = write!(s, "{b:02x}");
247            s
248        });
249        format!(
250            "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
251            content.len()
252        )
253        .into_bytes()
254    }
255
256    #[test]
257    fn empty_repo_returns_no_pointers() {
258        let repo = init_repo();
259        commit_file(&repo, "a.txt", b"plain content");
260        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
261        assert!(result.is_empty());
262    }
263
264    #[test]
265    fn finds_pointer_blobs_skips_plain_blobs() {
266        let repo = init_repo();
267        // Plain content + LFS pointer side-by-side.
268        commit_file(&repo, "plain.txt", b"just text");
269        let pointer = pointer_text(b"this would be the actual binary content");
270        commit_file(&repo, "big.bin", &pointer);
271
272        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
273        assert_eq!(result.len(), 1, "{result:?}");
274        assert_eq!(
275            result[0].size,
276            b"this would be the actual binary content".len() as u64,
277        );
278        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
279    }
280
281    #[test]
282    fn dedups_same_lfs_oid_in_multiple_paths() {
283        let repo = init_repo();
284        let pointer = pointer_text(b"shared payload");
285        commit_file(&repo, "first.bin", &pointer);
286        commit_file(&repo, "second.bin", &pointer);
287
288        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
289        // Same content → same pointer text → same git blob OID, but we
290        // also want to verify dedup at the LFS-OID layer.
291        assert_eq!(result.len(), 1, "{result:?}");
292    }
293
294    #[test]
295    fn finds_pointers_in_history_not_just_tip() {
296        let repo = init_repo();
297        // A pointer that is later overwritten by plain content. ScanRefs
298        // semantics require we still find it — older commits are part of
299        // history reachable from HEAD.
300        let pointer = pointer_text(b"deleted later");
301        commit_file(&repo, "x.bin", &pointer);
302        commit_file(&repo, "x.bin", b"plain text now");
303
304        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
305        assert_eq!(result.len(), 1);
306        assert_eq!(result[0].size, b"deleted later".len() as u64);
307    }
308
309    #[test]
310    fn excludes_filter_history_walk() {
311        let repo = init_repo();
312        commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
313        let first = head_oid(&repo);
314        commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
315
316        // Include HEAD, exclude the first commit → only new.bin's pointer.
317        let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
318        assert_eq!(result.len(), 1, "{result:?}");
319        assert_eq!(result[0].size, b"new payload".len() as u64);
320    }
321
322    #[test]
323    fn skips_blobs_that_look_like_pointers_but_dont_parse() {
324        let repo = init_repo();
325        // Small, but malformed pointer-shaped content.
326        commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
327
328        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
329        assert!(result.is_empty(), "{result:?}");
330    }
331
332    #[test]
333    fn scan_tree_returns_only_tree_entries_not_history() {
334        let repo = init_repo();
335        // A pointer that exists historically but is gone at HEAD must
336        // NOT show up in scan_tree (this is the point of the helper —
337        // ls-files should only see what's in the named tree).
338        let pointer = pointer_text(b"deleted later");
339        commit_file(&repo, "x.bin", &pointer);
340        commit_file(&repo, "x.bin", b"plain text now");
341
342        let result = scan_tree(repo.path(), "HEAD").unwrap();
343        assert!(result.is_empty(), "{result:?}");
344    }
345
346    #[test]
347    fn scan_tree_emits_one_entry_per_path_not_per_oid() {
348        let repo = init_repo();
349        // Same pointer at two paths in the current tree → two entries.
350        // (scan_pointers would dedupe to one; scan_tree must not.)
351        let pointer = pointer_text(b"shared payload");
352        commit_file(&repo, "first.bin", &pointer);
353        commit_file(&repo, "second.bin", &pointer);
354
355        let mut result = scan_tree(repo.path(), "HEAD").unwrap();
356        result.sort_by(|a, b| a.path.cmp(&b.path));
357        assert_eq!(result.len(), 2, "{result:?}");
358        assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
359        assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
360        // Same OID under both paths.
361        assert_eq!(result[0].oid, result[1].oid);
362    }
363
364    #[test]
365    fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
366        let repo = init_repo();
367        commit_file(&repo, "plain.txt", b"just text");
368        let pointer = pointer_text(b"binary content");
369        commit_file(&repo, "big.bin", &pointer);
370
371        let result = scan_tree(repo.path(), "HEAD").unwrap();
372        assert_eq!(result.len(), 1, "{result:?}");
373        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
374    }
375
376    #[test]
377    fn scan_tree_unknown_ref_errors() {
378        let repo = init_repo();
379        commit_file(&repo, "a.txt", b"x");
380        let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
381        match err {
382            Error::Failed(msg) => assert!(
383                msg.contains("does-not-exist") || msg.contains("Not a valid"),
384                "unexpected message: {msg}"
385            ),
386            _ => panic!("expected Failed, got {err:?}"),
387        }
388    }
389}