Skip to main content

git_lfs_git/
scanner.rs

1//! Scanner: walk git history, find LFS pointer blobs.
2//!
3//! This is the entry point used by `git lfs fetch`/`pull`/`push` to
4//! enumerate the LFS pointers reachable from a set of refs. The pipeline
5//! mirrors upstream:
6//!
7//! 1. [`rev_list`](crate::rev_list::rev_list) emits every reachable object
8//!    (commits, trees, blobs).
9//! 2. [`CatFileBatchCheck`] filters those to blobs whose size could fit in
10//!    a pointer file (≤ [`MAX_POINTER_SIZE`]). Blobs are read from index;
11//!    cheap header-only check, no content I/O.
12//! 3. [`CatFileBatch`] reads the surviving candidates' content. Each is
13//!    parsed as a [`Pointer`]; non-pointers are silently skipped.
14//! 4. The output is deduplicated by LFS OID (the pointer's content OID,
15//!    not the git blob OID): the same LFS object can appear in many
16//!    blobs/paths, but we only need to fetch it once.
17
18use std::path::{Path, PathBuf};
19use std::process::Command;
20
21use git_lfs_pointer::{MAX_POINTER_SIZE, Oid, Pointer};
22
23use crate::Error;
24use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
25
26/// One LFS pointer discovered by the scanner.
27#[derive(Debug, Clone)]
28pub struct PointerEntry {
29    /// LFS object OID (the `oid sha256:...` field of the pointer file).
30    pub oid: Oid,
31    /// Object size in bytes (per the pointer's `size` field).
32    pub size: u64,
33    /// First working-tree path the pointer was found at. A single LFS
34    /// object can appear under many paths in history; we keep the first.
35    /// Useful for progress display ("downloading foo/bar.bin"); not the
36    /// authoritative source — caller should not rely on it for routing.
37    pub path: Option<PathBuf>,
38    /// Every working-tree path the pointer was seen at (across history
39    /// and refs). Callers that filter by path (`--include`/`--exclude`)
40    /// must check this set rather than just `path`, otherwise an LFS
41    /// OID shared between two paths gets filtered out whenever the
42    /// scanner happens to dedup down to the wrong one. Always
43    /// non-empty when `path` is `Some`.
44    pub paths: Vec<PathBuf>,
45    /// `true` if the pointer's source bytes were byte-canonical. Used by
46    /// `git lfs fsck --pointers` to flag pointers that parse but don't
47    /// match the canonical encoding.
48    pub canonical: bool,
49}
50
51/// Walk history reachable from `include` minus `exclude`, return unique
52/// LFS pointers.
53///
54/// Order is undefined and should not be relied on. Callers that want a
55/// stable order should sort the result.
56///
57/// **History semantics**: matches upstream's `ScanRefs` — every blob in
58/// every commit's tree is examined, including blobs that have since been
59/// deleted or modified. This catches LFS objects from the full history
60/// of the named refs, which is what `git lfs fetch <ref>` is documented
61/// to do.
62pub fn scan_pointers(
63    cwd: &Path,
64    include: &[&str],
65    exclude: &[&str],
66) -> Result<Vec<PointerEntry>, Error> {
67    scan_pointers_with_args(cwd, include, exclude, &[])
68}
69
70/// [`scan_pointers`] with extra rev-list cmdline args. See
71/// [`rev_list_with_args`](crate::rev_list_with_args).
72pub fn scan_pointers_with_args(
73    cwd: &Path,
74    include: &[&str],
75    exclude: &[&str],
76    extra_cmdline_args: &[&str],
77) -> Result<Vec<PointerEntry>, Error> {
78    let entries = crate::rev_list::rev_list_with_args(cwd, include, exclude, extra_cmdline_args)?;
79
80    // Phase 1: header-only check. Filter to blobs whose size could plausibly
81    // be a pointer file. Tracking name alongside so we can report it.
82    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
83    let mut candidates: Vec<(String, Option<String>)> = Vec::new();
84    for entry in entries {
85        match bcheck.check(&entry.oid)? {
86            CatFileHeader::Found { kind, size, .. }
87                if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
88            {
89                candidates.push((entry.oid, entry.name));
90            }
91            // Trees, commits, oversized blobs, missing — all skipped.
92            _ => {}
93        }
94    }
95    drop(bcheck);
96
97    // Phase 2: read content of each candidate, parse as pointer, dedup
98    // by LFS OID. Same LFS object referenced from multiple paths/commits
99    // collapses to one entry — but we accumulate every path it appeared
100    // at so include/exclude filters can match any of them.
101    let mut batch = CatFileBatch::spawn(cwd)?;
102    let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
103    let mut out: Vec<PointerEntry> = Vec::new();
104    for (oid, name) in candidates {
105        let Some(blob) = batch.read(&oid)? else {
106            continue;
107        };
108        let Ok(pointer) = Pointer::parse(&blob.content) else {
109            continue;
110        };
111        let path_buf = name.map(PathBuf::from);
112        if let Some(&idx) = by_oid.get(&pointer.oid) {
113            if let Some(p) = path_buf
114                && !out[idx].paths.contains(&p)
115            {
116                out[idx].paths.push(p);
117            }
118            continue;
119        }
120        let paths: Vec<PathBuf> = path_buf.iter().cloned().collect();
121        by_oid.insert(pointer.oid, out.len());
122        out.push(PointerEntry {
123            oid: pointer.oid,
124            size: pointer.size,
125            path: path_buf,
126            paths,
127            canonical: pointer.canonical,
128        });
129    }
130    Ok(out)
131}
132
133/// One blob found while walking a tree, before any pointer-parsing or
134/// size-based filtering. Paths and OIDs are reported verbatim from
135/// `git ls-tree`.
136#[derive(Debug, Clone)]
137pub struct TreeBlob {
138    /// Working-tree path of the blob.
139    pub path: PathBuf,
140    /// Git blob OID (the SHA-1 of the blob in the object database).
141    pub blob_oid: String,
142    /// Size of the blob in bytes, per `cat-file --batch-check`.
143    pub size: u64,
144    /// Git tree-entry mode in octal (e.g. `100644`, `100755`,
145    /// `120000` for symlinks). Callers that classify entries by
146    /// mode (e.g. `fsck --pointers` skipping symlinks) read this.
147    pub mode: String,
148}
149
150/// Walk the tree at `reference` and return *every* blob — no size filter,
151/// no pointer parsing. Used by `fsck --pointers` for its full-tree sweep
152/// when classifying paths against `.gitattributes`.
153pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
154    let out = Command::new("git")
155        .arg("-C")
156        .arg(cwd)
157        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
158        .output()?;
159    if !out.status.success() {
160        return Err(Error::Failed(format!(
161            "git ls-tree failed: {}",
162            String::from_utf8_lossy(&out.stderr).trim()
163        )));
164    }
165    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
166    let mut blobs = Vec::new();
167    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
168        let s = std::str::from_utf8(record)
169            .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
170        let (header, path) = s
171            .split_once('\t')
172            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
173        let mut parts = header.split_whitespace();
174        let mode = parts
175            .next()
176            .ok_or_else(|| Error::Failed(format!("ls-tree: missing mode in {s:?}")))?;
177        let kind = parts.next();
178        let oid = parts
179            .next()
180            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
181        if kind != Some("blob") {
182            continue;
183        }
184        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
185            && kind == "blob"
186        {
187            blobs.push(TreeBlob {
188                path: PathBuf::from(path),
189                blob_oid: oid.to_owned(),
190                size,
191                mode: mode.to_owned(),
192            });
193        }
194    }
195    Ok(blobs)
196}
197
198/// Walk the tree at `reference`, returning one entry per LFS pointer blob.
199///
200/// Unlike [`scan_pointers`], this does *not* walk history and does *not*
201/// dedupe by LFS OID — each path in the tree that points at an LFS
202/// pointer becomes its own entry. Multiple paths pointing at the same
203/// LFS object yield multiple entries, with their working-tree paths
204/// preserved. This matches upstream's `ScanTree` semantics, used by
205/// `ls-files` and `status`.
206///
207/// Paths are read from `git ls-tree -r -z` so embedded newlines or
208/// quoting metacharacters round-trip cleanly.
209pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
210    let out = Command::new("git")
211        .arg("-C")
212        .arg(cwd)
213        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
214        .output()?;
215    if !out.status.success() {
216        return Err(Error::Failed(format!(
217            "git ls-tree failed: {}",
218            String::from_utf8_lossy(&out.stderr).trim()
219        )));
220    }
221
222    // Phase 1: parse `<mode> <type> <oid>\t<path>` records, keep blobs
223    // small enough to be a pointer.
224    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
225    let mut candidates: Vec<(String, String)> = Vec::new();
226    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
227        let s = std::str::from_utf8(record)
228            .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
229        let (header, path) = s
230            .split_once('\t')
231            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
232        let mut parts = header.split_whitespace();
233        let _mode = parts.next();
234        let kind = parts.next();
235        let oid = parts
236            .next()
237            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
238        if kind != Some("blob") {
239            continue;
240        }
241        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
242            && kind == "blob"
243            && (size as usize) < MAX_POINTER_SIZE
244        {
245            candidates.push((oid.to_owned(), path.to_owned()));
246        }
247    }
248    drop(bcheck);
249
250    // Phase 2: read each candidate blob, parse as pointer, emit one
251    // entry per path. No OID dedup — that's intentional, callers may
252    // want to know every path an object lives at in this tree.
253    let mut batch = CatFileBatch::spawn(cwd)?;
254    let mut entries = Vec::new();
255    for (oid, path) in candidates {
256        let Some(blob) = batch.read(&oid)? else {
257            continue;
258        };
259        let Ok(pointer) = Pointer::parse(&blob.content) else {
260            continue;
261        };
262        let path_buf = PathBuf::from(path);
263        entries.push(PointerEntry {
264            oid: pointer.oid,
265            size: pointer.size,
266            path: Some(path_buf.clone()),
267            paths: vec![path_buf],
268            canonical: pointer.canonical,
269        });
270    }
271    Ok(entries)
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277    use crate::tests::commit_helper::*;
278
279    /// Build a canonical pointer text for a known content. Mirrors what
280    /// `git lfs clean` would emit, so we don't need to wire the filter
281    /// crate into git's tests.
282    fn pointer_text(content: &[u8]) -> Vec<u8> {
283        use sha2::{Digest, Sha256};
284        let oid_bytes: [u8; 32] = Sha256::digest(content).into();
285        let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
286            use std::fmt::Write;
287            let _ = write!(s, "{b:02x}");
288            s
289        });
290        format!(
291            "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
292            content.len()
293        )
294        .into_bytes()
295    }
296
297    #[test]
298    fn empty_repo_returns_no_pointers() {
299        let repo = init_repo();
300        commit_file(&repo, "a.txt", b"plain content");
301        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
302        assert!(result.is_empty());
303    }
304
305    #[test]
306    fn finds_pointer_blobs_skips_plain_blobs() {
307        let repo = init_repo();
308        // Plain content + LFS pointer side-by-side.
309        commit_file(&repo, "plain.txt", b"just text");
310        let pointer = pointer_text(b"this would be the actual binary content");
311        commit_file(&repo, "big.bin", &pointer);
312
313        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
314        assert_eq!(result.len(), 1, "{result:?}");
315        assert_eq!(
316            result[0].size,
317            b"this would be the actual binary content".len() as u64,
318        );
319        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
320    }
321
322    #[test]
323    fn dedups_same_lfs_oid_in_multiple_paths() {
324        let repo = init_repo();
325        let pointer = pointer_text(b"shared payload");
326        commit_file(&repo, "first.bin", &pointer);
327        commit_file(&repo, "second.bin", &pointer);
328
329        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
330        // Same content → same pointer text → same git blob OID, but we
331        // also want to verify dedup at the LFS-OID layer.
332        assert_eq!(result.len(), 1, "{result:?}");
333    }
334
335    #[test]
336    fn finds_pointers_in_history_not_just_tip() {
337        let repo = init_repo();
338        // A pointer that is later overwritten by plain content. ScanRefs
339        // semantics require we still find it — older commits are part of
340        // history reachable from HEAD.
341        let pointer = pointer_text(b"deleted later");
342        commit_file(&repo, "x.bin", &pointer);
343        commit_file(&repo, "x.bin", b"plain text now");
344
345        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
346        assert_eq!(result.len(), 1);
347        assert_eq!(result[0].size, b"deleted later".len() as u64);
348    }
349
350    #[test]
351    fn excludes_filter_history_walk() {
352        let repo = init_repo();
353        commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
354        let first = head_oid(&repo);
355        commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
356
357        // Include HEAD, exclude the first commit → only new.bin's pointer.
358        let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
359        assert_eq!(result.len(), 1, "{result:?}");
360        assert_eq!(result[0].size, b"new payload".len() as u64);
361    }
362
363    #[test]
364    fn skips_blobs_that_look_like_pointers_but_dont_parse() {
365        let repo = init_repo();
366        // Small, but malformed pointer-shaped content.
367        commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
368
369        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
370        assert!(result.is_empty(), "{result:?}");
371    }
372
373    #[test]
374    fn scan_tree_returns_only_tree_entries_not_history() {
375        let repo = init_repo();
376        // A pointer that exists historically but is gone at HEAD must
377        // NOT show up in scan_tree (this is the point of the helper —
378        // ls-files should only see what's in the named tree).
379        let pointer = pointer_text(b"deleted later");
380        commit_file(&repo, "x.bin", &pointer);
381        commit_file(&repo, "x.bin", b"plain text now");
382
383        let result = scan_tree(repo.path(), "HEAD").unwrap();
384        assert!(result.is_empty(), "{result:?}");
385    }
386
387    #[test]
388    fn scan_tree_emits_one_entry_per_path_not_per_oid() {
389        let repo = init_repo();
390        // Same pointer at two paths in the current tree → two entries.
391        // (scan_pointers would dedupe to one; scan_tree must not.)
392        let pointer = pointer_text(b"shared payload");
393        commit_file(&repo, "first.bin", &pointer);
394        commit_file(&repo, "second.bin", &pointer);
395
396        let mut result = scan_tree(repo.path(), "HEAD").unwrap();
397        result.sort_by(|a, b| a.path.cmp(&b.path));
398        assert_eq!(result.len(), 2, "{result:?}");
399        assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
400        assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
401        // Same OID under both paths.
402        assert_eq!(result[0].oid, result[1].oid);
403    }
404
405    #[test]
406    fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
407        let repo = init_repo();
408        commit_file(&repo, "plain.txt", b"just text");
409        let pointer = pointer_text(b"binary content");
410        commit_file(&repo, "big.bin", &pointer);
411
412        let result = scan_tree(repo.path(), "HEAD").unwrap();
413        assert_eq!(result.len(), 1, "{result:?}");
414        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
415    }
416
417    #[test]
418    fn scan_tree_unknown_ref_errors() {
419        let repo = init_repo();
420        commit_file(&repo, "a.txt", b"x");
421        let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
422        match err {
423            Error::Failed(msg) => assert!(
424                msg.contains("does-not-exist") || msg.contains("Not a valid"),
425                "unexpected message: {msg}"
426            ),
427            _ => panic!("expected Failed, got {err:?}"),
428        }
429    }
430}