Skip to main content

git_lfs_git/
scanner.rs

1//! Scanner: walk git history, find LFS pointer blobs.
2//!
3//! This is the entry point used by `git lfs fetch`/`pull`/`push` to
4//! enumerate the LFS pointers reachable from a set of refs. The pipeline
5//! mirrors upstream:
6//!
7//! 1. [`rev_list`](crate::rev_list::rev_list) emits every reachable object
8//!    (commits, trees, blobs).
9//! 2. [`CatFileBatchCheck`] filters those to blobs whose size could fit in
10//!    a pointer file (≤ [`MAX_POINTER_SIZE`]). Blobs are read from index;
11//!    cheap header-only check, no content I/O.
12//! 3. [`CatFileBatch`] reads the surviving candidates' content. Each is
13//!    parsed as a [`Pointer`]; non-pointers are silently skipped.
14//! 4. The output is deduplicated by LFS OID (the pointer's content OID,
15//!    not the git blob OID): the same LFS object can appear in many
16//!    blobs/paths, but we only need to fetch it once.
17
18use std::io::{BufRead, BufReader};
19use std::path::{Path, PathBuf};
20use std::process::{Command, Stdio};
21use std::time::{SystemTime, UNIX_EPOCH};
22
23use git_lfs_pointer::{Extension, MAX_POINTER_SIZE, Oid, Pointer};
24
25use crate::Error;
26use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
27
28/// One LFS pointer discovered by the scanner.
29#[derive(Debug, Clone)]
30pub struct PointerEntry {
31    /// LFS object OID (the `oid sha256:...` field of the pointer file).
32    pub oid: Oid,
33    /// Object size in bytes (per the pointer's `size` field).
34    pub size: u64,
35    /// First working-tree path the pointer was found at. A single LFS
36    /// object can appear under many paths in history; we keep the first.
37    /// Useful for progress display ("downloading foo/bar.bin"); not the
38    /// authoritative source — caller should not rely on it for routing.
39    pub path: Option<PathBuf>,
40    /// Every working-tree path the pointer was seen at (across history
41    /// and refs). Callers that filter by path (`--include`/`--exclude`)
42    /// must check this set rather than just `path`, otherwise an LFS
43    /// OID shared between two paths gets filtered out whenever the
44    /// scanner happens to dedup down to the wrong one. Always
45    /// non-empty when `path` is `Some`.
46    pub paths: Vec<PathBuf>,
47    /// `true` if the pointer's source bytes were byte-canonical. Used by
48    /// `git lfs fsck --pointers` to flag pointers that parse but don't
49    /// match the canonical encoding.
50    pub canonical: bool,
51    /// Pointer extensions in priority-ascending order, mirroring
52    /// `Pointer::extensions`. Empty for plain pointers; non-empty when
53    /// the file was committed through a configured `lfs.extension.<n>`
54    /// chain. The materialize/checkout paths replay these in reverse to
55    /// reconstruct the working-tree content.
56    pub extensions: Vec<Extension>,
57}
58
59/// Walk history reachable from `include` minus `exclude`, return unique
60/// LFS pointers.
61///
62/// Order is undefined and should not be relied on. Callers that want a
63/// stable order should sort the result.
64///
65/// **History semantics**: matches upstream's `ScanRefs` — every blob in
66/// every commit's tree is examined, including blobs that have since been
67/// deleted or modified. This catches LFS objects from the full history
68/// of the named refs, which is what `git lfs fetch <ref>` is documented
69/// to do.
70pub fn scan_pointers(
71    cwd: &Path,
72    include: &[&str],
73    exclude: &[&str],
74) -> Result<Vec<PointerEntry>, Error> {
75    scan_pointers_with_args(cwd, include, exclude, &[])
76}
77
78/// [`scan_pointers`] with extra rev-list cmdline args. See
79/// [`rev_list_with_args`](crate::rev_list::rev_list_with_args).
80pub fn scan_pointers_with_args(
81    cwd: &Path,
82    include: &[&str],
83    exclude: &[&str],
84    extra_cmdline_args: &[&str],
85) -> Result<Vec<PointerEntry>, Error> {
86    let entries = crate::rev_list::rev_list_with_args(cwd, include, exclude, extra_cmdline_args)?;
87
88    // Phase 1: header-only check. Filter to blobs whose size could plausibly
89    // be a pointer file. Tracking name alongside so we can report it.
90    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
91    let mut candidates: Vec<(String, Option<String>)> = Vec::new();
92    for entry in entries {
93        match bcheck.check(&entry.oid)? {
94            CatFileHeader::Found { kind, size, .. }
95                if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
96            {
97                candidates.push((entry.oid, entry.name));
98            }
99            // Trees, commits, oversized blobs, missing — all skipped.
100            _ => {}
101        }
102    }
103    drop(bcheck);
104
105    // Phase 2: read content of each candidate, parse as pointer, dedup
106    // by LFS OID. Same LFS object referenced from multiple paths/commits
107    // collapses to one entry — but we accumulate every path it appeared
108    // at so include/exclude filters can match any of them.
109    let mut batch = CatFileBatch::spawn(cwd)?;
110    let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
111    let mut out: Vec<PointerEntry> = Vec::new();
112    for (oid, name) in candidates {
113        let Some(blob) = batch.read(&oid)? else {
114            continue;
115        };
116        let Ok(pointer) = Pointer::parse(&blob.content) else {
117            continue;
118        };
119        let path_buf = name.map(PathBuf::from);
120        if let Some(&idx) = by_oid.get(&pointer.oid) {
121            if let Some(p) = path_buf
122                && !out[idx].paths.contains(&p)
123            {
124                out[idx].paths.push(p);
125            }
126            continue;
127        }
128        let paths: Vec<PathBuf> = path_buf.iter().cloned().collect();
129        by_oid.insert(pointer.oid, out.len());
130        out.push(PointerEntry {
131            oid: pointer.oid,
132            size: pointer.size,
133            path: path_buf,
134            paths,
135            canonical: pointer.canonical,
136            extensions: pointer.extensions.clone(),
137        });
138    }
139    Ok(out)
140}
141
142/// Scan the index for LFS pointers via
143/// `git ls-files --stage -z -- :(attr:filter=lfs)`.
144///
145/// Honors sparse-checkout (only entries in the sparse cone are listed)
146/// and works in bare repos against whatever's been written into the
147/// index. Empty result when the index is empty or no path matches the
148/// `filter=lfs` attribute. Symlinks (mode 120000) are skipped — they
149/// can never be LFS pointers.
150///
151/// This is the discovery path upstream's pull / fetch use on Git 2.42+;
152/// it sidesteps the rev-list traversal that's expensive on partial
153/// clones with `--filter=tree:0` and over-broad in bare repos with no
154/// committed `.gitattributes` reachable via the index.
155pub fn scan_index_lfs(cwd: &Path) -> Result<Vec<PointerEntry>, Error> {
156    // Run from the work-tree top (or git-dir for bare): `git ls-files`
157    // from a subdir restricts output to that subdir's entries, so
158    // running from `repo/dir1/` would miss `repo/a.dat`. Resolve via
159    // `--show-toplevel` first; fall back to the git-dir for bare repos
160    // (which legitimately have no work tree).
161    let scan_cwd = match crate::run_git(cwd, &["rev-parse", "--show-toplevel"]) {
162        Ok(s) if !s.is_empty() => PathBuf::from(s),
163        _ => crate::run_git(cwd, &["rev-parse", "--absolute-git-dir"])
164            .map(PathBuf::from)
165            .unwrap_or_else(|_| cwd.to_path_buf()),
166    };
167    // Apply a parent-dir-existence filter only when there's a reason
168    // to: cone-mode sparse-checkout marks out-of-cone entries by
169    // omitting their working-tree parents, and bare repos have no
170    // working-tree subdirs at all. For ordinary checkouts where the
171    // user just `rm`'d a file, we want to fetch and restore — not
172    // skip — so the filter stays off.
173    let filter_by_parent_dir = is_bare_repo(&scan_cwd) || is_sparse_checkout(&scan_cwd);
174
175    let out = Command::new("git")
176        .arg("-C")
177        .arg(&scan_cwd)
178        .args(["ls-files", "--stage", "-z", "--", ":(attr:filter=lfs)"])
179        .output()?;
180    if !out.status.success() {
181        return Err(Error::Failed(
182            String::from_utf8_lossy(&out.stderr).trim().to_owned(),
183        ));
184    }
185
186    let mut candidates: Vec<(String, PathBuf)> = Vec::new();
187    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
188        let s = match std::str::from_utf8(record) {
189            Ok(s) => s,
190            Err(_) => continue,
191        };
192        // `<mode> SP <oid> SP <stage>\t<path>`
193        let Some((meta, path)) = s.split_once('\t') else {
194            continue;
195        };
196        let parts: Vec<&str> = meta.split_whitespace().collect();
197        if parts.len() < 3 {
198            continue;
199        }
200        let mode = parts[0];
201        let oid = parts[1];
202        if mode == "120000" {
203            continue;
204        }
205        let path = PathBuf::from(path);
206        // Skip paths whose parent dir isn't materialized in the work
207        // tree: that's how cone-mode sparse-checkout marks out-of-cone
208        // entries when ls-files emits the *expanded* index (the trees
209        // are local but the working-tree dirs were never created).
210        // The same check naturally drops non-root entries in bare
211        // repos, where only the top-level scan_cwd exists as a
212        // directory. Skipped on plain checkouts so a user `rm`'d
213        // file still gets restored by `git lfs pull`.
214        if filter_by_parent_dir
215            && let Some(parent) = path.parent()
216            && !parent.as_os_str().is_empty()
217            && !scan_cwd.join(parent).is_dir()
218        {
219            continue;
220        }
221        candidates.push((oid.to_string(), path));
222    }
223    if candidates.is_empty() {
224        return Ok(Vec::new());
225    }
226
227    let mut batch = CatFileBatch::spawn(cwd)?;
228    let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
229    let mut out: Vec<PointerEntry> = Vec::new();
230    for (oid, path) in candidates {
231        let Some(blob) = batch.read(&oid)? else {
232            continue;
233        };
234        let Ok(pointer) = Pointer::parse(&blob.content) else {
235            continue;
236        };
237        if let Some(&idx) = by_oid.get(&pointer.oid) {
238            if !out[idx].paths.contains(&path) {
239                out[idx].paths.push(path);
240            }
241            continue;
242        }
243        by_oid.insert(pointer.oid, out.len());
244        out.push(PointerEntry {
245            oid: pointer.oid,
246            size: pointer.size,
247            path: Some(path.clone()),
248            paths: vec![path],
249            canonical: pointer.canonical,
250            extensions: pointer.extensions.clone(),
251        });
252    }
253    Ok(out)
254}
255
256fn is_bare_repo(cwd: &Path) -> bool {
257    crate::run_git(cwd, &["rev-parse", "--is-bare-repository"])
258        .map(|s| s.trim() == "true")
259        .unwrap_or(false)
260}
261
262fn is_sparse_checkout(cwd: &Path) -> bool {
263    crate::run_git(cwd, &["config", "--get", "core.sparseCheckout"])
264        .map(|s| s.trim().eq_ignore_ascii_case("true"))
265        .unwrap_or(false)
266}
267
268/// One blob found while walking a tree, before any pointer-parsing or
269/// size-based filtering. Paths and OIDs are reported verbatim from
270/// `git ls-tree`.
271#[derive(Debug, Clone)]
272pub struct TreeBlob {
273    /// Working-tree path of the blob.
274    pub path: PathBuf,
275    /// Git blob OID (the SHA-1 of the blob in the object database).
276    pub blob_oid: String,
277    /// Size of the blob in bytes, per `cat-file --batch-check`.
278    pub size: u64,
279    /// Git tree-entry mode in octal (e.g. `100644`, `100755`,
280    /// `120000` for symlinks). Callers that classify entries by
281    /// mode (e.g. `fsck --pointers` skipping symlinks) read this.
282    pub mode: String,
283}
284
285/// Walk the tree at `reference` and return *every* blob — no size filter,
286/// no pointer parsing. Used by `fsck --pointers` for its full-tree sweep
287/// when classifying paths against `.gitattributes`.
288pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
289    // `git ls-tree` only takes a tree-ish, not a range. For a `<a>..<b>`
290    // reference (used by `git lfs fsck HEAD^..HEAD`), walk every commit
291    // in the range and union their tree blobs (deduped by path+oid). A
292    // bare ref still takes the cheap one-shot path.
293    if reference.contains("..") {
294        return scan_blobs_in_range(cwd, reference);
295    }
296    scan_tree_blobs_for_ref(cwd, reference)
297}
298
299fn scan_tree_blobs_for_ref(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
300    let out = Command::new("git")
301        .arg("-C")
302        .arg(cwd)
303        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
304        .output()?;
305    if !out.status.success() {
306        return Err(Error::Failed(format!(
307            "git ls-tree failed: {}",
308            String::from_utf8_lossy(&out.stderr).trim()
309        )));
310    }
311    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
312    let mut blobs = Vec::new();
313    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
314        let s = std::str::from_utf8(record)
315            .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
316        let (header, path) = s
317            .split_once('\t')
318            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
319        let mut parts = header.split_whitespace();
320        let mode = parts
321            .next()
322            .ok_or_else(|| Error::Failed(format!("ls-tree: missing mode in {s:?}")))?;
323        let kind = parts.next();
324        let oid = parts
325            .next()
326            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
327        if kind != Some("blob") {
328            continue;
329        }
330        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
331            && kind == "blob"
332        {
333            blobs.push(TreeBlob {
334                path: PathBuf::from(path),
335                blob_oid: oid.to_owned(),
336                size,
337                mode: mode.to_owned(),
338            });
339        }
340    }
341    Ok(blobs)
342}
343
344/// Expand a `<a>..<b>` rev-range into the concrete commits it names
345/// and union their tree blobs (deduped by path + blob OID). Mirrors
346/// upstream's behavior for `git lfs fsck HEAD^..HEAD`: every blob
347/// reachable from any commit in the range is checked once.
348fn scan_blobs_in_range(cwd: &Path, range: &str) -> Result<Vec<TreeBlob>, Error> {
349    let out = Command::new("git")
350        .arg("-C")
351        .arg(cwd)
352        .args(["rev-list", range])
353        .output()?;
354    if !out.status.success() {
355        return Err(Error::Failed(format!(
356            "git rev-list failed: {}",
357            String::from_utf8_lossy(&out.stderr).trim()
358        )));
359    }
360    let mut seen: std::collections::HashSet<(PathBuf, String)> = std::collections::HashSet::new();
361    let mut all = Vec::new();
362    for line in String::from_utf8_lossy(&out.stdout).lines() {
363        let commit = line.trim();
364        if commit.is_empty() {
365            continue;
366        }
367        for blob in scan_tree_blobs_for_ref(cwd, commit)? {
368            if seen.insert((blob.path.clone(), blob.blob_oid.clone())) {
369                all.push(blob);
370            }
371        }
372    }
373    Ok(all)
374}
375
376/// Walk the tree at `reference`, returning one entry per LFS pointer blob.
377///
378/// Unlike [`scan_pointers`], this does *not* walk history and does *not*
379/// dedupe by LFS OID — each path in the tree that points at an LFS
380/// pointer becomes its own entry. Multiple paths pointing at the same
381/// LFS object yield multiple entries, with their working-tree paths
382/// preserved. This matches upstream's `ScanTree` semantics, used by
383/// `ls-files` and `status`.
384///
385/// Paths are read from `git ls-tree -r -z` so embedded newlines or
386/// quoting metacharacters round-trip cleanly.
387pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
388    let out = Command::new("git")
389        .arg("-C")
390        .arg(cwd)
391        .args(["ls-tree", "--full-tree", "-r", "-z", reference])
392        .output()?;
393    if !out.status.success() {
394        return Err(Error::Failed(format!(
395            "git ls-tree failed: {}",
396            String::from_utf8_lossy(&out.stderr).trim()
397        )));
398    }
399
400    // Phase 1: parse `<mode> <type> <oid>\t<path>` records, keep blobs
401    // small enough to be a pointer.
402    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
403    let mut candidates: Vec<(String, String)> = Vec::new();
404    for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
405        let s = std::str::from_utf8(record)
406            .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
407        let (header, path) = s
408            .split_once('\t')
409            .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
410        let mut parts = header.split_whitespace();
411        let _mode = parts.next();
412        let kind = parts.next();
413        let oid = parts
414            .next()
415            .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
416        if kind != Some("blob") {
417            continue;
418        }
419        if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
420            && kind == "blob"
421            && (size as usize) < MAX_POINTER_SIZE
422        {
423            candidates.push((oid.to_owned(), path.to_owned()));
424        }
425    }
426    drop(bcheck);
427
428    // Phase 2: read each candidate blob, parse as pointer, emit one
429    // entry per path. No OID dedup — that's intentional, callers may
430    // want to know every path an object lives at in this tree.
431    let mut batch = CatFileBatch::spawn(cwd)?;
432    let mut entries = Vec::new();
433    for (oid, path) in candidates {
434        let Some(blob) = batch.read(&oid)? else {
435            continue;
436        };
437        let Ok(pointer) = Pointer::parse(&blob.content) else {
438            continue;
439        };
440        let path_buf = PathBuf::from(path);
441        entries.push(PointerEntry {
442            oid: pointer.oid,
443            size: pointer.size,
444            path: Some(path_buf.clone()),
445            paths: vec![path_buf],
446            canonical: pointer.canonical,
447            extensions: pointer.extensions.clone(),
448        });
449    }
450    Ok(entries)
451}
452
453/// LFS pointers in the index or working tree that *differ* from `ref`
454/// (typically `HEAD`). Mirrors upstream's `lfs/gitscanner_index.go::
455/// scanIndex`: runs `git diff-index <ref>` and `git diff-index --cached
456/// <ref>` to surface staged + working-tree changes, then dedupes by
457/// (sha, path).
458///
459/// Returns only pointers — small blobs that parse as LFS pointer text.
460/// Symlinks and gitlinks (their dst-mode in diff-index output) are
461/// skipped. Used by prune retention so a staged-but-uncommitted
462/// pointer doesn't get pruned out from under the user.
463pub fn scan_index_pointers(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
464    let scan_cwd = match crate::run_git(cwd, &["rev-parse", "--show-toplevel"]) {
465        Ok(s) if !s.is_empty() => PathBuf::from(s),
466        _ => crate::run_git(cwd, &["rev-parse", "--absolute-git-dir"])
467            .map(PathBuf::from)
468            .unwrap_or_else(|_| cwd.to_path_buf()),
469    };
470
471    let mut candidates: Vec<(String, PathBuf)> = Vec::new();
472    let mut seen: std::collections::HashSet<(String, PathBuf)> = std::collections::HashSet::new();
473    for cached_arg in [&[][..], &["--cached"][..]] {
474        let mut args = vec!["diff-index", "-z"];
475        args.extend_from_slice(cached_arg);
476        args.push(reference);
477        let out = Command::new("git")
478            .arg("-C")
479            .arg(&scan_cwd)
480            .args(&args)
481            .output()?;
482        if !out.status.success() {
483            // diff-index against a missing ref is fine — empty repo,
484            // detached HEAD before first commit, etc. Treat like
485            // "no entries" rather than erroring.
486            continue;
487        }
488        // Format with `-z`:
489        //   `:<src_mode> <dst_mode> <src_sha> <dst_sha> <status>\0<path>\0`
490        // For renames (R*) and copies (C*) two paths follow, but the
491        // dst-sha reflects the new content either way.
492        let bytes = &out.stdout;
493        let mut i = 0;
494        while i < bytes.len() {
495            // Read until next NUL — the meta record.
496            let meta_end = bytes[i..]
497                .iter()
498                .position(|&b| b == 0)
499                .map(|p| i + p)
500                .unwrap_or(bytes.len());
501            let Ok(meta) = std::str::from_utf8(&bytes[i..meta_end]) else {
502                i = meta_end + 1;
503                continue;
504            };
505            i = meta_end + 1;
506            // Then 1-2 NUL-terminated paths depending on status.
507            let parts: Vec<&str> = meta.trim_start_matches(':').split_whitespace().collect();
508            if parts.len() < 5 {
509                continue;
510            }
511            let dst_mode = parts[1];
512            let dst_sha = parts[3];
513            let status = parts[4];
514            // Skip symlinks (120000), gitlinks/submodules (160000),
515            // deletions (D), and entries with all-zero dst sha
516            // (deleted in working tree).
517            if dst_mode == "120000"
518                || dst_mode == "160000"
519                || status.starts_with('D')
520                || dst_sha.bytes().all(|b| b == b'0')
521            {
522                // Still need to consume the path(s).
523                let path_count = if status.starts_with('R') || status.starts_with('C') {
524                    2
525                } else {
526                    1
527                };
528                for _ in 0..path_count {
529                    let end = bytes[i..]
530                        .iter()
531                        .position(|&b| b == 0)
532                        .map(|p| i + p)
533                        .unwrap_or(bytes.len());
534                    i = end + 1;
535                }
536                continue;
537            }
538            // Read the destination path (last NUL-terminated entry).
539            let path_count = if status.starts_with('R') || status.starts_with('C') {
540                2
541            } else {
542                1
543            };
544            let mut path: PathBuf = PathBuf::new();
545            for n in 0..path_count {
546                let end = bytes[i..]
547                    .iter()
548                    .position(|&b| b == 0)
549                    .map(|p| i + p)
550                    .unwrap_or(bytes.len());
551                if n + 1 == path_count {
552                    path = PathBuf::from(String::from_utf8_lossy(&bytes[i..end]).into_owned());
553                }
554                i = end + 1;
555            }
556            let key = (dst_sha.to_owned(), path.clone());
557            if seen.insert(key) {
558                candidates.push((dst_sha.to_owned(), path));
559            }
560        }
561    }
562    if candidates.is_empty() {
563        return Ok(Vec::new());
564    }
565
566    // Header check first to avoid reading non-pointer-sized blobs.
567    let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
568    let mut sized: Vec<(String, PathBuf)> = Vec::new();
569    for (oid, path) in candidates {
570        match bcheck.check(&oid)? {
571            CatFileHeader::Found { kind, size, .. }
572                if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
573            {
574                sized.push((oid, path));
575            }
576            _ => {}
577        }
578    }
579    drop(bcheck);
580
581    let mut batch = CatFileBatch::spawn(cwd)?;
582    let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
583    let mut out: Vec<PointerEntry> = Vec::new();
584    for (oid, path) in sized {
585        let Some(blob) = batch.read(&oid)? else {
586            continue;
587        };
588        let Ok(pointer) = Pointer::parse(&blob.content) else {
589            continue;
590        };
591        if let Some(&idx) = by_oid.get(&pointer.oid) {
592            if !out[idx].paths.contains(&path) {
593                out[idx].paths.push(path);
594            }
595            continue;
596        }
597        by_oid.insert(pointer.oid, out.len());
598        out.push(PointerEntry {
599            oid: pointer.oid,
600            size: pointer.size,
601            path: Some(path.clone()),
602            paths: vec![path],
603            canonical: pointer.canonical,
604            extensions: pointer.extensions.clone(),
605        });
606    }
607    Ok(out)
608}
609
610/// LFS pointers reachable from `refs/stash` and its associated WIP /
611/// index / untracked merge parents. Mirrors upstream's
612/// `lfs/gitscanner_log.go::scanStashed`.
613///
614/// Stashes are stored as merge commits whose first parent is the HEAD
615/// at stash time; the other parent(s) are the index commit and (when
616/// `git stash -u` is used) the untracked-files commit. Walking the
617/// reflog for `refs/stash` and reading both sides of each merge diff
618/// is the way to surface every LFS pointer those commits reference.
619///
620/// Returns an empty vec when the repo has no stash entries (the
621/// `git log -g refs/stash` invocation exits non-zero in that case;
622/// upstream silently swallows that error and we do the same).
623pub fn scan_stashed(cwd: &Path) -> Result<Vec<PointerEntry>, Error> {
624    let stash_shas: Vec<String> = match Command::new("git")
625        .arg("-C")
626        .arg(cwd)
627        .args(["log", "-g", "--format=%h", "refs/stash", "--"])
628        .output()
629    {
630        Ok(out) if out.status.success() => String::from_utf8_lossy(&out.stdout)
631            .lines()
632            .map(|l| l.trim().to_owned())
633            .filter(|s| !s.is_empty())
634            .collect(),
635        _ => return Ok(Vec::new()),
636    };
637    if stash_shas.is_empty() {
638        return Ok(Vec::new());
639    }
640    // Each stash entry is walked twice: first with `-m --first-parent`
641    // to surface the WIP merge's diff against HEAD; then with no
642    // extra args to surface the index merge (and the untracked merge
643    // when present). Both runs hit the same parser; pointers are
644    // deduped at the call site.
645    let mut entries: Vec<PointerEntry> = Vec::new();
646    for extra in [&["-m", "--first-parent"][..], &[][..]] {
647        let mut args: Vec<String> = vec!["log".into()];
648        for a in extra {
649            args.push((*a).to_owned());
650        }
651        for a in [
652            "--no-ext-diff",
653            "--no-textconv",
654            "--color=never",
655            "-G",
656            "oid sha256:",
657            "-p",
658            "-U12",
659            "--format=lfs-commit-sha: %H %P",
660        ] {
661            args.push(a.to_owned());
662        }
663        for sha in &stash_shas {
664            args.push(format!("{sha}^..{sha}"));
665        }
666        let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
667        let mut child = Command::new("git")
668            .arg("-C")
669            .arg(cwd)
670            .args(&arg_refs)
671            .stdout(Stdio::piped())
672            .stderr(Stdio::piped())
673            .spawn()?;
674        let stdout = child.stdout.take().expect("piped");
675        let mut parser = LogScanner::new(LogDiffDirection::Additions);
676        for line in BufReader::new(stdout).lines() {
677            let line = line?;
678            if let Some(entry) = parser.feed(&line) {
679                entries.push(entry);
680            }
681        }
682        if let Some(entry) = parser.flush() {
683            entries.push(entry);
684        }
685        // Swallow the exit status — `git log -g refs/stash^..refs/stash`
686        // can fail when `refs/stash` doesn't exist; upstream's
687        // `scanStashed` ignores it for the same reason.
688        let _ = child.wait();
689    }
690    Ok(entries)
691}
692
693/// Walk `git log -G "oid sha256:" -p <ref>` since `since`, returning
694/// every LFS pointer that appears as the **previous** state of a
695/// modified file (i.e. lives on the `-` side of a unified diff).
696///
697/// Mirrors upstream's `lfs/gitscanner_log.go::logPreviousSHAs`. Used by
698/// fetch-recent (to download pre-images of recently-modified files) and
699/// by prune retention (to keep them on disk).
700///
701/// `-U12` is requested so a small in-place edit still surfaces enough
702/// surrounding context to capture the full pointer body (version, oid,
703/// size, optional ext-N lines).
704pub fn scan_previous_versions(
705    cwd: &Path,
706    reference: &str,
707    since: SystemTime,
708) -> Result<Vec<PointerEntry>, Error> {
709    let since_unix = since
710        .duration_since(UNIX_EPOCH)
711        .map(|d| d.as_secs() as i64)
712        .unwrap_or(0);
713    let since_arg = format!("--since=@{since_unix}");
714    let mut child = Command::new("git")
715        .arg("-C")
716        .arg(cwd)
717        .args([
718            "log",
719            "--no-ext-diff",
720            "--no-textconv",
721            "--color=never",
722            "-G",
723            "oid sha256:",
724            "-p",
725            "-U12",
726            "--format=lfs-commit-sha: %H %P",
727            &since_arg,
728            reference,
729        ])
730        .stdout(Stdio::piped())
731        .stderr(Stdio::piped())
732        .spawn()?;
733    let stdout = child.stdout.take().expect("piped");
734    let mut parser = LogScanner::new(LogDiffDirection::Deletions);
735    let mut entries = Vec::new();
736    for line in BufReader::new(stdout).lines() {
737        let line = line?;
738        if let Some(entry) = parser.feed(&line) {
739            entries.push(entry);
740        }
741    }
742    if let Some(entry) = parser.flush() {
743        entries.push(entry);
744    }
745    let status = child.wait()?;
746    if !status.success() {
747        return Err(Error::Failed(format!(
748            "git log failed: exit {:?}",
749            status.code()
750        )));
751    }
752    Ok(entries)
753}
754
755/// Which side of a unified diff to capture pointer bodies from.
756#[derive(Debug, Clone, Copy, PartialEq, Eq)]
757enum LogDiffDirection {
758    /// `+` lines (new state of the file in this commit). Used by the
759    /// stash walker — the WIP merge's "added" pointer is the one we
760    /// need to keep around if the user later does `git stash pop`.
761    Additions,
762    /// `-` lines (previous state, before this commit modified it).
763    Deletions,
764}
765
766/// Line-by-line state machine for `git log -p` output formatted with
767/// `--format=lfs-commit-sha: %H %P`. Mirrors upstream's `logScanner`
768/// (`lfs/gitscanner_log.go`).
769struct LogScanner {
770    direction: LogDiffDirection,
771    /// Path of the file currently being diffed, or `None` between
772    /// commits / before the first `diff --git` header.
773    current_filename: Option<String>,
774    /// Buffered pointer-body lines (with the diff marker stripped).
775    /// Flushed into a [`PointerEntry`] when the next file/commit
776    /// boundary arrives or the stream ends.
777    pointer_data: Vec<u8>,
778}
779
780impl LogScanner {
781    fn new(direction: LogDiffDirection) -> Self {
782        Self {
783            direction,
784            current_filename: None,
785            pointer_data: Vec::new(),
786        }
787    }
788
789    /// Feed one log line. Returns `Some(entry)` when a complete pointer
790    /// body just got flushed (i.e. a file or commit boundary just
791    /// arrived after we'd buffered some pointer data).
792    fn feed(&mut self, line: &str) -> Option<PointerEntry> {
793        if line.starts_with("lfs-commit-sha: ") {
794            return self.flush();
795        }
796        if let Some(rest) = line.strip_prefix("diff --git ") {
797            let entry = self.flush();
798            self.current_filename = parse_diff_git_header(rest, self.direction);
799            return entry;
800        }
801        if let Some(rest) = line.strip_prefix("diff --cc ") {
802            let entry = self.flush();
803            self.current_filename = Some(rest.to_owned());
804            return entry;
805        }
806        if self.current_filename.is_some() && is_pointer_data_line(line, self.direction) {
807            // Strip the leading diff marker (`+`, `-`, or ` `).
808            self.pointer_data.extend_from_slice(&line.as_bytes()[1..]);
809            self.pointer_data.push(b'\n');
810        }
811        None
812    }
813
814    /// Drain the pending pointer buffer into a [`PointerEntry`] if it
815    /// parses as a valid LFS pointer. Resets the buffer either way.
816    fn flush(&mut self) -> Option<PointerEntry> {
817        if self.pointer_data.is_empty() {
818            return None;
819        }
820        let parsed = Pointer::parse(&self.pointer_data);
821        let path = self.current_filename.as_ref().map(PathBuf::from);
822        self.pointer_data.clear();
823        let pointer = parsed.ok()?;
824        Some(PointerEntry {
825            oid: pointer.oid,
826            size: pointer.size,
827            paths: path.iter().cloned().collect(),
828            path,
829            canonical: pointer.canonical,
830            extensions: pointer.extensions,
831        })
832    }
833}
834
835/// Pointer-body lines start with one of the diff markers (`+`, `-`,
836/// ` `) followed by one of the four pointer-keyword prefixes. We
837/// always include unchanged context lines (` `) so the version/size
838/// lines bracket the changed `oid` line — `-U12` makes that reliable
839/// for typical extension chains too.
840fn is_pointer_data_line(line: &str, dir: LogDiffDirection) -> bool {
841    let mut chars = line.chars();
842    let Some(marker) = chars.next() else {
843        return false;
844    };
845    let dir_match = matches!(
846        (marker, dir),
847        ('+', LogDiffDirection::Additions) | ('-', LogDiffDirection::Deletions) | (' ', _)
848    );
849    if !dir_match {
850        return false;
851    }
852    let body = chars.as_str();
853    body.starts_with("version https://git-lfs")
854        || body.starts_with("oid sha256")
855        || body.starts_with("size")
856        || body.starts_with("ext-")
857}
858
859/// Parse a `diff --git a/<path1> b/<path2>` header to the relevant
860/// path. We pick the `b/` path for additions (the "new" name) and
861/// the `a/` path for deletions (the "old" name). Renames are rare in
862/// LFS-tracked binaries; on a rename, additions tracks the new path
863/// and deletions tracks the old, which matches upstream.
864///
865/// Quoted / octal-escaped paths (those with spaces or non-ASCII)
866/// aren't unescaped here — yet. The fetch-recent and prune tests
867/// don't exercise them, so deferring keeps Slice 3 compact.
868fn parse_diff_git_header(rest: &str, dir: LogDiffDirection) -> Option<String> {
869    let trimmed = rest.trim();
870    let a_idx = trimmed.find("a/")?;
871    let after_a = &trimmed[a_idx + 2..];
872    // Find the boundary between path1 and " b/path2". Upstream's regex
873    // uses `\s+`, so any whitespace run terminates path1.
874    let space_idx = after_a.find(|c: char| c.is_whitespace())?;
875    let path_a = &after_a[..space_idx];
876    let after_space = after_a[space_idx..].trim_start();
877    let after_b = after_space.strip_prefix("b/")?;
878    match dir {
879        LogDiffDirection::Additions => Some(after_b.to_owned()),
880        LogDiffDirection::Deletions => Some(path_a.to_owned()),
881    }
882}
883
884#[cfg(test)]
885mod tests {
886    use super::*;
887    use crate::tests::commit_helper::*;
888
889    /// Build a canonical pointer text for a known content. Mirrors what
890    /// `git lfs clean` would emit, so we don't need to wire the filter
891    /// crate into git's tests.
892    fn pointer_text(content: &[u8]) -> Vec<u8> {
893        use sha2::{Digest, Sha256};
894        let oid_bytes: [u8; 32] = Sha256::digest(content).into();
895        let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
896            use std::fmt::Write;
897            let _ = write!(s, "{b:02x}");
898            s
899        });
900        format!(
901            "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
902            content.len()
903        )
904        .into_bytes()
905    }
906
907    #[test]
908    fn empty_repo_returns_no_pointers() {
909        let repo = init_repo();
910        commit_file(&repo, "a.txt", b"plain content");
911        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
912        assert!(result.is_empty());
913    }
914
915    #[test]
916    fn finds_pointer_blobs_skips_plain_blobs() {
917        let repo = init_repo();
918        // Plain content + LFS pointer side-by-side.
919        commit_file(&repo, "plain.txt", b"just text");
920        let pointer = pointer_text(b"this would be the actual binary content");
921        commit_file(&repo, "big.bin", &pointer);
922
923        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
924        assert_eq!(result.len(), 1, "{result:?}");
925        assert_eq!(
926            result[0].size,
927            b"this would be the actual binary content".len() as u64,
928        );
929        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
930    }
931
932    #[test]
933    fn dedups_same_lfs_oid_in_multiple_paths() {
934        let repo = init_repo();
935        let pointer = pointer_text(b"shared payload");
936        commit_file(&repo, "first.bin", &pointer);
937        commit_file(&repo, "second.bin", &pointer);
938
939        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
940        // Same content → same pointer text → same git blob OID, but we
941        // also want to verify dedup at the LFS-OID layer.
942        assert_eq!(result.len(), 1, "{result:?}");
943    }
944
945    #[test]
946    fn finds_pointers_in_history_not_just_tip() {
947        let repo = init_repo();
948        // A pointer that is later overwritten by plain content. ScanRefs
949        // semantics require we still find it — older commits are part of
950        // history reachable from HEAD.
951        let pointer = pointer_text(b"deleted later");
952        commit_file(&repo, "x.bin", &pointer);
953        commit_file(&repo, "x.bin", b"plain text now");
954
955        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
956        assert_eq!(result.len(), 1);
957        assert_eq!(result[0].size, b"deleted later".len() as u64);
958    }
959
960    #[test]
961    fn excludes_filter_history_walk() {
962        let repo = init_repo();
963        commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
964        let first = head_oid(&repo);
965        commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
966
967        // Include HEAD, exclude the first commit → only new.bin's pointer.
968        let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
969        assert_eq!(result.len(), 1, "{result:?}");
970        assert_eq!(result[0].size, b"new payload".len() as u64);
971    }
972
973    #[test]
974    fn skips_blobs_that_look_like_pointers_but_dont_parse() {
975        let repo = init_repo();
976        // Small, but malformed pointer-shaped content.
977        commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
978
979        let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
980        assert!(result.is_empty(), "{result:?}");
981    }
982
983    #[test]
984    fn scan_tree_returns_only_tree_entries_not_history() {
985        let repo = init_repo();
986        // A pointer that exists historically but is gone at HEAD must
987        // NOT show up in scan_tree (this is the point of the helper —
988        // ls-files should only see what's in the named tree).
989        let pointer = pointer_text(b"deleted later");
990        commit_file(&repo, "x.bin", &pointer);
991        commit_file(&repo, "x.bin", b"plain text now");
992
993        let result = scan_tree(repo.path(), "HEAD").unwrap();
994        assert!(result.is_empty(), "{result:?}");
995    }
996
997    #[test]
998    fn scan_tree_emits_one_entry_per_path_not_per_oid() {
999        let repo = init_repo();
1000        // Same pointer at two paths in the current tree → two entries.
1001        // (scan_pointers would dedupe to one; scan_tree must not.)
1002        let pointer = pointer_text(b"shared payload");
1003        commit_file(&repo, "first.bin", &pointer);
1004        commit_file(&repo, "second.bin", &pointer);
1005
1006        let mut result = scan_tree(repo.path(), "HEAD").unwrap();
1007        result.sort_by(|a, b| a.path.cmp(&b.path));
1008        assert_eq!(result.len(), 2, "{result:?}");
1009        assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
1010        assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
1011        // Same OID under both paths.
1012        assert_eq!(result[0].oid, result[1].oid);
1013    }
1014
1015    #[test]
1016    fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
1017        let repo = init_repo();
1018        commit_file(&repo, "plain.txt", b"just text");
1019        let pointer = pointer_text(b"binary content");
1020        commit_file(&repo, "big.bin", &pointer);
1021
1022        let result = scan_tree(repo.path(), "HEAD").unwrap();
1023        assert_eq!(result.len(), 1, "{result:?}");
1024        assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
1025    }
1026
1027    #[test]
1028    fn scan_tree_unknown_ref_errors() {
1029        let repo = init_repo();
1030        commit_file(&repo, "a.txt", b"x");
1031        let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
1032        match err {
1033            Error::Failed(msg) => assert!(
1034                msg.contains("does-not-exist") || msg.contains("Not a valid"),
1035                "unexpected message: {msg}"
1036            ),
1037            _ => panic!("expected Failed, got {err:?}"),
1038        }
1039    }
1040
1041    fn feed_log<'a, I: IntoIterator<Item = &'a str>>(
1042        dir: LogDiffDirection,
1043        lines: I,
1044    ) -> Vec<PointerEntry> {
1045        let mut s = LogScanner::new(dir);
1046        let mut out = Vec::new();
1047        for line in lines {
1048            if let Some(e) = s.feed(line) {
1049                out.push(e);
1050            }
1051        }
1052        if let Some(e) = s.flush() {
1053            out.push(e);
1054        }
1055        out
1056    }
1057
1058    #[test]
1059    fn log_scanner_extracts_deleted_pointer_body() {
1060        // Two commits: first adds the pointer (its old state at HEAD~1
1061        // is empty), second replaces it with a different OID. The
1062        // Deletions side captures the second diff's `-` lines = the
1063        // pointer at HEAD~1.
1064        let lines = [
1065            "lfs-commit-sha: cccccccccccccccccccccccccccccccccccccccc bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
1066            "diff --git a/foo.bin b/foo.bin",
1067            "@@ -1,3 +1,3 @@",
1068            " version https://git-lfs.github.com/spec/v1",
1069            "-oid sha256:1111111111111111111111111111111111111111111111111111111111111111",
1070            "-size 100",
1071            "+oid sha256:2222222222222222222222222222222222222222222222222222222222222222",
1072            "+size 200",
1073        ];
1074        let out = feed_log(LogDiffDirection::Deletions, lines);
1075        assert_eq!(out.len(), 1);
1076        assert_eq!(out[0].size, 100);
1077        assert_eq!(
1078            out[0]
1079                .path
1080                .as_deref()
1081                .map(|p| p.to_string_lossy().into_owned()),
1082            Some("foo.bin".to_owned())
1083        );
1084    }
1085
1086    #[test]
1087    fn log_scanner_handles_multi_file_commit() {
1088        let lines = [
1089            "lfs-commit-sha: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
1090            "diff --git a/a.bin b/a.bin",
1091            " version https://git-lfs.github.com/spec/v1",
1092            "-oid sha256:1111111111111111111111111111111111111111111111111111111111111111",
1093            "-size 1",
1094            "+oid sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
1095            "+size 2",
1096            "diff --git a/b.bin b/b.bin",
1097            " version https://git-lfs.github.com/spec/v1",
1098            "-oid sha256:3333333333333333333333333333333333333333333333333333333333333333",
1099            "-size 3",
1100            "+oid sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
1101            "+size 4",
1102        ];
1103        let out = feed_log(LogDiffDirection::Deletions, lines);
1104        assert_eq!(out.len(), 2);
1105        assert_eq!(out[0].size, 1);
1106        assert_eq!(out[1].size, 3);
1107    }
1108
1109    #[test]
1110    fn log_scanner_skips_non_pointer_diffs() {
1111        // The pointer-data regex only matches lines starting with one
1112        // of the four LFS keywords — random source-file edits don't
1113        // accumulate.
1114        let lines = [
1115            "lfs-commit-sha: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
1116            "diff --git a/main.c b/main.c",
1117            "-int old() { return 1; }",
1118            "+int new() { return 2; }",
1119        ];
1120        let out = feed_log(LogDiffDirection::Deletions, lines);
1121        assert!(out.is_empty(), "got {out:?}");
1122    }
1123
1124    #[test]
1125    fn parse_diff_git_header_picks_correct_side() {
1126        let h = "a/foo.bin b/foo.bin";
1127        assert_eq!(
1128            parse_diff_git_header(h, LogDiffDirection::Additions).as_deref(),
1129            Some("foo.bin")
1130        );
1131        assert_eq!(
1132            parse_diff_git_header(h, LogDiffDirection::Deletions).as_deref(),
1133            Some("foo.bin")
1134        );
1135        // Rename — paths differ; deletions sees the old name.
1136        let renamed = "a/old.bin b/new.bin";
1137        assert_eq!(
1138            parse_diff_git_header(renamed, LogDiffDirection::Additions).as_deref(),
1139            Some("new.bin")
1140        );
1141        assert_eq!(
1142            parse_diff_git_header(renamed, LogDiffDirection::Deletions).as_deref(),
1143            Some("old.bin")
1144        );
1145    }
1146}