use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::process::Command;
use git_lfs_pointer::{MAX_POINTER_SIZE, Oid, Pointer};
use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
use crate::{Error, rev_list};
#[derive(Debug, Clone)]
pub struct PointerEntry {
pub oid: Oid,
pub size: u64,
pub path: Option<PathBuf>,
pub canonical: bool,
}
pub fn scan_pointers(
cwd: &Path,
include: &[&str],
exclude: &[&str],
) -> Result<Vec<PointerEntry>, Error> {
let entries = rev_list(cwd, include, exclude)?;
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut candidates: Vec<(String, Option<String>)> = Vec::new();
for entry in entries {
match bcheck.check(&entry.oid)? {
CatFileHeader::Found { kind, size, .. }
if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
{
candidates.push((entry.oid, entry.name));
}
_ => {}
}
}
drop(bcheck);
let mut batch = CatFileBatch::spawn(cwd)?;
let mut seen: HashSet<Oid> = HashSet::new();
let mut out = Vec::new();
for (oid, name) in candidates {
let Some(blob) = batch.read(&oid)? else { continue };
let Ok(pointer) = Pointer::parse(&blob.content) else { continue };
if seen.insert(pointer.oid) {
out.push(PointerEntry {
oid: pointer.oid,
size: pointer.size,
path: name.map(PathBuf::from),
canonical: pointer.canonical,
});
}
}
Ok(out)
}
#[derive(Debug, Clone)]
pub struct TreeBlob {
pub path: PathBuf,
pub blob_oid: String,
pub size: u64,
}
pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
let out = Command::new("git")
.arg("-C")
.arg(cwd)
.args(["ls-tree", "--full-tree", "-r", "-z", reference])
.output()?;
if !out.status.success() {
return Err(Error::Failed(format!(
"git ls-tree failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
)));
}
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut blobs = Vec::new();
for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
let s = std::str::from_utf8(record).map_err(|e| {
Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
})?;
let (header, path) = s
.split_once('\t')
.ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
let mut parts = header.split_whitespace();
let _mode = parts.next();
let kind = parts.next();
let oid = parts
.next()
.ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
if kind != Some("blob") {
continue;
}
if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
&& kind == "blob"
{
blobs.push(TreeBlob {
path: PathBuf::from(path),
blob_oid: oid.to_owned(),
size,
});
}
}
Ok(blobs)
}
pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
let out = Command::new("git")
.arg("-C")
.arg(cwd)
.args(["ls-tree", "--full-tree", "-r", "-z", reference])
.output()?;
if !out.status.success() {
return Err(Error::Failed(format!(
"git ls-tree failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
)));
}
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut candidates: Vec<(String, String)> = Vec::new();
for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
let s = std::str::from_utf8(record).map_err(|e| {
Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
})?;
let (header, path) = s
.split_once('\t')
.ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
let mut parts = header.split_whitespace();
let _mode = parts.next();
let kind = parts.next();
let oid = parts
.next()
.ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
if kind != Some("blob") {
continue;
}
if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
&& kind == "blob"
&& (size as usize) < MAX_POINTER_SIZE
{
candidates.push((oid.to_owned(), path.to_owned()));
}
}
drop(bcheck);
let mut batch = CatFileBatch::spawn(cwd)?;
let mut entries = Vec::new();
for (oid, path) in candidates {
let Some(blob) = batch.read(&oid)? else { continue };
let Ok(pointer) = Pointer::parse(&blob.content) else {
continue;
};
entries.push(PointerEntry {
oid: pointer.oid,
size: pointer.size,
path: Some(PathBuf::from(path)),
canonical: pointer.canonical,
});
}
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::commit_helper::*;
fn pointer_text(content: &[u8]) -> Vec<u8> {
use sha2::{Digest, Sha256};
let oid_bytes: [u8; 32] = Sha256::digest(content).into();
let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
use std::fmt::Write;
let _ = write!(s, "{b:02x}");
s
});
format!(
"version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
content.len()
)
.into_bytes()
}
#[test]
fn empty_repo_returns_no_pointers() {
let repo = init_repo();
commit_file(&repo, "a.txt", b"plain content");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert!(result.is_empty());
}
#[test]
fn finds_pointer_blobs_skips_plain_blobs() {
let repo = init_repo();
commit_file(&repo, "plain.txt", b"just text");
let pointer = pointer_text(b"this would be the actual binary content");
commit_file(&repo, "big.bin", &pointer);
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(
result[0].size,
b"this would be the actual binary content".len() as u64,
);
assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
}
#[test]
fn dedups_same_lfs_oid_in_multiple_paths() {
let repo = init_repo();
let pointer = pointer_text(b"shared payload");
commit_file(&repo, "first.bin", &pointer);
commit_file(&repo, "second.bin", &pointer);
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
}
#[test]
fn finds_pointers_in_history_not_just_tip() {
let repo = init_repo();
let pointer = pointer_text(b"deleted later");
commit_file(&repo, "x.bin", &pointer);
commit_file(&repo, "x.bin", b"plain text now");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].size, b"deleted later".len() as u64);
}
#[test]
fn excludes_filter_history_walk() {
let repo = init_repo();
commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
let first = head_oid(&repo);
commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(result[0].size, b"new payload".len() as u64);
}
#[test]
fn skips_blobs_that_look_like_pointers_but_dont_parse() {
let repo = init_repo();
commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert!(result.is_empty(), "{result:?}");
}
#[test]
fn scan_tree_returns_only_tree_entries_not_history() {
let repo = init_repo();
let pointer = pointer_text(b"deleted later");
commit_file(&repo, "x.bin", &pointer);
commit_file(&repo, "x.bin", b"plain text now");
let result = scan_tree(repo.path(), "HEAD").unwrap();
assert!(result.is_empty(), "{result:?}");
}
#[test]
fn scan_tree_emits_one_entry_per_path_not_per_oid() {
let repo = init_repo();
let pointer = pointer_text(b"shared payload");
commit_file(&repo, "first.bin", &pointer);
commit_file(&repo, "second.bin", &pointer);
let mut result = scan_tree(repo.path(), "HEAD").unwrap();
result.sort_by(|a, b| a.path.cmp(&b.path));
assert_eq!(result.len(), 2, "{result:?}");
assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
assert_eq!(result[0].oid, result[1].oid);
}
#[test]
fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
let repo = init_repo();
commit_file(&repo, "plain.txt", b"just text");
let pointer = pointer_text(b"binary content");
commit_file(&repo, "big.bin", &pointer);
let result = scan_tree(repo.path(), "HEAD").unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
}
#[test]
fn scan_tree_unknown_ref_errors() {
let repo = init_repo();
commit_file(&repo, "a.txt", b"x");
let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
match err {
Error::Failed(msg) => assert!(
msg.contains("does-not-exist") || msg.contains("Not a valid"),
"unexpected message: {msg}"
),
_ => panic!("expected Failed, got {err:?}"),
}
}
}