use std::path::{Path, PathBuf};
use std::process::Command;
use git_lfs_pointer::{Extension, MAX_POINTER_SIZE, Oid, Pointer};
use crate::Error;
use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
#[derive(Debug, Clone)]
pub struct PointerEntry {
pub oid: Oid,
pub size: u64,
pub path: Option<PathBuf>,
pub paths: Vec<PathBuf>,
pub canonical: bool,
pub extensions: Vec<Extension>,
}
pub fn scan_pointers(
cwd: &Path,
include: &[&str],
exclude: &[&str],
) -> Result<Vec<PointerEntry>, Error> {
scan_pointers_with_args(cwd, include, exclude, &[])
}
pub fn scan_pointers_with_args(
cwd: &Path,
include: &[&str],
exclude: &[&str],
extra_cmdline_args: &[&str],
) -> Result<Vec<PointerEntry>, Error> {
let entries = crate::rev_list::rev_list_with_args(cwd, include, exclude, extra_cmdline_args)?;
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut candidates: Vec<(String, Option<String>)> = Vec::new();
for entry in entries {
match bcheck.check(&entry.oid)? {
CatFileHeader::Found { kind, size, .. }
if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
{
candidates.push((entry.oid, entry.name));
}
_ => {}
}
}
drop(bcheck);
let mut batch = CatFileBatch::spawn(cwd)?;
let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
let mut out: Vec<PointerEntry> = Vec::new();
for (oid, name) in candidates {
let Some(blob) = batch.read(&oid)? else {
continue;
};
let Ok(pointer) = Pointer::parse(&blob.content) else {
continue;
};
let path_buf = name.map(PathBuf::from);
if let Some(&idx) = by_oid.get(&pointer.oid) {
if let Some(p) = path_buf
&& !out[idx].paths.contains(&p)
{
out[idx].paths.push(p);
}
continue;
}
let paths: Vec<PathBuf> = path_buf.iter().cloned().collect();
by_oid.insert(pointer.oid, out.len());
out.push(PointerEntry {
oid: pointer.oid,
size: pointer.size,
path: path_buf,
paths,
canonical: pointer.canonical,
extensions: pointer.extensions.clone(),
});
}
Ok(out)
}
pub fn scan_index_lfs(cwd: &Path) -> Result<Vec<PointerEntry>, Error> {
let scan_cwd = match crate::run_git(cwd, &["rev-parse", "--show-toplevel"]) {
Ok(s) if !s.is_empty() => PathBuf::from(s),
_ => crate::run_git(cwd, &["rev-parse", "--absolute-git-dir"])
.map(PathBuf::from)
.unwrap_or_else(|_| cwd.to_path_buf()),
};
let filter_by_parent_dir = is_bare_repo(&scan_cwd) || is_sparse_checkout(&scan_cwd);
let out = Command::new("git")
.arg("-C")
.arg(&scan_cwd)
.args(["ls-files", "--stage", "-z", "--", ":(attr:filter=lfs)"])
.output()?;
if !out.status.success() {
return Err(Error::Failed(
String::from_utf8_lossy(&out.stderr).trim().to_owned(),
));
}
let mut candidates: Vec<(String, PathBuf)> = Vec::new();
for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
let s = match std::str::from_utf8(record) {
Ok(s) => s,
Err(_) => continue,
};
let Some((meta, path)) = s.split_once('\t') else {
continue;
};
let parts: Vec<&str> = meta.split_whitespace().collect();
if parts.len() < 3 {
continue;
}
let mode = parts[0];
let oid = parts[1];
if mode == "120000" {
continue;
}
let path = PathBuf::from(path);
if filter_by_parent_dir
&& let Some(parent) = path.parent()
&& !parent.as_os_str().is_empty()
&& !scan_cwd.join(parent).is_dir()
{
continue;
}
candidates.push((oid.to_string(), path));
}
if candidates.is_empty() {
return Ok(Vec::new());
}
let mut batch = CatFileBatch::spawn(cwd)?;
let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
let mut out: Vec<PointerEntry> = Vec::new();
for (oid, path) in candidates {
let Some(blob) = batch.read(&oid)? else {
continue;
};
let Ok(pointer) = Pointer::parse(&blob.content) else {
continue;
};
if let Some(&idx) = by_oid.get(&pointer.oid) {
if !out[idx].paths.contains(&path) {
out[idx].paths.push(path);
}
continue;
}
by_oid.insert(pointer.oid, out.len());
out.push(PointerEntry {
oid: pointer.oid,
size: pointer.size,
path: Some(path.clone()),
paths: vec![path],
canonical: pointer.canonical,
extensions: pointer.extensions.clone(),
});
}
Ok(out)
}
fn is_bare_repo(cwd: &Path) -> bool {
crate::run_git(cwd, &["rev-parse", "--is-bare-repository"])
.map(|s| s.trim() == "true")
.unwrap_or(false)
}
fn is_sparse_checkout(cwd: &Path) -> bool {
crate::run_git(cwd, &["config", "--get", "core.sparseCheckout"])
.map(|s| s.trim().eq_ignore_ascii_case("true"))
.unwrap_or(false)
}
#[derive(Debug, Clone)]
pub struct TreeBlob {
pub path: PathBuf,
pub blob_oid: String,
pub size: u64,
pub mode: String,
}
pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
if reference.contains("..") {
return scan_blobs_in_range(cwd, reference);
}
scan_tree_blobs_for_ref(cwd, reference)
}
fn scan_tree_blobs_for_ref(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
let out = Command::new("git")
.arg("-C")
.arg(cwd)
.args(["ls-tree", "--full-tree", "-r", "-z", reference])
.output()?;
if !out.status.success() {
return Err(Error::Failed(format!(
"git ls-tree failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
)));
}
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut blobs = Vec::new();
for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
let s = std::str::from_utf8(record)
.map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
let (header, path) = s
.split_once('\t')
.ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
let mut parts = header.split_whitespace();
let mode = parts
.next()
.ok_or_else(|| Error::Failed(format!("ls-tree: missing mode in {s:?}")))?;
let kind = parts.next();
let oid = parts
.next()
.ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
if kind != Some("blob") {
continue;
}
if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
&& kind == "blob"
{
blobs.push(TreeBlob {
path: PathBuf::from(path),
blob_oid: oid.to_owned(),
size,
mode: mode.to_owned(),
});
}
}
Ok(blobs)
}
fn scan_blobs_in_range(cwd: &Path, range: &str) -> Result<Vec<TreeBlob>, Error> {
let out = Command::new("git")
.arg("-C")
.arg(cwd)
.args(["rev-list", range])
.output()?;
if !out.status.success() {
return Err(Error::Failed(format!(
"git rev-list failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
)));
}
let mut seen: std::collections::HashSet<(PathBuf, String)> = std::collections::HashSet::new();
let mut all = Vec::new();
for line in String::from_utf8_lossy(&out.stdout).lines() {
let commit = line.trim();
if commit.is_empty() {
continue;
}
for blob in scan_tree_blobs_for_ref(cwd, commit)? {
if seen.insert((blob.path.clone(), blob.blob_oid.clone())) {
all.push(blob);
}
}
}
Ok(all)
}
pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
let out = Command::new("git")
.arg("-C")
.arg(cwd)
.args(["ls-tree", "--full-tree", "-r", "-z", reference])
.output()?;
if !out.status.success() {
return Err(Error::Failed(format!(
"git ls-tree failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
)));
}
let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
let mut candidates: Vec<(String, String)> = Vec::new();
for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
let s = std::str::from_utf8(record)
.map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
let (header, path) = s
.split_once('\t')
.ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
let mut parts = header.split_whitespace();
let _mode = parts.next();
let kind = parts.next();
let oid = parts
.next()
.ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
if kind != Some("blob") {
continue;
}
if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
&& kind == "blob"
&& (size as usize) < MAX_POINTER_SIZE
{
candidates.push((oid.to_owned(), path.to_owned()));
}
}
drop(bcheck);
let mut batch = CatFileBatch::spawn(cwd)?;
let mut entries = Vec::new();
for (oid, path) in candidates {
let Some(blob) = batch.read(&oid)? else {
continue;
};
let Ok(pointer) = Pointer::parse(&blob.content) else {
continue;
};
let path_buf = PathBuf::from(path);
entries.push(PointerEntry {
oid: pointer.oid,
size: pointer.size,
path: Some(path_buf.clone()),
paths: vec![path_buf],
canonical: pointer.canonical,
extensions: pointer.extensions.clone(),
});
}
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::commit_helper::*;
fn pointer_text(content: &[u8]) -> Vec<u8> {
use sha2::{Digest, Sha256};
let oid_bytes: [u8; 32] = Sha256::digest(content).into();
let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
use std::fmt::Write;
let _ = write!(s, "{b:02x}");
s
});
format!(
"version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
content.len()
)
.into_bytes()
}
#[test]
fn empty_repo_returns_no_pointers() {
let repo = init_repo();
commit_file(&repo, "a.txt", b"plain content");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert!(result.is_empty());
}
#[test]
fn finds_pointer_blobs_skips_plain_blobs() {
let repo = init_repo();
commit_file(&repo, "plain.txt", b"just text");
let pointer = pointer_text(b"this would be the actual binary content");
commit_file(&repo, "big.bin", &pointer);
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(
result[0].size,
b"this would be the actual binary content".len() as u64,
);
assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
}
#[test]
fn dedups_same_lfs_oid_in_multiple_paths() {
let repo = init_repo();
let pointer = pointer_text(b"shared payload");
commit_file(&repo, "first.bin", &pointer);
commit_file(&repo, "second.bin", &pointer);
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
}
#[test]
fn finds_pointers_in_history_not_just_tip() {
let repo = init_repo();
let pointer = pointer_text(b"deleted later");
commit_file(&repo, "x.bin", &pointer);
commit_file(&repo, "x.bin", b"plain text now");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].size, b"deleted later".len() as u64);
}
#[test]
fn excludes_filter_history_walk() {
let repo = init_repo();
commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
let first = head_oid(&repo);
commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(result[0].size, b"new payload".len() as u64);
}
#[test]
fn skips_blobs_that_look_like_pointers_but_dont_parse() {
let repo = init_repo();
commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
assert!(result.is_empty(), "{result:?}");
}
#[test]
fn scan_tree_returns_only_tree_entries_not_history() {
let repo = init_repo();
let pointer = pointer_text(b"deleted later");
commit_file(&repo, "x.bin", &pointer);
commit_file(&repo, "x.bin", b"plain text now");
let result = scan_tree(repo.path(), "HEAD").unwrap();
assert!(result.is_empty(), "{result:?}");
}
#[test]
fn scan_tree_emits_one_entry_per_path_not_per_oid() {
let repo = init_repo();
let pointer = pointer_text(b"shared payload");
commit_file(&repo, "first.bin", &pointer);
commit_file(&repo, "second.bin", &pointer);
let mut result = scan_tree(repo.path(), "HEAD").unwrap();
result.sort_by(|a, b| a.path.cmp(&b.path));
assert_eq!(result.len(), 2, "{result:?}");
assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
assert_eq!(result[0].oid, result[1].oid);
}
#[test]
fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
let repo = init_repo();
commit_file(&repo, "plain.txt", b"just text");
let pointer = pointer_text(b"binary content");
commit_file(&repo, "big.bin", &pointer);
let result = scan_tree(repo.path(), "HEAD").unwrap();
assert_eq!(result.len(), 1, "{result:?}");
assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
}
#[test]
fn scan_tree_unknown_ref_errors() {
let repo = init_repo();
commit_file(&repo, "a.txt", b"x");
let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
match err {
Error::Failed(msg) => assert!(
msg.contains("does-not-exist") || msg.contains("Not a valid"),
"unexpected message: {msg}"
),
_ => panic!("expected Failed, got {err:?}"),
}
}
}