1use std::path::{Path, PathBuf};
19use std::process::Command;
20
21use git_lfs_pointer::{Extension, MAX_POINTER_SIZE, Oid, Pointer};
22
23use crate::Error;
24use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
25
26#[derive(Debug, Clone)]
28pub struct PointerEntry {
29 pub oid: Oid,
31 pub size: u64,
33 pub path: Option<PathBuf>,
38 pub paths: Vec<PathBuf>,
45 pub canonical: bool,
49 pub extensions: Vec<Extension>,
55}
56
57pub fn scan_pointers(
69 cwd: &Path,
70 include: &[&str],
71 exclude: &[&str],
72) -> Result<Vec<PointerEntry>, Error> {
73 scan_pointers_with_args(cwd, include, exclude, &[])
74}
75
76pub fn scan_pointers_with_args(
79 cwd: &Path,
80 include: &[&str],
81 exclude: &[&str],
82 extra_cmdline_args: &[&str],
83) -> Result<Vec<PointerEntry>, Error> {
84 let entries = crate::rev_list::rev_list_with_args(cwd, include, exclude, extra_cmdline_args)?;
85
86 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
89 let mut candidates: Vec<(String, Option<String>)> = Vec::new();
90 for entry in entries {
91 match bcheck.check(&entry.oid)? {
92 CatFileHeader::Found { kind, size, .. }
93 if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
94 {
95 candidates.push((entry.oid, entry.name));
96 }
97 _ => {}
99 }
100 }
101 drop(bcheck);
102
103 let mut batch = CatFileBatch::spawn(cwd)?;
108 let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
109 let mut out: Vec<PointerEntry> = Vec::new();
110 for (oid, name) in candidates {
111 let Some(blob) = batch.read(&oid)? else {
112 continue;
113 };
114 let Ok(pointer) = Pointer::parse(&blob.content) else {
115 continue;
116 };
117 let path_buf = name.map(PathBuf::from);
118 if let Some(&idx) = by_oid.get(&pointer.oid) {
119 if let Some(p) = path_buf
120 && !out[idx].paths.contains(&p)
121 {
122 out[idx].paths.push(p);
123 }
124 continue;
125 }
126 let paths: Vec<PathBuf> = path_buf.iter().cloned().collect();
127 by_oid.insert(pointer.oid, out.len());
128 out.push(PointerEntry {
129 oid: pointer.oid,
130 size: pointer.size,
131 path: path_buf,
132 paths,
133 canonical: pointer.canonical,
134 extensions: pointer.extensions.clone(),
135 });
136 }
137 Ok(out)
138}
139
140pub fn scan_index_lfs(cwd: &Path) -> Result<Vec<PointerEntry>, Error> {
154 let scan_cwd = match crate::run_git(cwd, &["rev-parse", "--show-toplevel"]) {
160 Ok(s) if !s.is_empty() => PathBuf::from(s),
161 _ => crate::run_git(cwd, &["rev-parse", "--absolute-git-dir"])
162 .map(PathBuf::from)
163 .unwrap_or_else(|_| cwd.to_path_buf()),
164 };
165 let filter_by_parent_dir = is_bare_repo(&scan_cwd) || is_sparse_checkout(&scan_cwd);
172
173 let out = Command::new("git")
174 .arg("-C")
175 .arg(&scan_cwd)
176 .args(["ls-files", "--stage", "-z", "--", ":(attr:filter=lfs)"])
177 .output()?;
178 if !out.status.success() {
179 return Err(Error::Failed(
180 String::from_utf8_lossy(&out.stderr).trim().to_owned(),
181 ));
182 }
183
184 let mut candidates: Vec<(String, PathBuf)> = Vec::new();
185 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
186 let s = match std::str::from_utf8(record) {
187 Ok(s) => s,
188 Err(_) => continue,
189 };
190 let Some((meta, path)) = s.split_once('\t') else {
192 continue;
193 };
194 let parts: Vec<&str> = meta.split_whitespace().collect();
195 if parts.len() < 3 {
196 continue;
197 }
198 let mode = parts[0];
199 let oid = parts[1];
200 if mode == "120000" {
201 continue;
202 }
203 let path = PathBuf::from(path);
204 if filter_by_parent_dir
213 && let Some(parent) = path.parent()
214 && !parent.as_os_str().is_empty()
215 && !scan_cwd.join(parent).is_dir()
216 {
217 continue;
218 }
219 candidates.push((oid.to_string(), path));
220 }
221 if candidates.is_empty() {
222 return Ok(Vec::new());
223 }
224
225 let mut batch = CatFileBatch::spawn(cwd)?;
226 let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
227 let mut out: Vec<PointerEntry> = Vec::new();
228 for (oid, path) in candidates {
229 let Some(blob) = batch.read(&oid)? else {
230 continue;
231 };
232 let Ok(pointer) = Pointer::parse(&blob.content) else {
233 continue;
234 };
235 if let Some(&idx) = by_oid.get(&pointer.oid) {
236 if !out[idx].paths.contains(&path) {
237 out[idx].paths.push(path);
238 }
239 continue;
240 }
241 by_oid.insert(pointer.oid, out.len());
242 out.push(PointerEntry {
243 oid: pointer.oid,
244 size: pointer.size,
245 path: Some(path.clone()),
246 paths: vec![path],
247 canonical: pointer.canonical,
248 extensions: pointer.extensions.clone(),
249 });
250 }
251 Ok(out)
252}
253
254fn is_bare_repo(cwd: &Path) -> bool {
255 crate::run_git(cwd, &["rev-parse", "--is-bare-repository"])
256 .map(|s| s.trim() == "true")
257 .unwrap_or(false)
258}
259
260fn is_sparse_checkout(cwd: &Path) -> bool {
261 crate::run_git(cwd, &["config", "--get", "core.sparseCheckout"])
262 .map(|s| s.trim().eq_ignore_ascii_case("true"))
263 .unwrap_or(false)
264}
265
266#[derive(Debug, Clone)]
270pub struct TreeBlob {
271 pub path: PathBuf,
273 pub blob_oid: String,
275 pub size: u64,
277 pub mode: String,
281}
282
283pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
287 if reference.contains("..") {
292 return scan_blobs_in_range(cwd, reference);
293 }
294 scan_tree_blobs_for_ref(cwd, reference)
295}
296
297fn scan_tree_blobs_for_ref(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
298 let out = Command::new("git")
299 .arg("-C")
300 .arg(cwd)
301 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
302 .output()?;
303 if !out.status.success() {
304 return Err(Error::Failed(format!(
305 "git ls-tree failed: {}",
306 String::from_utf8_lossy(&out.stderr).trim()
307 )));
308 }
309 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
310 let mut blobs = Vec::new();
311 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
312 let s = std::str::from_utf8(record)
313 .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
314 let (header, path) = s
315 .split_once('\t')
316 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
317 let mut parts = header.split_whitespace();
318 let mode = parts
319 .next()
320 .ok_or_else(|| Error::Failed(format!("ls-tree: missing mode in {s:?}")))?;
321 let kind = parts.next();
322 let oid = parts
323 .next()
324 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
325 if kind != Some("blob") {
326 continue;
327 }
328 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
329 && kind == "blob"
330 {
331 blobs.push(TreeBlob {
332 path: PathBuf::from(path),
333 blob_oid: oid.to_owned(),
334 size,
335 mode: mode.to_owned(),
336 });
337 }
338 }
339 Ok(blobs)
340}
341
342fn scan_blobs_in_range(cwd: &Path, range: &str) -> Result<Vec<TreeBlob>, Error> {
347 let out = Command::new("git")
348 .arg("-C")
349 .arg(cwd)
350 .args(["rev-list", range])
351 .output()?;
352 if !out.status.success() {
353 return Err(Error::Failed(format!(
354 "git rev-list failed: {}",
355 String::from_utf8_lossy(&out.stderr).trim()
356 )));
357 }
358 let mut seen: std::collections::HashSet<(PathBuf, String)> = std::collections::HashSet::new();
359 let mut all = Vec::new();
360 for line in String::from_utf8_lossy(&out.stdout).lines() {
361 let commit = line.trim();
362 if commit.is_empty() {
363 continue;
364 }
365 for blob in scan_tree_blobs_for_ref(cwd, commit)? {
366 if seen.insert((blob.path.clone(), blob.blob_oid.clone())) {
367 all.push(blob);
368 }
369 }
370 }
371 Ok(all)
372}
373
374pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
386 let out = Command::new("git")
387 .arg("-C")
388 .arg(cwd)
389 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
390 .output()?;
391 if !out.status.success() {
392 return Err(Error::Failed(format!(
393 "git ls-tree failed: {}",
394 String::from_utf8_lossy(&out.stderr).trim()
395 )));
396 }
397
398 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
401 let mut candidates: Vec<(String, String)> = Vec::new();
402 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
403 let s = std::str::from_utf8(record)
404 .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
405 let (header, path) = s
406 .split_once('\t')
407 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
408 let mut parts = header.split_whitespace();
409 let _mode = parts.next();
410 let kind = parts.next();
411 let oid = parts
412 .next()
413 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
414 if kind != Some("blob") {
415 continue;
416 }
417 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
418 && kind == "blob"
419 && (size as usize) < MAX_POINTER_SIZE
420 {
421 candidates.push((oid.to_owned(), path.to_owned()));
422 }
423 }
424 drop(bcheck);
425
426 let mut batch = CatFileBatch::spawn(cwd)?;
430 let mut entries = Vec::new();
431 for (oid, path) in candidates {
432 let Some(blob) = batch.read(&oid)? else {
433 continue;
434 };
435 let Ok(pointer) = Pointer::parse(&blob.content) else {
436 continue;
437 };
438 let path_buf = PathBuf::from(path);
439 entries.push(PointerEntry {
440 oid: pointer.oid,
441 size: pointer.size,
442 path: Some(path_buf.clone()),
443 paths: vec![path_buf],
444 canonical: pointer.canonical,
445 extensions: pointer.extensions.clone(),
446 });
447 }
448 Ok(entries)
449}
450
451#[cfg(test)]
452mod tests {
453 use super::*;
454 use crate::tests::commit_helper::*;
455
456 fn pointer_text(content: &[u8]) -> Vec<u8> {
460 use sha2::{Digest, Sha256};
461 let oid_bytes: [u8; 32] = Sha256::digest(content).into();
462 let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
463 use std::fmt::Write;
464 let _ = write!(s, "{b:02x}");
465 s
466 });
467 format!(
468 "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
469 content.len()
470 )
471 .into_bytes()
472 }
473
474 #[test]
475 fn empty_repo_returns_no_pointers() {
476 let repo = init_repo();
477 commit_file(&repo, "a.txt", b"plain content");
478 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
479 assert!(result.is_empty());
480 }
481
482 #[test]
483 fn finds_pointer_blobs_skips_plain_blobs() {
484 let repo = init_repo();
485 commit_file(&repo, "plain.txt", b"just text");
487 let pointer = pointer_text(b"this would be the actual binary content");
488 commit_file(&repo, "big.bin", &pointer);
489
490 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
491 assert_eq!(result.len(), 1, "{result:?}");
492 assert_eq!(
493 result[0].size,
494 b"this would be the actual binary content".len() as u64,
495 );
496 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
497 }
498
499 #[test]
500 fn dedups_same_lfs_oid_in_multiple_paths() {
501 let repo = init_repo();
502 let pointer = pointer_text(b"shared payload");
503 commit_file(&repo, "first.bin", &pointer);
504 commit_file(&repo, "second.bin", &pointer);
505
506 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
507 assert_eq!(result.len(), 1, "{result:?}");
510 }
511
512 #[test]
513 fn finds_pointers_in_history_not_just_tip() {
514 let repo = init_repo();
515 let pointer = pointer_text(b"deleted later");
519 commit_file(&repo, "x.bin", &pointer);
520 commit_file(&repo, "x.bin", b"plain text now");
521
522 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
523 assert_eq!(result.len(), 1);
524 assert_eq!(result[0].size, b"deleted later".len() as u64);
525 }
526
527 #[test]
528 fn excludes_filter_history_walk() {
529 let repo = init_repo();
530 commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
531 let first = head_oid(&repo);
532 commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
533
534 let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
536 assert_eq!(result.len(), 1, "{result:?}");
537 assert_eq!(result[0].size, b"new payload".len() as u64);
538 }
539
540 #[test]
541 fn skips_blobs_that_look_like_pointers_but_dont_parse() {
542 let repo = init_repo();
543 commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
545
546 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
547 assert!(result.is_empty(), "{result:?}");
548 }
549
550 #[test]
551 fn scan_tree_returns_only_tree_entries_not_history() {
552 let repo = init_repo();
553 let pointer = pointer_text(b"deleted later");
557 commit_file(&repo, "x.bin", &pointer);
558 commit_file(&repo, "x.bin", b"plain text now");
559
560 let result = scan_tree(repo.path(), "HEAD").unwrap();
561 assert!(result.is_empty(), "{result:?}");
562 }
563
564 #[test]
565 fn scan_tree_emits_one_entry_per_path_not_per_oid() {
566 let repo = init_repo();
567 let pointer = pointer_text(b"shared payload");
570 commit_file(&repo, "first.bin", &pointer);
571 commit_file(&repo, "second.bin", &pointer);
572
573 let mut result = scan_tree(repo.path(), "HEAD").unwrap();
574 result.sort_by(|a, b| a.path.cmp(&b.path));
575 assert_eq!(result.len(), 2, "{result:?}");
576 assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
577 assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
578 assert_eq!(result[0].oid, result[1].oid);
580 }
581
582 #[test]
583 fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
584 let repo = init_repo();
585 commit_file(&repo, "plain.txt", b"just text");
586 let pointer = pointer_text(b"binary content");
587 commit_file(&repo, "big.bin", &pointer);
588
589 let result = scan_tree(repo.path(), "HEAD").unwrap();
590 assert_eq!(result.len(), 1, "{result:?}");
591 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
592 }
593
594 #[test]
595 fn scan_tree_unknown_ref_errors() {
596 let repo = init_repo();
597 commit_file(&repo, "a.txt", b"x");
598 let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
599 match err {
600 Error::Failed(msg) => assert!(
601 msg.contains("does-not-exist") || msg.contains("Not a valid"),
602 "unexpected message: {msg}"
603 ),
604 _ => panic!("expected Failed, got {err:?}"),
605 }
606 }
607}