1use std::path::{Path, PathBuf};
19use std::process::Command;
20
21use git_lfs_pointer::{MAX_POINTER_SIZE, Oid, Pointer};
22
23use crate::Error;
24use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
25
26#[derive(Debug, Clone)]
28pub struct PointerEntry {
29 pub oid: Oid,
31 pub size: u64,
33 pub path: Option<PathBuf>,
38 pub paths: Vec<PathBuf>,
45 pub canonical: bool,
49}
50
51pub fn scan_pointers(
63 cwd: &Path,
64 include: &[&str],
65 exclude: &[&str],
66) -> Result<Vec<PointerEntry>, Error> {
67 scan_pointers_with_args(cwd, include, exclude, &[])
68}
69
70pub fn scan_pointers_with_args(
73 cwd: &Path,
74 include: &[&str],
75 exclude: &[&str],
76 extra_cmdline_args: &[&str],
77) -> Result<Vec<PointerEntry>, Error> {
78 let entries = crate::rev_list::rev_list_with_args(cwd, include, exclude, extra_cmdline_args)?;
79
80 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
83 let mut candidates: Vec<(String, Option<String>)> = Vec::new();
84 for entry in entries {
85 match bcheck.check(&entry.oid)? {
86 CatFileHeader::Found { kind, size, .. }
87 if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
88 {
89 candidates.push((entry.oid, entry.name));
90 }
91 _ => {}
93 }
94 }
95 drop(bcheck);
96
97 let mut batch = CatFileBatch::spawn(cwd)?;
102 let mut by_oid: std::collections::HashMap<Oid, usize> = std::collections::HashMap::new();
103 let mut out: Vec<PointerEntry> = Vec::new();
104 for (oid, name) in candidates {
105 let Some(blob) = batch.read(&oid)? else {
106 continue;
107 };
108 let Ok(pointer) = Pointer::parse(&blob.content) else {
109 continue;
110 };
111 let path_buf = name.map(PathBuf::from);
112 if let Some(&idx) = by_oid.get(&pointer.oid) {
113 if let Some(p) = path_buf
114 && !out[idx].paths.contains(&p)
115 {
116 out[idx].paths.push(p);
117 }
118 continue;
119 }
120 let paths: Vec<PathBuf> = path_buf.iter().cloned().collect();
121 by_oid.insert(pointer.oid, out.len());
122 out.push(PointerEntry {
123 oid: pointer.oid,
124 size: pointer.size,
125 path: path_buf,
126 paths,
127 canonical: pointer.canonical,
128 });
129 }
130 Ok(out)
131}
132
133#[derive(Debug, Clone)]
137pub struct TreeBlob {
138 pub path: PathBuf,
140 pub blob_oid: String,
142 pub size: u64,
144 pub mode: String,
148}
149
150pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
154 let out = Command::new("git")
155 .arg("-C")
156 .arg(cwd)
157 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
158 .output()?;
159 if !out.status.success() {
160 return Err(Error::Failed(format!(
161 "git ls-tree failed: {}",
162 String::from_utf8_lossy(&out.stderr).trim()
163 )));
164 }
165 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
166 let mut blobs = Vec::new();
167 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
168 let s = std::str::from_utf8(record)
169 .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
170 let (header, path) = s
171 .split_once('\t')
172 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
173 let mut parts = header.split_whitespace();
174 let mode = parts
175 .next()
176 .ok_or_else(|| Error::Failed(format!("ls-tree: missing mode in {s:?}")))?;
177 let kind = parts.next();
178 let oid = parts
179 .next()
180 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
181 if kind != Some("blob") {
182 continue;
183 }
184 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
185 && kind == "blob"
186 {
187 blobs.push(TreeBlob {
188 path: PathBuf::from(path),
189 blob_oid: oid.to_owned(),
190 size,
191 mode: mode.to_owned(),
192 });
193 }
194 }
195 Ok(blobs)
196}
197
198pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
210 let out = Command::new("git")
211 .arg("-C")
212 .arg(cwd)
213 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
214 .output()?;
215 if !out.status.success() {
216 return Err(Error::Failed(format!(
217 "git ls-tree failed: {}",
218 String::from_utf8_lossy(&out.stderr).trim()
219 )));
220 }
221
222 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
225 let mut candidates: Vec<(String, String)> = Vec::new();
226 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
227 let s = std::str::from_utf8(record)
228 .map_err(|e| Error::Failed(format!("ls-tree: non-utf8 record: {e}")))?;
229 let (header, path) = s
230 .split_once('\t')
231 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
232 let mut parts = header.split_whitespace();
233 let _mode = parts.next();
234 let kind = parts.next();
235 let oid = parts
236 .next()
237 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
238 if kind != Some("blob") {
239 continue;
240 }
241 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
242 && kind == "blob"
243 && (size as usize) < MAX_POINTER_SIZE
244 {
245 candidates.push((oid.to_owned(), path.to_owned()));
246 }
247 }
248 drop(bcheck);
249
250 let mut batch = CatFileBatch::spawn(cwd)?;
254 let mut entries = Vec::new();
255 for (oid, path) in candidates {
256 let Some(blob) = batch.read(&oid)? else {
257 continue;
258 };
259 let Ok(pointer) = Pointer::parse(&blob.content) else {
260 continue;
261 };
262 let path_buf = PathBuf::from(path);
263 entries.push(PointerEntry {
264 oid: pointer.oid,
265 size: pointer.size,
266 path: Some(path_buf.clone()),
267 paths: vec![path_buf],
268 canonical: pointer.canonical,
269 });
270 }
271 Ok(entries)
272}
273
274#[cfg(test)]
275mod tests {
276 use super::*;
277 use crate::tests::commit_helper::*;
278
279 fn pointer_text(content: &[u8]) -> Vec<u8> {
283 use sha2::{Digest, Sha256};
284 let oid_bytes: [u8; 32] = Sha256::digest(content).into();
285 let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
286 use std::fmt::Write;
287 let _ = write!(s, "{b:02x}");
288 s
289 });
290 format!(
291 "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
292 content.len()
293 )
294 .into_bytes()
295 }
296
297 #[test]
298 fn empty_repo_returns_no_pointers() {
299 let repo = init_repo();
300 commit_file(&repo, "a.txt", b"plain content");
301 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
302 assert!(result.is_empty());
303 }
304
305 #[test]
306 fn finds_pointer_blobs_skips_plain_blobs() {
307 let repo = init_repo();
308 commit_file(&repo, "plain.txt", b"just text");
310 let pointer = pointer_text(b"this would be the actual binary content");
311 commit_file(&repo, "big.bin", &pointer);
312
313 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
314 assert_eq!(result.len(), 1, "{result:?}");
315 assert_eq!(
316 result[0].size,
317 b"this would be the actual binary content".len() as u64,
318 );
319 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
320 }
321
322 #[test]
323 fn dedups_same_lfs_oid_in_multiple_paths() {
324 let repo = init_repo();
325 let pointer = pointer_text(b"shared payload");
326 commit_file(&repo, "first.bin", &pointer);
327 commit_file(&repo, "second.bin", &pointer);
328
329 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
330 assert_eq!(result.len(), 1, "{result:?}");
333 }
334
335 #[test]
336 fn finds_pointers_in_history_not_just_tip() {
337 let repo = init_repo();
338 let pointer = pointer_text(b"deleted later");
342 commit_file(&repo, "x.bin", &pointer);
343 commit_file(&repo, "x.bin", b"plain text now");
344
345 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
346 assert_eq!(result.len(), 1);
347 assert_eq!(result[0].size, b"deleted later".len() as u64);
348 }
349
350 #[test]
351 fn excludes_filter_history_walk() {
352 let repo = init_repo();
353 commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
354 let first = head_oid(&repo);
355 commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
356
357 let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
359 assert_eq!(result.len(), 1, "{result:?}");
360 assert_eq!(result[0].size, b"new payload".len() as u64);
361 }
362
363 #[test]
364 fn skips_blobs_that_look_like_pointers_but_dont_parse() {
365 let repo = init_repo();
366 commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
368
369 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
370 assert!(result.is_empty(), "{result:?}");
371 }
372
373 #[test]
374 fn scan_tree_returns_only_tree_entries_not_history() {
375 let repo = init_repo();
376 let pointer = pointer_text(b"deleted later");
380 commit_file(&repo, "x.bin", &pointer);
381 commit_file(&repo, "x.bin", b"plain text now");
382
383 let result = scan_tree(repo.path(), "HEAD").unwrap();
384 assert!(result.is_empty(), "{result:?}");
385 }
386
387 #[test]
388 fn scan_tree_emits_one_entry_per_path_not_per_oid() {
389 let repo = init_repo();
390 let pointer = pointer_text(b"shared payload");
393 commit_file(&repo, "first.bin", &pointer);
394 commit_file(&repo, "second.bin", &pointer);
395
396 let mut result = scan_tree(repo.path(), "HEAD").unwrap();
397 result.sort_by(|a, b| a.path.cmp(&b.path));
398 assert_eq!(result.len(), 2, "{result:?}");
399 assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
400 assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
401 assert_eq!(result[0].oid, result[1].oid);
403 }
404
405 #[test]
406 fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
407 let repo = init_repo();
408 commit_file(&repo, "plain.txt", b"just text");
409 let pointer = pointer_text(b"binary content");
410 commit_file(&repo, "big.bin", &pointer);
411
412 let result = scan_tree(repo.path(), "HEAD").unwrap();
413 assert_eq!(result.len(), 1, "{result:?}");
414 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
415 }
416
417 #[test]
418 fn scan_tree_unknown_ref_errors() {
419 let repo = init_repo();
420 commit_file(&repo, "a.txt", b"x");
421 let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
422 match err {
423 Error::Failed(msg) => assert!(
424 msg.contains("does-not-exist") || msg.contains("Not a valid"),
425 "unexpected message: {msg}"
426 ),
427 _ => panic!("expected Failed, got {err:?}"),
428 }
429 }
430}