1use std::collections::HashSet;
19use std::path::{Path, PathBuf};
20use std::process::Command;
21
22use git_lfs_pointer::{MAX_POINTER_SIZE, Oid, Pointer};
23
24use crate::cat_file::{CatFileBatch, CatFileBatchCheck, CatFileHeader};
25use crate::{Error, rev_list};
26
27#[derive(Debug, Clone)]
29pub struct PointerEntry {
30 pub oid: Oid,
32 pub size: u64,
34 pub path: Option<PathBuf>,
39 pub canonical: bool,
43}
44
45pub fn scan_pointers(
57 cwd: &Path,
58 include: &[&str],
59 exclude: &[&str],
60) -> Result<Vec<PointerEntry>, Error> {
61 let entries = rev_list(cwd, include, exclude)?;
62
63 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
66 let mut candidates: Vec<(String, Option<String>)> = Vec::new();
67 for entry in entries {
68 match bcheck.check(&entry.oid)? {
69 CatFileHeader::Found { kind, size, .. }
70 if kind == "blob" && (size as usize) < MAX_POINTER_SIZE =>
71 {
72 candidates.push((entry.oid, entry.name));
73 }
74 _ => {}
76 }
77 }
78 drop(bcheck);
79
80 let mut batch = CatFileBatch::spawn(cwd)?;
84 let mut seen: HashSet<Oid> = HashSet::new();
85 let mut out = Vec::new();
86 for (oid, name) in candidates {
87 let Some(blob) = batch.read(&oid)? else { continue };
88 let Ok(pointer) = Pointer::parse(&blob.content) else { continue };
89 if seen.insert(pointer.oid) {
90 out.push(PointerEntry {
91 oid: pointer.oid,
92 size: pointer.size,
93 path: name.map(PathBuf::from),
94 canonical: pointer.canonical,
95 });
96 }
97 }
98 Ok(out)
99}
100
101#[derive(Debug, Clone)]
105pub struct TreeBlob {
106 pub path: PathBuf,
108 pub blob_oid: String,
110 pub size: u64,
112}
113
114pub fn scan_tree_blobs(cwd: &Path, reference: &str) -> Result<Vec<TreeBlob>, Error> {
118 let out = Command::new("git")
119 .arg("-C")
120 .arg(cwd)
121 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
122 .output()?;
123 if !out.status.success() {
124 return Err(Error::Failed(format!(
125 "git ls-tree failed: {}",
126 String::from_utf8_lossy(&out.stderr).trim()
127 )));
128 }
129 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
130 let mut blobs = Vec::new();
131 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
132 let s = std::str::from_utf8(record).map_err(|e| {
133 Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
134 })?;
135 let (header, path) = s
136 .split_once('\t')
137 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
138 let mut parts = header.split_whitespace();
139 let _mode = parts.next();
140 let kind = parts.next();
141 let oid = parts
142 .next()
143 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
144 if kind != Some("blob") {
145 continue;
146 }
147 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
148 && kind == "blob"
149 {
150 blobs.push(TreeBlob {
151 path: PathBuf::from(path),
152 blob_oid: oid.to_owned(),
153 size,
154 });
155 }
156 }
157 Ok(blobs)
158}
159
160pub fn scan_tree(cwd: &Path, reference: &str) -> Result<Vec<PointerEntry>, Error> {
172 let out = Command::new("git")
173 .arg("-C")
174 .arg(cwd)
175 .args(["ls-tree", "--full-tree", "-r", "-z", reference])
176 .output()?;
177 if !out.status.success() {
178 return Err(Error::Failed(format!(
179 "git ls-tree failed: {}",
180 String::from_utf8_lossy(&out.stderr).trim()
181 )));
182 }
183
184 let mut bcheck = CatFileBatchCheck::spawn(cwd)?;
187 let mut candidates: Vec<(String, String)> = Vec::new();
188 for record in out.stdout.split(|&b| b == 0).filter(|s| !s.is_empty()) {
189 let s = std::str::from_utf8(record).map_err(|e| {
190 Error::Failed(format!("ls-tree: non-utf8 record: {e}"))
191 })?;
192 let (header, path) = s
193 .split_once('\t')
194 .ok_or_else(|| Error::Failed(format!("ls-tree: malformed record {s:?}")))?;
195 let mut parts = header.split_whitespace();
196 let _mode = parts.next();
197 let kind = parts.next();
198 let oid = parts
199 .next()
200 .ok_or_else(|| Error::Failed(format!("ls-tree: missing oid in {s:?}")))?;
201 if kind != Some("blob") {
202 continue;
203 }
204 if let CatFileHeader::Found { kind, size, .. } = bcheck.check(oid)?
205 && kind == "blob"
206 && (size as usize) < MAX_POINTER_SIZE
207 {
208 candidates.push((oid.to_owned(), path.to_owned()));
209 }
210 }
211 drop(bcheck);
212
213 let mut batch = CatFileBatch::spawn(cwd)?;
217 let mut entries = Vec::new();
218 for (oid, path) in candidates {
219 let Some(blob) = batch.read(&oid)? else { continue };
220 let Ok(pointer) = Pointer::parse(&blob.content) else {
221 continue;
222 };
223 entries.push(PointerEntry {
224 oid: pointer.oid,
225 size: pointer.size,
226 path: Some(PathBuf::from(path)),
227 canonical: pointer.canonical,
228 });
229 }
230 Ok(entries)
231}
232
233#[cfg(test)]
234mod tests {
235 use super::*;
236 use crate::tests::commit_helper::*;
237
238 fn pointer_text(content: &[u8]) -> Vec<u8> {
242 use sha2::{Digest, Sha256};
243 let oid_bytes: [u8; 32] = Sha256::digest(content).into();
244 let oid_hex = oid_bytes.iter().fold(String::new(), |mut s, b| {
245 use std::fmt::Write;
246 let _ = write!(s, "{b:02x}");
247 s
248 });
249 format!(
250 "version https://git-lfs.github.com/spec/v1\noid sha256:{oid_hex}\nsize {}\n",
251 content.len()
252 )
253 .into_bytes()
254 }
255
256 #[test]
257 fn empty_repo_returns_no_pointers() {
258 let repo = init_repo();
259 commit_file(&repo, "a.txt", b"plain content");
260 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
261 assert!(result.is_empty());
262 }
263
264 #[test]
265 fn finds_pointer_blobs_skips_plain_blobs() {
266 let repo = init_repo();
267 commit_file(&repo, "plain.txt", b"just text");
269 let pointer = pointer_text(b"this would be the actual binary content");
270 commit_file(&repo, "big.bin", &pointer);
271
272 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
273 assert_eq!(result.len(), 1, "{result:?}");
274 assert_eq!(
275 result[0].size,
276 b"this would be the actual binary content".len() as u64,
277 );
278 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
279 }
280
281 #[test]
282 fn dedups_same_lfs_oid_in_multiple_paths() {
283 let repo = init_repo();
284 let pointer = pointer_text(b"shared payload");
285 commit_file(&repo, "first.bin", &pointer);
286 commit_file(&repo, "second.bin", &pointer);
287
288 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
289 assert_eq!(result.len(), 1, "{result:?}");
292 }
293
294 #[test]
295 fn finds_pointers_in_history_not_just_tip() {
296 let repo = init_repo();
297 let pointer = pointer_text(b"deleted later");
301 commit_file(&repo, "x.bin", &pointer);
302 commit_file(&repo, "x.bin", b"plain text now");
303
304 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
305 assert_eq!(result.len(), 1);
306 assert_eq!(result[0].size, b"deleted later".len() as u64);
307 }
308
309 #[test]
310 fn excludes_filter_history_walk() {
311 let repo = init_repo();
312 commit_file(&repo, "old.bin", &pointer_text(b"old payload"));
313 let first = head_oid(&repo);
314 commit_file(&repo, "new.bin", &pointer_text(b"new payload"));
315
316 let result = scan_pointers(repo.path(), &["HEAD"], &[&first]).unwrap();
318 assert_eq!(result.len(), 1, "{result:?}");
319 assert_eq!(result[0].size, b"new payload".len() as u64);
320 }
321
322 #[test]
323 fn skips_blobs_that_look_like_pointers_but_dont_parse() {
324 let repo = init_repo();
325 commit_file(&repo, "fake.bin", b"version foo\nbut not really a pointer");
327
328 let result = scan_pointers(repo.path(), &["HEAD"], &[]).unwrap();
329 assert!(result.is_empty(), "{result:?}");
330 }
331
332 #[test]
333 fn scan_tree_returns_only_tree_entries_not_history() {
334 let repo = init_repo();
335 let pointer = pointer_text(b"deleted later");
339 commit_file(&repo, "x.bin", &pointer);
340 commit_file(&repo, "x.bin", b"plain text now");
341
342 let result = scan_tree(repo.path(), "HEAD").unwrap();
343 assert!(result.is_empty(), "{result:?}");
344 }
345
346 #[test]
347 fn scan_tree_emits_one_entry_per_path_not_per_oid() {
348 let repo = init_repo();
349 let pointer = pointer_text(b"shared payload");
352 commit_file(&repo, "first.bin", &pointer);
353 commit_file(&repo, "second.bin", &pointer);
354
355 let mut result = scan_tree(repo.path(), "HEAD").unwrap();
356 result.sort_by(|a, b| a.path.cmp(&b.path));
357 assert_eq!(result.len(), 2, "{result:?}");
358 assert_eq!(result[0].path.as_deref(), Some(Path::new("first.bin")));
359 assert_eq!(result[1].path.as_deref(), Some(Path::new("second.bin")));
360 assert_eq!(result[0].oid, result[1].oid);
362 }
363
364 #[test]
365 fn scan_tree_skips_plain_blobs_and_keeps_pointers() {
366 let repo = init_repo();
367 commit_file(&repo, "plain.txt", b"just text");
368 let pointer = pointer_text(b"binary content");
369 commit_file(&repo, "big.bin", &pointer);
370
371 let result = scan_tree(repo.path(), "HEAD").unwrap();
372 assert_eq!(result.len(), 1, "{result:?}");
373 assert_eq!(result[0].path.as_deref(), Some(Path::new("big.bin")));
374 }
375
376 #[test]
377 fn scan_tree_unknown_ref_errors() {
378 let repo = init_repo();
379 commit_file(&repo, "a.txt", b"x");
380 let err = scan_tree(repo.path(), "does-not-exist").unwrap_err();
381 match err {
382 Error::Failed(msg) => assert!(
383 msg.contains("does-not-exist") || msg.contains("Not a valid"),
384 "unexpected message: {msg}"
385 ),
386 _ => panic!("expected Failed, got {err:?}"),
387 }
388 }
389}