1use std::io::{self, Read, Write};
4use std::path::Path;
5use std::process::{Command, Stdio};
6
7use git_lfs_pointer::{Extension, Oid, Pointer};
8use git_lfs_store::{Store, StoreError};
9use sha2::{Digest, Sha256};
10use tempfile::NamedTempFile;
11
12use crate::detect_pointer;
13
14const COPY_BUFFER: usize = 64 * 1024;
15
16#[derive(Debug)]
18pub enum CleanOutcome {
19 Passthrough(Pointer),
23 Stored(Pointer),
27}
28
29impl CleanOutcome {
30 pub fn pointer(&self) -> &Pointer {
33 match self {
34 Self::Passthrough(p) | Self::Stored(p) => p,
35 }
36 }
37
38 pub fn was_passthrough(&self) -> bool {
40 matches!(self, Self::Passthrough(_))
41 }
42}
43
44#[derive(Debug, Clone)]
50pub struct CleanExtension {
51 pub name: String,
52 pub priority: u8,
53 pub command: String,
54}
55
56#[derive(Debug, thiserror::Error)]
57pub enum CleanError {
58 #[error(transparent)]
59 Io(#[from] io::Error),
60 #[error(transparent)]
61 Store(#[from] StoreError),
62 #[error("extension {name:?} has no clean command configured")]
63 ExtensionMissingCommand { name: String },
64 #[error("failed to spawn extension {name:?}: {source}")]
65 ExtensionSpawnFailed {
66 name: String,
67 #[source]
68 source: io::Error,
69 },
70 #[error("extension {name:?} exited with status {status:?}")]
71 ExtensionFailed { name: String, status: Option<i32> },
72}
73
74pub fn clean<R: Read, W: Write>(
91 store: &Store,
92 input: &mut R,
93 output: &mut W,
94 path: &str,
95 extensions: &[CleanExtension],
96) -> Result<CleanOutcome, CleanError> {
97 let (head, maybe_pointer) = detect_pointer(input)?;
98
99 if let Some(pointer) = maybe_pointer {
100 output.write_all(&head)?;
101 return Ok(CleanOutcome::Passthrough(pointer));
102 }
103
104 if extensions.is_empty() {
105 let mut combined = head.as_slice().chain(input);
106 let (oid, size) = store.insert(&mut combined)?;
107 let pointer = Pointer::new(oid, size);
108 output.write_all(pointer.encode().as_bytes())?;
109 return Ok(CleanOutcome::Stored(pointer));
110 }
111
112 for ext in extensions {
113 if ext.command.trim().is_empty() {
114 return Err(CleanError::ExtensionMissingCommand {
115 name: ext.name.clone(),
116 });
117 }
118 }
119
120 let tmp_dir = store.tmp_dir();
121 std::fs::create_dir_all(&tmp_dir)?;
122
123 let mut combined = head.as_slice().chain(input);
125 let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
126 let orig_oid = hash_and_write(&mut combined, current_tmp.as_file_mut())?;
127 let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
128 input_oids.push(orig_oid);
129
130 for (i, ext) in extensions.iter().enumerate() {
134 let cmd_str = ext.command.replace("%f", path);
135 let mut parts = cmd_str.split_whitespace();
136 let prog = parts
137 .next()
138 .ok_or_else(|| CleanError::ExtensionMissingCommand {
139 name: ext.name.clone(),
140 })?;
141 let args: Vec<&str> = parts.collect();
142
143 let stdin_file = std::fs::File::open(current_tmp.path())?;
144 let mut child = Command::new(prog)
145 .args(&args)
146 .stdin(stdin_file)
147 .stdout(Stdio::piped())
148 .stderr(Stdio::inherit())
149 .spawn()
150 .map_err(|e| CleanError::ExtensionSpawnFailed {
151 name: ext.name.clone(),
152 source: e,
153 })?;
154 let mut stdout = child.stdout.take().expect("piped stdout");
155
156 let is_last = i + 1 == extensions.len();
157 if is_last {
158 let (oid, size) = store.insert(&mut stdout)?;
159 let status = child.wait()?;
160 if !status.success() {
161 return Err(CleanError::ExtensionFailed {
162 name: ext.name.clone(),
163 status: status.code(),
164 });
165 }
166
167 let pointer_extensions = build_pointer_extensions(extensions, &input_oids);
168 let pointer = Pointer {
169 oid,
170 size,
171 extensions: pointer_extensions,
172 canonical: true,
173 };
174 output.write_all(pointer.encode().as_bytes())?;
175 return Ok(CleanOutcome::Stored(pointer));
176 }
177
178 let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
179 let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
180 let status = child.wait()?;
181 if !status.success() {
182 return Err(CleanError::ExtensionFailed {
183 name: ext.name.clone(),
184 status: status.code(),
185 });
186 }
187
188 current_tmp = next_tmp;
189 input_oids.push(next_oid);
190 }
191
192 unreachable!("clean loop exited without storing")
195}
196
197pub fn build_pointer_with_extensions<R: Read>(
208 input: &mut R,
209 path: &str,
210 extensions: &[CleanExtension],
211 tmp_dir: &Path,
212) -> Result<Pointer, CleanError> {
213 if extensions.is_empty() {
214 let mut hasher = Sha256::new();
215 let mut buf = vec![0u8; COPY_BUFFER];
216 let mut size: u64 = 0;
217 loop {
218 let n = input.read(&mut buf)?;
219 if n == 0 {
220 break;
221 }
222 hasher.update(&buf[..n]);
223 size += n as u64;
224 }
225 let bytes: [u8; 32] = hasher.finalize().into();
226 return Ok(Pointer::new(Oid::from_bytes(bytes), size));
227 }
228
229 for ext in extensions {
230 if ext.command.trim().is_empty() {
231 return Err(CleanError::ExtensionMissingCommand {
232 name: ext.name.clone(),
233 });
234 }
235 }
236
237 std::fs::create_dir_all(tmp_dir)?;
238
239 let mut current_tmp = NamedTempFile::new_in(tmp_dir)?;
240 let orig_oid = hash_and_write(input, current_tmp.as_file_mut())?;
241 let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
242 input_oids.push(orig_oid);
243
244 for (i, ext) in extensions.iter().enumerate() {
245 let cmd_str = ext.command.replace("%f", path);
246 let mut parts = cmd_str.split_whitespace();
247 let prog = parts
248 .next()
249 .ok_or_else(|| CleanError::ExtensionMissingCommand {
250 name: ext.name.clone(),
251 })?;
252 let args: Vec<&str> = parts.collect();
253
254 let stdin_file = std::fs::File::open(current_tmp.path())?;
255 let mut child = Command::new(prog)
256 .args(&args)
257 .stdin(stdin_file)
258 .stdout(Stdio::piped())
259 .stderr(Stdio::inherit())
260 .spawn()
261 .map_err(|e| CleanError::ExtensionSpawnFailed {
262 name: ext.name.clone(),
263 source: e,
264 })?;
265 let mut stdout = child.stdout.take().expect("piped stdout");
266
267 let is_last = i + 1 == extensions.len();
268 if is_last {
269 let mut hasher = Sha256::new();
270 let mut buf = vec![0u8; COPY_BUFFER];
271 let mut size: u64 = 0;
272 loop {
273 let n = stdout.read(&mut buf)?;
274 if n == 0 {
275 break;
276 }
277 hasher.update(&buf[..n]);
278 size += n as u64;
279 }
280 let status = child.wait()?;
281 if !status.success() {
282 return Err(CleanError::ExtensionFailed {
283 name: ext.name.clone(),
284 status: status.code(),
285 });
286 }
287 let bytes: [u8; 32] = hasher.finalize().into();
288 return Ok(Pointer {
289 oid: Oid::from_bytes(bytes),
290 size,
291 extensions: build_pointer_extensions(extensions, &input_oids),
292 canonical: true,
293 });
294 }
295
296 let mut next_tmp = NamedTempFile::new_in(tmp_dir)?;
297 let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
298 let status = child.wait()?;
299 if !status.success() {
300 return Err(CleanError::ExtensionFailed {
301 name: ext.name.clone(),
302 status: status.code(),
303 });
304 }
305 current_tmp = next_tmp;
306 input_oids.push(next_oid);
307 }
308
309 unreachable!("extension chain exited without producing a pointer")
310}
311
312fn build_pointer_extensions(extensions: &[CleanExtension], input_oids: &[Oid]) -> Vec<Extension> {
313 extensions
314 .iter()
315 .enumerate()
316 .map(|(i, ext)| Extension {
317 name: ext.name.clone(),
318 priority: ext.priority,
319 oid: input_oids[i],
320 })
321 .collect()
322}
323
324fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
325 let mut hasher = Sha256::new();
326 let mut buf = vec![0u8; COPY_BUFFER];
327 loop {
328 let n = src.read(&mut buf)?;
329 if n == 0 {
330 break;
331 }
332 hasher.update(&buf[..n]);
333 dst.write_all(&buf[..n])?;
334 }
335 dst.flush()?;
336 let bytes: [u8; 32] = hasher.finalize().into();
337 Ok(Oid::from_bytes(bytes))
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343 use git_lfs_pointer::VERSION_LATEST;
344 use tempfile::TempDir;
345
346 fn fixture() -> (TempDir, Store) {
347 let tmp = TempDir::new().unwrap();
348 let store = Store::new(tmp.path().join("lfs"));
349 (tmp, store)
350 }
351
352 fn run(store: &Store, input: &[u8]) -> (CleanOutcome, Vec<u8>) {
353 let mut out = Vec::new();
354 let outcome = clean(store, &mut { input }, &mut out, "", &[]).unwrap();
355 (outcome, out)
356 }
357
358 #[test]
361 fn small_content_is_hashed_and_stored() {
362 let (_t, store) = fixture();
363 let (outcome, out) = run(&store, b"hello world!");
364 let p = match outcome {
365 CleanOutcome::Stored(p) => p,
366 o => panic!("expected Stored, got {o:?}"),
367 };
368 assert_eq!(p.size, 12);
369 assert!(store.contains(p.oid));
370 assert_eq!(out, p.encode().as_bytes());
371 }
372
373 #[test]
374 fn known_sha256_for_abc() {
375 let (_t, store) = fixture();
376 let (outcome, _) = run(&store, b"abc");
377 let expected: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
378 .parse()
379 .unwrap();
380 assert_eq!(outcome.pointer().oid, expected);
381 }
382
383 #[test]
384 fn pseudo_pointer_with_extra_text_is_hashed() {
385 let input = b"version https://git-lfs.github.com/spec/v1\n\
386 oid sha256:7cd8be1d2cd0dd22cd9d229bb6b5785009a05e8b39d405615d882caac56562b5\n\
387 size 1024\n\
388 \n\
389 This is my test pointer.\n";
390 let (_t, store) = fixture();
391 let (outcome, out) = run(&store, input);
392 let p = match outcome {
393 CleanOutcome::Stored(p) => p,
394 o => panic!("expected Stored, got {o:?}"),
395 };
396 assert_eq!(p.size, input.len() as u64);
397 assert!(store.contains(p.oid));
398 assert_eq!(out, p.encode().as_bytes());
399 }
400
401 #[test]
402 fn oversized_pointer_shaped_input_is_hashed() {
403 let mut input = Vec::from(
404 &b"version https://git-lfs.github.com/spec/v1\n\
405 oid sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n\
406 size 5\n"[..],
407 );
408 input.extend(std::iter::repeat_n(b'x', 2000));
409 let (_t, store) = fixture();
410 let (outcome, _) = run(&store, &input);
411 let p = match outcome {
412 CleanOutcome::Stored(p) => p,
413 o => panic!("expected Stored, got {o:?}"),
414 };
415 assert_eq!(p.size, input.len() as u64);
416 assert!(store.contains(p.oid));
417 }
418
419 #[test]
420 fn streaming_megabyte_input_works() {
421 let (_t, store) = fixture();
422 let content: Vec<u8> = (0..1_048_576u32).map(|i| (i ^ (i >> 5)) as u8).collect();
423 let (outcome, _) = run(&store, &content);
424 assert_eq!(outcome.pointer().size, content.len() as u64);
425 assert!(store.contains(outcome.pointer().oid));
426 }
427
428 #[test]
431 fn canonical_pointer_passes_through_verbatim() {
432 let (_t, store) = fixture();
433 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
434 let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
435 let (outcome, out) = run(&store, pointer_text.as_bytes());
436 match &outcome {
437 CleanOutcome::Passthrough(p) => assert!(p.canonical),
438 o => panic!("expected Passthrough, got {o:?}"),
439 }
440 assert_eq!(
441 out,
442 pointer_text.as_bytes(),
443 "output must be input verbatim"
444 );
445 assert!(!store.root().join("objects").exists());
446 }
447
448 #[test]
449 fn non_canonical_pointer_passes_through_verbatim() {
450 let (_t, store) = fixture();
454 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
455 let crlf = format!("version {VERSION_LATEST}\r\noid sha256:{oid_hex}\r\nsize 12345\r\n");
456 let (outcome, out) = run(&store, crlf.as_bytes());
457 match &outcome {
458 CleanOutcome::Passthrough(p) => assert!(!p.canonical),
459 o => panic!("expected Passthrough, got {o:?}"),
460 }
461 assert_eq!(out, crlf.as_bytes());
462 }
463
464 #[test]
465 fn empty_input_is_passthrough_empty_pointer() {
466 let (_t, store) = fixture();
467 let (outcome, out) = run(&store, b"");
468 match &outcome {
469 CleanOutcome::Passthrough(p) => {
470 assert_eq!(p, &Pointer::empty());
471 }
472 o => panic!("expected Passthrough, got {o:?}"),
473 }
474 assert!(out.is_empty(), "empty pointer encodes to empty bytes");
475 }
476
477 #[test]
478 fn passthrough_is_idempotent() {
479 let (_t, store) = fixture();
480 let (_, first) = run(&store, b"some content here");
481 let (outcome2, second) = run(&store, &first);
482 assert!(matches!(outcome2, CleanOutcome::Passthrough(_)));
483 assert_eq!(first, second);
484 }
485
486 #[test]
493 fn single_extension_records_input_oid() {
494 let (_t, store) = fixture();
495 let exts = vec![CleanExtension {
496 name: "upper".into(),
497 priority: 0,
498 command: "tr a-z A-Z".into(),
499 }];
500
501 let mut out = Vec::new();
502 let outcome = clean(&store, &mut &b"abc"[..], &mut out, "foo.txt", &exts).unwrap();
503
504 let pointer = match outcome {
505 CleanOutcome::Stored(p) => p,
506 o => panic!("expected Stored, got {o:?}"),
507 };
508
509 let abc_oid: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
511 .parse()
512 .unwrap();
513 let upper_oid: Oid = "b5d4045c3f466fa91fe2cc6abe79232a1a57cdf104f7a26e716e0a1e2789df78"
515 .parse()
516 .unwrap();
517
518 assert_eq!(pointer.extensions.len(), 1);
519 assert_eq!(pointer.extensions[0].name, "upper");
520 assert_eq!(pointer.extensions[0].priority, 0);
521 assert_eq!(pointer.extensions[0].oid, abc_oid);
522 assert_eq!(pointer.oid, upper_oid);
523 assert_eq!(pointer.size, 3);
524 assert!(store.contains(upper_oid));
525 let mut f = store.open(upper_oid).unwrap();
527 let mut bytes = Vec::new();
528 std::io::Read::read_to_end(&mut f, &mut bytes).unwrap();
529 assert_eq!(bytes, b"ABC");
530 }
531
532 #[test]
533 fn extensions_skipped_for_passthrough_pointer() {
534 let (_t, store) = fixture();
537 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
538 let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
539 let exts = vec![CleanExtension {
540 name: "fail".into(),
541 priority: 0,
542 command: "false".into(),
544 }];
545 let mut out = Vec::new();
546 let outcome = clean(&store, &mut pointer_text.as_bytes(), &mut out, "x", &exts).unwrap();
547 assert!(matches!(outcome, CleanOutcome::Passthrough(_)));
548 assert_eq!(out, pointer_text.as_bytes());
549 }
550
551 #[test]
552 fn extension_failure_is_propagated() {
553 let (_t, store) = fixture();
554 let exts = vec![CleanExtension {
555 name: "fail".into(),
556 priority: 0,
557 command: "false".into(),
558 }];
559 let mut out = Vec::new();
560 let err = clean(&store, &mut &b"hello"[..], &mut out, "x", &exts).unwrap_err();
561 assert!(
562 matches!(err, CleanError::ExtensionFailed { .. }),
563 "got {err:?}"
564 );
565 }
566}