1use std::io::{self, Read, Write};
4use std::path::Path;
5use std::process::{Command, Stdio};
6
7use git_lfs_pointer::{Extension, Oid, Pointer};
8use git_lfs_store::{Store, StoreError};
9use sha2::{Digest, Sha256};
10use tempfile::NamedTempFile;
11
12use crate::detect_pointer;
13
14const COPY_BUFFER: usize = 64 * 1024;
15
16#[derive(Debug)]
18pub enum CleanOutcome {
19 Passthrough(Pointer),
23 Stored(Pointer),
27}
28
29impl CleanOutcome {
30 pub fn pointer(&self) -> &Pointer {
33 match self {
34 Self::Passthrough(p) | Self::Stored(p) => p,
35 }
36 }
37
38 pub fn was_passthrough(&self) -> bool {
40 matches!(self, Self::Passthrough(_))
41 }
42}
43
44#[derive(Debug, Clone)]
50pub struct CleanExtension {
51 pub name: String,
53 pub priority: u8,
56 pub command: String,
59}
60
61#[derive(Debug, thiserror::Error)]
63pub enum CleanError {
64 #[error(transparent)]
67 Io(#[from] io::Error),
68 #[error(transparent)]
70 Store(#[from] StoreError),
71 #[error("extension {name:?} has no clean command configured")]
74 ExtensionMissingCommand { name: String },
75 #[error("failed to spawn extension {name:?}: {source}")]
78 ExtensionSpawnFailed {
79 name: String,
80 #[source]
81 source: io::Error,
82 },
83 #[error("extension {name:?} exited with status {status:?}")]
85 ExtensionFailed { name: String, status: Option<i32> },
86}
87
88pub fn clean<R: Read, W: Write>(
105 store: &Store,
106 input: &mut R,
107 output: &mut W,
108 path: &str,
109 extensions: &[CleanExtension],
110) -> Result<CleanOutcome, CleanError> {
111 let (head, maybe_pointer) = detect_pointer(input)?;
112
113 if let Some(pointer) = maybe_pointer {
114 output.write_all(&head)?;
115 return Ok(CleanOutcome::Passthrough(pointer));
116 }
117
118 if extensions.is_empty() {
119 let mut combined = head.as_slice().chain(input);
120 let (oid, size) = store.insert(&mut combined)?;
121 let pointer = Pointer::new(oid, size);
122 output.write_all(pointer.encode().as_bytes())?;
123 return Ok(CleanOutcome::Stored(pointer));
124 }
125
126 for ext in extensions {
127 if ext.command.trim().is_empty() {
128 return Err(CleanError::ExtensionMissingCommand {
129 name: ext.name.clone(),
130 });
131 }
132 }
133
134 let tmp_dir = store.tmp_dir();
135 std::fs::create_dir_all(&tmp_dir)?;
136
137 let mut combined = head.as_slice().chain(input);
139 let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
140 let orig_oid = hash_and_write(&mut combined, current_tmp.as_file_mut())?;
141 let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
142 input_oids.push(orig_oid);
143
144 for (i, ext) in extensions.iter().enumerate() {
148 let cmd_str = ext.command.replace("%f", path);
149 let mut parts = cmd_str.split_whitespace();
150 let prog = parts
151 .next()
152 .ok_or_else(|| CleanError::ExtensionMissingCommand {
153 name: ext.name.clone(),
154 })?;
155 let args: Vec<&str> = parts.collect();
156
157 let stdin_file = std::fs::File::open(current_tmp.path())?;
158 let mut child = Command::new(prog)
159 .args(&args)
160 .stdin(stdin_file)
161 .stdout(Stdio::piped())
162 .stderr(Stdio::inherit())
163 .spawn()
164 .map_err(|e| CleanError::ExtensionSpawnFailed {
165 name: ext.name.clone(),
166 source: e,
167 })?;
168 let mut stdout = child.stdout.take().expect("piped stdout");
169
170 let is_last = i + 1 == extensions.len();
171 if is_last {
172 let (oid, size) = store.insert(&mut stdout)?;
173 let status = child.wait()?;
174 if !status.success() {
175 return Err(CleanError::ExtensionFailed {
176 name: ext.name.clone(),
177 status: status.code(),
178 });
179 }
180
181 let pointer_extensions = build_pointer_extensions(extensions, &input_oids);
182 let pointer = Pointer {
183 oid,
184 size,
185 extensions: pointer_extensions,
186 canonical: true,
187 };
188 output.write_all(pointer.encode().as_bytes())?;
189 return Ok(CleanOutcome::Stored(pointer));
190 }
191
192 let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
193 let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
194 let status = child.wait()?;
195 if !status.success() {
196 return Err(CleanError::ExtensionFailed {
197 name: ext.name.clone(),
198 status: status.code(),
199 });
200 }
201
202 current_tmp = next_tmp;
203 input_oids.push(next_oid);
204 }
205
206 unreachable!("clean loop exited without storing")
209}
210
211pub fn build_pointer_with_extensions<R: Read>(
224 input: &mut R,
225 path: &str,
226 extensions: &[CleanExtension],
227 tmp_dir: &Path,
228) -> Result<Pointer, CleanError> {
229 if extensions.is_empty() {
230 let mut hasher = Sha256::new();
231 let mut buf = vec![0u8; COPY_BUFFER];
232 let mut size: u64 = 0;
233 loop {
234 let n = input.read(&mut buf)?;
235 if n == 0 {
236 break;
237 }
238 hasher.update(&buf[..n]);
239 size += n as u64;
240 }
241 let bytes: [u8; 32] = hasher.finalize().into();
242 return Ok(Pointer::new(Oid::from_bytes(bytes), size));
243 }
244
245 for ext in extensions {
246 if ext.command.trim().is_empty() {
247 return Err(CleanError::ExtensionMissingCommand {
248 name: ext.name.clone(),
249 });
250 }
251 }
252
253 std::fs::create_dir_all(tmp_dir)?;
254
255 let mut current_tmp = NamedTempFile::new_in(tmp_dir)?;
256 let orig_oid = hash_and_write(input, current_tmp.as_file_mut())?;
257 let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
258 input_oids.push(orig_oid);
259
260 for (i, ext) in extensions.iter().enumerate() {
261 let cmd_str = ext.command.replace("%f", path);
262 let mut parts = cmd_str.split_whitespace();
263 let prog = parts
264 .next()
265 .ok_or_else(|| CleanError::ExtensionMissingCommand {
266 name: ext.name.clone(),
267 })?;
268 let args: Vec<&str> = parts.collect();
269
270 let stdin_file = std::fs::File::open(current_tmp.path())?;
271 let mut child = Command::new(prog)
272 .args(&args)
273 .stdin(stdin_file)
274 .stdout(Stdio::piped())
275 .stderr(Stdio::inherit())
276 .spawn()
277 .map_err(|e| CleanError::ExtensionSpawnFailed {
278 name: ext.name.clone(),
279 source: e,
280 })?;
281 let mut stdout = child.stdout.take().expect("piped stdout");
282
283 let is_last = i + 1 == extensions.len();
284 if is_last {
285 let mut hasher = Sha256::new();
286 let mut buf = vec![0u8; COPY_BUFFER];
287 let mut size: u64 = 0;
288 loop {
289 let n = stdout.read(&mut buf)?;
290 if n == 0 {
291 break;
292 }
293 hasher.update(&buf[..n]);
294 size += n as u64;
295 }
296 let status = child.wait()?;
297 if !status.success() {
298 return Err(CleanError::ExtensionFailed {
299 name: ext.name.clone(),
300 status: status.code(),
301 });
302 }
303 let bytes: [u8; 32] = hasher.finalize().into();
304 return Ok(Pointer {
305 oid: Oid::from_bytes(bytes),
306 size,
307 extensions: build_pointer_extensions(extensions, &input_oids),
308 canonical: true,
309 });
310 }
311
312 let mut next_tmp = NamedTempFile::new_in(tmp_dir)?;
313 let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
314 let status = child.wait()?;
315 if !status.success() {
316 return Err(CleanError::ExtensionFailed {
317 name: ext.name.clone(),
318 status: status.code(),
319 });
320 }
321 current_tmp = next_tmp;
322 input_oids.push(next_oid);
323 }
324
325 unreachable!("extension chain exited without producing a pointer")
326}
327
328fn build_pointer_extensions(extensions: &[CleanExtension], input_oids: &[Oid]) -> Vec<Extension> {
329 extensions
330 .iter()
331 .enumerate()
332 .map(|(i, ext)| Extension {
333 name: ext.name.clone(),
334 priority: ext.priority,
335 oid: input_oids[i],
336 })
337 .collect()
338}
339
340fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
341 let mut hasher = Sha256::new();
342 let mut buf = vec![0u8; COPY_BUFFER];
343 loop {
344 let n = src.read(&mut buf)?;
345 if n == 0 {
346 break;
347 }
348 hasher.update(&buf[..n]);
349 dst.write_all(&buf[..n])?;
350 }
351 dst.flush()?;
352 let bytes: [u8; 32] = hasher.finalize().into();
353 Ok(Oid::from_bytes(bytes))
354}
355
356#[cfg(test)]
357mod tests {
358 use super::*;
359 use git_lfs_pointer::VERSION_LATEST;
360 use tempfile::TempDir;
361
362 fn fixture() -> (TempDir, Store) {
363 let tmp = TempDir::new().unwrap();
364 let store = Store::new(tmp.path().join("lfs"));
365 (tmp, store)
366 }
367
368 fn run(store: &Store, input: &[u8]) -> (CleanOutcome, Vec<u8>) {
369 let mut out = Vec::new();
370 let outcome = clean(store, &mut { input }, &mut out, "", &[]).unwrap();
371 (outcome, out)
372 }
373
374 #[test]
377 fn small_content_is_hashed_and_stored() {
378 let (_t, store) = fixture();
379 let (outcome, out) = run(&store, b"hello world!");
380 let p = match outcome {
381 CleanOutcome::Stored(p) => p,
382 o => panic!("expected Stored, got {o:?}"),
383 };
384 assert_eq!(p.size, 12);
385 assert!(store.contains(p.oid));
386 assert_eq!(out, p.encode().as_bytes());
387 }
388
389 #[test]
390 fn known_sha256_for_abc() {
391 let (_t, store) = fixture();
392 let (outcome, _) = run(&store, b"abc");
393 let expected: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
394 .parse()
395 .unwrap();
396 assert_eq!(outcome.pointer().oid, expected);
397 }
398
399 #[test]
400 fn pseudo_pointer_with_extra_text_is_hashed() {
401 let input = b"version https://git-lfs.github.com/spec/v1\n\
402 oid sha256:7cd8be1d2cd0dd22cd9d229bb6b5785009a05e8b39d405615d882caac56562b5\n\
403 size 1024\n\
404 \n\
405 This is my test pointer.\n";
406 let (_t, store) = fixture();
407 let (outcome, out) = run(&store, input);
408 let p = match outcome {
409 CleanOutcome::Stored(p) => p,
410 o => panic!("expected Stored, got {o:?}"),
411 };
412 assert_eq!(p.size, input.len() as u64);
413 assert!(store.contains(p.oid));
414 assert_eq!(out, p.encode().as_bytes());
415 }
416
417 #[test]
418 fn oversized_pointer_shaped_input_is_hashed() {
419 let mut input = Vec::from(
420 &b"version https://git-lfs.github.com/spec/v1\n\
421 oid sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n\
422 size 5\n"[..],
423 );
424 input.extend(std::iter::repeat_n(b'x', 2000));
425 let (_t, store) = fixture();
426 let (outcome, _) = run(&store, &input);
427 let p = match outcome {
428 CleanOutcome::Stored(p) => p,
429 o => panic!("expected Stored, got {o:?}"),
430 };
431 assert_eq!(p.size, input.len() as u64);
432 assert!(store.contains(p.oid));
433 }
434
435 #[test]
436 fn streaming_megabyte_input_works() {
437 let (_t, store) = fixture();
438 let content: Vec<u8> = (0..1_048_576u32).map(|i| (i ^ (i >> 5)) as u8).collect();
439 let (outcome, _) = run(&store, &content);
440 assert_eq!(outcome.pointer().size, content.len() as u64);
441 assert!(store.contains(outcome.pointer().oid));
442 }
443
444 #[test]
447 fn canonical_pointer_passes_through_verbatim() {
448 let (_t, store) = fixture();
449 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
450 let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
451 let (outcome, out) = run(&store, pointer_text.as_bytes());
452 match &outcome {
453 CleanOutcome::Passthrough(p) => assert!(p.canonical),
454 o => panic!("expected Passthrough, got {o:?}"),
455 }
456 assert_eq!(
457 out,
458 pointer_text.as_bytes(),
459 "output must be input verbatim"
460 );
461 assert!(!store.root().join("objects").exists());
462 }
463
464 #[test]
465 fn non_canonical_pointer_passes_through_verbatim() {
466 let (_t, store) = fixture();
470 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
471 let crlf = format!("version {VERSION_LATEST}\r\noid sha256:{oid_hex}\r\nsize 12345\r\n");
472 let (outcome, out) = run(&store, crlf.as_bytes());
473 match &outcome {
474 CleanOutcome::Passthrough(p) => assert!(!p.canonical),
475 o => panic!("expected Passthrough, got {o:?}"),
476 }
477 assert_eq!(out, crlf.as_bytes());
478 }
479
480 #[test]
481 fn empty_input_is_passthrough_empty_pointer() {
482 let (_t, store) = fixture();
483 let (outcome, out) = run(&store, b"");
484 match &outcome {
485 CleanOutcome::Passthrough(p) => {
486 assert_eq!(p, &Pointer::empty());
487 }
488 o => panic!("expected Passthrough, got {o:?}"),
489 }
490 assert!(out.is_empty(), "empty pointer encodes to empty bytes");
491 }
492
493 #[test]
494 fn passthrough_is_idempotent() {
495 let (_t, store) = fixture();
496 let (_, first) = run(&store, b"some content here");
497 let (outcome2, second) = run(&store, &first);
498 assert!(matches!(outcome2, CleanOutcome::Passthrough(_)));
499 assert_eq!(first, second);
500 }
501
502 #[test]
509 fn single_extension_records_input_oid() {
510 let (_t, store) = fixture();
511 let exts = vec![CleanExtension {
512 name: "upper".into(),
513 priority: 0,
514 command: "tr a-z A-Z".into(),
515 }];
516
517 let mut out = Vec::new();
518 let outcome = clean(&store, &mut &b"abc"[..], &mut out, "foo.txt", &exts).unwrap();
519
520 let pointer = match outcome {
521 CleanOutcome::Stored(p) => p,
522 o => panic!("expected Stored, got {o:?}"),
523 };
524
525 let abc_oid: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
527 .parse()
528 .unwrap();
529 let upper_oid: Oid = "b5d4045c3f466fa91fe2cc6abe79232a1a57cdf104f7a26e716e0a1e2789df78"
531 .parse()
532 .unwrap();
533
534 assert_eq!(pointer.extensions.len(), 1);
535 assert_eq!(pointer.extensions[0].name, "upper");
536 assert_eq!(pointer.extensions[0].priority, 0);
537 assert_eq!(pointer.extensions[0].oid, abc_oid);
538 assert_eq!(pointer.oid, upper_oid);
539 assert_eq!(pointer.size, 3);
540 assert!(store.contains(upper_oid));
541 let mut f = store.open(upper_oid).unwrap();
543 let mut bytes = Vec::new();
544 std::io::Read::read_to_end(&mut f, &mut bytes).unwrap();
545 assert_eq!(bytes, b"ABC");
546 }
547
548 #[test]
549 fn extensions_skipped_for_passthrough_pointer() {
550 let (_t, store) = fixture();
553 let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
554 let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
555 let exts = vec![CleanExtension {
556 name: "fail".into(),
557 priority: 0,
558 command: "false".into(),
560 }];
561 let mut out = Vec::new();
562 let outcome = clean(&store, &mut pointer_text.as_bytes(), &mut out, "x", &exts).unwrap();
563 assert!(matches!(outcome, CleanOutcome::Passthrough(_)));
564 assert_eq!(out, pointer_text.as_bytes());
565 }
566
567 #[test]
568 fn extension_failure_is_propagated() {
569 let (_t, store) = fixture();
570 let exts = vec![CleanExtension {
571 name: "fail".into(),
572 priority: 0,
573 command: "false".into(),
574 }];
575 let mut out = Vec::new();
576 let err = clean(&store, &mut &b"hello"[..], &mut out, "x", &exts).unwrap_err();
577 assert!(
578 matches!(err, CleanError::ExtensionFailed { .. }),
579 "got {err:?}"
580 );
581 }
582}