Skip to main content

git_lfs_filter/
clean.rs

1//! The clean filter: stdin → store + pointer-on-stdout.
2
3use std::io::{self, Read, Write};
4use std::path::Path;
5use std::process::{Command, Stdio};
6
7use git_lfs_pointer::{Extension, Oid, Pointer};
8use git_lfs_store::{Store, StoreError};
9use sha2::{Digest, Sha256};
10use tempfile::NamedTempFile;
11
12use crate::detect_pointer;
13
14const COPY_BUFFER: usize = 64 * 1024;
15
16/// Result of running the [`clean`] filter on a piece of input.
17#[derive(Debug)]
18pub enum CleanOutcome {
19    /// Input was already a valid pointer; the original bytes were emitted
20    /// to the output stream verbatim and nothing was inserted into the store.
21    /// This is what makes `git lfs clean` idempotent on already-cleaned blobs.
22    Passthrough(Pointer),
23    /// Input was content; it was hashed and inserted into the store, and the
24    /// canonical encoding of the resulting [`Pointer`] was written to the
25    /// output stream.
26    Stored(Pointer),
27}
28
29impl CleanOutcome {
30    /// The pointer associated with this outcome (the parsed pass-through one,
31    /// or the freshly-stored one).
32    pub fn pointer(&self) -> &Pointer {
33        match self {
34            Self::Passthrough(p) | Self::Stored(p) => p,
35        }
36    }
37
38    /// `true` if the input was recognized as an existing pointer.
39    pub fn was_passthrough(&self) -> bool {
40        matches!(self, Self::Passthrough(_))
41    }
42}
43
44/// One pointer extension's clean side.
45///
46/// `command` is the raw command string from `lfs.extension.<name>.clean`,
47/// with `%f` placeholders for the working-tree path. Priority is the
48/// order index baked into the extension's pointer line (`ext-{N}-{name}`).
49#[derive(Debug, Clone)]
50pub struct CleanExtension {
51    pub name: String,
52    pub priority: u8,
53    pub command: String,
54}
55
56#[derive(Debug, thiserror::Error)]
57pub enum CleanError {
58    #[error(transparent)]
59    Io(#[from] io::Error),
60    #[error(transparent)]
61    Store(#[from] StoreError),
62    #[error("extension {name:?} has no clean command configured")]
63    ExtensionMissingCommand { name: String },
64    #[error("failed to spawn extension {name:?}: {source}")]
65    ExtensionSpawnFailed {
66        name: String,
67        #[source]
68        source: io::Error,
69    },
70    #[error("extension {name:?} exited with status {status:?}")]
71    ExtensionFailed { name: String, status: Option<i32> },
72}
73
74/// Apply the clean filter to `input`, writing the resulting pointer (or the
75/// pass-through bytes) to `output`.
76///
77/// Algorithm:
78/// 1. Read up to `MAX_POINTER_SIZE` bytes.
79/// 2. If those bytes parse as a valid pointer, emit them verbatim
80///    ([`CleanOutcome::Passthrough`]).
81/// 3. Otherwise stream the buffered head + the rest of `input` through
82///    each configured extension in priority order, hashing the input to
83///    each phase to record `ext-N-<name> sha256:<hash>` lines, and
84///    [`Store::insert`] the final phase's output.
85///
86/// `path` is the working-tree path (as passed by git on the command
87/// line / filter-process header). It substitutes for `%f` in each
88/// extension's `clean` command. May be empty when no path is known
89/// (e.g. piped invocation `git lfs clean` with no `--` arg).
90pub fn clean<R: Read, W: Write>(
91    store: &Store,
92    input: &mut R,
93    output: &mut W,
94    path: &str,
95    extensions: &[CleanExtension],
96) -> Result<CleanOutcome, CleanError> {
97    let (head, maybe_pointer) = detect_pointer(input)?;
98
99    if let Some(pointer) = maybe_pointer {
100        output.write_all(&head)?;
101        return Ok(CleanOutcome::Passthrough(pointer));
102    }
103
104    if extensions.is_empty() {
105        let mut combined = head.as_slice().chain(input);
106        let (oid, size) = store.insert(&mut combined)?;
107        let pointer = Pointer::new(oid, size);
108        output.write_all(pointer.encode().as_bytes())?;
109        return Ok(CleanOutcome::Stored(pointer));
110    }
111
112    for ext in extensions {
113        if ext.command.trim().is_empty() {
114            return Err(CleanError::ExtensionMissingCommand {
115                name: ext.name.clone(),
116            });
117        }
118    }
119
120    let tmp_dir = store.tmp_dir();
121    std::fs::create_dir_all(&tmp_dir)?;
122
123    // Stage 0: hash the original input while buffering it to a tmp file.
124    let mut combined = head.as_slice().chain(input);
125    let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
126    let orig_oid = hash_and_write(&mut combined, current_tmp.as_file_mut())?;
127    let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
128    input_oids.push(orig_oid);
129
130    // Stages 1..=N: for each extension, feed the previous stage's tmp file
131    // as stdin and capture stdout. Final stage streams directly into the
132    // store.
133    for (i, ext) in extensions.iter().enumerate() {
134        let cmd_str = ext.command.replace("%f", path);
135        let mut parts = cmd_str.split_whitespace();
136        let prog = parts
137            .next()
138            .ok_or_else(|| CleanError::ExtensionMissingCommand {
139                name: ext.name.clone(),
140            })?;
141        let args: Vec<&str> = parts.collect();
142
143        let stdin_file = std::fs::File::open(current_tmp.path())?;
144        let mut child = Command::new(prog)
145            .args(&args)
146            .stdin(stdin_file)
147            .stdout(Stdio::piped())
148            .stderr(Stdio::inherit())
149            .spawn()
150            .map_err(|e| CleanError::ExtensionSpawnFailed {
151                name: ext.name.clone(),
152                source: e,
153            })?;
154        let mut stdout = child.stdout.take().expect("piped stdout");
155
156        let is_last = i + 1 == extensions.len();
157        if is_last {
158            let (oid, size) = store.insert(&mut stdout)?;
159            let status = child.wait()?;
160            if !status.success() {
161                return Err(CleanError::ExtensionFailed {
162                    name: ext.name.clone(),
163                    status: status.code(),
164                });
165            }
166
167            let pointer_extensions = build_pointer_extensions(extensions, &input_oids);
168            let pointer = Pointer {
169                oid,
170                size,
171                extensions: pointer_extensions,
172                canonical: true,
173            };
174            output.write_all(pointer.encode().as_bytes())?;
175            return Ok(CleanOutcome::Stored(pointer));
176        }
177
178        let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
179        let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
180        let status = child.wait()?;
181        if !status.success() {
182            return Err(CleanError::ExtensionFailed {
183                name: ext.name.clone(),
184                status: status.code(),
185            });
186        }
187
188        current_tmp = next_tmp;
189        input_oids.push(next_oid);
190    }
191
192    // The loop returns on the last extension; if `extensions` is empty
193    // we took the early return above. So this is unreachable.
194    unreachable!("clean loop exited without storing")
195}
196
197/// Run `input` through the configured `extensions` chain in priority
198/// order and return the resulting [`Pointer`] **without** inserting the
199/// final stage's output anywhere. Used by `git lfs pointer --file=X` to
200/// preview what `clean` would emit, including the `ext-N-<name>` lines,
201/// without polluting the on-disk store.
202///
203/// Mirrors [`clean`]'s extension chain except for the final stage:
204/// instead of `Store::insert`, the post-extension stream is hashed and
205/// counted in-memory and discarded. `tmp_dir` holds the per-stage
206/// scratch files (use `std::env::temp_dir()` if no store is in scope).
207pub fn build_pointer_with_extensions<R: Read>(
208    input: &mut R,
209    path: &str,
210    extensions: &[CleanExtension],
211    tmp_dir: &Path,
212) -> Result<Pointer, CleanError> {
213    if extensions.is_empty() {
214        let mut hasher = Sha256::new();
215        let mut buf = vec![0u8; COPY_BUFFER];
216        let mut size: u64 = 0;
217        loop {
218            let n = input.read(&mut buf)?;
219            if n == 0 {
220                break;
221            }
222            hasher.update(&buf[..n]);
223            size += n as u64;
224        }
225        let bytes: [u8; 32] = hasher.finalize().into();
226        return Ok(Pointer::new(Oid::from_bytes(bytes), size));
227    }
228
229    for ext in extensions {
230        if ext.command.trim().is_empty() {
231            return Err(CleanError::ExtensionMissingCommand {
232                name: ext.name.clone(),
233            });
234        }
235    }
236
237    std::fs::create_dir_all(tmp_dir)?;
238
239    let mut current_tmp = NamedTempFile::new_in(tmp_dir)?;
240    let orig_oid = hash_and_write(input, current_tmp.as_file_mut())?;
241    let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
242    input_oids.push(orig_oid);
243
244    for (i, ext) in extensions.iter().enumerate() {
245        let cmd_str = ext.command.replace("%f", path);
246        let mut parts = cmd_str.split_whitespace();
247        let prog = parts
248            .next()
249            .ok_or_else(|| CleanError::ExtensionMissingCommand {
250                name: ext.name.clone(),
251            })?;
252        let args: Vec<&str> = parts.collect();
253
254        let stdin_file = std::fs::File::open(current_tmp.path())?;
255        let mut child = Command::new(prog)
256            .args(&args)
257            .stdin(stdin_file)
258            .stdout(Stdio::piped())
259            .stderr(Stdio::inherit())
260            .spawn()
261            .map_err(|e| CleanError::ExtensionSpawnFailed {
262                name: ext.name.clone(),
263                source: e,
264            })?;
265        let mut stdout = child.stdout.take().expect("piped stdout");
266
267        let is_last = i + 1 == extensions.len();
268        if is_last {
269            let mut hasher = Sha256::new();
270            let mut buf = vec![0u8; COPY_BUFFER];
271            let mut size: u64 = 0;
272            loop {
273                let n = stdout.read(&mut buf)?;
274                if n == 0 {
275                    break;
276                }
277                hasher.update(&buf[..n]);
278                size += n as u64;
279            }
280            let status = child.wait()?;
281            if !status.success() {
282                return Err(CleanError::ExtensionFailed {
283                    name: ext.name.clone(),
284                    status: status.code(),
285                });
286            }
287            let bytes: [u8; 32] = hasher.finalize().into();
288            return Ok(Pointer {
289                oid: Oid::from_bytes(bytes),
290                size,
291                extensions: build_pointer_extensions(extensions, &input_oids),
292                canonical: true,
293            });
294        }
295
296        let mut next_tmp = NamedTempFile::new_in(tmp_dir)?;
297        let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
298        let status = child.wait()?;
299        if !status.success() {
300            return Err(CleanError::ExtensionFailed {
301                name: ext.name.clone(),
302                status: status.code(),
303            });
304        }
305        current_tmp = next_tmp;
306        input_oids.push(next_oid);
307    }
308
309    unreachable!("extension chain exited without producing a pointer")
310}
311
312fn build_pointer_extensions(extensions: &[CleanExtension], input_oids: &[Oid]) -> Vec<Extension> {
313    extensions
314        .iter()
315        .enumerate()
316        .map(|(i, ext)| Extension {
317            name: ext.name.clone(),
318            priority: ext.priority,
319            oid: input_oids[i],
320        })
321        .collect()
322}
323
324fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
325    let mut hasher = Sha256::new();
326    let mut buf = vec![0u8; COPY_BUFFER];
327    loop {
328        let n = src.read(&mut buf)?;
329        if n == 0 {
330            break;
331        }
332        hasher.update(&buf[..n]);
333        dst.write_all(&buf[..n])?;
334    }
335    dst.flush()?;
336    let bytes: [u8; 32] = hasher.finalize().into();
337    Ok(Oid::from_bytes(bytes))
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343    use git_lfs_pointer::VERSION_LATEST;
344    use tempfile::TempDir;
345
346    fn fixture() -> (TempDir, Store) {
347        let tmp = TempDir::new().unwrap();
348        let store = Store::new(tmp.path().join("lfs"));
349        (tmp, store)
350    }
351
352    fn run(store: &Store, input: &[u8]) -> (CleanOutcome, Vec<u8>) {
353        let mut out = Vec::new();
354        let outcome = clean(store, &mut { input }, &mut out, "", &[]).unwrap();
355        (outcome, out)
356    }
357
358    // ---------- Stored path ----------
359
360    #[test]
361    fn small_content_is_hashed_and_stored() {
362        let (_t, store) = fixture();
363        let (outcome, out) = run(&store, b"hello world!");
364        let p = match outcome {
365            CleanOutcome::Stored(p) => p,
366            o => panic!("expected Stored, got {o:?}"),
367        };
368        assert_eq!(p.size, 12);
369        assert!(store.contains(p.oid));
370        assert_eq!(out, p.encode().as_bytes());
371    }
372
373    #[test]
374    fn known_sha256_for_abc() {
375        let (_t, store) = fixture();
376        let (outcome, _) = run(&store, b"abc");
377        let expected: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
378            .parse()
379            .unwrap();
380        assert_eq!(outcome.pointer().oid, expected);
381    }
382
383    #[test]
384    fn pseudo_pointer_with_extra_text_is_hashed() {
385        let input = b"version https://git-lfs.github.com/spec/v1\n\
386                      oid sha256:7cd8be1d2cd0dd22cd9d229bb6b5785009a05e8b39d405615d882caac56562b5\n\
387                      size 1024\n\
388                      \n\
389                      This is my test pointer.\n";
390        let (_t, store) = fixture();
391        let (outcome, out) = run(&store, input);
392        let p = match outcome {
393            CleanOutcome::Stored(p) => p,
394            o => panic!("expected Stored, got {o:?}"),
395        };
396        assert_eq!(p.size, input.len() as u64);
397        assert!(store.contains(p.oid));
398        assert_eq!(out, p.encode().as_bytes());
399    }
400
401    #[test]
402    fn oversized_pointer_shaped_input_is_hashed() {
403        let mut input = Vec::from(
404            &b"version https://git-lfs.github.com/spec/v1\n\
405               oid sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n\
406               size 5\n"[..],
407        );
408        input.extend(std::iter::repeat_n(b'x', 2000));
409        let (_t, store) = fixture();
410        let (outcome, _) = run(&store, &input);
411        let p = match outcome {
412            CleanOutcome::Stored(p) => p,
413            o => panic!("expected Stored, got {o:?}"),
414        };
415        assert_eq!(p.size, input.len() as u64);
416        assert!(store.contains(p.oid));
417    }
418
419    #[test]
420    fn streaming_megabyte_input_works() {
421        let (_t, store) = fixture();
422        let content: Vec<u8> = (0..1_048_576u32).map(|i| (i ^ (i >> 5)) as u8).collect();
423        let (outcome, _) = run(&store, &content);
424        assert_eq!(outcome.pointer().size, content.len() as u64);
425        assert!(store.contains(outcome.pointer().oid));
426    }
427
428    // ---------- Passthrough path ----------
429
430    #[test]
431    fn canonical_pointer_passes_through_verbatim() {
432        let (_t, store) = fixture();
433        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
434        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
435        let (outcome, out) = run(&store, pointer_text.as_bytes());
436        match &outcome {
437            CleanOutcome::Passthrough(p) => assert!(p.canonical),
438            o => panic!("expected Passthrough, got {o:?}"),
439        }
440        assert_eq!(
441            out,
442            pointer_text.as_bytes(),
443            "output must be input verbatim"
444        );
445        assert!(!store.root().join("objects").exists());
446    }
447
448    #[test]
449    fn non_canonical_pointer_passes_through_verbatim() {
450        // CRLF pointer: parses, marked non-canonical. Pass-through must keep
451        // the CRLFs, *not* re-emit the canonical (LF) encoding — otherwise the
452        // git blob hash would change underneath the user.
453        let (_t, store) = fixture();
454        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
455        let crlf = format!("version {VERSION_LATEST}\r\noid sha256:{oid_hex}\r\nsize 12345\r\n");
456        let (outcome, out) = run(&store, crlf.as_bytes());
457        match &outcome {
458            CleanOutcome::Passthrough(p) => assert!(!p.canonical),
459            o => panic!("expected Passthrough, got {o:?}"),
460        }
461        assert_eq!(out, crlf.as_bytes());
462    }
463
464    #[test]
465    fn empty_input_is_passthrough_empty_pointer() {
466        let (_t, store) = fixture();
467        let (outcome, out) = run(&store, b"");
468        match &outcome {
469            CleanOutcome::Passthrough(p) => {
470                assert_eq!(p, &Pointer::empty());
471            }
472            o => panic!("expected Passthrough, got {o:?}"),
473        }
474        assert!(out.is_empty(), "empty pointer encodes to empty bytes");
475    }
476
477    #[test]
478    fn passthrough_is_idempotent() {
479        let (_t, store) = fixture();
480        let (_, first) = run(&store, b"some content here");
481        let (outcome2, second) = run(&store, &first);
482        assert!(matches!(outcome2, CleanOutcome::Passthrough(_)));
483        assert_eq!(first, second);
484    }
485
486    // ---------- Extensions ----------
487
488    /// Use `tr a-z A-Z` (POSIX, present everywhere) as a stand-in for the
489    /// case-inverter test extension. Verifies the chained subprocess + OID
490    /// bookkeeping; the upstream Go test driver covers the more elaborate
491    /// case-inverter semantics end-to-end.
492    #[test]
493    fn single_extension_records_input_oid() {
494        let (_t, store) = fixture();
495        let exts = vec![CleanExtension {
496            name: "upper".into(),
497            priority: 0,
498            command: "tr a-z A-Z".into(),
499        }];
500
501        let mut out = Vec::new();
502        let outcome = clean(&store, &mut &b"abc"[..], &mut out, "foo.txt", &exts).unwrap();
503
504        let pointer = match outcome {
505            CleanOutcome::Stored(p) => p,
506            o => panic!("expected Stored, got {o:?}"),
507        };
508
509        // Input was "abc" → SHA-256 well-known.
510        let abc_oid: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
511            .parse()
512            .unwrap();
513        // Output was "ABC" → distinct OID.
514        let upper_oid: Oid = "b5d4045c3f466fa91fe2cc6abe79232a1a57cdf104f7a26e716e0a1e2789df78"
515            .parse()
516            .unwrap();
517
518        assert_eq!(pointer.extensions.len(), 1);
519        assert_eq!(pointer.extensions[0].name, "upper");
520        assert_eq!(pointer.extensions[0].priority, 0);
521        assert_eq!(pointer.extensions[0].oid, abc_oid);
522        assert_eq!(pointer.oid, upper_oid);
523        assert_eq!(pointer.size, 3);
524        assert!(store.contains(upper_oid));
525        // Stored bytes are "ABC".
526        let mut f = store.open(upper_oid).unwrap();
527        let mut bytes = Vec::new();
528        std::io::Read::read_to_end(&mut f, &mut bytes).unwrap();
529        assert_eq!(bytes, b"ABC");
530    }
531
532    #[test]
533    fn extensions_skipped_for_passthrough_pointer() {
534        // If the input is already a pointer, extensions are never invoked —
535        // upstream's `clean` short-circuits before doing anything.
536        let (_t, store) = fixture();
537        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
538        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
539        let exts = vec![CleanExtension {
540            name: "fail".into(),
541            priority: 0,
542            // /bin/false would be invoked if we got past the passthrough check.
543            command: "false".into(),
544        }];
545        let mut out = Vec::new();
546        let outcome = clean(&store, &mut pointer_text.as_bytes(), &mut out, "x", &exts).unwrap();
547        assert!(matches!(outcome, CleanOutcome::Passthrough(_)));
548        assert_eq!(out, pointer_text.as_bytes());
549    }
550
551    #[test]
552    fn extension_failure_is_propagated() {
553        let (_t, store) = fixture();
554        let exts = vec![CleanExtension {
555            name: "fail".into(),
556            priority: 0,
557            command: "false".into(),
558        }];
559        let mut out = Vec::new();
560        let err = clean(&store, &mut &b"hello"[..], &mut out, "x", &exts).unwrap_err();
561        assert!(
562            matches!(err, CleanError::ExtensionFailed { .. }),
563            "got {err:?}"
564        );
565    }
566}