Skip to main content

git_lfs_filter/
clean.rs

1//! The clean filter: stdin → store + pointer-on-stdout.
2
3use std::io::{self, Read, Write};
4use std::path::Path;
5use std::process::{Command, Stdio};
6
7use git_lfs_pointer::{Extension, Oid, Pointer};
8use git_lfs_store::{Store, StoreError};
9use sha2::{Digest, Sha256};
10use tempfile::NamedTempFile;
11
12use crate::detect_pointer;
13
14const COPY_BUFFER: usize = 64 * 1024;
15
16/// Result of running the [`clean`] filter on a piece of input.
17#[derive(Debug)]
18pub enum CleanOutcome {
19    /// Input was already a valid pointer; the original bytes were emitted
20    /// to the output stream verbatim and nothing was inserted into the store.
21    /// This is what makes `git lfs clean` idempotent on already-cleaned blobs.
22    Passthrough(Pointer),
23    /// Input was content; it was hashed and inserted into the store, and the
24    /// canonical encoding of the resulting [`Pointer`] was written to the
25    /// output stream.
26    Stored(Pointer),
27}
28
29impl CleanOutcome {
30    /// The pointer associated with this outcome (the parsed pass-through one,
31    /// or the freshly-stored one).
32    pub fn pointer(&self) -> &Pointer {
33        match self {
34            Self::Passthrough(p) | Self::Stored(p) => p,
35        }
36    }
37
38    /// `true` if the input was recognized as an existing pointer.
39    pub fn was_passthrough(&self) -> bool {
40        matches!(self, Self::Passthrough(_))
41    }
42}
43
44/// One pointer extension's clean side.
45///
46/// `command` is the raw command string from `lfs.extension.<name>.clean`,
47/// with `%f` placeholders for the working-tree path. Priority is the
48/// order index baked into the extension's pointer line (`ext-{N}-{name}`).
49#[derive(Debug, Clone)]
50pub struct CleanExtension {
51    /// Extension name, as configured under `lfs.extension.<name>`.
52    pub name: String,
53    /// Single decimal digit (0-9) determining position in the chain;
54    /// lower priorities run earlier.
55    pub priority: u8,
56    /// Raw shell command from `lfs.extension.<name>.clean`. `%f`
57    /// placeholders are substituted with the working-tree path.
58    pub command: String,
59}
60
61/// Things that can go wrong while running [`clean`].
62#[derive(Debug, thiserror::Error)]
63pub enum CleanError {
64    /// Filesystem-level failure: reading the input stream, writing
65    /// the tempfile, etc.
66    #[error(transparent)]
67    Io(#[from] io::Error),
68    /// The local LFS store rejected the bytes.
69    #[error(transparent)]
70    Store(#[from] StoreError),
71    /// A configured extension was declared with an empty `clean`
72    /// command, so there's nothing to spawn.
73    #[error("extension {name:?} has no clean command configured")]
74    ExtensionMissingCommand { name: String },
75    /// The extension subprocess couldn't be started (typically
76    /// because the binary isn't on `PATH`).
77    #[error("failed to spawn extension {name:?}: {source}")]
78    ExtensionSpawnFailed {
79        name: String,
80        #[source]
81        source: io::Error,
82    },
83    /// The extension subprocess started but exited non-zero.
84    #[error("extension {name:?} exited with status {status:?}")]
85    ExtensionFailed { name: String, status: Option<i32> },
86}
87
88/// Apply the clean filter to `input`, writing the resulting pointer (or the
89/// pass-through bytes) to `output`.
90///
91/// Algorithm:
92/// 1. Read up to `MAX_POINTER_SIZE` bytes.
93/// 2. If those bytes parse as a valid pointer, emit them verbatim
94///    ([`CleanOutcome::Passthrough`]).
95/// 3. Otherwise stream the buffered head + the rest of `input` through
96///    each configured extension in priority order, hashing the input to
97///    each phase to record `ext-N-<name> sha256:<hash>` lines, and
98///    [`Store::insert`] the final phase's output.
99///
100/// `path` is the working-tree path (as passed by git on the command
101/// line / filter-process header). It substitutes for `%f` in each
102/// extension's `clean` command. May be empty when no path is known
103/// (e.g. piped invocation `git lfs clean` with no `--` arg).
104pub fn clean<R: Read, W: Write>(
105    store: &Store,
106    input: &mut R,
107    output: &mut W,
108    path: &str,
109    extensions: &[CleanExtension],
110) -> Result<CleanOutcome, CleanError> {
111    let (head, maybe_pointer) = detect_pointer(input)?;
112
113    if let Some(pointer) = maybe_pointer {
114        output.write_all(&head)?;
115        return Ok(CleanOutcome::Passthrough(pointer));
116    }
117
118    if extensions.is_empty() {
119        let mut combined = head.as_slice().chain(input);
120        let (oid, size) = store.insert(&mut combined)?;
121        let pointer = Pointer::new(oid, size);
122        output.write_all(pointer.encode().as_bytes())?;
123        return Ok(CleanOutcome::Stored(pointer));
124    }
125
126    for ext in extensions {
127        if ext.command.trim().is_empty() {
128            return Err(CleanError::ExtensionMissingCommand {
129                name: ext.name.clone(),
130            });
131        }
132    }
133
134    let tmp_dir = store.tmp_dir();
135    std::fs::create_dir_all(&tmp_dir)?;
136
137    // Stage 0: hash the original input while buffering it to a tmp file.
138    let mut combined = head.as_slice().chain(input);
139    let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
140    let orig_oid = hash_and_write(&mut combined, current_tmp.as_file_mut())?;
141    let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
142    input_oids.push(orig_oid);
143
144    // Stages 1..=N: for each extension, feed the previous stage's tmp file
145    // as stdin and capture stdout. Final stage streams directly into the
146    // store.
147    for (i, ext) in extensions.iter().enumerate() {
148        let cmd_str = ext.command.replace("%f", path);
149        let mut parts = cmd_str.split_whitespace();
150        let prog = parts
151            .next()
152            .ok_or_else(|| CleanError::ExtensionMissingCommand {
153                name: ext.name.clone(),
154            })?;
155        let args: Vec<&str> = parts.collect();
156
157        let stdin_file = std::fs::File::open(current_tmp.path())?;
158        let mut child = Command::new(prog)
159            .args(&args)
160            .stdin(stdin_file)
161            .stdout(Stdio::piped())
162            .stderr(Stdio::inherit())
163            .spawn()
164            .map_err(|e| CleanError::ExtensionSpawnFailed {
165                name: ext.name.clone(),
166                source: e,
167            })?;
168        let mut stdout = child.stdout.take().expect("piped stdout");
169
170        let is_last = i + 1 == extensions.len();
171        if is_last {
172            let (oid, size) = store.insert(&mut stdout)?;
173            let status = child.wait()?;
174            if !status.success() {
175                return Err(CleanError::ExtensionFailed {
176                    name: ext.name.clone(),
177                    status: status.code(),
178                });
179            }
180
181            let pointer_extensions = build_pointer_extensions(extensions, &input_oids);
182            let pointer = Pointer {
183                oid,
184                size,
185                extensions: pointer_extensions,
186                canonical: true,
187            };
188            output.write_all(pointer.encode().as_bytes())?;
189            return Ok(CleanOutcome::Stored(pointer));
190        }
191
192        let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
193        let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
194        let status = child.wait()?;
195        if !status.success() {
196            return Err(CleanError::ExtensionFailed {
197                name: ext.name.clone(),
198                status: status.code(),
199            });
200        }
201
202        current_tmp = next_tmp;
203        input_oids.push(next_oid);
204    }
205
206    // The loop returns on the last extension; if `extensions` is empty
207    // we took the early return above. So this is unreachable.
208    unreachable!("clean loop exited without storing")
209}
210
211/// Run `input` through the configured `extensions` chain in priority
212/// order and return the resulting [`Pointer`] without inserting the
213/// final stage's output anywhere.
214///
215/// Used by `git lfs pointer --file=X` to preview what `clean` would
216/// emit, including the `ext-N-<name>` lines, without polluting the
217/// on-disk store.
218///
219/// Mirrors [`clean`]'s extension chain except for the final stage:
220/// instead of `Store::insert`, the post-extension stream is hashed and
221/// counted in-memory and discarded. `tmp_dir` holds the per-stage
222/// scratch files (use `std::env::temp_dir()` if no store is in scope).
223pub fn build_pointer_with_extensions<R: Read>(
224    input: &mut R,
225    path: &str,
226    extensions: &[CleanExtension],
227    tmp_dir: &Path,
228) -> Result<Pointer, CleanError> {
229    if extensions.is_empty() {
230        let mut hasher = Sha256::new();
231        let mut buf = vec![0u8; COPY_BUFFER];
232        let mut size: u64 = 0;
233        loop {
234            let n = input.read(&mut buf)?;
235            if n == 0 {
236                break;
237            }
238            hasher.update(&buf[..n]);
239            size += n as u64;
240        }
241        let bytes: [u8; 32] = hasher.finalize().into();
242        return Ok(Pointer::new(Oid::from_bytes(bytes), size));
243    }
244
245    for ext in extensions {
246        if ext.command.trim().is_empty() {
247            return Err(CleanError::ExtensionMissingCommand {
248                name: ext.name.clone(),
249            });
250        }
251    }
252
253    std::fs::create_dir_all(tmp_dir)?;
254
255    let mut current_tmp = NamedTempFile::new_in(tmp_dir)?;
256    let orig_oid = hash_and_write(input, current_tmp.as_file_mut())?;
257    let mut input_oids: Vec<Oid> = Vec::with_capacity(extensions.len());
258    input_oids.push(orig_oid);
259
260    for (i, ext) in extensions.iter().enumerate() {
261        let cmd_str = ext.command.replace("%f", path);
262        let mut parts = cmd_str.split_whitespace();
263        let prog = parts
264            .next()
265            .ok_or_else(|| CleanError::ExtensionMissingCommand {
266                name: ext.name.clone(),
267            })?;
268        let args: Vec<&str> = parts.collect();
269
270        let stdin_file = std::fs::File::open(current_tmp.path())?;
271        let mut child = Command::new(prog)
272            .args(&args)
273            .stdin(stdin_file)
274            .stdout(Stdio::piped())
275            .stderr(Stdio::inherit())
276            .spawn()
277            .map_err(|e| CleanError::ExtensionSpawnFailed {
278                name: ext.name.clone(),
279                source: e,
280            })?;
281        let mut stdout = child.stdout.take().expect("piped stdout");
282
283        let is_last = i + 1 == extensions.len();
284        if is_last {
285            let mut hasher = Sha256::new();
286            let mut buf = vec![0u8; COPY_BUFFER];
287            let mut size: u64 = 0;
288            loop {
289                let n = stdout.read(&mut buf)?;
290                if n == 0 {
291                    break;
292                }
293                hasher.update(&buf[..n]);
294                size += n as u64;
295            }
296            let status = child.wait()?;
297            if !status.success() {
298                return Err(CleanError::ExtensionFailed {
299                    name: ext.name.clone(),
300                    status: status.code(),
301                });
302            }
303            let bytes: [u8; 32] = hasher.finalize().into();
304            return Ok(Pointer {
305                oid: Oid::from_bytes(bytes),
306                size,
307                extensions: build_pointer_extensions(extensions, &input_oids),
308                canonical: true,
309            });
310        }
311
312        let mut next_tmp = NamedTempFile::new_in(tmp_dir)?;
313        let next_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
314        let status = child.wait()?;
315        if !status.success() {
316            return Err(CleanError::ExtensionFailed {
317                name: ext.name.clone(),
318                status: status.code(),
319            });
320        }
321        current_tmp = next_tmp;
322        input_oids.push(next_oid);
323    }
324
325    unreachable!("extension chain exited without producing a pointer")
326}
327
328fn build_pointer_extensions(extensions: &[CleanExtension], input_oids: &[Oid]) -> Vec<Extension> {
329    extensions
330        .iter()
331        .enumerate()
332        .map(|(i, ext)| Extension {
333            name: ext.name.clone(),
334            priority: ext.priority,
335            oid: input_oids[i],
336        })
337        .collect()
338}
339
340fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
341    let mut hasher = Sha256::new();
342    let mut buf = vec![0u8; COPY_BUFFER];
343    loop {
344        let n = src.read(&mut buf)?;
345        if n == 0 {
346            break;
347        }
348        hasher.update(&buf[..n]);
349        dst.write_all(&buf[..n])?;
350    }
351    dst.flush()?;
352    let bytes: [u8; 32] = hasher.finalize().into();
353    Ok(Oid::from_bytes(bytes))
354}
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359    use git_lfs_pointer::VERSION_LATEST;
360    use tempfile::TempDir;
361
362    fn fixture() -> (TempDir, Store) {
363        let tmp = TempDir::new().unwrap();
364        let store = Store::new(tmp.path().join("lfs"));
365        (tmp, store)
366    }
367
368    fn run(store: &Store, input: &[u8]) -> (CleanOutcome, Vec<u8>) {
369        let mut out = Vec::new();
370        let outcome = clean(store, &mut { input }, &mut out, "", &[]).unwrap();
371        (outcome, out)
372    }
373
374    // ---------- Stored path ----------
375
376    #[test]
377    fn small_content_is_hashed_and_stored() {
378        let (_t, store) = fixture();
379        let (outcome, out) = run(&store, b"hello world!");
380        let p = match outcome {
381            CleanOutcome::Stored(p) => p,
382            o => panic!("expected Stored, got {o:?}"),
383        };
384        assert_eq!(p.size, 12);
385        assert!(store.contains(p.oid));
386        assert_eq!(out, p.encode().as_bytes());
387    }
388
389    #[test]
390    fn known_sha256_for_abc() {
391        let (_t, store) = fixture();
392        let (outcome, _) = run(&store, b"abc");
393        let expected: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
394            .parse()
395            .unwrap();
396        assert_eq!(outcome.pointer().oid, expected);
397    }
398
399    #[test]
400    fn pseudo_pointer_with_extra_text_is_hashed() {
401        let input = b"version https://git-lfs.github.com/spec/v1\n\
402                      oid sha256:7cd8be1d2cd0dd22cd9d229bb6b5785009a05e8b39d405615d882caac56562b5\n\
403                      size 1024\n\
404                      \n\
405                      This is my test pointer.\n";
406        let (_t, store) = fixture();
407        let (outcome, out) = run(&store, input);
408        let p = match outcome {
409            CleanOutcome::Stored(p) => p,
410            o => panic!("expected Stored, got {o:?}"),
411        };
412        assert_eq!(p.size, input.len() as u64);
413        assert!(store.contains(p.oid));
414        assert_eq!(out, p.encode().as_bytes());
415    }
416
417    #[test]
418    fn oversized_pointer_shaped_input_is_hashed() {
419        let mut input = Vec::from(
420            &b"version https://git-lfs.github.com/spec/v1\n\
421               oid sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n\
422               size 5\n"[..],
423        );
424        input.extend(std::iter::repeat_n(b'x', 2000));
425        let (_t, store) = fixture();
426        let (outcome, _) = run(&store, &input);
427        let p = match outcome {
428            CleanOutcome::Stored(p) => p,
429            o => panic!("expected Stored, got {o:?}"),
430        };
431        assert_eq!(p.size, input.len() as u64);
432        assert!(store.contains(p.oid));
433    }
434
435    #[test]
436    fn streaming_megabyte_input_works() {
437        let (_t, store) = fixture();
438        let content: Vec<u8> = (0..1_048_576u32).map(|i| (i ^ (i >> 5)) as u8).collect();
439        let (outcome, _) = run(&store, &content);
440        assert_eq!(outcome.pointer().size, content.len() as u64);
441        assert!(store.contains(outcome.pointer().oid));
442    }
443
444    // ---------- Passthrough path ----------
445
446    #[test]
447    fn canonical_pointer_passes_through_verbatim() {
448        let (_t, store) = fixture();
449        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
450        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
451        let (outcome, out) = run(&store, pointer_text.as_bytes());
452        match &outcome {
453            CleanOutcome::Passthrough(p) => assert!(p.canonical),
454            o => panic!("expected Passthrough, got {o:?}"),
455        }
456        assert_eq!(
457            out,
458            pointer_text.as_bytes(),
459            "output must be input verbatim"
460        );
461        assert!(!store.root().join("objects").exists());
462    }
463
464    #[test]
465    fn non_canonical_pointer_passes_through_verbatim() {
466        // CRLF pointer: parses, marked non-canonical. Pass-through must keep
467        // the CRLFs, *not* re-emit the canonical (LF) encoding — otherwise the
468        // git blob hash would change underneath the user.
469        let (_t, store) = fixture();
470        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
471        let crlf = format!("version {VERSION_LATEST}\r\noid sha256:{oid_hex}\r\nsize 12345\r\n");
472        let (outcome, out) = run(&store, crlf.as_bytes());
473        match &outcome {
474            CleanOutcome::Passthrough(p) => assert!(!p.canonical),
475            o => panic!("expected Passthrough, got {o:?}"),
476        }
477        assert_eq!(out, crlf.as_bytes());
478    }
479
480    #[test]
481    fn empty_input_is_passthrough_empty_pointer() {
482        let (_t, store) = fixture();
483        let (outcome, out) = run(&store, b"");
484        match &outcome {
485            CleanOutcome::Passthrough(p) => {
486                assert_eq!(p, &Pointer::empty());
487            }
488            o => panic!("expected Passthrough, got {o:?}"),
489        }
490        assert!(out.is_empty(), "empty pointer encodes to empty bytes");
491    }
492
493    #[test]
494    fn passthrough_is_idempotent() {
495        let (_t, store) = fixture();
496        let (_, first) = run(&store, b"some content here");
497        let (outcome2, second) = run(&store, &first);
498        assert!(matches!(outcome2, CleanOutcome::Passthrough(_)));
499        assert_eq!(first, second);
500    }
501
502    // ---------- Extensions ----------
503
504    /// Use `tr a-z A-Z` (POSIX, present everywhere) as a stand-in for the
505    /// case-inverter test extension. Verifies the chained subprocess + OID
506    /// bookkeeping; the upstream Go test driver covers the more elaborate
507    /// case-inverter semantics end-to-end.
508    #[test]
509    fn single_extension_records_input_oid() {
510        let (_t, store) = fixture();
511        let exts = vec![CleanExtension {
512            name: "upper".into(),
513            priority: 0,
514            command: "tr a-z A-Z".into(),
515        }];
516
517        let mut out = Vec::new();
518        let outcome = clean(&store, &mut &b"abc"[..], &mut out, "foo.txt", &exts).unwrap();
519
520        let pointer = match outcome {
521            CleanOutcome::Stored(p) => p,
522            o => panic!("expected Stored, got {o:?}"),
523        };
524
525        // Input was "abc" → SHA-256 well-known.
526        let abc_oid: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
527            .parse()
528            .unwrap();
529        // Output was "ABC" → distinct OID.
530        let upper_oid: Oid = "b5d4045c3f466fa91fe2cc6abe79232a1a57cdf104f7a26e716e0a1e2789df78"
531            .parse()
532            .unwrap();
533
534        assert_eq!(pointer.extensions.len(), 1);
535        assert_eq!(pointer.extensions[0].name, "upper");
536        assert_eq!(pointer.extensions[0].priority, 0);
537        assert_eq!(pointer.extensions[0].oid, abc_oid);
538        assert_eq!(pointer.oid, upper_oid);
539        assert_eq!(pointer.size, 3);
540        assert!(store.contains(upper_oid));
541        // Stored bytes are "ABC".
542        let mut f = store.open(upper_oid).unwrap();
543        let mut bytes = Vec::new();
544        std::io::Read::read_to_end(&mut f, &mut bytes).unwrap();
545        assert_eq!(bytes, b"ABC");
546    }
547
548    #[test]
549    fn extensions_skipped_for_passthrough_pointer() {
550        // If the input is already a pointer, extensions are never invoked —
551        // upstream's `clean` short-circuits before doing anything.
552        let (_t, store) = fixture();
553        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
554        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
555        let exts = vec![CleanExtension {
556            name: "fail".into(),
557            priority: 0,
558            // /bin/false would be invoked if we got past the passthrough check.
559            command: "false".into(),
560        }];
561        let mut out = Vec::new();
562        let outcome = clean(&store, &mut pointer_text.as_bytes(), &mut out, "x", &exts).unwrap();
563        assert!(matches!(outcome, CleanOutcome::Passthrough(_)));
564        assert_eq!(out, pointer_text.as_bytes());
565    }
566
567    #[test]
568    fn extension_failure_is_propagated() {
569        let (_t, store) = fixture();
570        let exts = vec![CleanExtension {
571            name: "fail".into(),
572            priority: 0,
573            command: "false".into(),
574        }];
575        let mut out = Vec::new();
576        let err = clean(&store, &mut &b"hello"[..], &mut out, "x", &exts).unwrap_err();
577        assert!(
578            matches!(err, CleanError::ExtensionFailed { .. }),
579            "got {err:?}"
580        );
581    }
582}