Skip to main content

git_lfs_filter/
clean.rs

1//! The clean filter: stdin → store + pointer-on-stdout.
2
3use std::io::{Read, Write};
4
5use git_lfs_pointer::Pointer;
6use git_lfs_store::{Store, StoreError};
7
8use crate::detect_pointer;
9
10/// Result of running the [`clean`] filter on a piece of input.
11#[derive(Debug)]
12pub enum CleanOutcome {
13    /// Input was already a valid pointer; the original bytes were emitted
14    /// to the output stream verbatim and nothing was inserted into the store.
15    /// This is what makes `git lfs clean` idempotent on already-cleaned blobs.
16    Passthrough(Pointer),
17    /// Input was content; it was hashed and inserted into the store, and the
18    /// canonical encoding of the resulting [`Pointer`] was written to the
19    /// output stream.
20    Stored(Pointer),
21}
22
23impl CleanOutcome {
24    /// The pointer associated with this outcome (the parsed pass-through one,
25    /// or the freshly-stored one).
26    pub fn pointer(&self) -> &Pointer {
27        match self {
28            Self::Passthrough(p) | Self::Stored(p) => p,
29        }
30    }
31
32    /// `true` if the input was recognized as an existing pointer.
33    pub fn was_passthrough(&self) -> bool {
34        matches!(self, Self::Passthrough(_))
35    }
36}
37
38/// Apply the clean filter to `input`, writing the resulting pointer (or the
39/// pass-through bytes) to `output`.
40///
41/// Algorithm (matches upstream `gitfilter_clean.go`):
42/// 1. Read up to `MAX_POINTER_SIZE` bytes.
43/// 2. If those bytes parse as a valid pointer, emit them verbatim
44///    ([`CleanOutcome::Passthrough`]).
45/// 3. Otherwise stream the buffered head + the rest of `input` into the
46///    [`Store`], computing SHA-256 as we go, and emit the canonical encoding
47///    of the resulting pointer ([`CleanOutcome::Stored`]).
48pub fn clean<R: Read, W: Write>(
49    store: &Store,
50    input: &mut R,
51    output: &mut W,
52) -> Result<CleanOutcome, StoreError> {
53    let (head, maybe_pointer) = detect_pointer(input)?;
54
55    if let Some(pointer) = maybe_pointer {
56        output.write_all(&head)?;
57        return Ok(CleanOutcome::Passthrough(pointer));
58    }
59
60    let mut combined = head.as_slice().chain(input);
61    let (oid, size) = store.insert(&mut combined)?;
62    let pointer = Pointer::new(oid, size);
63    output.write_all(pointer.encode().as_bytes())?;
64    Ok(CleanOutcome::Stored(pointer))
65}
66
67#[cfg(test)]
68mod tests {
69    use super::*;
70    use git_lfs_pointer::{Oid, VERSION_LATEST};
71    use tempfile::TempDir;
72
73    fn fixture() -> (TempDir, Store) {
74        let tmp = TempDir::new().unwrap();
75        let store = Store::new(tmp.path().join("lfs"));
76        (tmp, store)
77    }
78
79    fn run(store: &Store, input: &[u8]) -> (CleanOutcome, Vec<u8>) {
80        let mut out = Vec::new();
81        let outcome = clean(store, &mut { input }, &mut out).unwrap();
82        (outcome, out)
83    }
84
85    // ---------- Stored path ----------
86
87    #[test]
88    fn small_content_is_hashed_and_stored() {
89        let (_t, store) = fixture();
90        let (outcome, out) = run(&store, b"hello world!");
91        let p = match outcome {
92            CleanOutcome::Stored(p) => p,
93            o => panic!("expected Stored, got {o:?}"),
94        };
95        assert_eq!(p.size, 12);
96        assert!(store.contains(p.oid));
97        assert_eq!(out, p.encode().as_bytes());
98    }
99
100    #[test]
101    fn known_sha256_for_abc() {
102        let (_t, store) = fixture();
103        let (outcome, _) = run(&store, b"abc");
104        let expected: Oid = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
105            .parse()
106            .unwrap();
107        assert_eq!(outcome.pointer().oid, expected);
108    }
109
110    #[test]
111    fn pseudo_pointer_with_extra_text_is_hashed() {
112        let input = b"version https://git-lfs.github.com/spec/v1\n\
113                      oid sha256:7cd8be1d2cd0dd22cd9d229bb6b5785009a05e8b39d405615d882caac56562b5\n\
114                      size 1024\n\
115                      \n\
116                      This is my test pointer.\n";
117        let (_t, store) = fixture();
118        let (outcome, out) = run(&store, input);
119        let p = match outcome {
120            CleanOutcome::Stored(p) => p,
121            o => panic!("expected Stored, got {o:?}"),
122        };
123        assert_eq!(p.size, input.len() as u64);
124        assert!(store.contains(p.oid));
125        assert_eq!(out, p.encode().as_bytes());
126    }
127
128    #[test]
129    fn oversized_pointer_shaped_input_is_hashed() {
130        let mut input = Vec::from(
131            &b"version https://git-lfs.github.com/spec/v1\n\
132               oid sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n\
133               size 5\n"[..],
134        );
135        input.extend(std::iter::repeat_n(b'x', 2000));
136        let (_t, store) = fixture();
137        let (outcome, _) = run(&store, &input);
138        let p = match outcome {
139            CleanOutcome::Stored(p) => p,
140            o => panic!("expected Stored, got {o:?}"),
141        };
142        assert_eq!(p.size, input.len() as u64);
143        assert!(store.contains(p.oid));
144    }
145
146    #[test]
147    fn streaming_megabyte_input_works() {
148        let (_t, store) = fixture();
149        let content: Vec<u8> = (0..1_048_576u32).map(|i| (i ^ (i >> 5)) as u8).collect();
150        let (outcome, _) = run(&store, &content);
151        assert_eq!(outcome.pointer().size, content.len() as u64);
152        assert!(store.contains(outcome.pointer().oid));
153    }
154
155    // ---------- Passthrough path ----------
156
157    #[test]
158    fn canonical_pointer_passes_through_verbatim() {
159        let (_t, store) = fixture();
160        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
161        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{oid_hex}\nsize 12345\n");
162        let (outcome, out) = run(&store, pointer_text.as_bytes());
163        match &outcome {
164            CleanOutcome::Passthrough(p) => assert!(p.canonical),
165            o => panic!("expected Passthrough, got {o:?}"),
166        }
167        assert_eq!(out, pointer_text.as_bytes(), "output must be input verbatim");
168        assert!(!store.root().join("objects").exists());
169    }
170
171    #[test]
172    fn non_canonical_pointer_passes_through_verbatim() {
173        // CRLF pointer: parses, marked non-canonical. Pass-through must keep
174        // the CRLFs, *not* re-emit the canonical (LF) encoding — otherwise the
175        // git blob hash would change underneath the user.
176        let (_t, store) = fixture();
177        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
178        let crlf = format!("version {VERSION_LATEST}\r\noid sha256:{oid_hex}\r\nsize 12345\r\n");
179        let (outcome, out) = run(&store, crlf.as_bytes());
180        match &outcome {
181            CleanOutcome::Passthrough(p) => assert!(!p.canonical),
182            o => panic!("expected Passthrough, got {o:?}"),
183        }
184        assert_eq!(out, crlf.as_bytes());
185    }
186
187    #[test]
188    fn empty_input_is_passthrough_empty_pointer() {
189        let (_t, store) = fixture();
190        let (outcome, out) = run(&store, b"");
191        match &outcome {
192            CleanOutcome::Passthrough(p) => {
193                assert_eq!(p, &Pointer::empty());
194            }
195            o => panic!("expected Passthrough, got {o:?}"),
196        }
197        assert!(out.is_empty(), "empty pointer encodes to empty bytes");
198    }
199
200    #[test]
201    fn passthrough_is_idempotent() {
202        let (_t, store) = fixture();
203        let (_, first) = run(&store, b"some content here");
204        let (outcome2, second) = run(&store, &first);
205        assert!(matches!(outcome2, CleanOutcome::Passthrough(_)));
206        assert_eq!(first, second);
207    }
208}