Skip to main content

git_lfs_filter/
smudge.rs

1//! The smudge filter: pointer-on-stdin → content-on-stdout.
2
3use std::fs;
4use std::io::{self, Read, Write};
5use std::path::Path;
6use std::process::{Command, Stdio};
7
8use git_lfs_pointer::{Oid, Pointer};
9use git_lfs_store::Store;
10use sha2::{Digest, Sha256};
11use tempfile::NamedTempFile;
12
13use crate::FetchError;
14use crate::detect_pointer;
15
16const COPY_BUFFER: usize = 64 * 1024;
17
18/// One pointer extension's smudge side.
19///
20/// Mirrors [`crate::CleanExtension`]; the two are separate types
21/// because clean and smudge commands come from distinct config keys
22/// (`lfs.extension.<name>.clean` vs `.smudge`) and are consumed by
23/// different code paths.
24#[derive(Debug, Clone)]
25pub struct SmudgeExtension {
26    /// Extension name, as configured under `lfs.extension.<name>`.
27    pub name: String,
28    /// Single decimal digit (0-9) determining position in the chain;
29    /// smudge walks extensions in reverse priority order.
30    pub priority: u8,
31    /// Raw shell command from `lfs.extension.<name>.smudge`. `%f`
32    /// placeholders are substituted with the working-tree path.
33    pub command: String,
34}
35
36/// Result of running the [`smudge`] filter on a piece of input.
37#[derive(Debug)]
38pub enum SmudgeOutcome {
39    /// Input wasn't a pointer (or was malformed) and was emitted to
40    /// the output stream verbatim.
41    ///
42    /// Matches upstream's "smudge with invalid pointer" behavior:
43    /// git wraps everything through the filter, and non-LFS content
44    /// has to come out unchanged.
45    Passthrough,
46    /// Input was a pointer; its content was streamed from the store
47    /// to the output (or it was the empty pointer, which writes
48    /// nothing).
49    Resolved(Pointer),
50}
51
52/// Things that can go wrong while running [`smudge`].
53#[derive(Debug, thiserror::Error)]
54pub enum SmudgeError {
55    /// Filesystem-level failure: reading the input, writing the
56    /// output, opening the stored object, etc.
57    #[error(transparent)]
58    Io(#[from] io::Error),
59    /// The pointer references an object that isn't in the local store.
60    /// [`smudge_with_fetch`] handles this by invoking the caller's fetch
61    /// closure; bare [`smudge`] surfaces it for the caller to react to.
62    #[error("object {} (size {}) is not present in the local store", .0.oid, .0.size)]
63    ObjectMissing(Pointer),
64    /// The fetch closure passed to [`smudge_with_fetch`] failed to produce
65    /// the missing object.
66    #[error("fetch failed: {0}")]
67    FetchFailed(FetchError),
68    /// Pointer references an extension by name that isn't configured in
69    /// `lfs.extension.<name>.smudge`. Mirrors upstream's
70    /// `extension '%s' is not configured`.
71    #[error("extension {name:?} is not configured")]
72    ExtensionNotConfigured { name: String },
73    /// Configured extension has an empty `smudge` command.
74    #[error("extension {name:?} has no smudge command configured")]
75    ExtensionMissingCommand { name: String },
76    /// Failed to spawn the extension subprocess.
77    #[error("failed to spawn extension {name:?}: {source}")]
78    ExtensionSpawnFailed {
79        name: String,
80        #[source]
81        source: io::Error,
82    },
83    /// Extension subprocess exited non-zero.
84    #[error("extension {name:?} exited with status {status:?}")]
85    ExtensionFailed { name: String, status: Option<i32> },
86    /// An extension's output (or the stored object's content) didn't
87    /// hash to the OID recorded in the pointer. Either the extension
88    /// is non-deterministic, the on-disk object is corrupt, or the
89    /// extension is the wrong implementation for what cleaned the file.
90    #[error("OID mismatch for {stage}: expected {expected}, got {actual}")]
91    OidMismatch {
92        stage: String,
93        expected: Oid,
94        actual: Oid,
95    },
96}
97
98/// Apply the smudge filter to `input`, writing the working-tree content
99/// (or pass-through bytes) to `output`.
100///
101/// 1. If `input` parses as a pointer, look the OID up in the store and
102///    stream the bytes out (running configured pointer extensions in
103///    reverse priority order when the pointer carries any).
104/// 2. If `input` doesn't parse as a pointer, pass it through verbatim.
105///
106/// `path` is the working-tree path passed to git's filter; substituted
107/// for `%f` in each extension's smudge command. `extensions` is the
108/// configured `lfs.extension.<name>` set; its order doesn't matter
109/// (the chain is built from `pointer.extensions` in priority order).
110pub fn smudge<R: Read, W: Write>(
111    store: &Store,
112    input: &mut R,
113    output: &mut W,
114    path: &str,
115    extensions: &[SmudgeExtension],
116) -> Result<SmudgeOutcome, SmudgeError> {
117    let (head, maybe_pointer) = detect_pointer(input)?;
118
119    let Some(pointer) = maybe_pointer else {
120        // Not a pointer: pass bytes through unchanged.
121        output.write_all(&head)?;
122        io::copy(input, output)?;
123        return Ok(SmudgeOutcome::Passthrough);
124    };
125
126    if pointer.is_empty() {
127        return Ok(SmudgeOutcome::Resolved(pointer));
128    }
129
130    // Treat any size mismatch as "missing": same OID + different size means
131    // a corrupt or partial local copy, and the recovery path is the same
132    // as a real miss — re-download.
133    if !store.contains_with_size(pointer.oid, pointer.size) {
134        return Err(SmudgeError::ObjectMissing(pointer));
135    }
136
137    smudge_object_to(store, &pointer, output, path, extensions, None)?;
138    Ok(SmudgeOutcome::Resolved(pointer))
139}
140
141/// Like [`smudge`], but on a missing-object miss invokes `fetch` to populate
142/// the store, then streams the freshly-fetched bytes to `output`.
143///
144/// `fetch` receives the [`Pointer`] of the missing object; the
145/// caller is expected to download exactly that OID into the local
146/// store. After a successful return, this function re-checks the
147/// store and streams the content; if the store *still* doesn't have
148/// the object, an [`SmudgeError::ObjectMissing`] is surfaced (i.e.
149/// the fetch lied).
150pub fn smudge_with_fetch<R, W, F>(
151    store: &Store,
152    input: &mut R,
153    output: &mut W,
154    path: &str,
155    extensions: &[SmudgeExtension],
156    mut fetch: F,
157) -> Result<SmudgeOutcome, SmudgeError>
158where
159    R: Read,
160    W: Write,
161    F: FnMut(&Pointer) -> Result<(), FetchError>,
162{
163    match smudge(store, input, output, path, extensions) {
164        Err(SmudgeError::ObjectMissing(pointer)) => {
165            fetch(&pointer).map_err(SmudgeError::FetchFailed)?;
166            if !store.contains_with_size(pointer.oid, pointer.size) {
167                return Err(SmudgeError::ObjectMissing(pointer));
168            }
169            smudge_object_to(store, &pointer, output, path, extensions, None)?;
170            Ok(SmudgeOutcome::Resolved(pointer))
171        }
172        other => other,
173    }
174}
175
176/// Stream the working-tree content for an already-parsed `pointer` to
177/// `output`.
178///
179/// Used by `pull` and `checkout`, which have the pointer in hand
180/// from the index walk. `spawn_cwd` is the working directory each
181/// extension subprocess runs from: pass `Some(work_tree_root)` from
182/// pull or checkout (so a `git lfs pull` invoked from a subdirectory
183/// still finds `.git/`); the smudge filter (called by git from the
184/// work-tree root) can pass `None` to inherit the parent's cwd.
185///
186/// The caller must have already verified `store.contains_with_size`;
187/// this function won't fetch.
188pub fn smudge_object_to<W: Write>(
189    store: &Store,
190    pointer: &Pointer,
191    output: &mut W,
192    path: &str,
193    extensions: &[SmudgeExtension],
194    spawn_cwd: Option<&Path>,
195) -> Result<(), SmudgeError> {
196    if pointer.extensions.is_empty() {
197        let mut file = store.open(pointer.oid)?;
198        io::copy(&mut file, output)?;
199        return Ok(());
200    }
201    apply_smudge_chain(store, pointer, output, path, extensions, spawn_cwd)
202}
203
204fn apply_smudge_chain<W: Write>(
205    store: &Store,
206    pointer: &Pointer,
207    output: &mut W,
208    path: &str,
209    extensions: &[SmudgeExtension],
210    spawn_cwd: Option<&Path>,
211) -> Result<(), SmudgeError> {
212    // Match each pointer extension to its registered config by name.
213    // Walk in *reverse* priority order — clean ran ext0 → ext1 → store;
214    // smudge undoes that with ext1 → ext0 → working tree.
215    let mut chain: Vec<(&SmudgeExtension, Oid)> = Vec::with_capacity(pointer.extensions.len());
216    for ptr_ext in &pointer.extensions {
217        let registered = extensions
218            .iter()
219            .find(|e| e.name == ptr_ext.name)
220            .ok_or_else(|| SmudgeError::ExtensionNotConfigured {
221                name: ptr_ext.name.clone(),
222            })?;
223        if registered.command.trim().is_empty() {
224            return Err(SmudgeError::ExtensionMissingCommand {
225                name: registered.name.clone(),
226            });
227        }
228        chain.push((registered, ptr_ext.oid));
229    }
230    chain.reverse();
231
232    let tmp_dir = store.tmp_dir();
233    fs::create_dir_all(&tmp_dir)?;
234
235    // Stage 0: copy the stored object into a tmp file. Verify the
236    // input hash equals `pointer.oid` — should always hold (the store
237    // is content-addressed) but a corrupt object would otherwise
238    // surface as a confusing extension-output mismatch later on.
239    let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
240    let mut store_file = store.open(pointer.oid)?;
241    let initial_oid = hash_and_write(&mut store_file, current_tmp.as_file_mut())?;
242    if initial_oid != pointer.oid {
243        return Err(SmudgeError::OidMismatch {
244            stage: format!("stored object {}", pointer.oid),
245            expected: pointer.oid,
246            actual: initial_oid,
247        });
248    }
249
250    for (i, (ext, expected_out_oid)) in chain.iter().enumerate() {
251        let cmd_str = ext.command.replace("%f", path);
252        let mut parts = cmd_str.split_whitespace();
253        let prog = parts
254            .next()
255            .ok_or_else(|| SmudgeError::ExtensionMissingCommand {
256                name: ext.name.clone(),
257            })?;
258        let args: Vec<&str> = parts.collect();
259
260        let stdin_file = std::fs::File::open(current_tmp.path())?;
261        let mut command = Command::new(prog);
262        command
263            .args(&args)
264            .stdin(stdin_file)
265            .stdout(Stdio::piped())
266            .stderr(Stdio::inherit());
267        if let Some(dir) = spawn_cwd {
268            command.current_dir(dir);
269        }
270        let mut child = command
271            .spawn()
272            .map_err(|e| SmudgeError::ExtensionSpawnFailed {
273                name: ext.name.clone(),
274                source: e,
275            })?;
276        let mut stdout = child.stdout.take().expect("piped stdout");
277
278        let is_last = i + 1 == chain.len();
279        if is_last {
280            let actual_oid = hash_and_copy(&mut stdout, output)?;
281            let status = child.wait()?;
282            if !status.success() {
283                return Err(SmudgeError::ExtensionFailed {
284                    name: ext.name.clone(),
285                    status: status.code(),
286                });
287            }
288            if actual_oid != *expected_out_oid {
289                return Err(SmudgeError::OidMismatch {
290                    stage: format!("smudge output of extension {:?}", ext.name),
291                    expected: *expected_out_oid,
292                    actual: actual_oid,
293                });
294            }
295            return Ok(());
296        }
297
298        let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
299        let actual_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
300        let status = child.wait()?;
301        if !status.success() {
302            return Err(SmudgeError::ExtensionFailed {
303                name: ext.name.clone(),
304                status: status.code(),
305            });
306        }
307        if actual_oid != *expected_out_oid {
308            return Err(SmudgeError::OidMismatch {
309                stage: format!("smudge output of extension {:?}", ext.name),
310                expected: *expected_out_oid,
311                actual: actual_oid,
312            });
313        }
314        current_tmp = next_tmp;
315    }
316    unreachable!("smudge chain exited without writing output")
317}
318
319fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
320    let mut hasher = Sha256::new();
321    let mut buf = vec![0u8; COPY_BUFFER];
322    loop {
323        let n = src.read(&mut buf)?;
324        if n == 0 {
325            break;
326        }
327        hasher.update(&buf[..n]);
328        dst.write_all(&buf[..n])?;
329    }
330    dst.flush()?;
331    let bytes: [u8; 32] = hasher.finalize().into();
332    Ok(Oid::from_bytes(bytes))
333}
334
335fn hash_and_copy<R: Read, W: Write>(src: &mut R, dst: &mut W) -> io::Result<Oid> {
336    let mut hasher = Sha256::new();
337    let mut buf = vec![0u8; COPY_BUFFER];
338    loop {
339        let n = src.read(&mut buf)?;
340        if n == 0 {
341            break;
342        }
343        hasher.update(&buf[..n]);
344        dst.write_all(&buf[..n])?;
345    }
346    let bytes: [u8; 32] = hasher.finalize().into();
347    Ok(Oid::from_bytes(bytes))
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353    use crate::clean;
354    use git_lfs_pointer::VERSION_LATEST;
355    use tempfile::TempDir;
356
357    fn fixture() -> (TempDir, Store) {
358        let tmp = TempDir::new().unwrap();
359        let store = Store::new(tmp.path().join("lfs"));
360        (tmp, store)
361    }
362
363    fn run(store: &Store, input: &[u8]) -> (Result<SmudgeOutcome, SmudgeError>, Vec<u8>) {
364        let mut out = Vec::new();
365        let outcome = smudge(store, &mut { input }, &mut out, "", &[]);
366        (outcome, out)
367    }
368
369    /// Insert content via the clean filter and return the resulting pointer text.
370    fn clean_into(store: &Store, content: &[u8]) -> Vec<u8> {
371        let mut out = Vec::new();
372        clean(store, &mut { content }, &mut out, "", &[]).unwrap();
373        out
374    }
375
376    // ---------- Resolved ----------
377
378    #[test]
379    fn pointer_resolves_from_store() {
380        let (_t, store) = fixture();
381        let content = b"smudge a\n";
382        let pointer_text = clean_into(&store, content);
383
384        let (outcome, out) = run(&store, &pointer_text);
385        let p = match outcome.unwrap() {
386            SmudgeOutcome::Resolved(p) => p,
387            o => panic!("expected Resolved, got {o:?}"),
388        };
389        assert_eq!(p.size, content.len() as u64);
390        assert_eq!(out, content);
391    }
392
393    #[test]
394    fn empty_pointer_writes_nothing() {
395        let (_t, store) = fixture();
396        let (outcome, out) = run(&store, b"");
397        match outcome.unwrap() {
398            SmudgeOutcome::Resolved(p) => assert!(p.is_empty()),
399            o => panic!("expected Resolved(empty), got {o:?}"),
400        }
401        assert!(out.is_empty());
402    }
403
404    #[test]
405    fn clean_smudge_round_trip_preserves_bytes() {
406        let (_t, store) = fixture();
407        for content in [
408            &b""[..],
409            &b"hello"[..],
410            &b"binary \x00\x01\xff data"[..],
411            &(0..256u16).map(|i| i as u8).collect::<Vec<_>>(),
412        ] {
413            let pointer_text = clean_into(&store, content);
414            let mut out = Vec::new();
415            smudge(&store, &mut { &pointer_text[..] }, &mut out, "", &[]).unwrap();
416            assert_eq!(out, content, "round-trip failed for {content:?}");
417        }
418    }
419
420    // ---------- Passthrough ----------
421
422    #[test]
423    fn invalid_pointer_passes_through_short() {
424        let (_t, store) = fixture();
425        for input in [&b"wat"[..], b"not a git-lfs file", b"version "] {
426            let (outcome, out) = run(&store, input);
427            assert!(matches!(outcome.unwrap(), SmudgeOutcome::Passthrough));
428            assert_eq!(out, input);
429        }
430    }
431
432    #[test]
433    fn long_non_pointer_passes_through() {
434        // > MAX_POINTER_SIZE bytes — exercises the head buffer + io::copy path.
435        let (_t, store) = fixture();
436        let content: Vec<u8> = (0..2048u32).map(|i| (i ^ (i >> 3)) as u8).collect();
437        let (outcome, out) = run(&store, &content);
438        assert!(matches!(outcome.unwrap(), SmudgeOutcome::Passthrough));
439        assert_eq!(out, content);
440    }
441
442    // ---------- Errors ----------
443
444    #[test]
445    fn missing_object_errors() {
446        let (_t, store) = fixture();
447        let unknown_oid = "0000000000000000000000000000000000000000000000000000000000000001";
448        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown_oid}\nsize 5\n");
449        let (outcome, out) = run(&store, pointer_text.as_bytes());
450        match outcome.unwrap_err() {
451            SmudgeError::ObjectMissing(pointer) => {
452                assert_eq!(pointer.oid.to_string(), unknown_oid);
453                assert_eq!(pointer.size, 5);
454            }
455            e => panic!("expected ObjectMissing, got {e:?}"),
456        }
457        assert!(out.is_empty(), "no partial output on miss");
458    }
459
460    #[test]
461    fn size_mismatch_treated_as_missing() {
462        let (_t, store) = fixture();
463        let pointer_text = clean_into(&store, b"abc"); // size = 3
464        // Replace "size 3" with "size 99" — parses fine, but won't match the
465        // 3-byte object on disk.
466        let tampered = String::from_utf8(pointer_text)
467            .unwrap()
468            .replace("size 3", "size 99");
469        let (outcome, _) = run(&store, tampered.as_bytes());
470        match outcome.unwrap_err() {
471            SmudgeError::ObjectMissing(p) => assert_eq!(p.size, 99),
472            e => panic!("expected ObjectMissing, got {e:?}"),
473        }
474    }
475
476    // ---------- smudge_with_fetch ----------
477
478    #[test]
479    fn fetch_populates_store_then_streams() {
480        let (_t, store) = fixture();
481        let content = b"to be fetched\n";
482        // Build the pointer text without inserting the object — the store
483        // is "empty" from the smudge's perspective. The fetch closure will
484        // be the one to actually populate it.
485        let pointer_text = clean_into(&store, content);
486        // Wipe the just-inserted object to simulate a true miss.
487        let parsed = git_lfs_pointer::Pointer::parse(&pointer_text).unwrap();
488        std::fs::remove_file(store.object_path(parsed.oid)).unwrap();
489        assert!(!store.contains(parsed.oid));
490
491        let mut out = Vec::new();
492        let store_ref = &store;
493        let outcome = smudge_with_fetch(
494            &store,
495            &mut { &pointer_text[..] },
496            &mut out,
497            "",
498            &[],
499            |p: &Pointer| {
500                // "Download" by inserting the bytes synchronously.
501                store_ref.insert(&mut { &content[..] }).unwrap();
502                assert_eq!(p.size, content.len() as u64);
503                Ok(())
504            },
505        );
506        assert!(matches!(outcome.unwrap(), SmudgeOutcome::Resolved(_)));
507        assert_eq!(out, content);
508    }
509
510    #[test]
511    fn fetch_failure_surfaces_as_fetch_failed() {
512        let (_t, store) = fixture();
513        let unknown = "0000000000000000000000000000000000000000000000000000000000000001";
514        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown}\nsize 5\n");
515        let mut out = Vec::new();
516        let outcome = smudge_with_fetch(
517            &store,
518            &mut { pointer_text.as_bytes() },
519            &mut out,
520            "",
521            &[],
522            |_p: &Pointer| Err("server is on fire".into()),
523        );
524        match outcome.unwrap_err() {
525            SmudgeError::FetchFailed(e) => {
526                assert!(e.to_string().contains("server is on fire"));
527            }
528            other => panic!("expected FetchFailed, got {other:?}"),
529        }
530        assert!(out.is_empty());
531    }
532
533    #[test]
534    fn fetch_returning_ok_but_not_inserting_still_errors() {
535        // Closure lies — claims success but didn't populate the store.
536        let (_t, store) = fixture();
537        let unknown = "0000000000000000000000000000000000000000000000000000000000000001";
538        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown}\nsize 5\n");
539        let mut out = Vec::new();
540        let outcome = smudge_with_fetch(
541            &store,
542            &mut { pointer_text.as_bytes() },
543            &mut out,
544            "",
545            &[],
546            |_p: &Pointer| Ok(()),
547        );
548        assert!(matches!(
549            outcome.unwrap_err(),
550            SmudgeError::ObjectMissing(_)
551        ));
552    }
553
554    #[test]
555    fn fetch_not_invoked_when_object_already_present() {
556        let (_t, store) = fixture();
557        let content = b"already here";
558        let pointer_text = clean_into(&store, content);
559        let mut out = Vec::new();
560        let mut calls = 0;
561        smudge_with_fetch(
562            &store,
563            &mut { &pointer_text[..] },
564            &mut out,
565            "",
566            &[],
567            |_p: &Pointer| {
568                calls += 1;
569                Ok(())
570            },
571        )
572        .unwrap();
573        assert_eq!(
574            calls, 0,
575            "fetch must not be called when store has the object"
576        );
577        assert_eq!(out, content);
578    }
579
580    // ---------- Extensions ----------
581
582    /// Round-trip clean → smudge through `tr a-z A-Z` (the lower-case-
583    /// inverter stand-in we use for cli tests too). Verifies the chained
584    /// subprocess + OID bookkeeping. The upstream Go tests exercise the
585    /// case-inverter end-to-end — this is the unit-level analog.
586    #[test]
587    fn single_extension_round_trips() {
588        let (_t, store) = fixture();
589        let clean_exts = vec![crate::CleanExtension {
590            name: "upper".into(),
591            priority: 0,
592            command: "tr a-z A-Z".into(),
593        }];
594        let smudge_exts = vec![SmudgeExtension {
595            name: "upper".into(),
596            priority: 0,
597            command: "tr A-Z a-z".into(),
598        }];
599
600        // Clean "abc" → store "ABC", pointer with ext-0-upper.
601        let mut pointer_buf = Vec::new();
602        crate::clean(
603            &store,
604            &mut &b"abc"[..],
605            &mut pointer_buf,
606            "foo.txt",
607            &clean_exts,
608        )
609        .unwrap();
610
611        // Smudge that pointer back through the extension chain → "abc".
612        let mut out = Vec::new();
613        let outcome = smudge(
614            &store,
615            &mut pointer_buf.as_slice(),
616            &mut out,
617            "foo.txt",
618            &smudge_exts,
619        )
620        .unwrap();
621        assert!(matches!(outcome, SmudgeOutcome::Resolved(_)));
622        assert_eq!(out, b"abc");
623    }
624
625    #[test]
626    fn extension_not_configured_errors() {
627        let (_t, store) = fixture();
628        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
629        let ext_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff";
630        let pointer_text = format!(
631            "version {VERSION_LATEST}\n\
632             ext-0-foo sha256:{ext_oid}\n\
633             oid sha256:{oid_hex}\n\
634             size 12345\n",
635        );
636        let mut out = Vec::new();
637        let err = smudge(&store, &mut pointer_text.as_bytes(), &mut out, "x", &[]).unwrap_err();
638        // We hit ObjectMissing first because the store doesn't have the
639        // referenced OID; ExtensionNotConfigured would surface only
640        // after the object is present. Fine for this test — the goal
641        // is just to confirm we no longer error with an "unsupported"
642        // shaped variant.
643        assert!(matches!(err, SmudgeError::ObjectMissing(_)));
644    }
645}