Skip to main content

git_lfs_filter/
smudge.rs

1//! The smudge filter: pointer-on-stdin → content-on-stdout.
2
3use std::fs;
4use std::io::{self, Read, Write};
5use std::path::Path;
6use std::process::{Command, Stdio};
7
8use git_lfs_pointer::{Oid, Pointer};
9use git_lfs_store::Store;
10use sha2::{Digest, Sha256};
11use tempfile::NamedTempFile;
12
13use crate::FetchError;
14use crate::detect_pointer;
15
16const COPY_BUFFER: usize = 64 * 1024;
17
18/// One pointer extension's smudge side. Mirrors [`crate::CleanExtension`]
19/// — separate types because clean/smudge commands are distinct config
20/// keys (`lfs.extension.<name>.clean` vs `.smudge`) and different code
21/// paths consume them.
22#[derive(Debug, Clone)]
23pub struct SmudgeExtension {
24    pub name: String,
25    pub priority: u8,
26    pub command: String,
27}
28
29/// Result of running the [`smudge`] filter on a piece of input.
30#[derive(Debug)]
31pub enum SmudgeOutcome {
32    /// Input wasn't a pointer (or was malformed) and was emitted to the
33    /// output stream verbatim. This matches upstream's "smudge with invalid
34    /// pointer" behavior — git wraps everything through the filter, and
35    /// non-LFS content has to come out unchanged.
36    Passthrough,
37    /// Input was a pointer; its content was streamed from the store to the
38    /// output (or it was the empty pointer, which writes nothing).
39    Resolved(Pointer),
40}
41
42#[derive(Debug, thiserror::Error)]
43pub enum SmudgeError {
44    #[error(transparent)]
45    Io(#[from] io::Error),
46    /// The pointer references an object that isn't in the local store.
47    /// [`smudge_with_fetch`] handles this by invoking the caller's fetch
48    /// closure; bare [`smudge`] surfaces it for the caller to react to.
49    #[error("object {} (size {}) is not present in the local store", .0.oid, .0.size)]
50    ObjectMissing(Pointer),
51    /// The fetch closure passed to [`smudge_with_fetch`] failed to produce
52    /// the missing object.
53    #[error("fetch failed: {0}")]
54    FetchFailed(FetchError),
55    /// Pointer references an extension by name that isn't configured in
56    /// `lfs.extension.<name>.smudge`. Mirrors upstream's
57    /// `extension '%s' is not configured`.
58    #[error("extension {name:?} is not configured")]
59    ExtensionNotConfigured { name: String },
60    /// Configured extension has an empty `smudge` command.
61    #[error("extension {name:?} has no smudge command configured")]
62    ExtensionMissingCommand { name: String },
63    /// Failed to spawn the extension subprocess.
64    #[error("failed to spawn extension {name:?}: {source}")]
65    ExtensionSpawnFailed {
66        name: String,
67        #[source]
68        source: io::Error,
69    },
70    /// Extension subprocess exited non-zero.
71    #[error("extension {name:?} exited with status {status:?}")]
72    ExtensionFailed { name: String, status: Option<i32> },
73    /// An extension's output (or the stored object's content) didn't
74    /// hash to the OID recorded in the pointer. Either the extension
75    /// is non-deterministic, the on-disk object is corrupt, or the
76    /// extension is the wrong implementation for what cleaned the file.
77    #[error("OID mismatch for {stage}: expected {expected}, got {actual}")]
78    OidMismatch {
79        stage: String,
80        expected: Oid,
81        actual: Oid,
82    },
83}
84
85/// Apply the smudge filter to `input`, writing the working-tree content
86/// (or pass-through bytes) to `output`.
87///
88/// 1. If `input` parses as a pointer, look the OID up in the store and
89///    stream the bytes out (running configured pointer extensions in
90///    reverse priority order when the pointer carries any).
91/// 2. If `input` doesn't parse as a pointer, pass it through verbatim.
92///
93/// `path` is the working-tree path passed to git's filter; substituted
94/// for `%f` in each extension's smudge command. `extensions` is the
95/// configured `lfs.extension.<name>` set; its order doesn't matter
96/// (the chain is built from `pointer.extensions` in priority order).
97pub fn smudge<R: Read, W: Write>(
98    store: &Store,
99    input: &mut R,
100    output: &mut W,
101    path: &str,
102    extensions: &[SmudgeExtension],
103) -> Result<SmudgeOutcome, SmudgeError> {
104    let (head, maybe_pointer) = detect_pointer(input)?;
105
106    let Some(pointer) = maybe_pointer else {
107        // Not a pointer: pass bytes through unchanged.
108        output.write_all(&head)?;
109        io::copy(input, output)?;
110        return Ok(SmudgeOutcome::Passthrough);
111    };
112
113    if pointer.is_empty() {
114        return Ok(SmudgeOutcome::Resolved(pointer));
115    }
116
117    // Treat any size mismatch as "missing": same OID + different size means
118    // a corrupt or partial local copy, and the recovery path is the same
119    // as a real miss — re-download.
120    if !store.contains_with_size(pointer.oid, pointer.size) {
121        return Err(SmudgeError::ObjectMissing(pointer));
122    }
123
124    smudge_object_to(store, &pointer, output, path, extensions, None)?;
125    Ok(SmudgeOutcome::Resolved(pointer))
126}
127
128/// Like [`smudge`], but on a missing-object miss invokes `fetch` to populate
129/// the store, then streams the freshly-fetched bytes to `output`.
130///
131/// `fetch` receives the [`Pointer`] of the missing object — the caller is
132/// expected to download exactly that OID into the local store. After a
133/// successful return, this function re-checks the store and streams the
134/// content; if the store *still* doesn't have the object, an
135/// [`SmudgeError::ObjectMissing`] is surfaced (i.e. the fetch lied).
136pub fn smudge_with_fetch<R, W, F>(
137    store: &Store,
138    input: &mut R,
139    output: &mut W,
140    path: &str,
141    extensions: &[SmudgeExtension],
142    mut fetch: F,
143) -> Result<SmudgeOutcome, SmudgeError>
144where
145    R: Read,
146    W: Write,
147    F: FnMut(&Pointer) -> Result<(), FetchError>,
148{
149    match smudge(store, input, output, path, extensions) {
150        Err(SmudgeError::ObjectMissing(pointer)) => {
151            fetch(&pointer).map_err(SmudgeError::FetchFailed)?;
152            if !store.contains_with_size(pointer.oid, pointer.size) {
153                return Err(SmudgeError::ObjectMissing(pointer));
154            }
155            smudge_object_to(store, &pointer, output, path, extensions, None)?;
156            Ok(SmudgeOutcome::Resolved(pointer))
157        }
158        other => other,
159    }
160}
161
162/// Stream the working-tree content for an already-parsed `pointer` to
163/// `output`. Used by `pull` and `checkout`, which have the pointer in
164/// hand from the index walk. `spawn_cwd` is the working directory each
165/// extension subprocess runs from — pass `Some(work_tree_root)` from
166/// pull/checkout (so a `git lfs pull` invoked from a subdirectory still
167/// finds `.git/`); the smudge filter (called by git from the work-tree
168/// root) can pass `None` to inherit the parent's cwd.
169///
170/// The caller must have already verified `store.contains_with_size`;
171/// this function won't fetch.
172pub fn smudge_object_to<W: Write>(
173    store: &Store,
174    pointer: &Pointer,
175    output: &mut W,
176    path: &str,
177    extensions: &[SmudgeExtension],
178    spawn_cwd: Option<&Path>,
179) -> Result<(), SmudgeError> {
180    if pointer.extensions.is_empty() {
181        let mut file = store.open(pointer.oid)?;
182        io::copy(&mut file, output)?;
183        return Ok(());
184    }
185    apply_smudge_chain(store, pointer, output, path, extensions, spawn_cwd)
186}
187
188fn apply_smudge_chain<W: Write>(
189    store: &Store,
190    pointer: &Pointer,
191    output: &mut W,
192    path: &str,
193    extensions: &[SmudgeExtension],
194    spawn_cwd: Option<&Path>,
195) -> Result<(), SmudgeError> {
196    // Match each pointer extension to its registered config by name.
197    // Walk in *reverse* priority order — clean ran ext0 → ext1 → store;
198    // smudge undoes that with ext1 → ext0 → working tree.
199    let mut chain: Vec<(&SmudgeExtension, Oid)> = Vec::with_capacity(pointer.extensions.len());
200    for ptr_ext in &pointer.extensions {
201        let registered = extensions
202            .iter()
203            .find(|e| e.name == ptr_ext.name)
204            .ok_or_else(|| SmudgeError::ExtensionNotConfigured {
205                name: ptr_ext.name.clone(),
206            })?;
207        if registered.command.trim().is_empty() {
208            return Err(SmudgeError::ExtensionMissingCommand {
209                name: registered.name.clone(),
210            });
211        }
212        chain.push((registered, ptr_ext.oid));
213    }
214    chain.reverse();
215
216    let tmp_dir = store.tmp_dir();
217    fs::create_dir_all(&tmp_dir)?;
218
219    // Stage 0: copy the stored object into a tmp file. Verify the
220    // input hash equals `pointer.oid` — should always hold (the store
221    // is content-addressed) but a corrupt object would otherwise
222    // surface as a confusing extension-output mismatch later on.
223    let mut current_tmp = NamedTempFile::new_in(&tmp_dir)?;
224    let mut store_file = store.open(pointer.oid)?;
225    let initial_oid = hash_and_write(&mut store_file, current_tmp.as_file_mut())?;
226    if initial_oid != pointer.oid {
227        return Err(SmudgeError::OidMismatch {
228            stage: format!("stored object {}", pointer.oid),
229            expected: pointer.oid,
230            actual: initial_oid,
231        });
232    }
233
234    for (i, (ext, expected_out_oid)) in chain.iter().enumerate() {
235        let cmd_str = ext.command.replace("%f", path);
236        let mut parts = cmd_str.split_whitespace();
237        let prog = parts
238            .next()
239            .ok_or_else(|| SmudgeError::ExtensionMissingCommand {
240                name: ext.name.clone(),
241            })?;
242        let args: Vec<&str> = parts.collect();
243
244        let stdin_file = std::fs::File::open(current_tmp.path())?;
245        let mut command = Command::new(prog);
246        command
247            .args(&args)
248            .stdin(stdin_file)
249            .stdout(Stdio::piped())
250            .stderr(Stdio::inherit());
251        if let Some(dir) = spawn_cwd {
252            command.current_dir(dir);
253        }
254        let mut child = command
255            .spawn()
256            .map_err(|e| SmudgeError::ExtensionSpawnFailed {
257                name: ext.name.clone(),
258                source: e,
259            })?;
260        let mut stdout = child.stdout.take().expect("piped stdout");
261
262        let is_last = i + 1 == chain.len();
263        if is_last {
264            let actual_oid = hash_and_copy(&mut stdout, output)?;
265            let status = child.wait()?;
266            if !status.success() {
267                return Err(SmudgeError::ExtensionFailed {
268                    name: ext.name.clone(),
269                    status: status.code(),
270                });
271            }
272            if actual_oid != *expected_out_oid {
273                return Err(SmudgeError::OidMismatch {
274                    stage: format!("smudge output of extension {:?}", ext.name),
275                    expected: *expected_out_oid,
276                    actual: actual_oid,
277                });
278            }
279            return Ok(());
280        }
281
282        let mut next_tmp = NamedTempFile::new_in(&tmp_dir)?;
283        let actual_oid = hash_and_write(&mut stdout, next_tmp.as_file_mut())?;
284        let status = child.wait()?;
285        if !status.success() {
286            return Err(SmudgeError::ExtensionFailed {
287                name: ext.name.clone(),
288                status: status.code(),
289            });
290        }
291        if actual_oid != *expected_out_oid {
292            return Err(SmudgeError::OidMismatch {
293                stage: format!("smudge output of extension {:?}", ext.name),
294                expected: *expected_out_oid,
295                actual: actual_oid,
296            });
297        }
298        current_tmp = next_tmp;
299    }
300    unreachable!("smudge chain exited without writing output")
301}
302
303fn hash_and_write<R: Read>(src: &mut R, dst: &mut std::fs::File) -> io::Result<Oid> {
304    let mut hasher = Sha256::new();
305    let mut buf = vec![0u8; COPY_BUFFER];
306    loop {
307        let n = src.read(&mut buf)?;
308        if n == 0 {
309            break;
310        }
311        hasher.update(&buf[..n]);
312        dst.write_all(&buf[..n])?;
313    }
314    dst.flush()?;
315    let bytes: [u8; 32] = hasher.finalize().into();
316    Ok(Oid::from_bytes(bytes))
317}
318
319fn hash_and_copy<R: Read, W: Write>(src: &mut R, dst: &mut W) -> io::Result<Oid> {
320    let mut hasher = Sha256::new();
321    let mut buf = vec![0u8; COPY_BUFFER];
322    loop {
323        let n = src.read(&mut buf)?;
324        if n == 0 {
325            break;
326        }
327        hasher.update(&buf[..n]);
328        dst.write_all(&buf[..n])?;
329    }
330    let bytes: [u8; 32] = hasher.finalize().into();
331    Ok(Oid::from_bytes(bytes))
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::clean;
338    use git_lfs_pointer::VERSION_LATEST;
339    use tempfile::TempDir;
340
341    fn fixture() -> (TempDir, Store) {
342        let tmp = TempDir::new().unwrap();
343        let store = Store::new(tmp.path().join("lfs"));
344        (tmp, store)
345    }
346
347    fn run(store: &Store, input: &[u8]) -> (Result<SmudgeOutcome, SmudgeError>, Vec<u8>) {
348        let mut out = Vec::new();
349        let outcome = smudge(store, &mut { input }, &mut out, "", &[]);
350        (outcome, out)
351    }
352
353    /// Insert content via the clean filter and return the resulting pointer text.
354    fn clean_into(store: &Store, content: &[u8]) -> Vec<u8> {
355        let mut out = Vec::new();
356        clean(store, &mut { content }, &mut out, "", &[]).unwrap();
357        out
358    }
359
360    // ---------- Resolved ----------
361
362    #[test]
363    fn pointer_resolves_from_store() {
364        let (_t, store) = fixture();
365        let content = b"smudge a\n";
366        let pointer_text = clean_into(&store, content);
367
368        let (outcome, out) = run(&store, &pointer_text);
369        let p = match outcome.unwrap() {
370            SmudgeOutcome::Resolved(p) => p,
371            o => panic!("expected Resolved, got {o:?}"),
372        };
373        assert_eq!(p.size, content.len() as u64);
374        assert_eq!(out, content);
375    }
376
377    #[test]
378    fn empty_pointer_writes_nothing() {
379        let (_t, store) = fixture();
380        let (outcome, out) = run(&store, b"");
381        match outcome.unwrap() {
382            SmudgeOutcome::Resolved(p) => assert!(p.is_empty()),
383            o => panic!("expected Resolved(empty), got {o:?}"),
384        }
385        assert!(out.is_empty());
386    }
387
388    #[test]
389    fn clean_smudge_round_trip_preserves_bytes() {
390        let (_t, store) = fixture();
391        for content in [
392            &b""[..],
393            &b"hello"[..],
394            &b"binary \x00\x01\xff data"[..],
395            &(0..256u16).map(|i| i as u8).collect::<Vec<_>>(),
396        ] {
397            let pointer_text = clean_into(&store, content);
398            let mut out = Vec::new();
399            smudge(&store, &mut { &pointer_text[..] }, &mut out, "", &[]).unwrap();
400            assert_eq!(out, content, "round-trip failed for {content:?}");
401        }
402    }
403
404    // ---------- Passthrough ----------
405
406    #[test]
407    fn invalid_pointer_passes_through_short() {
408        let (_t, store) = fixture();
409        for input in [&b"wat"[..], b"not a git-lfs file", b"version "] {
410            let (outcome, out) = run(&store, input);
411            assert!(matches!(outcome.unwrap(), SmudgeOutcome::Passthrough));
412            assert_eq!(out, input);
413        }
414    }
415
416    #[test]
417    fn long_non_pointer_passes_through() {
418        // > MAX_POINTER_SIZE bytes — exercises the head buffer + io::copy path.
419        let (_t, store) = fixture();
420        let content: Vec<u8> = (0..2048u32).map(|i| (i ^ (i >> 3)) as u8).collect();
421        let (outcome, out) = run(&store, &content);
422        assert!(matches!(outcome.unwrap(), SmudgeOutcome::Passthrough));
423        assert_eq!(out, content);
424    }
425
426    // ---------- Errors ----------
427
428    #[test]
429    fn missing_object_errors() {
430        let (_t, store) = fixture();
431        let unknown_oid = "0000000000000000000000000000000000000000000000000000000000000001";
432        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown_oid}\nsize 5\n");
433        let (outcome, out) = run(&store, pointer_text.as_bytes());
434        match outcome.unwrap_err() {
435            SmudgeError::ObjectMissing(pointer) => {
436                assert_eq!(pointer.oid.to_string(), unknown_oid);
437                assert_eq!(pointer.size, 5);
438            }
439            e => panic!("expected ObjectMissing, got {e:?}"),
440        }
441        assert!(out.is_empty(), "no partial output on miss");
442    }
443
444    #[test]
445    fn size_mismatch_treated_as_missing() {
446        let (_t, store) = fixture();
447        let pointer_text = clean_into(&store, b"abc"); // size = 3
448        // Replace "size 3" with "size 99" — parses fine, but won't match the
449        // 3-byte object on disk.
450        let tampered = String::from_utf8(pointer_text)
451            .unwrap()
452            .replace("size 3", "size 99");
453        let (outcome, _) = run(&store, tampered.as_bytes());
454        match outcome.unwrap_err() {
455            SmudgeError::ObjectMissing(p) => assert_eq!(p.size, 99),
456            e => panic!("expected ObjectMissing, got {e:?}"),
457        }
458    }
459
460    // ---------- smudge_with_fetch ----------
461
462    #[test]
463    fn fetch_populates_store_then_streams() {
464        let (_t, store) = fixture();
465        let content = b"to be fetched\n";
466        // Build the pointer text without inserting the object — the store
467        // is "empty" from the smudge's perspective. The fetch closure will
468        // be the one to actually populate it.
469        let pointer_text = clean_into(&store, content);
470        // Wipe the just-inserted object to simulate a true miss.
471        let parsed = git_lfs_pointer::Pointer::parse(&pointer_text).unwrap();
472        std::fs::remove_file(store.object_path(parsed.oid)).unwrap();
473        assert!(!store.contains(parsed.oid));
474
475        let mut out = Vec::new();
476        let store_ref = &store;
477        let outcome = smudge_with_fetch(
478            &store,
479            &mut { &pointer_text[..] },
480            &mut out,
481            "",
482            &[],
483            |p: &Pointer| {
484                // "Download" by inserting the bytes synchronously.
485                store_ref.insert(&mut { &content[..] }).unwrap();
486                assert_eq!(p.size, content.len() as u64);
487                Ok(())
488            },
489        );
490        assert!(matches!(outcome.unwrap(), SmudgeOutcome::Resolved(_)));
491        assert_eq!(out, content);
492    }
493
494    #[test]
495    fn fetch_failure_surfaces_as_fetch_failed() {
496        let (_t, store) = fixture();
497        let unknown = "0000000000000000000000000000000000000000000000000000000000000001";
498        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown}\nsize 5\n");
499        let mut out = Vec::new();
500        let outcome = smudge_with_fetch(
501            &store,
502            &mut { pointer_text.as_bytes() },
503            &mut out,
504            "",
505            &[],
506            |_p: &Pointer| Err("server is on fire".into()),
507        );
508        match outcome.unwrap_err() {
509            SmudgeError::FetchFailed(e) => {
510                assert!(e.to_string().contains("server is on fire"));
511            }
512            other => panic!("expected FetchFailed, got {other:?}"),
513        }
514        assert!(out.is_empty());
515    }
516
517    #[test]
518    fn fetch_returning_ok_but_not_inserting_still_errors() {
519        // Closure lies — claims success but didn't populate the store.
520        let (_t, store) = fixture();
521        let unknown = "0000000000000000000000000000000000000000000000000000000000000001";
522        let pointer_text = format!("version {VERSION_LATEST}\noid sha256:{unknown}\nsize 5\n");
523        let mut out = Vec::new();
524        let outcome = smudge_with_fetch(
525            &store,
526            &mut { pointer_text.as_bytes() },
527            &mut out,
528            "",
529            &[],
530            |_p: &Pointer| Ok(()),
531        );
532        assert!(matches!(
533            outcome.unwrap_err(),
534            SmudgeError::ObjectMissing(_)
535        ));
536    }
537
538    #[test]
539    fn fetch_not_invoked_when_object_already_present() {
540        let (_t, store) = fixture();
541        let content = b"already here";
542        let pointer_text = clean_into(&store, content);
543        let mut out = Vec::new();
544        let mut calls = 0;
545        smudge_with_fetch(
546            &store,
547            &mut { &pointer_text[..] },
548            &mut out,
549            "",
550            &[],
551            |_p: &Pointer| {
552                calls += 1;
553                Ok(())
554            },
555        )
556        .unwrap();
557        assert_eq!(
558            calls, 0,
559            "fetch must not be called when store has the object"
560        );
561        assert_eq!(out, content);
562    }
563
564    // ---------- Extensions ----------
565
566    /// Round-trip clean → smudge through `tr a-z A-Z` (the lower-case-
567    /// inverter stand-in we use for cli tests too). Verifies the chained
568    /// subprocess + OID bookkeeping. The upstream Go tests exercise the
569    /// case-inverter end-to-end — this is the unit-level analog.
570    #[test]
571    fn single_extension_round_trips() {
572        let (_t, store) = fixture();
573        let clean_exts = vec![crate::CleanExtension {
574            name: "upper".into(),
575            priority: 0,
576            command: "tr a-z A-Z".into(),
577        }];
578        let smudge_exts = vec![SmudgeExtension {
579            name: "upper".into(),
580            priority: 0,
581            command: "tr A-Z a-z".into(),
582        }];
583
584        // Clean "abc" → store "ABC", pointer with ext-0-upper.
585        let mut pointer_buf = Vec::new();
586        crate::clean(
587            &store,
588            &mut &b"abc"[..],
589            &mut pointer_buf,
590            "foo.txt",
591            &clean_exts,
592        )
593        .unwrap();
594
595        // Smudge that pointer back through the extension chain → "abc".
596        let mut out = Vec::new();
597        let outcome = smudge(
598            &store,
599            &mut pointer_buf.as_slice(),
600            &mut out,
601            "foo.txt",
602            &smudge_exts,
603        )
604        .unwrap();
605        assert!(matches!(outcome, SmudgeOutcome::Resolved(_)));
606        assert_eq!(out, b"abc");
607    }
608
609    #[test]
610    fn extension_not_configured_errors() {
611        let (_t, store) = fixture();
612        let oid_hex = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
613        let ext_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff";
614        let pointer_text = format!(
615            "version {VERSION_LATEST}\n\
616             ext-0-foo sha256:{ext_oid}\n\
617             oid sha256:{oid_hex}\n\
618             size 12345\n",
619        );
620        let mut out = Vec::new();
621        let err = smudge(&store, &mut pointer_text.as_bytes(), &mut out, "x", &[]).unwrap_err();
622        // We hit ObjectMissing first because the store doesn't have the
623        // referenced OID; ExtensionNotConfigured would surface only
624        // after the object is present. Fine for this test — the goal
625        // is just to confirm we no longer error with an "unsupported"
626        // shaped variant.
627        assert!(matches!(err, SmudgeError::ObjectMissing(_)));
628    }
629}