Skip to main content

maw_lfs/
pointer.rs

1//! LFS pointer format v1 codec.
2//!
3//! Spec: <https://github.com/git-lfs/git-lfs/blob/main/docs/spec.md>
4//!
5//! Canonical form:
6//!
7//! ```text
8//! version https://git-lfs.github.com/spec/v1
9//! oid sha256:<64-char-lowercase-hex>
10//! size <decimal-bytes>
11//! ```
12//!
13//! Rules enforced:
14//! - `version` line is always first.
15//! - All other keys are sorted alphabetically.
16//! - Each line ends with LF (0x0A), including the final line.
17//! - ASCII only; CRLF rejected.
18//! - Max pointer size: 1024 bytes (spec recommends rejecting larger inputs).
19
20use thiserror::Error;
21
22const VERSION_URL: &str = "https://git-lfs.github.com/spec/v1";
23const MAX_POINTER_BYTES: usize = 1024;
24const VERSION_PREFIX: &[u8] = b"version https://git-lfs.github.com/spec/v1\n";
25
26/// A parsed LFS pointer. Represents the content of a git blob that stands in
27/// for a real binary file.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct Pointer {
30    /// sha256 of the real file content.
31    pub oid: [u8; 32],
32    /// Size of the real file, in bytes.
33    pub size: u64,
34    /// Unknown keys preserved for forward compatibility. Round-tripped on write.
35    /// Keys are stored lowercase; values as-parsed.
36    pub extensions: Vec<(String, String)>,
37}
38
39#[derive(Debug, Error, PartialEq, Eq)]
40pub enum ParseError {
41    #[error("pointer is empty")]
42    Empty,
43    #[error("pointer too large: {0} bytes (max {MAX_POINTER_BYTES})")]
44    TooLarge(usize),
45    #[error("missing or invalid version line")]
46    BadVersion,
47    #[error("unsupported pointer version: {found}")]
48    UnsupportedVersion { found: String },
49    #[error("missing or invalid oid line")]
50    BadOid,
51    #[error("missing or invalid size line")]
52    BadSize,
53    #[error("non-ASCII bytes in pointer")]
54    NonAscii,
55    #[error("duplicate key: {0}")]
56    DuplicateKey(String),
57    #[error("CRLF line endings not allowed")]
58    CrlfLineEndings,
59}
60
61impl Pointer {
62    /// Parse an LFS pointer from canonical pointer bytes.
63    ///
64    /// # Errors
65    /// Returns a [`ParseError`] if the bytes are too large, non-ASCII, use
66    /// unsupported line endings, or do not contain a valid version, oid, and
67    /// size.
68    pub fn parse(bytes: &[u8]) -> Result<Self, ParseError> {
69        if bytes.is_empty() {
70            return Err(ParseError::Empty);
71        }
72        if bytes.len() > MAX_POINTER_BYTES {
73            return Err(ParseError::TooLarge(bytes.len()));
74        }
75        if !bytes.is_ascii() {
76            return Err(ParseError::NonAscii);
77        }
78        if bytes.contains(&b'\r') {
79            return Err(ParseError::CrlfLineEndings);
80        }
81        // Every line, including the last, must terminate in LF.
82        if !bytes.ends_with(b"\n") {
83            return Err(ParseError::BadVersion);
84        }
85
86        // SAFETY: we verified is_ascii() above.
87        let text = std::str::from_utf8(bytes).map_err(|_| ParseError::NonAscii)?;
88
89        let mut lines = text.split('\n');
90        // split on '\n' with trailing '\n' yields a trailing empty element.
91        let version_line = lines.next().ok_or(ParseError::BadVersion)?;
92
93        // Version line is exactly "version <URL>".
94        let version_value = version_line
95            .strip_prefix("version ")
96            .ok_or(ParseError::BadVersion)?;
97        if version_value != VERSION_URL {
98            return Err(ParseError::UnsupportedVersion {
99                found: version_value.to_owned(),
100            });
101        }
102
103        let mut oid: Option<[u8; 32]> = None;
104        let mut size: Option<u64> = None;
105        let mut extensions: Vec<(String, String)> = Vec::new();
106        let mut seen_keys: Vec<String> = Vec::new();
107
108        for line in lines {
109            if line.is_empty() {
110                continue; // trailing empty from split
111            }
112            // "key value" — exactly one space separator.
113            let (key, value) = line.split_once(' ').ok_or(ParseError::BadVersion)?;
114            if seen_keys.iter().any(|k| k == key) {
115                return Err(ParseError::DuplicateKey(key.to_owned()));
116            }
117            seen_keys.push(key.to_owned());
118
119            match key {
120                "oid" => {
121                    let hex = value.strip_prefix("sha256:").ok_or(ParseError::BadOid)?;
122                    if hex.len() != 64 {
123                        return Err(ParseError::BadOid);
124                    }
125                    let mut bytes = [0u8; 32];
126                    for (i, byte) in bytes.iter_mut().enumerate() {
127                        let hi = hex_digit(hex.as_bytes()[i * 2]).ok_or(ParseError::BadOid)?;
128                        let lo = hex_digit(hex.as_bytes()[i * 2 + 1]).ok_or(ParseError::BadOid)?;
129                        // Reject uppercase hex (spec says lowercase).
130                        if hex.as_bytes()[i * 2].is_ascii_uppercase()
131                            || hex.as_bytes()[i * 2 + 1].is_ascii_uppercase()
132                        {
133                            return Err(ParseError::BadOid);
134                        }
135                        *byte = (hi << 4) | lo;
136                    }
137                    oid = Some(bytes);
138                }
139                "size" => {
140                    let n: u64 = value.parse().map_err(|_| ParseError::BadSize)?;
141                    size = Some(n);
142                }
143                _ => {
144                    extensions.push((key.to_owned(), value.to_owned()));
145                }
146            }
147        }
148
149        let oid = oid.ok_or(ParseError::BadOid)?;
150        let size = size.ok_or(ParseError::BadSize)?;
151
152        Ok(Self {
153            oid,
154            size,
155            extensions,
156        })
157    }
158
159    /// Serialize this pointer in canonical Git LFS pointer format.
160    #[must_use]
161    pub fn write(&self) -> Vec<u8> {
162        // Version always first; all other keys sorted alphabetically.
163        // Known keys: oid, size. Unknown: extensions. Merge-sort them all.
164        let mut keyed: Vec<(String, String)> = Vec::with_capacity(2 + self.extensions.len());
165        keyed.push(("oid".to_owned(), format!("sha256:{}", self.oid_hex())));
166        keyed.push(("size".to_owned(), self.size.to_string()));
167        for (k, v) in &self.extensions {
168            keyed.push((k.clone(), v.clone()));
169        }
170        keyed.sort_by(|a, b| a.0.cmp(&b.0));
171
172        let mut out = String::with_capacity(
173            VERSION_PREFIX.len()
174                + keyed
175                    .iter()
176                    .map(|(k, v)| k.len() + v.len() + 2)
177                    .sum::<usize>(),
178        );
179        out.push_str("version ");
180        out.push_str(VERSION_URL);
181        out.push('\n');
182        for (k, v) in &keyed {
183            out.push_str(k);
184            out.push(' ');
185            out.push_str(v);
186            out.push('\n');
187        }
188        out.into_bytes()
189    }
190
191    #[must_use]
192    pub fn oid_hex(&self) -> String {
193        let mut s = String::with_capacity(64);
194        for byte in &self.oid {
195            s.push(hex_char(byte >> 4));
196            s.push(hex_char(byte & 0x0f));
197        }
198        s
199    }
200}
201
202const fn hex_digit(b: u8) -> Option<u8> {
203    match b {
204        b'0'..=b'9' => Some(b - b'0'),
205        b'a'..=b'f' => Some(b - b'a' + 10),
206        b'A'..=b'F' => Some(b - b'A' + 10),
207        _ => None,
208    }
209}
210
211fn hex_char(n: u8) -> char {
212    match n {
213        0..=9 => (b'0' + n) as char,
214        10..=15 => (b'a' + n - 10) as char,
215        _ => unreachable!(),
216    }
217}
218
219/// Fast check: does this byte slice look like an LFS pointer?
220///
221/// Used to short-circuit blob inspection before a full parse.
222#[must_use]
223pub fn looks_like_pointer(bytes: &[u8]) -> bool {
224    bytes.len() <= MAX_POINTER_BYTES && bytes.starts_with(VERSION_PREFIX)
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    const SAMPLE_OID_HEX: &str = "4d7a214614ab2935c943f9e0ff69d22eadbb8f32b1258daaa5e2ca24d17e2393";
232    const SAMPLE_SIZE: u64 = 12345;
233
234    fn sample_oid() -> [u8; 32] {
235        let mut out = [0u8; 32];
236        for (i, byte) in out.iter_mut().enumerate() {
237            let hi = hex_digit(SAMPLE_OID_HEX.as_bytes()[i * 2]).expect("operation should succeed");
238            let lo =
239                hex_digit(SAMPLE_OID_HEX.as_bytes()[i * 2 + 1]).expect("operation should succeed");
240            *byte = (hi << 4) | lo;
241        }
242        out
243    }
244
245    fn sample_pointer_bytes() -> Vec<u8> {
246        format!(
247            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\nsize {SAMPLE_SIZE}\n"
248        )
249        .into_bytes()
250    }
251
252    #[test]
253    fn roundtrip_canonical_pointer() {
254        let bytes = sample_pointer_bytes();
255        let p = Pointer::parse(&bytes).expect("operation should succeed");
256        assert_eq!(p.oid, sample_oid());
257        assert_eq!(p.size, SAMPLE_SIZE);
258        assert!(p.extensions.is_empty());
259        assert_eq!(p.oid_hex(), SAMPLE_OID_HEX);
260        assert_eq!(p.write(), bytes);
261    }
262
263    #[test]
264    fn parse_keys_in_any_order_after_version() {
265        let bytes = format!(
266            "version https://git-lfs.github.com/spec/v1\nsize {SAMPLE_SIZE}\noid sha256:{SAMPLE_OID_HEX}\n"
267        );
268        let p = Pointer::parse(bytes.as_bytes()).expect("operation should succeed");
269        assert_eq!(p.size, SAMPLE_SIZE);
270        // Write sorts alphabetically: oid before size.
271        let out = p.write();
272        let text = std::str::from_utf8(&out).expect("operation should succeed");
273        let lines: Vec<&str> = text.lines().collect();
274        assert_eq!(lines[0], "version https://git-lfs.github.com/spec/v1");
275        assert!(lines[1].starts_with("oid "));
276        assert!(lines[2].starts_with("size "));
277    }
278
279    #[test]
280    fn empty_input_rejected() {
281        assert_eq!(Pointer::parse(b""), Err(ParseError::Empty));
282    }
283
284    #[test]
285    fn too_large_rejected() {
286        let huge = vec![b'a'; MAX_POINTER_BYTES + 1];
287        assert!(matches!(
288            Pointer::parse(&huge),
289            Err(ParseError::TooLarge(_))
290        ));
291    }
292
293    #[test]
294    fn non_ascii_rejected() {
295        let bytes = b"version https://git-lfs.github.com/spec/v1\nsize 1\noid sha256:\xff\n";
296        assert_eq!(Pointer::parse(bytes), Err(ParseError::NonAscii));
297    }
298
299    #[test]
300    fn crlf_rejected() {
301        let bytes = b"version https://git-lfs.github.com/spec/v1\r\nsize 1\r\n";
302        assert_eq!(Pointer::parse(bytes), Err(ParseError::CrlfLineEndings));
303    }
304
305    #[test]
306    fn missing_trailing_newline_rejected() {
307        // Valid content but no trailing LF — reject (spec requires LF on every line).
308        let bytes = format!(
309            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\nsize {SAMPLE_SIZE}"
310        );
311        assert!(Pointer::parse(bytes.as_bytes()).is_err());
312    }
313
314    #[test]
315    fn bad_version_url_rejected() {
316        let bytes = b"version https://example.com/v99\noid sha256:0\nsize 1\n";
317        assert!(matches!(
318            Pointer::parse(bytes),
319            Err(ParseError::UnsupportedVersion { .. })
320        ));
321    }
322
323    #[test]
324    fn missing_version_rejected() {
325        let bytes = format!("oid sha256:{SAMPLE_OID_HEX}\nsize {SAMPLE_SIZE}\n");
326        assert_eq!(
327            Pointer::parse(bytes.as_bytes()),
328            Err(ParseError::BadVersion)
329        );
330    }
331
332    #[test]
333    fn uppercase_hex_rejected() {
334        let upper: String = SAMPLE_OID_HEX.to_ascii_uppercase();
335        let bytes = format!(
336            "version https://git-lfs.github.com/spec/v1\noid sha256:{upper}\nsize {SAMPLE_SIZE}\n"
337        );
338        assert_eq!(Pointer::parse(bytes.as_bytes()), Err(ParseError::BadOid));
339    }
340
341    #[test]
342    fn short_oid_rejected() {
343        let bytes = b"version https://git-lfs.github.com/spec/v1\noid sha256:abc\nsize 1\n";
344        assert_eq!(Pointer::parse(bytes), Err(ParseError::BadOid));
345    }
346
347    #[test]
348    fn non_numeric_size_rejected() {
349        let bytes = format!(
350            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\nsize notanumber\n"
351        );
352        assert_eq!(Pointer::parse(bytes.as_bytes()), Err(ParseError::BadSize));
353    }
354
355    #[test]
356    fn missing_oid_rejected() {
357        let bytes = b"version https://git-lfs.github.com/spec/v1\nsize 1\n";
358        assert_eq!(Pointer::parse(bytes), Err(ParseError::BadOid));
359    }
360
361    #[test]
362    fn missing_size_rejected() {
363        let bytes =
364            format!("version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\n");
365        assert_eq!(Pointer::parse(bytes.as_bytes()), Err(ParseError::BadSize));
366    }
367
368    #[test]
369    fn duplicate_key_rejected() {
370        let bytes = format!(
371            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\noid sha256:{SAMPLE_OID_HEX}\nsize 1\n"
372        );
373        assert!(matches!(
374            Pointer::parse(bytes.as_bytes()),
375            Err(ParseError::DuplicateKey(_))
376        ));
377    }
378
379    #[test]
380    fn extensions_preserved_roundtrip() {
381        // Unknown keys must be preserved and sorted with known keys on write.
382        let bytes = format!(
383            "version https://git-lfs.github.com/spec/v1\nextra value-x\noid sha256:{SAMPLE_OID_HEX}\nsize {SAMPLE_SIZE}\n"
384        );
385        let p = Pointer::parse(bytes.as_bytes()).expect("operation should succeed");
386        assert_eq!(
387            p.extensions,
388            vec![("extra".to_owned(), "value-x".to_owned())]
389        );
390        let out = p.write();
391        let expected = format!(
392            "version https://git-lfs.github.com/spec/v1\nextra value-x\noid sha256:{SAMPLE_OID_HEX}\nsize {SAMPLE_SIZE}\n"
393        );
394        assert_eq!(out, expected.as_bytes());
395    }
396
397    #[test]
398    fn looks_like_pointer_positive() {
399        assert!(looks_like_pointer(&sample_pointer_bytes()));
400    }
401
402    #[test]
403    fn looks_like_pointer_rejects_binary() {
404        let binary: Vec<u8> = (0..2048u16)
405            .map(|i| u8::try_from(i % 256).expect("value reduced below byte range"))
406            .collect();
407        assert!(!looks_like_pointer(&binary));
408    }
409
410    #[test]
411    fn looks_like_pointer_rejects_text_starting_with_version() {
412        assert!(!looks_like_pointer(b"version 2.0 something else\n"));
413    }
414
415    #[test]
416    fn looks_like_pointer_rejects_too_large_even_with_prefix() {
417        let mut buf = VERSION_PREFIX.to_vec();
418        buf.resize(MAX_POINTER_BYTES + 1, b'x');
419        assert!(!looks_like_pointer(&buf));
420    }
421
422    #[test]
423    fn size_zero_accepted() {
424        // Empty files are valid LFS content.
425        let bytes = format!(
426            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\nsize 0\n"
427        );
428        let p = Pointer::parse(bytes.as_bytes()).expect("operation should succeed");
429        assert_eq!(p.size, 0);
430    }
431
432    #[test]
433    fn large_size_accepted() {
434        let big = u64::MAX;
435        let bytes = format!(
436            "version https://git-lfs.github.com/spec/v1\noid sha256:{SAMPLE_OID_HEX}\nsize {big}\n"
437        );
438        let p = Pointer::parse(bytes.as_bytes()).expect("operation should succeed");
439        assert_eq!(p.size, big);
440    }
441}
442
443#[cfg(test)]
444mod interop_tests {
445    use super::*;
446
447    #[test]
448    fn matches_git_lfs_output() {
449        // "hello world\n" is 12 bytes; sha256 matches git-lfs 3.7.1 output.
450        let hex = "a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447";
451        let mut oid = [0u8; 32];
452        for (i, byte) in oid.iter_mut().enumerate() {
453            let hi = hex_digit(hex.as_bytes()[i * 2]).expect("operation should succeed");
454            let lo = hex_digit(hex.as_bytes()[i * 2 + 1]).expect("operation should succeed");
455            *byte = (hi << 4) | lo;
456        }
457        let p = Pointer {
458            oid,
459            size: 12,
460            extensions: vec![],
461        };
462        let out = p.write();
463        let expected = b"version https://git-lfs.github.com/spec/v1\noid sha256:a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447\nsize 12\n";
464        assert_eq!(out, expected);
465    }
466}