Skip to main content

mkit_core/
delta.rs

1//! Delta instruction stream — implements the versioned format required
2//! by `docs/SPEC-DELTA.md`.
3//!
4//! Stream layout (SPEC-DELTA §2):
5//!
6//! ```text
7//! [u8  stream_version == 0x01]
8//! [u32 LE base_len]            length of the base object
9//! [u32 LE result_len]          expected length of the reconstruction
10//! [instructions ...]           sequence of opcodes
11//! ```
12//!
13//! Two opcodes (SPEC-DELTA §3):
14//!
15//! * `0x80` (COPY): `[opcode][u32 LE offset][u16 LE length]` = 7 bytes.
16//!   `length` MUST be `>= 1` and `offset + length <= base_len`. The
17//!   remaining 7 low bits of the opcode are reserved and MUST be 0 in v1.
18//! * `0x01..=0x7F` (INSERT): the opcode byte is the literal length (1..127),
19//!   followed by that many literal bytes. Long literals are split into
20//!   multiple INSERTs; there is no extended-length form in v1.
21//!
22//! `0x00` is **reserved** and MUST be rejected.
23
24use crate::object::MkitError;
25
26/// Version byte at offset 0 of every v1 delta stream.
27pub const STREAM_VERSION: u8 = 0x01;
28/// `COPY` opcode — top bit set, low seven bits reserved (must be zero).
29pub const OP_COPY: u8 = 0x80;
30/// Maximum bytes encodable in a single `INSERT` opcode.
31pub const MAX_INSERT_LEN: usize = 127;
32/// Fixed prefix size: 1 byte version + 4 byte `base_len` + 4 byte `result_len`.
33pub const HEADER_LEN: usize = 1 + 4 + 4;
34
35/// Block size for the writer's hash index. Power of two for cheap
36/// alignment math. Not part of the wire format — readers don't care.
37const BLOCK_SIZE: usize = 16;
38
39/// Multiplier applied to `stream.len()` when bounding the decoder's
40/// initial `Vec::with_capacity`. The worst-case expansion of a COPY op
41/// is 7 bytes of stream → `u16::MAX` output bytes (≈ 9363×), but in
42/// practice stream-sized * 256 dwarfs real deltas and still keeps the
43/// attacker's reach tiny: a 9-byte stream can only request ≤ 2304
44/// bytes of pre-allocation, independent of `base.len()` or the
45/// declared `result_len`. The final `result_len` self-consistency
46/// check still catches inflated payloads.
47pub(crate) const CAP_MULTIPLIER: usize = 256;
48
49/// Compute the decoder's initial capacity hint.
50///
51/// This is a small, explicitly attacker-resistant helper, exposed to
52/// the crate so that the regression test can pin down the bound
53/// without reaching into `decode`. The hint is the smaller of:
54///
55/// * the declared `result_len` (can't exceed declared output), and
56/// * `stream.len() * CAP_MULTIPLIER` (attacker bounded by on-wire size).
57///
58/// Crucially, `base.len()` does NOT appear here — see SEC finding G5.
59#[inline]
60pub(crate) fn compute_cap_hint(result_len: usize, _base_len: usize, stream_len: usize) -> usize {
61    result_len.min(stream_len.saturating_mul(CAP_MULTIPLIER))
62}
63
64/// Build a v1 delta stream that reconstructs `result` from `base`.
65///
66/// The writer is an FNV-1a-on-16-byte-blocks scan. Any conformant
67/// writer is acceptable; this one is greedy. Output is always at least
68/// [`HEADER_LEN`] bytes.
69///
70/// # Errors
71///
72/// Returns [`MkitError::DeltaLengthOverflow`] if either `base.len()`
73/// or `result.len()` exceeds `u32::MAX`. SPEC-PACKFILE caps individual
74/// payloads under this bound, so this is a programmer error rather
75/// than a normal runtime condition — but silently saturating (the
76/// old behaviour) produced a stream that `decode()` rejected with a
77/// confusing "length mismatch" far from the actual source.
78///
79/// # Panics
80///
81/// Panics only on invariant violations in the writer's bookkeeping
82/// (insert-buffer length > 127, match length > `u16::MAX`); both are
83/// guarded above and unreachable for any valid input.
84pub fn encode(base: &[u8], result: &[u8]) -> Result<Vec<u8>, MkitError> {
85    use std::collections::HashMap;
86
87    check_length_bounds(base.len(), result.len())?;
88
89    let mut out = Vec::with_capacity(HEADER_LEN + result.len());
90    write_header(&mut out, base.len(), result.len());
91
92    // Build hash table: hash(block) -> first-seen position. We cap
93    // `base.len()` at `u32::MAX` for COPY offsets — bases over 4 GiB
94    // are out of scope for v1 (SPEC-PACKFILE caps individual payloads).
95    let num_blocks = base.len() / BLOCK_SIZE;
96    let mut index: HashMap<u64, u32> = HashMap::with_capacity(num_blocks);
97    for i in 0..num_blocks {
98        let pos = i * BLOCK_SIZE;
99        if let Ok(pos_u32) = u32::try_from(pos) {
100            let block = &base[pos..pos + BLOCK_SIZE];
101            let h = block_hash(block);
102            index.entry(h).or_insert(pos_u32);
103        } else {
104            break; // base too large for u32 offsets; fall back to all-INSERT
105        }
106    }
107
108    let mut insert_buf: Vec<u8> = Vec::with_capacity(MAX_INSERT_LEN);
109    let mut ti = 0usize;
110    while ti < result.len() {
111        let mut matched = false;
112        if ti + BLOCK_SIZE <= result.len() {
113            let target_block = &result[ti..ti + BLOCK_SIZE];
114            let h = block_hash(target_block);
115            if let Some(&base_pos) = index.get(&h) {
116                let base_pos_usize = base_pos as usize;
117                if &base[base_pos_usize..base_pos_usize + BLOCK_SIZE] == target_block {
118                    flush_insert(&mut out, &mut insert_buf);
119
120                    // Greedy forward extension, capped at u16::MAX.
121                    let mut match_len = BLOCK_SIZE;
122                    while base_pos_usize + match_len < base.len()
123                        && ti + match_len < result.len()
124                        && base[base_pos_usize + match_len] == result[ti + match_len]
125                        && match_len < u16::MAX as usize
126                    {
127                        match_len += 1;
128                    }
129                    // match_len is bounded by u16::MAX above.
130                    emit_copy(
131                        &mut out,
132                        base_pos,
133                        u16::try_from(match_len).expect("<= u16::MAX"),
134                    );
135                    ti += match_len;
136                    matched = true;
137                }
138            }
139        }
140        if !matched {
141            insert_buf.push(result[ti]);
142            ti += 1;
143            if insert_buf.len() == MAX_INSERT_LEN {
144                flush_insert(&mut out, &mut insert_buf);
145            }
146        }
147    }
148    flush_insert(&mut out, &mut insert_buf);
149    Ok(out)
150}
151
152/// Validate that `base_len` and `result_len` both fit in the v1 wire
153/// format's `u32` cap. Extracted as a helper so tests can exercise
154/// the bound without actually allocating multi-gigabyte buffers.
155pub(crate) fn check_length_bounds(base_len: usize, result_len: usize) -> Result<(), MkitError> {
156    if u32::try_from(base_len).is_err() {
157        return Err(MkitError::DeltaLengthOverflow {
158            field: "base_len",
159            len: base_len,
160        });
161    }
162    if u32::try_from(result_len).is_err() {
163        return Err(MkitError::DeltaLengthOverflow {
164            field: "result_len",
165            len: result_len,
166        });
167    }
168    Ok(())
169}
170
171/// Apply a v1 delta stream to `base`, returning the reconstructed bytes.
172/// Verifies header version, base length, COPY bounds, and the final
173/// `result_len`.
174///
175/// # Errors
176///
177/// Returns [`MkitError::UnsupportedObjectVersion`] for stream version
178/// other than `0x01`, [`MkitError::UnexpectedEof`] for truncated input,
179/// and [`MkitError::TrailingData`] for any other corruption (zero
180/// opcode, COPY past base, length mismatch at end-of-stream, reserved
181/// bits set, etc.).
182///
183/// # Panics
184///
185/// Slice-to-fixed-array conversions in this function are guarded by
186/// the preceding bounds checks; the `expect` calls trip only if the
187/// compiler's slice-bounds elision is wrong.
188pub fn decode(base: &[u8], stream: &[u8]) -> Result<Vec<u8>, MkitError> {
189    if stream.len() < HEADER_LEN {
190        return Err(MkitError::UnexpectedEof);
191    }
192    if stream[0] != STREAM_VERSION {
193        return Err(MkitError::UnsupportedObjectVersion);
194    }
195    let base_len = u32::from_le_bytes(stream[1..5].try_into().expect("4 bytes")) as usize;
196    let result_len = u32::from_le_bytes(stream[5..9].try_into().expect("4 bytes")) as usize;
197    if base_len != base.len() {
198        return Err(MkitError::TrailingData);
199    }
200
201    // Bound the pre-allocation against attacker-controlled length fields.
202    // See [`compute_cap_hint`]: the hint is strictly a function of
203    // `stream.len()` (with `result_len` as an upper bound) — `base.len()`
204    // MUST NOT appear, because a 1 GiB base + 9-byte crafted stream
205    // otherwise triggers a ≈ 1 GiB allocation (SEC finding G5). The
206    // final `result_len` equality check below still enforces wire-level
207    // self-consistency.
208    let cap_hint = compute_cap_hint(result_len, base.len(), stream.len());
209    let mut out: Vec<u8> = Vec::with_capacity(cap_hint);
210    let mut pos = HEADER_LEN;
211    while pos < stream.len() {
212        let op = stream[pos];
213        pos += 1;
214        if op & 0x80 != 0 {
215            // COPY. Reserved low seven bits MUST be zero in v1.
216            if op & 0x7F != 0 {
217                return Err(MkitError::TrailingData);
218            }
219            if pos + 6 > stream.len() {
220                return Err(MkitError::UnexpectedEof);
221            }
222            let offset =
223                u32::from_le_bytes(stream[pos..pos + 4].try_into().expect("4 bytes")) as usize;
224            pos += 4;
225            let length =
226                u16::from_le_bytes(stream[pos..pos + 2].try_into().expect("2 bytes")) as usize;
227            pos += 2;
228            if length == 0 {
229                return Err(MkitError::TrailingData);
230            }
231            // Use saturating math: an attacker-controlled offset could
232            // overflow `usize` on 32-bit targets when added to length.
233            let end = offset.checked_add(length).ok_or(MkitError::TrailingData)?;
234            if end > base.len() {
235                return Err(MkitError::TrailingData);
236            }
237            // Don't overshoot the declared result_len.
238            if out.len().checked_add(length).is_none_or(|v| v > result_len) {
239                return Err(MkitError::TrailingData);
240            }
241            out.extend_from_slice(&base[offset..end]);
242        } else if op > 0 {
243            // INSERT. opcode IS the literal length (1..=127).
244            let length = op as usize;
245            if pos + length > stream.len() {
246                return Err(MkitError::UnexpectedEof);
247            }
248            if out.len().checked_add(length).is_none_or(|v| v > result_len) {
249                return Err(MkitError::TrailingData);
250            }
251            out.extend_from_slice(&stream[pos..pos + length]);
252            pos += length;
253        } else {
254            // 0x00 reserved.
255            return Err(MkitError::TrailingData);
256        }
257    }
258    if out.len() != result_len {
259        return Err(MkitError::TrailingData);
260    }
261    Ok(out)
262}
263
264// --- helpers ---
265
266fn write_header(out: &mut Vec<u8>, base_len: usize, result_len: usize) {
267    // `check_length_bounds` has already been called by `encode`, so
268    // both fit in u32. The `expect()`s are invariant-preserving
269    // rather than user-facing: reaching them means the caller
270    // bypassed `encode`.
271    let bl: u32 = u32::try_from(base_len).expect("base_len <= u32::MAX (checked)");
272    let rl: u32 = u32::try_from(result_len).expect("result_len <= u32::MAX (checked)");
273    out.push(STREAM_VERSION);
274    out.extend_from_slice(&bl.to_le_bytes());
275    out.extend_from_slice(&rl.to_le_bytes());
276}
277
278fn emit_copy(out: &mut Vec<u8>, offset: u32, length: u16) {
279    out.push(OP_COPY);
280    out.extend_from_slice(&offset.to_le_bytes());
281    out.extend_from_slice(&length.to_le_bytes());
282}
283
284fn flush_insert(out: &mut Vec<u8>, buf: &mut Vec<u8>) {
285    if buf.is_empty() {
286        return;
287    }
288    debug_assert!(buf.len() <= MAX_INSERT_LEN);
289    out.push(u8::try_from(buf.len()).expect("<= 127"));
290    out.extend_from_slice(buf);
291    buf.clear();
292}
293
294fn block_hash(block: &[u8]) -> u64 {
295    // FNV-1a 64-bit.
296    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
297    for &b in block {
298        h ^= u64::from(b);
299        h = h.wrapping_mul(0x0000_0001_0000_01b3);
300    }
301    h
302}
303
304// =========================================================================
305// Tests
306// =========================================================================
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    fn header(base_len: u32, result_len: u32) -> [u8; HEADER_LEN] {
313        let mut h = [0u8; HEADER_LEN];
314        h[0] = STREAM_VERSION;
315        h[1..5].copy_from_slice(&base_len.to_le_bytes());
316        h[5..9].copy_from_slice(&result_len.to_le_bytes());
317        h
318    }
319
320    #[test]
321    fn identity_roundtrip() {
322        let data = b"0123456789abcdef".repeat(4); // 64 bytes
323        let stream = encode(&data, &data).unwrap();
324        let restored = decode(&data, &stream).unwrap();
325        assert_eq!(restored, data);
326    }
327
328    #[test]
329    fn pure_insert_roundtrip() {
330        let base = b"aaa";
331        let target = b"zzz";
332        let stream = encode(base, target).unwrap();
333        // After the 9-byte header, the very next byte is the INSERT length.
334        assert_eq!(stream[HEADER_LEN] & 0x80, 0);
335        assert_eq!(stream[HEADER_LEN], 3);
336        let restored = decode(base, &stream).unwrap();
337        assert_eq!(restored, target);
338    }
339
340    #[test]
341    fn pure_copy_full_base() {
342        let base: Vec<u8> = (0..16u8).cycle().take(128).collect();
343        let target = &base[..64];
344        // Hand-build a stream that is exactly one COPY(0, 64).
345        let mut stream = header(
346            u32::try_from(base.len()).unwrap(),
347            u32::try_from(target.len()).unwrap(),
348        )
349        .to_vec();
350        stream.push(OP_COPY);
351        stream.extend_from_slice(&0u32.to_le_bytes());
352        stream.extend_from_slice(&64u16.to_le_bytes());
353        assert_eq!(stream.len(), HEADER_LEN + 7);
354        let restored = decode(&base, &stream).unwrap();
355        assert_eq!(restored, target);
356    }
357
358    #[test]
359    fn near_duplicate_yields_smaller_delta() {
360        let v1 = include_str!("delta.rs"); // any sizable text
361        let mut v2 = String::from(v1);
362        v2.push_str("\n// trailing edit\n");
363        let stream = encode(v1.as_bytes(), v2.as_bytes()).unwrap();
364        let restored = decode(v1.as_bytes(), &stream).unwrap();
365        assert_eq!(restored, v2.as_bytes());
366        assert!(stream.len() < v2.len(), "delta should be smaller than v2");
367    }
368
369    #[test]
370    fn rejects_zero_opcode() {
371        let mut stream = header(0, 0).to_vec();
372        stream.push(0x00);
373        let err = decode(&[], &stream).unwrap_err();
374        assert!(matches!(err, MkitError::TrailingData));
375    }
376
377    #[test]
378    fn rejects_unknown_version() {
379        let mut bytes = header(0, 0);
380        bytes[0] = 0x02;
381        let err = decode(&[], &bytes).unwrap_err();
382        assert!(matches!(err, MkitError::UnsupportedObjectVersion));
383    }
384
385    #[test]
386    fn rejects_truncated_header() {
387        let bytes = [0x01u8, 0x00, 0x00];
388        let err = decode(&[], &bytes).unwrap_err();
389        assert!(matches!(err, MkitError::UnexpectedEof));
390    }
391
392    #[test]
393    fn rejects_truncated_copy() {
394        let mut stream = header(16, 16).to_vec();
395        stream.push(OP_COPY);
396        stream.extend_from_slice(&0u32.to_le_bytes()); // missing 2-byte length
397        let err = decode(&[0u8; 16], &stream).unwrap_err();
398        assert!(matches!(err, MkitError::UnexpectedEof));
399    }
400
401    #[test]
402    fn rejects_truncated_insert() {
403        let mut stream = header(0, 10).to_vec();
404        stream.push(10); // claim 10 literal bytes
405        stream.extend_from_slice(b"abc"); // only 3 supplied
406        let err = decode(&[], &stream).unwrap_err();
407        assert!(matches!(err, MkitError::UnexpectedEof));
408    }
409
410    #[test]
411    fn rejects_copy_past_base_end() {
412        let base = b"short"; // 5 bytes
413        let mut stream = header(u32::try_from(base.len()).unwrap(), 100).to_vec();
414        stream.push(OP_COPY);
415        stream.extend_from_slice(&0u32.to_le_bytes());
416        stream.extend_from_slice(&100u16.to_le_bytes());
417        let err = decode(base, &stream).unwrap_err();
418        assert!(matches!(err, MkitError::TrailingData));
419    }
420
421    #[test]
422    fn rejects_copy_with_zero_length() {
423        let base = [0u8; 16];
424        let mut stream = header(16, 16).to_vec();
425        stream.push(OP_COPY);
426        stream.extend_from_slice(&0u32.to_le_bytes());
427        stream.extend_from_slice(&0u16.to_le_bytes());
428        let err = decode(&base, &stream).unwrap_err();
429        assert!(matches!(err, MkitError::TrailingData));
430    }
431
432    #[test]
433    fn rejects_base_len_mismatch() {
434        let stream = header(16, 0).to_vec();
435        let err = decode(&[0u8; 8], &stream).unwrap_err();
436        assert!(matches!(err, MkitError::TrailingData));
437    }
438
439    #[test]
440    fn rejects_result_len_mismatch_at_end() {
441        // INSERTs sum to 5 but header says result_len = 3.
442        let mut stream = header(0, 3).to_vec();
443        stream.push(5);
444        stream.extend_from_slice(b"hello");
445        let err = decode(&[], &stream).unwrap_err();
446        assert!(matches!(err, MkitError::TrailingData));
447    }
448
449    #[test]
450    fn rejects_huge_result_len_without_preallocating() {
451        // Regression: a 9-byte header claiming result_len = u32::MAX MUST NOT
452        // trigger a 4 GiB `Vec::with_capacity`. The pre-allocation is now
453        // capped against the stream+base size. The decoder still returns an
454        // error (TrailingData) because no ops follow — but the point is that
455        // it does so without first reserving 4 GiB of virtual memory.
456        let stream = header(0, u32::MAX);
457        let err = decode(&[], &stream).unwrap_err();
458        assert!(matches!(err, MkitError::TrailingData));
459    }
460
461    #[test]
462    fn rejects_copy_with_reserved_low_bits() {
463        let base = [0u8; 16];
464        let mut stream = header(16, 4).to_vec();
465        stream.push(OP_COPY | 0x01); // reserved bit set
466        stream.extend_from_slice(&0u32.to_le_bytes());
467        stream.extend_from_slice(&4u16.to_le_bytes());
468        let err = decode(&base, &stream).unwrap_err();
469        assert!(matches!(err, MkitError::TrailingData));
470    }
471
472    #[test]
473    fn empty_base_pure_insert() {
474        let target = b"all new content here!";
475        let stream = encode(b"", target).unwrap();
476        let restored = decode(b"", &stream).unwrap();
477        assert_eq!(restored, target);
478    }
479
480    #[test]
481    fn cap_hint_does_not_scale_with_base_len() {
482        // G5 regression: the decoder's pre-allocation must be bounded by
483        // `stream.len()`, not by `base.len()`. Previously a 9-byte stream
484        // with a 1 GiB base could drive `Vec::with_capacity` to ~1 GiB.
485        //
486        // Assert: for a huge base + tiny stream, cap_hint is tiny.
487        let huge_base = 1usize << 30; // 1 GiB
488        let tiny_stream = 9usize; // just the header
489        let declared_result = u32::MAX as usize;
490        let cap = super::compute_cap_hint(declared_result, huge_base, tiny_stream);
491        assert!(
492            cap <= tiny_stream.saturating_mul(CAP_MULTIPLIER),
493            "cap_hint {cap} must be bounded by stream.len() * CAP_MULTIPLIER, \
494             not by base.len()",
495        );
496        assert!(
497            cap < 1024 * 1024,
498            "cap_hint {cap} must stay well below 1 MiB for a 9-byte stream",
499        );
500    }
501
502    /// Finding H8: `encode()` used to saturate `base_len`/`result_len` to
503    /// `u32::MAX` for inputs over 4 GiB, silently producing a stream
504    /// that `decode()` would reject with a confusing "length mismatch".
505    /// Now `check_length_bounds` errors out explicitly with
506    /// `DeltaLengthOverflow` so misuse surfaces at the call site.
507    #[test]
508    fn check_length_bounds_rejects_over_u32() {
509        // base_len above u32::MAX.
510        let over = (u32::MAX as usize).saturating_add(1);
511        assert!(matches!(
512            check_length_bounds(over, 0),
513            Err(MkitError::DeltaLengthOverflow { .. })
514        ));
515        // result_len above u32::MAX.
516        assert!(matches!(
517            check_length_bounds(0, over),
518            Err(MkitError::DeltaLengthOverflow { .. })
519        ));
520        // Exactly at u32::MAX is fine.
521        assert!(check_length_bounds(u32::MAX as usize, u32::MAX as usize).is_ok());
522        // Small is fine.
523        assert!(check_length_bounds(1, 1).is_ok());
524    }
525}