Skip to main content

mkit_core/
chunker.rs

1//! `FastCDC` content-defined chunker.
2//!
3//! Spec reference: `docs/SPEC-FASTCDC.md`. Frozen v1 parameters:
4//!
5//! * Gear seed: ASCII `"MKITFCDC"` interpreted as a big-endian `u64`
6//!   (`0x4D4B_4954_4643_4443`). Splitmix64-derived 256-entry table.
7//! * `MIN_SIZE = 16 KiB`, `AVG_SIZE = 64 KiB`, `MAX_SIZE = 256 KiB`.
8//! * `MASK_S = 0x0001_FFFF` (strict, used while `i < AVG_SIZE`).
9//! * `MASK   = 0x0000_FFFF` (informative; not used directly — we only
10//!   need the strict and loose masks at runtime).
11//! * `MASK_L = 0x0000_7FFF` (loose, used while `AVG_SIZE <= i < MAX_SIZE`).
12//! * Rolling hash: `h = (h << 1) +% gear[byte]`. Cut when `(h & mask) == 0`,
13//!   else force a cut at `MAX_SIZE`. Never cut before `MIN_SIZE`.
14//!
15//! The spec hard-codes the masks; we pin them as constants here so any
16//! accidental change lights up in the golden tests immediately.
17//!
18//! All chunk boundaries are fully determined by the input bytes plus the
19//! constants above. Any change breaks `chunked_blob` reproducibility
20//! (see `SPEC-FASTCDC.md` §2 determinism contract).
21
22use crate::object::MkitError;
23
24/// Frozen splitmix64 seed for v1 — ASCII `"MKITFCDC"` as big-endian u64.
25pub const SEED: u64 = 0x4D4B_4954_4643_4443;
26
27/// Minimum chunk size (`16 KiB`). `cut` will not return a value below
28/// this for non-final chunks.
29pub const MIN_SIZE: usize = 0x4000;
30/// Average target chunk size (`64 KiB`). The mask transition from strict
31/// to loose happens here.
32pub const AVG_SIZE: usize = 0x10000;
33/// Maximum chunk size (`256 KiB`). `cut` always returns at most this.
34pub const MAX_SIZE: usize = 0x40000;
35
36/// Strict mask (used while `i < AVG_SIZE`). Fewer cuts → bias the chunker
37/// toward `AVG_SIZE`.
38pub const MASK_S: u64 = 0x0001_FFFF;
39/// Loose mask (used while `AVG_SIZE <= i < MAX_SIZE`). More cuts → avoid
40/// running into the `MAX_SIZE` forced boundary.
41pub const MASK_L: u64 = 0x0000_7FFF;
42
43/// 256-entry gear table, derived once at first use from [`SEED`] via
44/// splitmix64. Wrapping arithmetic throughout — see SPEC-FASTCDC §3 for
45/// the exact derivation.
46fn gear_table() -> &'static [u64; 256] {
47    use std::sync::OnceLock;
48    static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
49    TABLE.get_or_init(build_gear_table)
50}
51
52fn build_gear_table() -> [u64; 256] {
53    let mut state: u64 = SEED;
54    let mut table = [0u64; 256];
55    for entry in &mut table {
56        // splitmix64 with the standard gamma. All ops are wrapping.
57        state = state.wrapping_add(0x9e37_79b9_7f4a_7c15);
58        let mut z = state;
59        z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
60        z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
61        z ^= z >> 31;
62        *entry = z;
63    }
64    table
65}
66
67/// One chunk's position in the source buffer.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub struct ChunkBoundary {
70    pub offset: usize,
71    pub length: usize,
72}
73
74/// `FastCDC` chunker parameters. Construct with [`FastCdc::v1`] for the
75/// frozen v1 constants; other constructors exist mainly for tests that
76/// exercise the algorithm at smaller sizes.
77#[derive(Debug, Clone, Copy)]
78pub struct FastCdc {
79    min_size: usize,
80    avg_size: usize,
81    max_size: usize,
82    mask_s: u64,
83    mask_l: u64,
84}
85
86impl FastCdc {
87    /// Construct the v1 chunker (`16 KiB / 64 KiB / 256 KiB`,
88    /// strict/loose `0x0001_FFFF` / `0x0000_7FFF`).
89    #[must_use]
90    pub const fn v1() -> Self {
91        Self {
92            min_size: MIN_SIZE,
93            avg_size: AVG_SIZE,
94            max_size: MAX_SIZE,
95            mask_s: MASK_S,
96            mask_l: MASK_L,
97        }
98    }
99
100    /// Construct a chunker with custom parameters. `avg_size` MUST be a
101    /// power of two; the strict/loose masks are derived as
102    /// `mask = (1 << log2(avg)) - 1; mask_s = mask | (mask << 1); mask_l = mask >> 1`.
103    /// Returns [`MkitError::InvalidIdentity`] re-purposed as a generic
104    /// "bad parameter" error if the constraints are violated — but in
105    /// practice this constructor is only used by tests, where the inputs
106    /// are constants, so we panic instead for a clearer failure mode.
107    ///
108    /// # Panics
109    ///
110    /// Panics if `min < avg < max` is not strictly satisfied or `avg` is
111    /// not a power of two.
112    #[must_use]
113    pub fn custom(min: usize, avg: usize, max: usize) -> Self {
114        assert!(min < avg && avg < max, "FastCdc: require min<avg<max");
115        assert!(avg.is_power_of_two(), "FastCdc: avg must be a power of 2");
116        let bits = avg.trailing_zeros();
117        let mask: u64 = (1u64 << bits) - 1;
118        Self {
119            min_size: min,
120            avg_size: avg,
121            max_size: max,
122            mask_s: mask | (mask << 1),
123            mask_l: mask >> 1,
124        }
125    }
126
127    #[must_use]
128    pub const fn min_size(&self) -> usize {
129        self.min_size
130    }
131    #[must_use]
132    pub const fn avg_size(&self) -> usize {
133        self.avg_size
134    }
135    #[must_use]
136    pub const fn max_size(&self) -> usize {
137        self.max_size
138    }
139
140    /// Find the cut point in `data`. Returns the length of the first
141    /// chunk. Semantics:
142    ///
143    /// * `data.len() <= min_size` → returns `data.len()` (no early cut).
144    /// * `min_size < i <= avg_size` → strict mask, fewer boundaries.
145    /// * `avg_size < i <= max_size` → loose mask, more boundaries.
146    /// * Otherwise → forced cut at `min(max_size, data.len())`.
147    #[must_use]
148    pub fn cut(&self, data: &[u8]) -> usize {
149        if data.len() <= self.min_size {
150            return data.len();
151        }
152        let table = gear_table();
153        let mut hash: u64 = 0;
154        let avg_end = self.avg_size.min(data.len());
155        let mut i = self.min_size;
156        while i < avg_end {
157            hash = (hash << 1).wrapping_add(table[data[i] as usize]);
158            if (hash & self.mask_s) == 0 {
159                return i;
160            }
161            i += 1;
162        }
163        let max_end = self.max_size.min(data.len());
164        while i < max_end {
165            hash = (hash << 1).wrapping_add(table[data[i] as usize]);
166            if (hash & self.mask_l) == 0 {
167                return i;
168            }
169            i += 1;
170        }
171        max_end
172    }
173}
174
175/// Iterator over chunk boundaries in a contiguous byte slice.
176#[derive(Debug)]
177pub struct ChunkIterator<'a> {
178    cdc: FastCdc,
179    data: &'a [u8],
180    offset: usize,
181}
182
183impl<'a> ChunkIterator<'a> {
184    #[must_use]
185    pub fn new(cdc: FastCdc, data: &'a [u8]) -> Self {
186        Self {
187            cdc,
188            data,
189            offset: 0,
190        }
191    }
192}
193
194impl Iterator for ChunkIterator<'_> {
195    type Item = ChunkBoundary;
196    fn next(&mut self) -> Option<Self::Item> {
197        if self.offset >= self.data.len() {
198            return None;
199        }
200        let remaining = &self.data[self.offset..];
201        let length = self.cdc.cut(remaining);
202        let boundary = ChunkBoundary {
203            offset: self.offset,
204            length,
205        };
206        self.offset += length;
207        Some(boundary)
208    }
209}
210
211/// Convenience: collect all chunk *end* offsets from `data` using the
212/// frozen v1 chunker. The returned vector starts with the length of the
213/// first chunk and ends with `data.len()`. An empty input yields an
214/// empty vector. Used by goldens to pin boundaries deterministically
215/// without exposing the `ChunkBoundary` struct in serialised form.
216#[must_use]
217pub fn chunk_boundaries(data: &[u8]) -> Vec<usize> {
218    let cdc = FastCdc::v1();
219    let mut out = Vec::new();
220    let mut offset = 0usize;
221    while offset < data.len() {
222        let len = cdc.cut(&data[offset..]);
223        offset += len;
224        out.push(offset);
225    }
226    out
227}
228
229/// Hash the entire 256-entry gear table as little-endian `u64` bytes
230/// (2 048 bytes total) with BLAKE3. Useful as a cheap "did we get the
231/// seed right" check — see SPEC-FASTCDC §8 vector 1.
232#[must_use]
233pub fn gear_table_digest() -> [u8; 32] {
234    let table = gear_table();
235    let mut bytes = [0u8; 256 * 8];
236    for (i, v) in table.iter().enumerate() {
237        bytes[i * 8..i * 8 + 8].copy_from_slice(&v.to_le_bytes());
238    }
239    crate::hash::hash(&bytes)
240}
241
242// MkitError is unused in this module today, but keep the import
243// reachable so future error returns don't drift the public surface.
244#[allow(dead_code)]
245fn _link_error_types(_: MkitError) {}
246
247// =========================================================================
248// Tests
249// =========================================================================
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    /// Deterministic xorshift64* PRNG for test-data generation. We avoid
256    /// `rand` here to keep test inputs explicit and reproducible
257    /// without committing to a specific `rand` version.
258    struct Prng(u64);
259    impl Prng {
260        fn new(seed: u64) -> Self {
261            Self(seed.max(1))
262        }
263        fn next_u64(&mut self) -> u64 {
264            // splitmix64 (same primitive used to derive the gear table —
265            // a different seed gives an unrelated stream).
266            self.0 = self.0.wrapping_add(0x9e37_79b9_7f4a_7c15);
267            let mut z = self.0;
268            z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
269            z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
270            z ^ (z >> 31)
271        }
272        fn fill(&mut self, dst: &mut [u8]) {
273            for chunk in dst.chunks_mut(8) {
274                let bytes = self.next_u64().to_le_bytes();
275                let n = chunk.len();
276                chunk.copy_from_slice(&bytes[..n]);
277            }
278        }
279    }
280
281    #[test]
282    fn gear_table_is_unique_and_nonzero() {
283        let table = gear_table();
284        let mut seen = std::collections::HashSet::new();
285        for &v in table {
286            assert_ne!(v, 0, "gear table entry is zero");
287            assert!(seen.insert(v), "duplicate gear table entry");
288        }
289        assert_eq!(seen.len(), 256);
290    }
291
292    #[test]
293    fn determinism_same_input_same_boundaries() {
294        let cdc = FastCdc::v1();
295        let mut data = vec![0u8; 200 * 1024];
296        Prng::new(0xDEAD_BEEF).fill(&mut data);
297
298        let pass1: Vec<_> = ChunkIterator::new(cdc, &data).collect();
299        let pass2: Vec<_> = ChunkIterator::new(cdc, &data).collect();
300        assert_eq!(pass1, pass2);
301    }
302
303    #[test]
304    fn min_max_size_constraints() {
305        let cdc = FastCdc::v1();
306        let mut data = vec![0u8; 512 * 1024];
307        Prng::new(0xCAFE_BABE).fill(&mut data);
308
309        let mut total = 0usize;
310        let mut count = 0usize;
311        for b in ChunkIterator::new(cdc, &data) {
312            assert!(b.length <= MAX_SIZE);
313            // Only the final chunk is allowed to be smaller than min.
314            if b.offset + b.length < data.len() {
315                assert!(b.length >= MIN_SIZE, "chunk under MIN_SIZE: {}", b.length);
316            }
317            total += b.length;
318            count += 1;
319        }
320        assert_eq!(total, data.len());
321        assert!(count > 1);
322    }
323
324    #[test]
325    fn small_input_is_single_chunk() {
326        let cdc = FastCdc::v1();
327        let small = b"hello, this is a tiny file";
328        assert_eq!(cdc.cut(small), small.len());
329        let boundaries: Vec<_> = ChunkIterator::new(cdc, small).collect();
330        assert_eq!(boundaries.len(), 1);
331        assert_eq!(boundaries[0].offset, 0);
332        assert_eq!(boundaries[0].length, small.len());
333    }
334
335    #[test]
336    fn empty_input_iterator_is_empty() {
337        let cdc = FastCdc::v1();
338        assert_eq!(cdc.cut(b""), 0);
339        let none: Vec<_> = ChunkIterator::new(cdc, b"").collect();
340        assert!(none.is_empty());
341    }
342
343    #[test]
344    fn cut_at_exactly_min_size_returns_full() {
345        let cdc = FastCdc::custom(1024, 4096, 16384);
346        let mut data = vec![0u8; 1024];
347        Prng::new(99).fill(&mut data);
348        assert_eq!(cdc.cut(&data), data.len());
349    }
350
351    #[test]
352    fn cut_forces_boundary_at_max_size() {
353        // All-zero buffer, gear hash never trips a natural cut → forced
354        // cut at max_size.
355        let cdc = FastCdc::custom(4, 8, 16);
356        let data = [0u8; 64];
357        let len = cdc.cut(&data);
358        assert!(len <= 16, "cut returned {len} > max=16");
359    }
360
361    #[test]
362    fn boundary_stability_single_byte_insert() {
363        // Insert one byte at offset 32 KiB; expect <=3 chunks differ.
364        let cdc = FastCdc::v1();
365        let mut original = vec![0u8; 64 * 1024];
366        Prng::new(0xBEEF).fill(&mut original);
367        let insert_point = 32 * 1024;
368        let mut modified = Vec::with_capacity(original.len() + 1);
369        modified.extend_from_slice(&original[..insert_point]);
370        modified.push(0xFF);
371        modified.extend_from_slice(&original[insert_point..]);
372
373        let orig_chunks: Vec<_> = ChunkIterator::new(cdc, &original).collect();
374        let mod_chunks: Vec<_> = ChunkIterator::new(cdc, &modified).collect();
375
376        let max_chunks = orig_chunks.len().max(mod_chunks.len());
377        let mut differing = 0usize;
378        for i in 0..max_chunks {
379            match (orig_chunks.get(i), mod_chunks.get(i)) {
380                (Some(o), Some(m)) => {
381                    let os = &original[o.offset..o.offset + o.length];
382                    let ms = &modified[m.offset..m.offset + m.length];
383                    if os != ms {
384                        differing += 1;
385                    }
386                }
387                _ => differing += 1,
388            }
389        }
390        assert!(
391            differing <= 3,
392            "expected <=3 differing chunks, got {differing}"
393        );
394    }
395
396    #[test]
397    fn iterator_total_bytes_matches_input() {
398        let cdc = FastCdc::v1();
399        let mut data = vec![0u8; 300 * 1024];
400        Prng::new(42).fill(&mut data);
401        let total: usize = ChunkIterator::new(cdc, &data).map(|b| b.length).sum();
402        assert_eq!(total, data.len());
403    }
404
405    #[test]
406    fn different_avg_size_yields_different_boundaries() {
407        let mut data = vec![0u8; 100 * 1024];
408        Prng::new(0xABCD).fill(&mut data);
409        let small = FastCdc::custom(8 * 1024, 32 * 1024, 128 * 1024);
410        let large = FastCdc::v1();
411
412        let s: Vec<_> = ChunkIterator::new(small, &data)
413            .map(|b| b.offset + b.length)
414            .collect();
415        let l: Vec<_> = ChunkIterator::new(large, &data)
416            .map(|b| b.offset + b.length)
417            .collect();
418        assert!(
419            s != l,
420            "expected different boundaries for different avg_size"
421        );
422    }
423
424    #[test]
425    fn chunk_boundaries_helper_matches_iterator() {
426        let mut data = vec![0u8; 200 * 1024];
427        Prng::new(0xFEED_FACE).fill(&mut data);
428        let from_helper = chunk_boundaries(&data);
429        let from_iter: Vec<usize> = ChunkIterator::new(FastCdc::v1(), &data)
430            .map(|b| b.offset + b.length)
431            .collect();
432        assert_eq!(from_helper, from_iter);
433    }
434
435    /// Pinned v1 gear-table digest, harvested once from the splitmix64
436    /// derivation seeded with "MKITFCDC". Drift = a v2 break.
437    const EXPECTED_GEAR_DIGEST_HEX: &str =
438        "7b238963a8bb10c4dea1bf678aa07d8c3ce94284209c440ca971ff3a97ee5ad4";
439
440    #[test]
441    fn gear_table_digest_is_stable() {
442        // SPEC-FASTCDC §8 vector 1: any change to the seed or splitmix
443        // derivation moves this digest. CI flags drift loud and early.
444        let hex = crate::hash::to_hex(&gear_table_digest());
445        assert_eq!(
446            hex, EXPECTED_GEAR_DIGEST_HEX,
447            "gear table digest changed; refuse to drift silently"
448        );
449    }
450
451    // -- Property tests -------------------------------------------------
452    //
453    // Determinism + cap invariants exercised against arbitrary inputs
454    // via `proptest`. The example tests above cover specific PRNG seeds;
455    // the properties below catch the boundary cases the examples miss
456    // (very-short data, repeating patterns, etc.).
457    proptest::proptest! {
458        /// FastCDC is deterministic: two passes over the same bytes
459        /// produce the same chunk boundaries. This is the core SPEC-
460        /// FASTCDC §2 contract that makes `chunked_blob` content-
461        /// addressable.
462        #[test]
463        fn proptest_determinism(data in proptest::collection::vec(proptest::num::u8::ANY, 0..256 * 1024)) {
464            let cdc = FastCdc::v1();
465            let pass1: Vec<_> = ChunkIterator::new(cdc, &data).collect();
466            let pass2: Vec<_> = ChunkIterator::new(cdc, &data).collect();
467            proptest::prop_assert_eq!(pass1, pass2);
468        }
469
470        /// Boundaries cover the input exactly: sum of lengths equals
471        /// input length; offsets are non-overlapping and monotonic.
472        #[test]
473        fn proptest_boundaries_partition_input(
474            data in proptest::collection::vec(proptest::num::u8::ANY, 0..256 * 1024),
475        ) {
476            let cdc = FastCdc::v1();
477            let boundaries: Vec<_> = ChunkIterator::new(cdc, &data).collect();
478            let mut expected_offset = 0usize;
479            for b in &boundaries {
480                proptest::prop_assert_eq!(b.offset, expected_offset);
481                expected_offset += b.length;
482            }
483            proptest::prop_assert_eq!(expected_offset, data.len());
484        }
485    }
486}