Skip to main content

iscc_lib/
cdc.rs

1//! Content-Defined Chunking (CDC) for similarity-preserving data splitting.
2//!
3//! Implements the FastCDC-inspired gear rolling hash algorithm from iscc-core.
4//! Splits byte data into variable-size chunks with content-dependent boundaries,
5//! enabling similarity detection across different versions of binary data.
6
7use crate::{IsccError, IsccResult};
8
9/// Gear rolling hash lookup table (256 entries).
10///
11/// Fixed constant from iscc-core `cdc_gear` option. Each byte value maps to
12/// a pseudo-random u32 used in the rolling hash for chunk boundary detection.
13const CDC_GEAR: [u32; 256] = [
14    1553318008, 574654857, 759734804, 310648967, 1393527547, 1195718329, 694400241, 1154184075,
15    1319583805, 1298164590, 122602963, 989043992, 1918895050, 933636724, 1369634190, 1963341198,
16    1565176104, 1296753019, 1105746212, 1191982839, 1195494369, 29065008, 1635524067, 722221599,
17    1355059059, 564669751, 1620421856, 1100048288, 1018120624, 1087284781, 1723604070, 1415454125,
18    737834957, 1854265892, 1605418437, 1697446953, 973791659, 674750707, 1669838606, 320299026,
19    1130545851, 1725494449, 939321396, 748475270, 554975894, 1651665064, 1695413559, 671470969,
20    992078781, 1935142196, 1062778243, 1901125066, 1935811166, 1644847216, 744420649, 2068980838,
21    1988851904, 1263854878, 1979320293, 111370182, 817303588, 478553825, 694867320, 685227566,
22    345022554, 2095989693, 1770739427, 165413158, 1322704750, 46251975, 710520147, 700507188,
23    2104251000, 1350123687, 1593227923, 1756802846, 1179873910, 1629210470, 358373501, 807118919,
24    751426983, 172199468, 174707988, 1951167187, 1328704411, 2129871494, 1242495143, 1793093310,
25    1721521010, 306195915, 1609230749, 1992815783, 1790818204, 234528824, 551692332, 1930351755,
26    110996527, 378457918, 638641695, 743517326, 368806918, 1583529078, 1767199029, 182158924,
27    1114175764, 882553770, 552467890, 1366456705, 934589400, 1574008098, 1798094820, 1548210079,
28    821697741, 601807702, 332526858, 1693310695, 136360183, 1189114632, 506273277, 397438002,
29    620771032, 676183860, 1747529440, 909035644, 142389739, 1991534368, 272707803, 1905681287,
30    1210958911, 596176677, 1380009185, 1153270606, 1150188963, 1067903737, 1020928348, 978324723,
31    962376754, 1368724127, 1133797255, 1367747748, 1458212849, 537933020, 1295159285, 2104731913,
32    1647629177, 1691336604, 922114202, 170715530, 1608833393, 62657989, 1140989235, 381784875,
33    928003604, 449509021, 1057208185, 1239816707, 525522922, 476962140, 102897870, 132620570,
34    419788154, 2095057491, 1240747817, 1271689397, 973007445, 1380110056, 1021668229, 12064370,
35    1186917580, 1017163094, 597085928, 2018803520, 1795688603, 1722115921, 2015264326, 506263638,
36    1002517905, 1229603330, 1376031959, 763839898, 1970623926, 1109937345, 524780807, 1976131071,
37    905940439, 1313298413, 772929676, 1578848328, 1108240025, 577439381, 1293318580, 1512203375,
38    371003697, 308046041, 320070446, 1252546340, 568098497, 1341794814, 1922466690, 480833267,
39    1060838440, 969079660, 1836468543, 2049091118, 2023431210, 383830867, 2112679659, 231203270,
40    1551220541, 1377927987, 275637462, 2110145570, 1700335604, 738389040, 1688841319, 1506456297,
41    1243730675, 258043479, 599084776, 41093802, 792486733, 1897397356, 28077829, 1520357900,
42    361516586, 1119263216, 209458355, 45979201, 363681532, 477245280, 2107748241, 601938891,
43    244572459, 1689418013, 1141711990, 1485744349, 1181066840, 1950794776, 410494836, 1445347454,
44    2137242950, 852679640, 1014566730, 1999335993, 1871390758, 1736439305, 231222289, 603972436,
45    783045542, 370384393, 184356284, 709706295, 1453549767, 591603172, 768512391, 854125182,
46];
47
48/// Default average chunk size in bytes for Data-Code generation.
49pub(crate) const DATA_AVG_CHUNK_SIZE: u32 = 1024;
50
51/// Calculate CDC parameters from target average chunk size.
52///
53/// Returns `(min_size, max_size, center_size, mask_s, mask_l)` where:
54/// - `min_size`: minimum chunk size (avg/4)
55/// - `max_size`: maximum chunk size (avg*8)
56/// - `center_size`: threshold between strict and relaxed mask phases
57/// - `mask_s`: strict mask for early boundary detection (harder to match)
58/// - `mask_l`: relaxed mask for late boundary detection (easier to match)
59pub(crate) fn alg_cdc_params(avg_size: u32) -> (usize, usize, usize, u32, u32) {
60    let min_size = (avg_size / 4) as usize;
61    let max_size = (avg_size * 8) as usize;
62    let offset = min_size + min_size.div_ceil(2);
63    let center_size = avg_size as usize - offset;
64    let bits = (avg_size as f64).log2().round() as u32;
65    let mask_s = (1u32 << (bits + 1)) - 1;
66    let mask_l = (1u32 << (bits - 1)) - 1;
67    (min_size, max_size, center_size, mask_s, mask_l)
68}
69
70/// Scan buffer for a gear-hash cut point, returning the offset if found.
71///
72/// Uses unsafe unchecked indexing and 4x loop unrolling for performance.
73///
74/// # Safety
75/// Caller must ensure `i <= barrier <= buffer.len()`.
76#[inline(always)]
77fn gear_scan(
78    buffer: &[u8],
79    pattern: &mut u32,
80    mut i: usize,
81    barrier: usize,
82    mask: u32,
83) -> (usize, Option<usize>) {
84    // SAFETY: `i < barrier <= buffer.len()`, so `buffer[i]` is in bounds.
85    // `buffer[i]` is u8 (0..=255) and CDC_GEAR has 256 entries, always in bounds.
86
87    // Unrolled 4x to reduce loop overhead and branch frequency
88    while i + 3 < barrier {
89        unsafe {
90            *pattern = (*pattern >> 1)
91                .wrapping_add(*CDC_GEAR.get_unchecked(*buffer.get_unchecked(i) as usize));
92        }
93        if *pattern & mask == 0 {
94            return (i + 1, Some(i + 1));
95        }
96        unsafe {
97            *pattern = (*pattern >> 1)
98                .wrapping_add(*CDC_GEAR.get_unchecked(*buffer.get_unchecked(i + 1) as usize));
99        }
100        if *pattern & mask == 0 {
101            return (i + 2, Some(i + 2));
102        }
103        unsafe {
104            *pattern = (*pattern >> 1)
105                .wrapping_add(*CDC_GEAR.get_unchecked(*buffer.get_unchecked(i + 2) as usize));
106        }
107        if *pattern & mask == 0 {
108            return (i + 3, Some(i + 3));
109        }
110        unsafe {
111            *pattern = (*pattern >> 1)
112                .wrapping_add(*CDC_GEAR.get_unchecked(*buffer.get_unchecked(i + 3) as usize));
113        }
114        if *pattern & mask == 0 {
115            return (i + 4, Some(i + 4));
116        }
117        i += 4;
118    }
119    // Handle remaining bytes
120    while i < barrier {
121        unsafe {
122            *pattern = (*pattern >> 1)
123                .wrapping_add(*CDC_GEAR.get_unchecked(*buffer.get_unchecked(i) as usize));
124        }
125        if *pattern & mask == 0 {
126            return (i + 1, Some(i + 1));
127        }
128        i += 1;
129    }
130    (i, None)
131}
132
133/// Find the CDC cut point offset within a buffer.
134///
135/// Uses a gear rolling hash to scan the buffer in two phases:
136/// 1. From `mi` to `cs`: strict phase using `mask_s` (harder to match)
137/// 2. From `cs` to `ma`: relaxed phase using `mask_l` (easier to match)
138///
139/// Returns the byte offset of the cut point. For buffers smaller than
140/// `mi`, returns the buffer length (entire buffer is one chunk).
141pub(crate) fn alg_cdc_offset(
142    buffer: &[u8],
143    mi: usize,
144    ma: usize,
145    cs: usize,
146    mask_s: u32,
147    mask_l: u32,
148) -> usize {
149    let mut pattern: u32 = 0;
150    let size = buffer.len();
151    let i = mi.min(size);
152
153    // Phase 1: strict mask (harder to match, produces larger chunks)
154    let (i, found) = gear_scan(buffer, &mut pattern, i, cs.min(size), mask_s);
155    if let Some(offset) = found {
156        return offset;
157    }
158
159    // Phase 2: relaxed mask (easier to match, prevents overly large chunks)
160    let (i, found) = gear_scan(buffer, &mut pattern, i, ma.min(size), mask_l);
161    if let Some(offset) = found {
162        return offset;
163    }
164
165    i
166}
167
168/// Split data into content-defined chunks.
169///
170/// Uses the gear rolling hash CDC algorithm to find content-dependent
171/// boundaries. Returns at least one chunk (empty slice for empty input).
172/// When `utf32` is true, aligns cut points to 4-byte boundaries.
173///
174/// # Errors
175///
176/// Returns `IsccError::InvalidInput` if `avg_chunk_size < 2`. The mask
177/// calculation requires `log2(avg_chunk_size) >= 1`.
178pub fn alg_cdc_chunks(data: &[u8], utf32: bool, avg_chunk_size: u32) -> IsccResult<Vec<&[u8]>> {
179    if avg_chunk_size < 2 {
180        return Err(IsccError::InvalidInput(format!(
181            "avg_chunk_size must be >= 2, got {avg_chunk_size}"
182        )));
183    }
184    Ok(alg_cdc_chunks_unchecked(data, utf32, avg_chunk_size))
185}
186
187/// Split data into content-defined chunks without validating `avg_chunk_size`.
188///
189/// Internal helper used by `DataHasher` and `gen_data_code_v0` where the
190/// chunk size is always the validated constant `DATA_AVG_CHUNK_SIZE`.
191pub(crate) fn alg_cdc_chunks_unchecked(
192    data: &[u8],
193    utf32: bool,
194    avg_chunk_size: u32,
195) -> Vec<&[u8]> {
196    if data.is_empty() {
197        return vec![&data[0..0]];
198    }
199
200    let (mi, ma, cs, mask_s, mask_l) = alg_cdc_params(avg_chunk_size);
201    let mut chunks = Vec::new();
202    let mut pos = 0;
203
204    while pos < data.len() {
205        let remaining = &data[pos..];
206        let mut cut_point = alg_cdc_offset(remaining, mi, ma, cs, mask_s, mask_l);
207
208        // Align cut points to 4-byte boundaries for UTF-32 encoded text
209        if utf32 {
210            cut_point -= cut_point % 4;
211            if cut_point == 0 {
212                cut_point = remaining.len().min(4);
213            }
214        }
215
216        chunks.push(&data[pos..pos + cut_point]);
217        pos += cut_point;
218    }
219
220    chunks
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226
227    #[test]
228    fn test_gear_table_length() {
229        assert_eq!(CDC_GEAR.len(), 256);
230    }
231
232    #[test]
233    fn test_gear_table_first_last() {
234        assert_eq!(CDC_GEAR[0], 1553318008);
235        assert_eq!(CDC_GEAR[255], 854125182);
236    }
237
238    #[test]
239    fn test_alg_cdc_params_default() {
240        let (mi, ma, cs, mask_s, mask_l) = alg_cdc_params(1024);
241        assert_eq!(mi, 256, "min_size");
242        assert_eq!(ma, 8192, "max_size");
243        assert_eq!(cs, 640, "center_size");
244        assert_eq!(mask_s, 2047, "mask_s = (1 << 11) - 1");
245        assert_eq!(mask_l, 511, "mask_l = (1 << 9) - 1");
246    }
247
248    #[test]
249    fn test_alg_cdc_offset_small_buffer() {
250        // Buffer smaller than min_size → returns buffer length
251        let buf = vec![0u8; 100];
252        let (mi, ma, cs, mask_s, mask_l) = alg_cdc_params(1024);
253        let offset = alg_cdc_offset(&buf, mi, ma, cs, mask_s, mask_l);
254        assert_eq!(offset, 100);
255    }
256
257    #[test]
258    fn test_alg_cdc_offset_returns_at_most_max() {
259        // Buffer larger than max_size → returns at most max_size
260        let buf = vec![0xAA; 10000];
261        let (mi, ma, cs, mask_s, mask_l) = alg_cdc_params(1024);
262        let offset = alg_cdc_offset(&buf, mi, ma, cs, mask_s, mask_l);
263        assert!(offset <= ma, "offset {offset} exceeds max_size {ma}");
264        assert!(offset >= mi, "offset {offset} below min_size {mi}");
265    }
266
267    #[test]
268    fn test_alg_cdc_chunks_empty() {
269        let chunks = alg_cdc_chunks(b"", false, 1024).unwrap();
270        assert_eq!(chunks.len(), 1);
271        assert_eq!(chunks[0].len(), 0);
272    }
273
274    #[test]
275    fn test_alg_cdc_chunks_small_data() {
276        // Data smaller than min_size → one chunk containing all data
277        let data = vec![42u8; 100];
278        let chunks = alg_cdc_chunks(&data, false, 1024).unwrap();
279        assert_eq!(chunks.len(), 1);
280        assert_eq!(chunks[0].len(), 100);
281    }
282
283    #[test]
284    fn test_alg_cdc_chunks_reassembly() {
285        // Chunks must reassemble to original data
286        let data: Vec<u8> = (0..=255).cycle().take(4096).collect();
287        let chunks = alg_cdc_chunks(&data, false, 1024).unwrap();
288        let reassembled: Vec<u8> = chunks.iter().flat_map(|c| c.iter().copied()).collect();
289        assert_eq!(reassembled, data);
290    }
291
292    #[test]
293    fn test_alg_cdc_chunks_deterministic() {
294        let data: Vec<u8> = (0..=255).cycle().take(4096).collect();
295        let chunks1 = alg_cdc_chunks(&data, false, 1024).unwrap();
296        let chunks2 = alg_cdc_chunks(&data, false, 1024).unwrap();
297        assert_eq!(chunks1.len(), chunks2.len());
298        for (a, b) in chunks1.iter().zip(chunks2.iter()) {
299            assert_eq!(a, b);
300        }
301    }
302
303    #[test]
304    fn test_alg_cdc_chunks_multiple_chunks() {
305        // Large data produces multiple chunks
306        let data: Vec<u8> = (0..=255).cycle().take(8192).collect();
307        let chunks = alg_cdc_chunks(&data, false, 1024).unwrap();
308        assert!(
309            chunks.len() > 1,
310            "expected multiple chunks, got {}",
311            chunks.len()
312        );
313    }
314
315    #[test]
316    fn test_alg_cdc_chunks_utf32_small_buffer() {
317        // 3 bytes with utf32=true must terminate and reassemble to original.
318        // Primary regression test for the infinite loop bug where
319        // cut_point % 4 == cut_point reduced cut_point to 0.
320        let data = [0xAA, 0xBB, 0xCC];
321        let chunks = alg_cdc_chunks(&data, true, 1024).unwrap();
322        assert!(!chunks.is_empty(), "must return at least one chunk");
323        let reassembled: Vec<u8> = chunks.iter().flat_map(|c| c.iter().copied()).collect();
324        assert_eq!(reassembled, data);
325    }
326
327    #[test]
328    fn test_alg_cdc_chunks_utf32_exact_4_bytes() {
329        // Exactly 4 bytes with utf32=true must return one 4-byte chunk.
330        let data = [0x01, 0x02, 0x03, 0x04];
331        let chunks = alg_cdc_chunks(&data, true, 1024).unwrap();
332        assert_eq!(chunks.len(), 1);
333        assert_eq!(chunks[0], &data[..]);
334    }
335
336    #[test]
337    fn test_alg_cdc_chunks_utf32_7_bytes() {
338        // 7 bytes (4+3) with utf32=true verifies non-aligned tail handling.
339        let data = [0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70];
340        let chunks = alg_cdc_chunks(&data, true, 1024).unwrap();
341        assert!(!chunks.is_empty(), "must return at least one chunk");
342        let reassembled: Vec<u8> = chunks.iter().flat_map(|c| c.iter().copied()).collect();
343        assert_eq!(reassembled, data);
344    }
345
346    #[test]
347    fn test_alg_cdc_chunks_utf32_reassembly() {
348        // Larger 4-byte-aligned input with utf32=true must reassemble correctly,
349        // and all chunks except possibly the last must be 4-byte aligned.
350        let data: Vec<u8> = (0..=255).cycle().take(4096).collect();
351        assert_eq!(data.len() % 4, 0, "test data must be 4-byte aligned");
352        let chunks = alg_cdc_chunks(&data, true, 1024).unwrap();
353        let reassembled: Vec<u8> = chunks.iter().flat_map(|c| c.iter().copied()).collect();
354        assert_eq!(reassembled, data);
355        // All chunks except the last must be 4-byte aligned
356        if chunks.len() > 1 {
357            for (i, chunk) in chunks[..chunks.len() - 1].iter().enumerate() {
358                assert_eq!(
359                    chunk.len() % 4,
360                    0,
361                    "chunk {i} has length {} which is not 4-byte aligned",
362                    chunk.len()
363                );
364            }
365        }
366    }
367
368    #[test]
369    fn test_alg_cdc_chunks_utf32_empty() {
370        // Empty input with utf32=true must not loop and must return one empty chunk.
371        let chunks = alg_cdc_chunks(b"", true, 1024).unwrap();
372        assert_eq!(chunks.len(), 1);
373        assert_eq!(chunks[0].len(), 0);
374    }
375
376    #[test]
377    fn test_alg_cdc_chunks_zero_avg_chunk_size() {
378        let err = alg_cdc_chunks(b"hello", false, 0).unwrap_err();
379        assert!(err.to_string().contains("avg_chunk_size must be >= 2"));
380    }
381
382    #[test]
383    fn test_alg_cdc_chunks_one_avg_chunk_size() {
384        let err = alg_cdc_chunks(b"hello", false, 1).unwrap_err();
385        assert!(err.to_string().contains("avg_chunk_size must be >= 2"));
386    }
387
388    #[test]
389    fn test_alg_cdc_chunks_min_valid_avg_chunk_size() {
390        let chunks = alg_cdc_chunks(b"hello world", false, 2).unwrap();
391        let reassembled: Vec<u8> = chunks.iter().flat_map(|c| c.iter().copied()).collect();
392        assert_eq!(reassembled, b"hello world");
393    }
394}