1use crate::object::MkitError;
23
24pub const SEED: u64 = 0x4D4B_4954_4643_4443;
26
27pub const MIN_SIZE: usize = 0x4000;
30pub const AVG_SIZE: usize = 0x10000;
33pub const MAX_SIZE: usize = 0x40000;
35
36pub const MASK_S: u64 = 0x0001_FFFF;
39pub const MASK_L: u64 = 0x0000_7FFF;
42
43fn gear_table() -> &'static [u64; 256] {
47 use std::sync::OnceLock;
48 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
49 TABLE.get_or_init(build_gear_table)
50}
51
52fn build_gear_table() -> [u64; 256] {
53 let mut state: u64 = SEED;
54 let mut table = [0u64; 256];
55 for entry in &mut table {
56 state = state.wrapping_add(0x9e37_79b9_7f4a_7c15);
58 let mut z = state;
59 z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
60 z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
61 z ^= z >> 31;
62 *entry = z;
63 }
64 table
65}
66
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub struct ChunkBoundary {
70 pub offset: usize,
71 pub length: usize,
72}
73
74#[derive(Debug, Clone, Copy)]
78pub struct FastCdc {
79 min_size: usize,
80 avg_size: usize,
81 max_size: usize,
82 mask_s: u64,
83 mask_l: u64,
84}
85
86impl FastCdc {
87 #[must_use]
90 pub const fn v1() -> Self {
91 Self {
92 min_size: MIN_SIZE,
93 avg_size: AVG_SIZE,
94 max_size: MAX_SIZE,
95 mask_s: MASK_S,
96 mask_l: MASK_L,
97 }
98 }
99
100 #[must_use]
113 pub fn custom(min: usize, avg: usize, max: usize) -> Self {
114 assert!(min < avg && avg < max, "FastCdc: require min<avg<max");
115 assert!(avg.is_power_of_two(), "FastCdc: avg must be a power of 2");
116 let bits = avg.trailing_zeros();
117 let mask: u64 = (1u64 << bits) - 1;
118 Self {
119 min_size: min,
120 avg_size: avg,
121 max_size: max,
122 mask_s: mask | (mask << 1),
123 mask_l: mask >> 1,
124 }
125 }
126
127 #[must_use]
128 pub const fn min_size(&self) -> usize {
129 self.min_size
130 }
131 #[must_use]
132 pub const fn avg_size(&self) -> usize {
133 self.avg_size
134 }
135 #[must_use]
136 pub const fn max_size(&self) -> usize {
137 self.max_size
138 }
139
140 #[must_use]
148 pub fn cut(&self, data: &[u8]) -> usize {
149 if data.len() <= self.min_size {
150 return data.len();
151 }
152 let table = gear_table();
153 let mut hash: u64 = 0;
154 let avg_end = self.avg_size.min(data.len());
155 let mut i = self.min_size;
156 while i < avg_end {
157 hash = (hash << 1).wrapping_add(table[data[i] as usize]);
158 if (hash & self.mask_s) == 0 {
159 return i;
160 }
161 i += 1;
162 }
163 let max_end = self.max_size.min(data.len());
164 while i < max_end {
165 hash = (hash << 1).wrapping_add(table[data[i] as usize]);
166 if (hash & self.mask_l) == 0 {
167 return i;
168 }
169 i += 1;
170 }
171 max_end
172 }
173}
174
175#[derive(Debug)]
177pub struct ChunkIterator<'a> {
178 cdc: FastCdc,
179 data: &'a [u8],
180 offset: usize,
181}
182
183impl<'a> ChunkIterator<'a> {
184 #[must_use]
185 pub fn new(cdc: FastCdc, data: &'a [u8]) -> Self {
186 Self {
187 cdc,
188 data,
189 offset: 0,
190 }
191 }
192}
193
194impl Iterator for ChunkIterator<'_> {
195 type Item = ChunkBoundary;
196 fn next(&mut self) -> Option<Self::Item> {
197 if self.offset >= self.data.len() {
198 return None;
199 }
200 let remaining = &self.data[self.offset..];
201 let length = self.cdc.cut(remaining);
202 let boundary = ChunkBoundary {
203 offset: self.offset,
204 length,
205 };
206 self.offset += length;
207 Some(boundary)
208 }
209}
210
211#[must_use]
217pub fn chunk_boundaries(data: &[u8]) -> Vec<usize> {
218 let cdc = FastCdc::v1();
219 let mut out = Vec::new();
220 let mut offset = 0usize;
221 while offset < data.len() {
222 let len = cdc.cut(&data[offset..]);
223 offset += len;
224 out.push(offset);
225 }
226 out
227}
228
229#[must_use]
233pub fn gear_table_digest() -> [u8; 32] {
234 let table = gear_table();
235 let mut bytes = [0u8; 256 * 8];
236 for (i, v) in table.iter().enumerate() {
237 bytes[i * 8..i * 8 + 8].copy_from_slice(&v.to_le_bytes());
238 }
239 crate::hash::hash(&bytes)
240}
241
242#[allow(dead_code)]
245fn _link_error_types(_: MkitError) {}
246
247#[cfg(test)]
252mod tests {
253 use super::*;
254
255 struct Prng(u64);
259 impl Prng {
260 fn new(seed: u64) -> Self {
261 Self(seed.max(1))
262 }
263 fn next_u64(&mut self) -> u64 {
264 self.0 = self.0.wrapping_add(0x9e37_79b9_7f4a_7c15);
267 let mut z = self.0;
268 z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
269 z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
270 z ^ (z >> 31)
271 }
272 fn fill(&mut self, dst: &mut [u8]) {
273 for chunk in dst.chunks_mut(8) {
274 let bytes = self.next_u64().to_le_bytes();
275 let n = chunk.len();
276 chunk.copy_from_slice(&bytes[..n]);
277 }
278 }
279 }
280
281 #[test]
282 fn gear_table_is_unique_and_nonzero() {
283 let table = gear_table();
284 let mut seen = std::collections::HashSet::new();
285 for &v in table {
286 assert_ne!(v, 0, "gear table entry is zero");
287 assert!(seen.insert(v), "duplicate gear table entry");
288 }
289 assert_eq!(seen.len(), 256);
290 }
291
292 #[test]
293 fn determinism_same_input_same_boundaries() {
294 let cdc = FastCdc::v1();
295 let mut data = vec![0u8; 200 * 1024];
296 Prng::new(0xDEAD_BEEF).fill(&mut data);
297
298 let pass1: Vec<_> = ChunkIterator::new(cdc, &data).collect();
299 let pass2: Vec<_> = ChunkIterator::new(cdc, &data).collect();
300 assert_eq!(pass1, pass2);
301 }
302
303 #[test]
304 fn min_max_size_constraints() {
305 let cdc = FastCdc::v1();
306 let mut data = vec![0u8; 512 * 1024];
307 Prng::new(0xCAFE_BABE).fill(&mut data);
308
309 let mut total = 0usize;
310 let mut count = 0usize;
311 for b in ChunkIterator::new(cdc, &data) {
312 assert!(b.length <= MAX_SIZE);
313 if b.offset + b.length < data.len() {
315 assert!(b.length >= MIN_SIZE, "chunk under MIN_SIZE: {}", b.length);
316 }
317 total += b.length;
318 count += 1;
319 }
320 assert_eq!(total, data.len());
321 assert!(count > 1);
322 }
323
324 #[test]
325 fn small_input_is_single_chunk() {
326 let cdc = FastCdc::v1();
327 let small = b"hello, this is a tiny file";
328 assert_eq!(cdc.cut(small), small.len());
329 let boundaries: Vec<_> = ChunkIterator::new(cdc, small).collect();
330 assert_eq!(boundaries.len(), 1);
331 assert_eq!(boundaries[0].offset, 0);
332 assert_eq!(boundaries[0].length, small.len());
333 }
334
335 #[test]
336 fn empty_input_iterator_is_empty() {
337 let cdc = FastCdc::v1();
338 assert_eq!(cdc.cut(b""), 0);
339 let none: Vec<_> = ChunkIterator::new(cdc, b"").collect();
340 assert!(none.is_empty());
341 }
342
343 #[test]
344 fn cut_at_exactly_min_size_returns_full() {
345 let cdc = FastCdc::custom(1024, 4096, 16384);
346 let mut data = vec![0u8; 1024];
347 Prng::new(99).fill(&mut data);
348 assert_eq!(cdc.cut(&data), data.len());
349 }
350
351 #[test]
352 fn cut_forces_boundary_at_max_size() {
353 let cdc = FastCdc::custom(4, 8, 16);
356 let data = [0u8; 64];
357 let len = cdc.cut(&data);
358 assert!(len <= 16, "cut returned {len} > max=16");
359 }
360
361 #[test]
362 fn boundary_stability_single_byte_insert() {
363 let cdc = FastCdc::v1();
365 let mut original = vec![0u8; 64 * 1024];
366 Prng::new(0xBEEF).fill(&mut original);
367 let insert_point = 32 * 1024;
368 let mut modified = Vec::with_capacity(original.len() + 1);
369 modified.extend_from_slice(&original[..insert_point]);
370 modified.push(0xFF);
371 modified.extend_from_slice(&original[insert_point..]);
372
373 let orig_chunks: Vec<_> = ChunkIterator::new(cdc, &original).collect();
374 let mod_chunks: Vec<_> = ChunkIterator::new(cdc, &modified).collect();
375
376 let max_chunks = orig_chunks.len().max(mod_chunks.len());
377 let mut differing = 0usize;
378 for i in 0..max_chunks {
379 match (orig_chunks.get(i), mod_chunks.get(i)) {
380 (Some(o), Some(m)) => {
381 let os = &original[o.offset..o.offset + o.length];
382 let ms = &modified[m.offset..m.offset + m.length];
383 if os != ms {
384 differing += 1;
385 }
386 }
387 _ => differing += 1,
388 }
389 }
390 assert!(
391 differing <= 3,
392 "expected <=3 differing chunks, got {differing}"
393 );
394 }
395
396 #[test]
397 fn iterator_total_bytes_matches_input() {
398 let cdc = FastCdc::v1();
399 let mut data = vec![0u8; 300 * 1024];
400 Prng::new(42).fill(&mut data);
401 let total: usize = ChunkIterator::new(cdc, &data).map(|b| b.length).sum();
402 assert_eq!(total, data.len());
403 }
404
405 #[test]
406 fn different_avg_size_yields_different_boundaries() {
407 let mut data = vec![0u8; 100 * 1024];
408 Prng::new(0xABCD).fill(&mut data);
409 let small = FastCdc::custom(8 * 1024, 32 * 1024, 128 * 1024);
410 let large = FastCdc::v1();
411
412 let s: Vec<_> = ChunkIterator::new(small, &data)
413 .map(|b| b.offset + b.length)
414 .collect();
415 let l: Vec<_> = ChunkIterator::new(large, &data)
416 .map(|b| b.offset + b.length)
417 .collect();
418 assert!(
419 s != l,
420 "expected different boundaries for different avg_size"
421 );
422 }
423
424 #[test]
425 fn chunk_boundaries_helper_matches_iterator() {
426 let mut data = vec![0u8; 200 * 1024];
427 Prng::new(0xFEED_FACE).fill(&mut data);
428 let from_helper = chunk_boundaries(&data);
429 let from_iter: Vec<usize> = ChunkIterator::new(FastCdc::v1(), &data)
430 .map(|b| b.offset + b.length)
431 .collect();
432 assert_eq!(from_helper, from_iter);
433 }
434
435 const EXPECTED_GEAR_DIGEST_HEX: &str =
438 "7b238963a8bb10c4dea1bf678aa07d8c3ce94284209c440ca971ff3a97ee5ad4";
439
440 #[test]
441 fn gear_table_digest_is_stable() {
442 let hex = crate::hash::to_hex(&gear_table_digest());
445 assert_eq!(
446 hex, EXPECTED_GEAR_DIGEST_HEX,
447 "gear table digest changed; refuse to drift silently"
448 );
449 }
450
451 proptest::proptest! {
458 #[test]
463 fn proptest_determinism(data in proptest::collection::vec(proptest::num::u8::ANY, 0..256 * 1024)) {
464 let cdc = FastCdc::v1();
465 let pass1: Vec<_> = ChunkIterator::new(cdc, &data).collect();
466 let pass2: Vec<_> = ChunkIterator::new(cdc, &data).collect();
467 proptest::prop_assert_eq!(pass1, pass2);
468 }
469
470 #[test]
473 fn proptest_boundaries_partition_input(
474 data in proptest::collection::vec(proptest::num::u8::ANY, 0..256 * 1024),
475 ) {
476 let cdc = FastCdc::v1();
477 let boundaries: Vec<_> = ChunkIterator::new(cdc, &data).collect();
478 let mut expected_offset = 0usize;
479 for b in &boundaries {
480 proptest::prop_assert_eq!(b.offset, expected_offset);
481 expected_offset += b.length;
482 }
483 proptest::prop_assert_eq!(expected_offset, data.len());
484 }
485 }
486}