fastcdc/v2020/
mod.rs

1//
2// Copyright (c) 2025 Nathan Fiedler
3//
4
5//! This module implements the canonical FastCDC algorithm as described in the
6//! [paper](https://ieeexplore.ieee.org/document/9055082) by Wen Xia, et al., in
7//! 2020.
8//!
9//! The algorithm incorporates a simplified hash judgement using the fast Gear
10//! hash, sub-minimum chunk cut-point skipping, normalized chunking to produce
11//! chunks of a more consistent length, and "rolling two bytes each time".
12//! According to the authors, this should be 30-40% faster than the 2016 version
13//! while producing the same cut points. Benchmarks on several large files on an
14//! Apple M1 show about a 20% improvement, but results may vary depending on CPU
15//! architecture, file size, chunk size, etc.
16//!
17//! There are two ways in which to use the [`FastCDC`] struct defined in this
18//! module. One is to simply invoke [`cut()`](FastCDC::cut) while managing your
19//! own `start` and `remaining` values. The other is to use the struct as an
20//! [`Iterator`] that yields [`Chunk`] structs which represent the offset and
21//! size of the chunks. Note that attempting to use both [`cut()`](FastCDC::cut)
22//! and [`Iterator`] on the same [`FastCDC`] instance will yield incorrect
23//! results.
24//!
25//! Note that the [`cut()`] function returns the 64-bit hash of the chunk, which
26//! may be useful in scenarios involving chunk size prediction using historical
27//! data, such as in RapidCDC or SuperCDC. This hash value is also given in the
28//! `hash` field of the [`Chunk`] struct. While this value has rather low
29//! entropy, it is computationally cost-free and can be put to some use with
30//! additional record keeping.
31//!
32//! The [`StreamCDC`] implementation is similar to [`FastCDC`] except that it
33//! will read data from a [`Read`] into an internal buffer of `max_size` and
34//! produce [`ChunkData`] values from the [`Iterator`].
35//!
36//! ## Altering the chunking
37//!
38//! The [`FastCDC::with_level_and_seed`] and [`StreamCDC::with_level_and_seed`]
39//! functions allow for changing the GEAR table values in order to alter the
40//! chunk boundaries. If the seed is non-zero, then it will be used to perform a
41//! bitwise exclusive OR on the values in both the GEAR and left-shifted GEAR
42//! tables. Modifying the GEAR hash is useful for preventing attacks based on
43//! monitoring the chunking behavior and using that information to infer other
44//! attributes of the data that would otherwise be unknown.
45use std::fmt;
46use std::io::Read;
47
48#[cfg(any(feature = "tokio", feature = "futures"))]
49mod async_stream_cdc;
50#[cfg(any(feature = "tokio", feature = "futures"))]
51pub use async_stream_cdc::*;
52
53/// Smallest acceptable value for the minimum chunk size.
54pub const MINIMUM_MIN: u32 = 64;
55/// Largest acceptable value for the minimum chunk size.
56pub const MINIMUM_MAX: u32 = 1_048_576;
57/// Smallest acceptable value for the average chunk size.
58pub const AVERAGE_MIN: u32 = 256;
59/// Largest acceptable value for the average chunk size.
60pub const AVERAGE_MAX: u32 = 4_194_304;
61/// Smallest acceptable value for the maximum chunk size.
62pub const MAXIMUM_MIN: u32 = 1024;
63/// Largest acceptable value for the maximum chunk size.
64pub const MAXIMUM_MAX: u32 = 16_777_216;
65
66//
67// Masks for each of the desired number of bits, where 0 through 5 are unused.
68// The values for sizes 64 bytes through 128 kilo-bytes comes from the C
69// reference implementation (found in the destor repository) while the extra
70// values come from the restic-FastCDC repository. The FastCDC paper claims that
71// the deduplication ratio is slightly improved when the mask bits are spread
72// relatively evenly, hence these seemingly "magic" values.
73//
74pub const MASKS: [u64; 26] = [
75    0,                  // padding
76    0,                  // padding
77    0,                  // padding
78    0,                  // padding
79    0,                  // padding
80    0x0000000001804110, // unused except for NC 3
81    0x0000000001803110, // 64B
82    0x0000000018035100, // 128B
83    0x0000001800035300, // 256B
84    0x0000019000353000, // 512B
85    0x0000590003530000, // 1KB
86    0x0000d90003530000, // 2KB
87    0x0000d90103530000, // 4KB
88    0x0000d90303530000, // 8KB
89    0x0000d90313530000, // 16KB
90    0x0000d90f03530000, // 32KB
91    0x0000d90303537000, // 64KB
92    0x0000d90703537000, // 128KB
93    0x0000d90707537000, // 256KB
94    0x0000d91707537000, // 512KB
95    0x0000d91747537000, // 1MB
96    0x0000d91767537000, // 2MB
97    0x0000d93767537000, // 4MB
98    0x0000d93777537000, // 8MB
99    0x0000d93777577000, // 16MB
100    0x0000db3777577000, // unused except for NC 3
101];
102
103//
104// GEAR contains seemingly random numbers which are created by computing the
105// MD5 digest of values from 0 to 255, using only the high 8 bytes of the 16
106// byte digest. This is the "gear hash" referred to the in FastCDC paper.
107//
108// The program to produce this table is named table64.rs in examples.
109//
110#[rustfmt::skip]
111const GEAR: [u64; 256] = [
112    0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7,
113    0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649,
114    0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226,
115    0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14,
116    0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6,
117    0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb,
118    0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98,
119    0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99,
120    0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01,
121    0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f,
122    0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915,
123    0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2,
124    0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565,
125    0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968,
126    0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116,
127    0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2,
128    0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040,
129    0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3,
130    0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651,
131    0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc,
132    0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666,
133    0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3,
134    0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163,
135    0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6,
136    0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0,
137    0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba,
138    0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9,
139    0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55,
140    0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52,
141    0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda,
142    0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e,
143    0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a,
144    0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba,
145    0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748,
146    0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e,
147    0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69,
148    0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0,
149    0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b,
150    0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5,
151    0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362,
152    0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845,
153    0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2,
154    0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71,
155    0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f,
156    0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd,
157    0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78,
158    0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa,
159    0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2,
160    0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482,
161    0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627,
162    0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b,
163    0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe,
164    0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87,
165    0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81,
166    0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e,
167    0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea,
168    0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2,
169    0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06,
170    0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c,
171    0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042,
172    0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6,
173    0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb,
174    0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780,
175    0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1
176];
177
178//
179// GEAR table in which all values have been shifted left 1 bit, as per the
180// FastCDC 2020 paper, section 3.7.
181//
182// The program to produce this table is named table64ls.rs in examples.
183//
184#[rustfmt::skip]
185const GEAR_LS: [u64; 256] = [
186    0x76ba78fa40fc6fb8, 0xf09ad1752224610c, 0x9aa5101f105ce530, 0xd59f1c9c33fb994e,
187    0x863e70bbf7a2c656, 0x3abe4e003c4b57cc, 0x062617bc7935b322, 0x89644acedd36ec92,
188    0x54653653c11d6932, 0x6cff97a43caefab0, 0x004f7555b4559ed6, 0xc7de5ab58e78444c,
189    0x1121e49adda6256e, 0x5013c06d0a3af8fc, 0xe14dfcbc0027b036, 0x3a04c6088a59d828,
190    0x070c8c64c91c491e, 0x9b559e7b9b257368, 0xebc025cc7830f0ac, 0x10c5f3a70438016c,
191    0x505ee670ea1edf14, 0x3cb07b8d839616de, 0xf4628b6d2e874fe2, 0x57641fdc80900fd6,
192    0x629679fc0f7074ba, 0x73b84f1315b7341e, 0x6e07ebd23754c57c, 0x9e1770cd02befb30,
193    0x7b30cf883d53b9a6, 0x37c3f4ca8857e458, 0x28601b498aac63b2, 0xcd31d3978ca8b932,
194    0x8ec620fd8c9d254a, 0x8cb043f8cad2d448, 0xec32d80c9045e16e, 0x90b7d083e6a4bc02,
195    0xeca579390b2e9fea, 0x95b06a5f59d3c7d2, 0x54dda3b9d66bd31c, 0x8de90775b822d01e,
196    0x79fb182fd39e25e2, 0x137078bc5d4ac8e2, 0x5cccb9fa49c72552, 0xd86789ca0997122a,
197    0x7f7362bf93fce8a2, 0xaffa3fa328be432a, 0x635bc10a6887dfb0, 0x4abdf930c7c3c5a4,
198    0x21d56c011aac859e, 0x8de090c64af59008, 0x4a5b8854f1344fa6, 0xb555bf39cef5eaca,
199    0xd68f39aa7b0ffd76, 0xc613c5a7f27b6e5e, 0x8ae71ff7543ff3ac, 0xd7aafe7e4b5ef2d0,
200    0xcec0a90db21a1494, 0xc29a172cc77f7b5c, 0x6f77b1b02dd60828, 0xbdf149e2d66b422c,
201    0xcf265b0b555ffdac, 0x102c3975d219fa90, 0x0aaa0f7d6529e116, 0x22469d4dffa73364,
202    0x5ae19e96486be604, 0xa51352eacb785a4e, 0x1cab086fff9533bc, 0x2da4e096e22b8080,
203    0x1113779bf8cc1c82, 0xbc1a9ccfb924251a, 0xe553f122e0c7db2e, 0x8716d3a813c02dc6,
204    0xbe9fdb14bb14872e, 0x01e83b9e83a807ee, 0x9029d6071ca4c07e, 0x3b5f7598b1c5aca2,
205    0xfc5e8b1c97c2e15e, 0x88afd8829bed5280, 0x0dcc5e28a2246628, 0x7a2029a2e7752598,
206    0xbde631c4bdaaeec0, 0x3fd41bd3bf950a4a, 0x8b3bc3ced840c496, 0x5dd8312c2fc5accc,
207    0x24d4580d56b50796, 0x62642a646c1ec264, 0xca842a07b7680246, 0x5acf850fd4113566,
208    0xd9277feb4ad7ad6c, 0x9ff6406d956db31a, 0x9cf6f0b637cf5a9e, 0xdb685dec313fa2c6,
209    0xb920a510e07311ec, 0x6cbf383a58d23108, 0x8c80b06d01b337fc, 0x79a8c4980eb27d8c,
210    0xfe3d51b0baf8b00a, 0x029085a9016ae292, 0x16c93796b5050d10, 0x79aae11daf3631e0,
211    0xd30f90c5f7a5e5e0, 0x304e62ce3e19b058, 0x75e27d162db180c6, 0x1d4621397b2a3774,
212    0xa28208b7f670b95a, 0x559783415e3efa60, 0xcc889b13be077fbe, 0x43198ee370311ff2,
213    0x3068853b60387376, 0x4295ba0ffc10d43e, 0x1e0f83363ed67ff2, 0xad452f637e9ffcaa,
214    0x29aab1c9278a9f8e, 0x817f8498ec8aa596, 0x2634e0df150a4196, 0x64453a64526b7aa4,
215    0x4ac1a1ebb89fdf5e, 0x3b798906ab2d376c, 0x1fb038730b816ad4, 0xc0702fc2ac1e57b4,
216    0x83769f03b12565aa, 0x61890c9e9c51a5ae, 0x7d9893f3b3ad84c6, 0xa260fd336a574cbc,
217    0x15e56d11b5094ea4, 0xebae4a477236416c, 0xdb2bfe3fe8c6900c, 0xac5e42aaa8b06734,
218    0x819c8ff11266c68c, 0x90f047ca113681b0, 0xc8e4f8fd78db2b24, 0xb51ee4efd292e174,
219    0xf945e80d639176a0, 0xb63f152be2f220e0, 0xa6095f3f92050c0a, 0xa88156ff9261ce90,
220    0x625a4bf794556b42, 0x21e949646949aaea, 0x20603aaa08fce8e6, 0x76c6e510d8c2b23c,
221    0x5268056ef8889c0c, 0x8a3e6949a7d2fbae, 0x62b1b029b0378af6, 0x06928484b737b4d2,
222    0xc4065ff3ca65b376, 0xc55c0cd71642f3ca, 0x2a8bc2185f1ae3b0, 0xffee907d65a47f80,
223    0x0128bf9d7b931b0c, 0x0ec9777d3364d944, 0x363d8c4509817f86, 0xb1c1f989e1546c56,
224    0xbe957df50f1a8b1a, 0xfae9585f2c0f8a32, 0x49c7f66fbf197f52, 0x7ed2fc87958ae9ca,
225    0x0de2947ed1e99aa6, 0x49447a0ede39ca44, 0xce4b9b00910d1990, 0x7e78e53d648c86c4,
226    0xb1ed9aaf67983db0, 0xc653ca484aa82aee, 0xc554d115ab5c3580, 0x14484acc4d37f08a,
227    0x2d16348ea7594e96, 0xef135fdffe5cfc78, 0xd866c41276df99b6, 0x99d1ea17a5181364,
228    0x00d45b72b5d15526, 0x2eb61ac4787a3518, 0x30c0ba726a6718b6, 0xb76dec26d95a78e2,
229    0x1ea7440e03f1b14c, 0x5718b5a5cfd278ce, 0x816b58a24f595452, 0x18f7ec7840eb12be,
230    0xf17b3efc029500b8, 0x6593d3e9f3918064, 0xdfac09304fd723e6, 0x57c8b3e90582df7a,
231    0xb259c18ae8b55518, 0x15551f6531b2cb72, 0x566ff258d900762a, 0x18a94bd29c1e1cf0,
232    0x2bf36dd218146064, 0xcf273f5486d8f0e8, 0xa2d7fd1ed5148192, 0x8930570c4c7fa5f4,
233    0xc50bf673f309cb06, 0xef351bee5aec33a6, 0xe5af351bd1abba3c, 0xa206e6a9accd09c4,
234    0x00990549ccd151ca, 0x63a814ecd16089cc, 0xae0af0a717a05822, 0xb68a8620f18be904,
235    0x2ee24376fed4a35a, 0xe7ab997a69dff1ba, 0xc86f40fa6adc2f9a, 0x8f64f0408792ac4e,
236    0x3f64a2827c83a934, 0x99ae16c0ca4a27a6, 0x392b663d14369364, 0x95ce7bfa37969836,
237    0x69b3066363eb6e1e, 0xf09c73e44671b25e, 0x30c27a940c9be840, 0xe3b1b5c4be179d7c,
238    0x67eef82b5d0abdf8, 0x7911677225d62138, 0x2ad45d92d75fdd4a, 0x35400b6bc15a1d0e,
239    0xaa01ae0a4f89771c, 0xc6d8ae32c8439888, 0x2789a50d986ddc72, 0xaca9447b03165502,
240    0xef63b827a2c357b8, 0xe69e89bcbf1abd6a, 0xc0e2fc2e94d91344, 0xa8fb2c924cd4423c,
241    0xb6274864576d3d20, 0xeecd2c13f16bf878, 0x43cd58ab7db9b592, 0x36ad6c56c22cdbd4,
242    0xe91ecd7272f2fd38, 0x6be665f381cd5d34, 0x98e67ed5350f1b60, 0x7b42c3c839821184,
243    0x6fae95ca6b229aa2, 0x9a92761623a6c8d2, 0x9c4c9a3bf752e834, 0x53a3e5b8e86db80c,
244    0xe0e7002cc098544e, 0x463a6dd2dd27e7aa, 0xeccd10232f071a32, 0x945506121555a818,
245    0xe3cec2b22cd166ba, 0xe6c646c92fee614e, 0x602101c6e6f3ba9a, 0xa05bd452e304e084,
246    0x858bd70b1e64c4be, 0xf0d5f73dbf5f7bfe, 0xb5dc1b0d09216548, 0xc2e6cd664d0c13ec,
247    0x5c1c6b41fc8c2e7c, 0xa340fbd27d049e22, 0x0f371622bd499950, 0x275324e8ab1f5d76,
248    0xf63cdc45c1140766, 0xd4c6bfb746d31ba0, 0x9ea6cb2650a074b8, 0x9bc7663cdfabaf00,
249    0x1c7c8443a6c28826, 0xde29a1b0d7e34458, 0xc3b061a7e2d8bbb6, 0x557a56548a2a09c2
250];
251
252///
253/// Produce the GEAR table, and the left-shifted version, as heap-owned values.
254///
255/// This will copy the original GEAR table, and its left-shifted twin, and
256/// peform a bitwise exclusive OR on the values using the given seed. If the
257/// seed is zero, no computation is performed.
258///
259pub fn get_gear_with_seed(seed: u64) -> (Box<[u64; 256]>, Box<[u64; 256]>) {
260    let mut gear = Box::new(GEAR);
261    let mut gear_ls = Box::new(GEAR_LS);
262    if seed > 0 {
263        for v in &mut *gear {
264            *v ^= seed
265        }
266        let seed_ls = seed << 1;
267        for v in &mut *gear_ls {
268            *v ^= seed_ls
269        }
270    }
271    (gear, gear_ls)
272}
273
274///
275/// Find the next chunk cut point in the source using the original GEAR tables.
276/// 
277/// See the `v2020_cut` example for a lengthy example of using this function.
278///
279pub fn cut(
280    source: &[u8],
281    min_size: usize,
282    avg_size: usize,
283    max_size: usize,
284    mask_s: u64,
285    mask_l: u64,
286    mask_s_ls: u64,
287    mask_l_ls: u64,
288) -> (u64, usize) {
289    cut_gear(
290        source, min_size, avg_size, max_size, mask_s, mask_l, mask_s_ls, mask_l_ls, &GEAR, &GEAR_LS,
291    )
292}
293
294///
295/// Find the next chunk cut point in the source using the given GEAR tables.
296///
297#[allow(clippy::too_many_arguments)]
298pub fn cut_gear(
299    source: &[u8],
300    min_size: usize,
301    avg_size: usize,
302    max_size: usize,
303    mask_s: u64,
304    mask_l: u64,
305    mask_s_ls: u64,
306    mask_l_ls: u64,
307    gear: &[u64; 256],
308    gear_ls: &[u64; 256],
309) -> (u64, usize) {
310    let mut remaining = source.len();
311    if remaining <= min_size {
312        return (0, remaining);
313    }
314    let mut center = avg_size;
315    if remaining > max_size {
316        remaining = max_size;
317    } else if remaining < center {
318        center = remaining;
319    }
320    let mut index = min_size / 2;
321    let mut hash: u64 = 0;
322    while index < center / 2 {
323        let a = index * 2;
324        hash = (hash << 2).wrapping_add(gear_ls[source[a] as usize]);
325        if (hash & mask_s_ls) == 0 {
326            return (hash, a);
327        }
328        hash = hash.wrapping_add(gear[source[a + 1] as usize]);
329        if (hash & mask_s) == 0 {
330            return (hash, a + 1);
331        }
332        index += 1;
333    }
334    while index < remaining / 2 {
335        let a = index * 2;
336        hash = (hash << 2).wrapping_add(gear_ls[source[a] as usize]);
337        if (hash & mask_l_ls) == 0 {
338            return (hash, a);
339        }
340        hash = hash.wrapping_add(gear[source[a + 1] as usize]);
341        if (hash & mask_l) == 0 {
342            return (hash, a + 1);
343        }
344        index += 1;
345    }
346    // If all else fails, return the largest chunk. This will happen with
347    // pathological data, such as all zeroes.
348    (hash, remaining)
349}
350
351///
352/// The level for the normalized chunking used by FastCDC.
353///
354/// Normalized chunking "generates chunks whose sizes are normalized to a
355/// specified region centered at the expected chunk size," as described in
356/// section 4.4 of the FastCDC 2016 paper.
357///
358/// Note that lower levels of normalization will result in a larger range of
359/// generated chunk sizes. It may be beneficial to widen the minimum/maximum
360/// chunk size values given to the [`FastCDC`] constructor in that case.
361///
362/// Note that higher levels of normalization may result in the final chunk of
363/// data being smaller than the minimum chunk size, which results in a hash
364/// value of zero since no calculations are performed for sub-minimum chunks.
365///
366#[derive(Copy, Clone, Debug)]
367pub enum Normalization {
368    /// No chunk size normalization, produces a wide range of chunk sizes.
369    Level0,
370    /// Level 1 normalization, in which fewer chunks are outside of the desired range.
371    Level1,
372    /// Level 2 normalization, where most chunks are of the desired size.
373    Level2,
374    /// Level 3 normalization, nearly all chunks are the desired size.
375    Level3,
376}
377
378impl Normalization {
379    pub fn bits(&self) -> u32 {
380        match self {
381            Normalization::Level0 => 0,
382            Normalization::Level1 => 1,
383            Normalization::Level2 => 2,
384            Normalization::Level3 => 3,
385        }
386    }
387}
388
389impl fmt::Display for Normalization {
390    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
391        self.bits().fmt(f)
392    }
393}
394
395///
396/// Represents a chunk returned from the [`FastCDC`] iterator.
397///
398#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
399pub struct Chunk {
400    /// The gear hash value as of the end of the chunk.
401    pub hash: u64,
402    /// Starting byte position within the source.
403    pub offset: usize,
404    /// Length of the chunk in bytes.
405    pub length: usize,
406}
407
408///
409/// The FastCDC chunker implementation from 2020.
410///
411/// Use `new` to construct an instance, and then iterate over the [`Chunk`]s via
412/// the [`Iterator`] trait.
413///
414/// This example reads a file into memory and splits it into chunks that are
415/// roughly 16 KB in size. The minimum and maximum sizes are the absolute limit
416/// on the returned chunk sizes. With this algorithm, it is helpful to be more
417/// lenient on the maximum chunk size as the results are highly dependent on the
418/// input data. Changing the minimum chunk size will affect the results as the
419/// algorithm may find different cut points given it uses the minimum as a
420/// starting point (cut-point skipping).
421///
422/// ```no_run
423/// use std::fs;
424/// use fastcdc::v2020;
425/// let contents = fs::read("test/fixtures/SekienAkashita.jpg").unwrap();
426/// let chunker = v2020::FastCDC::new(&contents, 8192, 16384, 65535);
427/// for entry in chunker {
428///     println!("offset={} size={}", entry.offset, entry.length);
429/// }
430/// ```
431///
432#[derive(Debug, Clone, Eq, PartialEq)]
433pub struct FastCDC<'a> {
434    source: &'a [u8],
435    processed: usize,
436    remaining: usize,
437    min_size: usize,
438    avg_size: usize,
439    max_size: usize,
440    mask_s: u64,
441    mask_l: u64,
442    mask_s_ls: u64,
443    mask_l_ls: u64,
444    gear: Box<[u64; 256]>,
445    gear_ls: Box<[u64; 256]>,
446}
447
448impl<'a> FastCDC<'a> {
449    ///
450    /// Construct a [`FastCDC`] that will process the given slice of bytes.
451    ///
452    /// Uses chunk size normalization level 1 by default.
453    ///
454    pub fn new(source: &'a [u8], min_size: u32, avg_size: u32, max_size: u32) -> Self {
455        FastCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
456    }
457
458    ///
459    /// Create a new [`FastCDC`] with the given normalization level.
460    ///
461    pub fn with_level(
462        source: &'a [u8],
463        min_size: u32,
464        avg_size: u32,
465        max_size: u32,
466        level: Normalization,
467    ) -> Self {
468        FastCDC::with_level_and_seed(source, min_size, avg_size, max_size, level, 0)
469    }
470
471    ///
472    /// Create a new [`FastCDC`] with the given normalization level and seed to
473    /// be XOR'd with the values in the gear tables.
474    ///
475    pub fn with_level_and_seed(
476        source: &'a [u8],
477        min_size: u32,
478        avg_size: u32,
479        max_size: u32,
480        level: Normalization,
481        seed: u64,
482    ) -> Self {
483        assert!(min_size >= MINIMUM_MIN);
484        assert!(min_size <= MINIMUM_MAX);
485        assert!(avg_size >= AVERAGE_MIN);
486        assert!(avg_size <= AVERAGE_MAX);
487        assert!(max_size >= MAXIMUM_MIN);
488        assert!(max_size <= MAXIMUM_MAX);
489        let bits = logarithm2(avg_size);
490        let normalization = level.bits();
491        let mask_s = MASKS[(bits + normalization) as usize];
492        let mask_l = MASKS[(bits - normalization) as usize];
493        let (gear, gear_ls) = get_gear_with_seed(seed);
494        Self {
495            source,
496            processed: 0,
497            remaining: source.len(),
498            min_size: min_size as usize,
499            avg_size: avg_size as usize,
500            max_size: max_size as usize,
501            mask_s,
502            mask_l,
503            mask_s_ls: mask_s << 1,
504            mask_l_ls: mask_l << 1,
505            gear,
506            gear_ls,
507        }
508    }
509
510    ///
511    /// Find the next cut point in the data, where `start` is the position from
512    /// which to start processing the source data, and `remaining` are the
513    /// number of bytes left to be processed.
514    ///
515    /// The returned 2-tuple consists of the 64-bit hash (fingerprint) and the
516    /// byte offset of the end of the chunk. Note that the hash values may
517    /// differ from those produced by the v2016 chunker.
518    ///
519    /// There is a special case in which the remaining bytes are less than the
520    /// minimum chunk size, at which point this function returns a hash of 0 and
521    /// the cut point is the end of the source data.
522    ///
523    pub fn cut(&self, start: usize, remaining: usize) -> (u64, usize) {
524        let end = start + remaining;
525        let (hash, count) = cut_gear(
526            &self.source[start..end],
527            self.min_size,
528            self.avg_size,
529            self.max_size,
530            self.mask_s,
531            self.mask_l,
532            self.mask_s_ls,
533            self.mask_l_ls,
534            &self.gear,
535            &self.gear_ls,
536        );
537        (hash, start + count)
538    }
539}
540
541impl Iterator for FastCDC<'_> {
542    type Item = Chunk;
543
544    fn next(&mut self) -> Option<Chunk> {
545        if self.remaining == 0 {
546            None
547        } else {
548            let (hash, cutpoint) = self.cut(self.processed, self.remaining);
549            if cutpoint == 0 {
550                None
551            } else {
552                let offset = self.processed;
553                let length = cutpoint - offset;
554                self.processed += length;
555                self.remaining -= length;
556                Some(Chunk {
557                    hash,
558                    offset,
559                    length,
560                })
561            }
562        }
563    }
564
565    fn size_hint(&self) -> (usize, Option<usize>) {
566        // NOTE: This intentionally returns the upper bound for both `size_hint`
567        // values, as the upper bound doesn't actually seem to get used by `std`
568        // and using the actual lower bound is practically guaranteed to require
569        // a second capacity growth.
570        let upper_bound = self.remaining / self.min_size;
571        (upper_bound, Some(upper_bound))
572    }
573}
574
575///
576/// The error type returned from the [`StreamCDC`] iterator.
577///
578#[derive(Debug)]
579pub enum Error {
580    /// End of source data reached.
581    Empty,
582    /// An I/O error occurred.
583    IoError(std::io::Error),
584    /// Something unexpected happened.
585    Other(String),
586}
587
588impl fmt::Display for Error {
589    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
590        write!(f, "chunker error: {self:?}")
591    }
592}
593
594impl std::error::Error for Error {}
595
596impl From<std::io::Error> for Error {
597    fn from(error: std::io::Error) -> Self {
598        Error::IoError(error)
599    }
600}
601
602impl From<Error> for std::io::Error {
603    fn from(error: Error) -> Self {
604        match error {
605            Error::IoError(ioerr) => ioerr,
606            Error::Empty => Self::from(std::io::ErrorKind::UnexpectedEof),
607            Error::Other(str) => Self::other(str),
608        }
609    }
610}
611
612///
613/// Represents a chunk returned from the [`StreamCDC`] iterator.
614///
615#[derive(Debug, Clone, Eq, PartialEq, Hash)]
616pub struct ChunkData {
617    /// The gear hash value as of the end of the chunk.
618    pub hash: u64,
619    /// Starting byte position within the source.
620    pub offset: u64,
621    /// Length of the chunk in bytes.
622    pub length: usize,
623    /// Source bytes contained in this chunk.
624    pub data: Vec<u8>,
625}
626
627///
628/// The FastCDC chunker implementation from 2020 with streaming support.
629///
630/// Use `new` to construct an instance, and then iterate over the [`ChunkData`]s
631/// via the [`Iterator`] trait.
632///
633/// Note that this struct allocates a [`Vec<u8>`] of `max_size` bytes to act as a
634/// buffer when reading from the source and finding chunk boundaries.
635///
636/// ```no_run
637/// # use std::fs::File;
638/// # use fastcdc::v2020::StreamCDC;
639/// let source = File::open("test/fixtures/SekienAkashita.jpg").unwrap();
640/// let chunker = StreamCDC::new(source, 4096, 16384, 65535);
641/// for result in chunker {
642///     let chunk = result.unwrap();
643///     println!("offset={} length={}", chunk.offset, chunk.length);
644/// }
645/// ```
646///
647pub struct StreamCDC<R: Read> {
648    /// Buffer of data from source for finding cut points.
649    buffer: Vec<u8>,
650    /// Maximum capacity of the buffer (always `max_size`).
651    capacity: usize,
652    /// Number of relevant bytes in the `buffer`.
653    length: usize,
654    /// Source from which data is read into `buffer`.
655    source: R,
656    /// Number of bytes read from the source so far.
657    processed: u64,
658    /// True when the source produces no more data.
659    eof: bool,
660    min_size: usize,
661    avg_size: usize,
662    max_size: usize,
663    mask_s: u64,
664    mask_l: u64,
665    mask_s_ls: u64,
666    mask_l_ls: u64,
667    gear: Box<[u64; 256]>,
668    gear_ls: Box<[u64; 256]>,
669}
670
671impl<R: Read> StreamCDC<R> {
672    ///
673    /// Construct a [`StreamCDC`] that will process bytes from the given source.
674    ///
675    /// Uses chunk size normalization level 1 by default.
676    ///
677    pub fn new(source: R, min_size: u32, avg_size: u32, max_size: u32) -> Self {
678        StreamCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
679    }
680
681    ///
682    /// Create a new [`StreamCDC`] with the given normalization level.
683    ///
684    pub fn with_level(
685        source: R,
686        min_size: u32,
687        avg_size: u32,
688        max_size: u32,
689        level: Normalization,
690    ) -> Self {
691        StreamCDC::with_level_and_seed(source, min_size, avg_size, max_size, level, 0)
692    }
693
694    ///
695    /// Create a new [`StreamCDC`] with the given normalization level and hash seed.
696    ///
697    pub fn with_level_and_seed(
698        source: R,
699        min_size: u32,
700        avg_size: u32,
701        max_size: u32,
702        level: Normalization,
703        seed: u64,
704    ) -> Self {
705        assert!(min_size >= MINIMUM_MIN);
706        assert!(min_size <= MINIMUM_MAX);
707        assert!(avg_size >= AVERAGE_MIN);
708        assert!(avg_size <= AVERAGE_MAX);
709        assert!(max_size >= MAXIMUM_MIN);
710        assert!(max_size <= MAXIMUM_MAX);
711        let bits = logarithm2(avg_size);
712        let normalization = level.bits();
713        let mask_s = MASKS[(bits + normalization) as usize];
714        let mask_l = MASKS[(bits - normalization) as usize];
715        let (gear, gear_ls) = get_gear_with_seed(seed);
716        Self {
717            buffer: vec![0_u8; max_size as usize],
718            capacity: max_size as usize,
719            length: 0,
720            source,
721            eof: false,
722            processed: 0,
723            min_size: min_size as usize,
724            avg_size: avg_size as usize,
725            max_size: max_size as usize,
726            mask_s,
727            mask_l,
728            mask_s_ls: mask_s << 1,
729            mask_l_ls: mask_l << 1,
730            gear,
731            gear_ls,
732        }
733    }
734
735    /// Fill the buffer with data from the source, returning the number of bytes
736    /// read (zero if end of source has been reached).
737    fn fill_buffer(&mut self) -> Result<usize, Error> {
738        // this code originally copied from asuran crate
739        if self.eof {
740            Ok(0)
741        } else {
742            let mut all_bytes_read = 0;
743            while !self.eof && self.length < self.capacity {
744                let bytes_read = self.source.read(&mut self.buffer[self.length..])?;
745                if bytes_read == 0 {
746                    self.eof = true;
747                } else {
748                    self.length += bytes_read;
749                    all_bytes_read += bytes_read;
750                }
751            }
752            Ok(all_bytes_read)
753        }
754    }
755
756    /// Drains a specified number of bytes from the buffer, then resizes the
757    /// buffer back to `capacity` size in preparation for further reads.
758    fn drain_bytes(&mut self, count: usize) -> Result<Vec<u8>, Error> {
759        // this code originally copied from asuran crate
760        if count > self.length {
761            Err(Error::Other(format!(
762                "drain_bytes() called with count larger than length: {} > {}",
763                count, self.length
764            )))
765        } else {
766            let data = self.buffer.drain(..count).collect::<Vec<u8>>();
767            self.length -= count;
768            self.buffer.resize(self.capacity, 0_u8);
769            Ok(data)
770        }
771    }
772
773    /// Find the next chunk in the source. If the end of the source has been
774    /// reached, returns `Error::Empty` as the error.
775    fn read_chunk(&mut self) -> Result<ChunkData, Error> {
776        self.fill_buffer()?;
777        if self.length == 0 {
778            Err(Error::Empty)
779        } else {
780            let (hash, count) = cut_gear(
781                &self.buffer[..self.length],
782                self.min_size,
783                self.avg_size,
784                self.max_size,
785                self.mask_s,
786                self.mask_l,
787                self.mask_s_ls,
788                self.mask_l_ls,
789                &self.gear,
790                &self.gear_ls,
791            );
792            if count == 0 {
793                Err(Error::Empty)
794            } else {
795                let offset = self.processed;
796                self.processed += count as u64;
797                let data = self.drain_bytes(count)?;
798                Ok(ChunkData {
799                    hash,
800                    offset,
801                    length: count,
802                    data,
803                })
804            }
805        }
806    }
807}
808
809impl<R: Read> Iterator for StreamCDC<R> {
810    type Item = Result<ChunkData, Error>;
811
812    fn next(&mut self) -> Option<Result<ChunkData, Error>> {
813        let slice = self.read_chunk();
814        if let Err(Error::Empty) = slice {
815            None
816        } else {
817            Some(slice)
818        }
819    }
820}
821
822///
823/// Base-2 logarithm function for unsigned 32-bit integers.
824///
825pub fn logarithm2(value: u32) -> u32 {
826    f64::from(value).log2().round() as u32
827}
828
829#[cfg(test)]
830mod tests {
831    use super::*;
832    use md5::{Digest, Md5};
833    use std::fs::{self, File};
834
835    #[test]
836    fn test_logarithm2() {
837        assert_eq!(logarithm2(0), 0);
838        assert_eq!(logarithm2(1), 0);
839        assert_eq!(logarithm2(2), 1);
840        assert_eq!(logarithm2(3), 2);
841        assert_eq!(logarithm2(5), 2);
842        assert_eq!(logarithm2(6), 3);
843        assert_eq!(logarithm2(11), 3);
844        assert_eq!(logarithm2(12), 4);
845        assert_eq!(logarithm2(19), 4);
846        assert_eq!(logarithm2(64), 6);
847        assert_eq!(logarithm2(128), 7);
848        assert_eq!(logarithm2(256), 8);
849        assert_eq!(logarithm2(512), 9);
850        assert_eq!(logarithm2(1024), 10);
851        assert_eq!(logarithm2(16383), 14);
852        assert_eq!(logarithm2(16384), 14);
853        assert_eq!(logarithm2(16385), 14);
854        assert_eq!(logarithm2(32767), 15);
855        assert_eq!(logarithm2(32768), 15);
856        assert_eq!(logarithm2(32769), 15);
857        assert_eq!(logarithm2(65535), 16);
858        assert_eq!(logarithm2(65536), 16);
859        assert_eq!(logarithm2(65537), 16);
860        assert_eq!(logarithm2(1_048_575), 20);
861        assert_eq!(logarithm2(1_048_576), 20);
862        assert_eq!(logarithm2(1_048_577), 20);
863        assert_eq!(logarithm2(4_194_303), 22);
864        assert_eq!(logarithm2(4_194_304), 22);
865        assert_eq!(logarithm2(4_194_305), 22);
866        assert_eq!(logarithm2(16_777_215), 24);
867        assert_eq!(logarithm2(16_777_216), 24);
868        assert_eq!(logarithm2(16_777_217), 24);
869    }
870
871    #[test]
872    #[should_panic]
873    fn test_minimum_too_low() {
874        let array = [0u8; 1024];
875        FastCDC::new(&array, 63, 256, 1024);
876    }
877
878    #[test]
879    #[should_panic]
880    fn test_minimum_too_high() {
881        let array = [0u8; 1024];
882        FastCDC::new(&array, 67_108_867, 256, 1024);
883    }
884
885    #[test]
886    #[should_panic]
887    fn test_average_too_low() {
888        let array = [0u8; 1024];
889        FastCDC::new(&array, 64, 255, 1024);
890    }
891
892    #[test]
893    #[should_panic]
894    fn test_average_too_high() {
895        let array = [0u8; 1024];
896        FastCDC::new(&array, 64, 268_435_457, 1024);
897    }
898
899    #[test]
900    #[should_panic]
901    fn test_maximum_too_low() {
902        let array = [0u8; 1024];
903        FastCDC::new(&array, 64, 256, 1023);
904    }
905
906    #[test]
907    #[should_panic]
908    fn test_maximum_too_high() {
909        let array = [0u8; 1024];
910        FastCDC::new(&array, 64, 256, 1_073_741_825);
911    }
912
913    #[test]
914    fn test_masks() {
915        let source = [0u8; 1024];
916        let chunker = FastCDC::new(&source, 64, 256, 1024);
917        assert_eq!(chunker.mask_l, MASKS[7]);
918        assert_eq!(chunker.mask_s, MASKS[9]);
919        let chunker = FastCDC::new(&source, 8192, 16384, 32768);
920        assert_eq!(chunker.mask_l, MASKS[13]);
921        assert_eq!(chunker.mask_s, MASKS[15]);
922        let chunker = FastCDC::new(&source, 1_048_576, 4_194_304, 16_777_216);
923        assert_eq!(chunker.mask_l, MASKS[21]);
924        assert_eq!(chunker.mask_s, MASKS[23]);
925    }
926
927    #[test]
928    fn test_cut_all_zeros() {
929        // for all zeros, always returns chunks of maximum size
930        let array = [0u8; 10240];
931        let chunker = FastCDC::new(&array, 64, 256, 1024);
932        let mut cursor: usize = 0;
933        for _ in 0..10 {
934            let (hash, pos) = chunker.cut(cursor, 10240 - cursor);
935            assert_eq!(hash, 14169102344523991076);
936            assert_eq!(pos, cursor + 1024);
937            cursor = pos;
938        }
939        // assert that nothing more should be returned
940        let (_, pos) = chunker.cut(cursor, 10240 - cursor);
941        assert_eq!(pos, 10240);
942    }
943
944    #[test]
945    fn test_cut_sekien_16k_chunks() {
946        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
947        assert!(read_result.is_ok());
948        let contents = read_result.unwrap();
949        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
950        let mut cursor: usize = 0;
951        let mut remaining: usize = contents.len();
952        let expected: Vec<(u64, usize)> = vec![
953            (17968276318003433923, 21325),
954            (8197189939299398838, 17140),
955            (13019990849178155730, 28084),
956            (4509236223063678303, 18217),
957            (2504464741100432583, 24700),
958        ];
959        for (e_hash, e_length) in expected.iter() {
960            let (hash, pos) = chunker.cut(cursor, remaining);
961            assert_eq!(hash, *e_hash);
962            assert_eq!(pos, cursor + e_length);
963            cursor = pos;
964            remaining -= e_length;
965        }
966        assert_eq!(remaining, 0);
967    }
968
969    #[test]
970    fn test_cut_sekien_16k_chunks_seed_666() {
971        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
972        assert!(read_result.is_ok());
973        let contents = read_result.unwrap();
974        let chunker =
975            FastCDC::with_level_and_seed(&contents, 4096, 16384, 65535, Normalization::Level1, 666);
976        let mut cursor: usize = 0;
977        let mut remaining: usize = contents.len();
978        let expected: Vec<(u64, usize)> = vec![
979            (9312357714466240148, 10605),
980            (226910853333574584, 55745),
981            (12271755243986371352, 11346),
982            (14153975939352546047, 5883),
983            (5890158701071314778, 11586),
984            (8981594897574481255, 14301),
985        ];
986        for (e_hash, e_length) in expected.iter() {
987            let (hash, pos) = chunker.cut(cursor, remaining);
988            assert_eq!(hash, *e_hash);
989            assert_eq!(pos, cursor + e_length);
990            cursor = pos;
991            remaining -= e_length;
992        }
993        assert_eq!(remaining, 0);
994    }
995
996    #[test]
997    fn test_cut_sekien_32k_chunks() {
998        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
999        assert!(read_result.is_ok());
1000        let contents = read_result.unwrap();
1001        let chunker = FastCDC::new(&contents, 8192, 32768, 131072);
1002        let mut cursor: usize = 0;
1003        let mut remaining: usize = contents.len();
1004        let expected: Vec<(u64, usize)> =
1005            vec![(15733367461443853673, 66549), (6321136627705800457, 42917)];
1006        for (e_hash, e_length) in expected.iter() {
1007            let (hash, pos) = chunker.cut(cursor, remaining);
1008            assert_eq!(hash, *e_hash);
1009            assert_eq!(pos, cursor + e_length);
1010            cursor = pos;
1011            remaining -= e_length;
1012        }
1013        assert_eq!(remaining, 0);
1014    }
1015
1016    #[test]
1017    fn test_cut_sekien_64k_chunks() {
1018        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
1019        assert!(read_result.is_ok());
1020        let contents = read_result.unwrap();
1021        let chunker = FastCDC::new(&contents, 16384, 65536, 262144);
1022        let mut cursor: usize = 0;
1023        let mut remaining: usize = contents.len();
1024        let expected: Vec<(u64, usize)> = vec![(2504464741100432583, 109466)];
1025        for (e_hash, e_length) in expected.iter() {
1026            let (hash, pos) = chunker.cut(cursor, remaining);
1027            assert_eq!(hash, *e_hash);
1028            assert_eq!(pos, cursor + e_length);
1029            cursor = pos;
1030            remaining -= e_length;
1031        }
1032        assert_eq!(remaining, 0);
1033    }
1034
1035    struct ExpectedChunk {
1036        hash: u64,
1037        offset: u64,
1038        length: usize,
1039        digest: String,
1040    }
1041
1042    #[test]
1043    fn test_iter_sekien_16k_chunks() {
1044        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
1045        assert!(read_result.is_ok());
1046        let contents = read_result.unwrap();
1047        // The digest values are not needed here, but they serve to validate
1048        // that the streaming version tested below is returning the correct
1049        // chunk data on each iteration.
1050        let expected_chunks = vec![
1051            ExpectedChunk {
1052                hash: 17968276318003433923,
1053                offset: 0,
1054                length: 21325,
1055                digest: "2bb52734718194617c957f5e07ee6054".into(),
1056            },
1057            ExpectedChunk {
1058                hash: 8197189939299398838,
1059                offset: 21325,
1060                length: 17140,
1061                digest: "badfb0757fe081c20336902e7131f768".into(),
1062            },
1063            ExpectedChunk {
1064                hash: 13019990849178155730,
1065                offset: 38465,
1066                length: 28084,
1067                digest: "18412d7414de6eb42f638351711f729d".into(),
1068            },
1069            ExpectedChunk {
1070                hash: 4509236223063678303,
1071                offset: 66549,
1072                length: 18217,
1073                digest: "04fe1405fc5f960363bfcd834c056407".into(),
1074            },
1075            ExpectedChunk {
1076                hash: 2504464741100432583,
1077                offset: 84766,
1078                length: 24700,
1079                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
1080            },
1081        ];
1082        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
1083        let mut index = 0;
1084        for chunk in chunker {
1085            assert_eq!(chunk.hash, expected_chunks[index].hash);
1086            assert_eq!(chunk.offset, expected_chunks[index].offset as usize);
1087            assert_eq!(chunk.length, expected_chunks[index].length);
1088            let mut hasher = Md5::new();
1089            hasher.update(&contents[chunk.offset..chunk.offset + chunk.length]);
1090            let table = hasher.finalize();
1091            let digest = format!("{:x}", table);
1092            assert_eq!(digest, expected_chunks[index].digest);
1093            index += 1;
1094        }
1095        assert_eq!(index, 5);
1096    }
1097
1098    #[test]
1099    fn test_cut_sekien_16k_nc_0() {
1100        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
1101        assert!(read_result.is_ok());
1102        let contents = read_result.unwrap();
1103        let chunker = FastCDC::with_level(&contents, 4096, 16384, 65535, Normalization::Level0);
1104        let mut cursor: usize = 0;
1105        let mut remaining: usize = contents.len();
1106        let expected: Vec<(u64, usize)> = vec![
1107            (443122261039895162, 6634),
1108            (15733367461443853673, 59915),
1109            (10460176299449652894, 25597),
1110            (6197802202431009942, 5237),
1111            (6321136627705800457, 12083),
1112        ];
1113        for (e_hash, e_length) in expected.iter() {
1114            let (hash, pos) = chunker.cut(cursor, remaining);
1115            assert_eq!(hash, *e_hash);
1116            assert_eq!(pos, cursor + e_length);
1117            cursor = pos;
1118            remaining -= e_length;
1119        }
1120        assert_eq!(remaining, 0);
1121    }
1122
1123    #[test]
1124    fn test_cut_sekien_16k_nc_3() {
1125        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
1126        assert!(read_result.is_ok());
1127        let contents = read_result.unwrap();
1128        let chunker = FastCDC::with_level(&contents, 8192, 16384, 32768, Normalization::Level3);
1129        let mut cursor: usize = 0;
1130        let mut remaining: usize = contents.len();
1131        let expected: Vec<(u64, usize)> = vec![
1132            (10718006254707412376, 17350),
1133            (13104072099671895560, 19911),
1134            (12322483109039221194, 17426),
1135            (16009206469796846404, 17519),
1136            (2473608525189754172, 19940),
1137            (2504464741100432583, 17320),
1138        ];
1139        for (e_hash, e_length) in expected.iter() {
1140            let (hash, pos) = chunker.cut(cursor, remaining);
1141            assert_eq!(hash, *e_hash);
1142            assert_eq!(pos, cursor + e_length);
1143            cursor = pos;
1144            remaining -= e_length;
1145        }
1146        assert_eq!(remaining, 0);
1147    }
1148
1149    #[test]
1150    fn test_error_fmt() {
1151        let err = Error::Empty;
1152        assert_eq!(format!("{err}"), "chunker error: Empty");
1153    }
1154
1155    #[test]
1156    fn test_stream_sekien_16k_chunks() {
1157        let file_result = File::open("test/fixtures/SekienAkashita.jpg");
1158        assert!(file_result.is_ok());
1159        let file = file_result.unwrap();
1160        // The set of expected results should match the non-streaming version.
1161        let expected_chunks = vec![
1162            ExpectedChunk {
1163                hash: 17968276318003433923,
1164                offset: 0,
1165                length: 21325,
1166                digest: "2bb52734718194617c957f5e07ee6054".into(),
1167            },
1168            ExpectedChunk {
1169                hash: 8197189939299398838,
1170                offset: 21325,
1171                length: 17140,
1172                digest: "badfb0757fe081c20336902e7131f768".into(),
1173            },
1174            ExpectedChunk {
1175                hash: 13019990849178155730,
1176                offset: 38465,
1177                length: 28084,
1178                digest: "18412d7414de6eb42f638351711f729d".into(),
1179            },
1180            ExpectedChunk {
1181                hash: 4509236223063678303,
1182                offset: 66549,
1183                length: 18217,
1184                digest: "04fe1405fc5f960363bfcd834c056407".into(),
1185            },
1186            ExpectedChunk {
1187                hash: 2504464741100432583,
1188                offset: 84766,
1189                length: 24700,
1190                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
1191            },
1192        ];
1193        let chunker = StreamCDC::new(file, 4096, 16384, 65535);
1194        let mut index = 0;
1195        for result in chunker {
1196            assert!(result.is_ok());
1197            let chunk = result.unwrap();
1198            assert_eq!(chunk.hash, expected_chunks[index].hash);
1199            assert_eq!(chunk.offset, expected_chunks[index].offset);
1200            assert_eq!(chunk.length, expected_chunks[index].length);
1201            let mut hasher = Md5::new();
1202            hasher.update(&chunk.data);
1203            let table = hasher.finalize();
1204            let digest = format!("{:x}", table);
1205            assert_eq!(digest, expected_chunks[index].digest);
1206            index += 1;
1207        }
1208        assert_eq!(index, 5);
1209    }
1210
1211    #[test]
1212    fn test_stream_sekien_16k_chunks_seed_666() {
1213        let file_result = File::open("test/fixtures/SekienAkashita.jpg");
1214        assert!(file_result.is_ok());
1215        let file = file_result.unwrap();
1216        // The set of expected results should match the non-streaming version.
1217        let expected_chunks = vec![
1218            ExpectedChunk {
1219                hash: 9312357714466240148,
1220                offset: 0,
1221                length: 10605,
1222                digest: "09734863c8022bb0314f3de8c87a6d00".into(),
1223            },
1224            ExpectedChunk {
1225                hash: 226910853333574584,
1226                offset: 10605,
1227                length: 55745,
1228                digest: "4e49a6aa921341881f47ec0bbcdba6a9".into(),
1229            },
1230            ExpectedChunk {
1231                hash: 12271755243986371352,
1232                offset: 66350,
1233                length: 11346,
1234                digest: "f4b0cf33b1521be2a6af69df9bf90455".into(),
1235            },
1236            ExpectedChunk {
1237                hash: 14153975939352546047,
1238                offset: 77696,
1239                length: 5883,
1240                digest: "f392b5df328ce6c43de7e19464a455b4".into(),
1241            },
1242            ExpectedChunk {
1243                hash: 5890158701071314778,
1244                offset: 83579,
1245                length: 11586,
1246                digest: "a6053ce5cd98e6b717e7f97e71d743ba".into(),
1247            },
1248            ExpectedChunk {
1249                hash: 8981594897574481255,
1250                offset: 95165,
1251                length: 14301,
1252                digest: "0db10fbf2685498e16e5440f5e697e2f".into(),
1253            },
1254        ];
1255        let chunker =
1256            StreamCDC::with_level_and_seed(file, 4096, 16384, 65535, Normalization::Level1, 666);
1257        let mut index = 0;
1258        for result in chunker {
1259            assert!(result.is_ok());
1260            let chunk = result.unwrap();
1261            assert_eq!(chunk.hash, expected_chunks[index].hash);
1262            assert_eq!(chunk.offset, expected_chunks[index].offset);
1263            assert_eq!(chunk.length, expected_chunks[index].length);
1264            let mut hasher = Md5::new();
1265            hasher.update(&chunk.data);
1266            let table = hasher.finalize();
1267            let digest = format!("{:x}", table);
1268            assert_eq!(digest, expected_chunks[index].digest);
1269            index += 1;
1270        }
1271        assert_eq!(index, 6);
1272    }
1273}