Skip to main content

xet_data/deduplication/
chunking.rs

1use std::cmp::min;
2use std::io::{Read, Seek, SeekFrom};
3
4use bytes::Bytes;
5use more_asserts::{debug_assert_ge, debug_assert_le};
6
7use super::Chunk;
8use super::constants::{MAXIMUM_CHUNK_MULTIPLIER, MINIMUM_CHUNK_DIVISOR, TARGET_CHUNK_SIZE};
9
10lazy_static::lazy_static! {
11    /// The maximum chunk size, calculated from the configurable constants above
12    pub static ref MAX_CHUNK_SIZE: usize = (*TARGET_CHUNK_SIZE) * (*MAXIMUM_CHUNK_MULTIPLIER);
13}
14
15/// Chunk Generator given an input stream. Do not use directly.
16/// Use `chunk_target_default`.
17pub struct Chunker {
18    // configs
19    hash: gearhash::Hasher<'static>,
20    minimum_chunk: usize,
21    maximum_chunk: usize,
22    mask: u64,
23
24    // generator state
25    chunkbuf: Vec<u8>,
26}
27
28impl Default for Chunker {
29    fn default() -> Self {
30        Self::new(*TARGET_CHUNK_SIZE)
31    }
32}
33
34impl Chunker {
35    pub fn new(target_chunk_size: usize) -> Self {
36        assert_eq!(target_chunk_size.count_ones(), 1);
37
38        // Some of the logic only works if the target_chunk_size is greater than the
39        // window size of the hash.
40        assert!(target_chunk_size > 64);
41
42        // note the strict lesser than. Combined with count_ones() == 1,
43        // this limits to 2^31
44        assert!(target_chunk_size < u32::MAX as usize);
45
46        let mask = (target_chunk_size - 1) as u64;
47
48        // we will like to shift the mask left by a bunch since the right
49        // bits of the gear hash are affected by only a small number of bytes
50        // really. we just shift it all the way left.
51        let mask = mask << mask.leading_zeros();
52        let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
53        let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
54
55        assert!(maximum_chunk > minimum_chunk);
56
57        let hash = gearhash::Hasher::default();
58
59        Chunker {
60            hash,
61            minimum_chunk,
62            maximum_chunk,
63            mask,
64            // generator state init
65            chunkbuf: Vec::with_capacity(maximum_chunk),
66        }
67    }
68
69    /// Create a chunker with custom min chunk sizes.
70    /// Only used by the partitioner which has special requirements.
71    fn new_with_min(target_chunk_size: usize, min_chunk_size: usize) -> Self {
72        let mut chunker = Self::new(target_chunk_size);
73        chunker.minimum_chunk = min_chunk_size;
74        chunker
75    }
76
77    /// Looks for the next chunk boundary in the data.  Assumes that whatever is in the current
78    /// state has been prepended to the current data.  If a boundary cannot be found based on the
79    /// current amount of data, then None is returned.
80    #[inline]
81    pub fn next_boundary(&mut self, data: &[u8]) -> Option<usize> {
82        const HASH_WINDOW_SIZE: usize = 64;
83        let n_bytes = data.len();
84
85        if n_bytes == 0 {
86            return None;
87        }
88
89        let previous_len = self.chunkbuf.len();
90        let mut cur_index = 0;
91        let mut create_chunk = false;
92
93        // skip the minimum chunk size
94        // and noting that the hash has a window size of 64
95        // so we should be careful to skip only minimum_chunk - 64 - 1
96        if previous_len + HASH_WINDOW_SIZE < self.minimum_chunk {
97            let skip = min(self.minimum_chunk - previous_len - HASH_WINDOW_SIZE - 1, n_bytes);
98            cur_index += skip;
99        }
100
101        // If we have a lot of data, don't read all the way to the end when we'll stop reading
102        // at the maximum chunk boundary.
103        let read_end = n_bytes.min(cur_index + self.maximum_chunk - previous_len);
104
105        loop {
106            if let Some(next_match) = self.hash.next_match(&data[cur_index..read_end], self.mask) {
107                cur_index += next_match;
108
109                // If we trigger a boundary before the end, create a chunk.
110
111                // We must enforce that the next boundary is actually past the minimum chunk size.
112                // Because of how the rolling hash is computed, bytes before HASH_WINDOW_SIZE don't affect the hash,
113                // so with the above skip we depend on it running for at least HASH_WINDOW_SIZE bytes before triggering
114                // a boundary.   However, in rare occurances, there can be a boundary triggered before HASH_WINDOW_SIZE
115                // bytes have been processed, which means the boundary is triggered based on the previous state of the
116                // hasher rather than on the current chunk content.  Thus we ensure this can't happen by ensuring that
117                // we have processed at least HASH_WINDOW_SIZE bytes.
118                if cur_index + previous_len < self.minimum_chunk {
119                    continue;
120                }
121
122                create_chunk = true;
123            } else {
124                cur_index = read_end;
125            }
126
127            break;
128        }
129
130        // if we hit maximum chunk we must create a chunk
131        if cur_index + previous_len >= self.maximum_chunk {
132            cur_index = self.maximum_chunk - previous_len;
133            create_chunk = true;
134        }
135
136        if create_chunk {
137            self.hash.set_hash(0); // Reset for the next time.
138            debug_assert_ge!(cur_index + previous_len, self.minimum_chunk);
139            debug_assert_le!(cur_index + previous_len, self.maximum_chunk);
140            Some(cur_index)
141        } else {
142            None
143        }
144    }
145
146    fn reset_state(&mut self) {
147        // Strictly speaking, this is unneccesary, as we should always hash 64 bytes out making the previous state
148        // of the hasher irrelevant.  However, this explicitly declares we're resetting things to the
149        // initial state.
150        self.hash.set_hash(0);
151        debug_assert!(self.chunkbuf.is_empty());
152    }
153
154    /// Process more data; this is a continuation of any data from before when calls were
155    ///
156    /// Returns the next chunk, if available, and the amount of data that was digested.
157    ///
158    /// If is_final is true, then it is assumed that no more data after this block will come,
159    /// and any data currently present and at the end will be put into a final chunk.
160    pub fn next(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
161        let (chunk_data, consume): (Bytes, usize) = {
162            if let Some(next_boundary) = self.next_boundary(data) {
163                if self.chunkbuf.is_empty() {
164                    (Bytes::copy_from_slice(&data[..next_boundary]), next_boundary)
165                } else {
166                    self.chunkbuf.extend_from_slice(&data[..next_boundary]);
167                    (std::mem::take(&mut self.chunkbuf).into(), next_boundary)
168                }
169            } else if is_final {
170                // Put the rest of the data in the chunkbuf.
171                let r = if self.chunkbuf.is_empty() {
172                    (Bytes::copy_from_slice(data), data.len())
173                } else {
174                    self.chunkbuf.extend_from_slice(data);
175                    (std::mem::take(&mut self.chunkbuf).into(), data.len())
176                };
177
178                if is_final {
179                    self.reset_state();
180                }
181
182                r
183            } else {
184                self.chunkbuf.extend_from_slice(data);
185                return (None, data.len());
186            }
187        };
188
189        // Special case this specific case.
190        if chunk_data.is_empty() {
191            return (None, 0);
192        }
193
194        (Some(Chunk::new(chunk_data)), consume)
195    }
196
197    /// Keeps chunking until no more chunks can be reliably produced, returning a
198    /// vector of the resulting chunks.  
199    pub fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
200        let mut ret = Vec::new();
201
202        let mut pos = 0;
203        loop {
204            debug_assert!(pos <= data.len());
205            if pos == data.len() {
206                if is_final {
207                    self.reset_state();
208                }
209
210                return ret;
211            }
212
213            let (maybe_chunk, bytes_consumed) = self.next(&data[pos..], is_final);
214
215            if let Some(chunk) = maybe_chunk {
216                ret.push(chunk);
217            }
218
219            pos += bytes_consumed;
220        }
221    }
222
223    /// Keeps chunking until no more chunks can be reliably produced, returning a
224    /// vector of the resulting chunks.
225    ///
226    /// The data is inserted here as a Bytes object, which means that no copying of the data
227    /// is performed except at the boundaries.  The resulting chunks then end up each holding
228    /// a reference to the original data object, which will not be deallocated until all the
229    /// original bytes are gone.
230    pub fn next_block_bytes(&mut self, data: &Bytes, is_final: bool) -> Vec<Chunk> {
231        let mut ret = Vec::new();
232
233        let mut pos = 0;
234
235        // In this case, we have to perform a single cut using the old method,
236        // which would copy the data.
237        if !self.chunkbuf.is_empty() {
238            let (maybe_chunk, skip_idx) = self.next(data, is_final);
239            if let Some(chunk) = maybe_chunk {
240                ret.push(chunk);
241            }
242            pos = skip_idx;
243        }
244
245        while pos < data.len() {
246            let maybe_next_boundary = self.next_boundary(&data[pos..]);
247
248            if let Some(chunk_size) = maybe_next_boundary {
249                let next_pos = pos + chunk_size;
250                ret.push(Chunk::new(data.slice(pos..next_pos)));
251                pos = next_pos;
252            } else {
253                // No more chunks in this block.
254                if is_final {
255                    ret.push(Chunk::new(data.slice(pos..)));
256                } else {
257                    self.chunkbuf.extend_from_slice(&data[pos..]);
258                }
259                break;
260            }
261        }
262
263        if is_final {
264            self.reset_state();
265        }
266
267        ret
268    }
269
270    // Finishes, returning the final chunk if one exists, and resets the hasher to
271    pub fn finish(&mut self) -> Option<Chunk> {
272        self.next(&[], true).0
273    }
274}
275
276/// Find valid partition points in a file where we can
277/// chunk in parallel. Returns the start points of each partition
278/// (i.e. file offset 0 is always the first entry, and `file_size`
279/// is never in the result).
280/// Note that reader position is modified and not restored.
281///
282/// partition_scan_bytes is the number of bytes to scan at each
283/// proposed partition boundary in search of a valid chunk.
284///
285/// Due to a known issue in how we do chunking, note that these
286/// partitions are not 100% guaranteed to align. See the
287/// parallel_chunking.pdf for details.
288pub fn find_partitions<R: Read + Seek>(
289    reader: &mut R,
290    file_size: usize,
291    target_chunk_size: usize,
292    min_partition_size: usize,
293    partition_scan_bytes: usize,
294) -> std::io::Result<Vec<usize>> {
295    assert!(min_partition_size > 0);
296    let mut partitions: Vec<usize> = Vec::new();
297    partitions.push(0);
298    // minimum chunk must be at least the hash window size.
299    // the way the chunker works, the minimum may be up to
300    // target_min_chunk_size - 64
301    let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
302    let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
303
304    assert!(minimum_chunk > 64);
305
306    if maximum_chunk >= min_partition_size {
307        return Ok(partitions);
308    }
309    let mut buf = vec![0u8; partition_scan_bytes];
310    let mut curpos: usize = 0;
311    // we jump curpos forward by min_partition_size
312    // and read *PARALLEL_CHUNKING_PARTITION_SCAN_BYTES bytes
313    // and try to find a partition boundary condition.
314    //
315    // We should also make sure There should also be at least
316    // min_partition_size bytes remaining at curpos so that
317    // we do not make a teeny tiny partition.
318    while curpos < file_size {
319        curpos += min_partition_size;
320        // there are not enough bytes to make a full partition
321        // or not enough bytes to scan for a partition
322        if curpos + min_partition_size >= file_size || curpos + partition_scan_bytes >= file_size {
323            break;
324        }
325        // read and chunk the scan bytes
326        reader.seek(SeekFrom::Start(curpos as u64))?;
327        reader.read_exact(&mut buf)?;
328        let mut chunker = Chunker::new_with_min(target_chunk_size, 0);
329        // TODO: there is a definite optimization here
330        // as we really only need the chunk lengths and not the data
331        let chunks = chunker.next_block(&buf, false);
332        if chunks.is_empty() {
333            continue;
334        }
335        // skip the first chunk
336        let mut offset = chunks[0].data.len();
337        offset += chunks[1].data.len();
338        for i in 2..chunks.len() {
339            let cprev = chunks[i - 1].data.len();
340            let c = chunks[i].data.len();
341            offset += chunks[i].data.len();
342            if cprev > minimum_chunk
343                && cprev < maximum_chunk - minimum_chunk
344                && c > minimum_chunk
345                && c < maximum_chunk - minimum_chunk
346            {
347                // we have a valid partition at this position
348                partitions.push(curpos + offset);
349                break;
350            }
351        }
352    }
353    Ok(partitions)
354}
355
356#[cfg(test)]
357mod tests {
358    use std::collections::HashSet;
359    use std::io::Cursor;
360
361    use rand::rngs::StdRng;
362    use rand::{Rng, SeedableRng};
363
364    use super::*;
365
366    /// A helper to create random test data using a specified `seed` and `len`.
367    /// Using a fixed seed ensures tests are reproducible.
368    fn make_test_data(seed: u64, len: usize) -> Vec<u8> {
369        let mut rng = StdRng::seed_from_u64(seed);
370        let mut data = vec![0; len];
371        rng.fill(&mut data[..]);
372        data
373    }
374
375    fn check_chunks_equal(chunks: &[Chunk], data: &[u8]) {
376        // Validate all the chunks are exact.
377        let mut new_vec = Vec::with_capacity(10000);
378        for c in chunks.iter() {
379            new_vec.extend_from_slice(&c.data[..]);
380        }
381
382        assert!(new_vec == data);
383    }
384
385    // A chunker that wraps two versions of the Chunker class,
386    // exposing next_block but then internally testing next_bytes_block and next_block and
387    // verifying the output is identical.
388    #[derive(Default)]
389    struct ChunkerTestWrapper {
390        chunker_chunks: Chunker,
391        chunker_bytes: Chunker,
392    }
393
394    impl ChunkerTestWrapper {
395        fn new(target_chunk_size: usize) -> Self {
396            ChunkerTestWrapper {
397                chunker_chunks: Chunker::new(target_chunk_size),
398                chunker_bytes: Chunker::new(target_chunk_size),
399            }
400        }
401
402        fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
403            let chunks = self.chunker_chunks.next_block(data, is_final);
404            let bytes_chunks = self.chunker_bytes.next_block_bytes(&Bytes::copy_from_slice(data), is_final);
405
406            // Check that the two match.
407            assert_eq!(chunks.len(), bytes_chunks.len());
408            for (c1, c2) in chunks.iter().zip(bytes_chunks.iter()) {
409                assert_eq!(c1.data, c2.data);
410            }
411
412            chunks
413        }
414
415        fn next_chunk(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
416            let (chunk, consumed) = self.chunker_chunks.next(data, is_final);
417            let (bytes_chunk, bytes_consumed) = self.chunker_bytes.next(&Bytes::copy_from_slice(data), is_final);
418
419            // Check that the two match.
420            if let Some(c) = &chunk {
421                assert_eq!(c.data, bytes_chunk.unwrap().data);
422            } else {
423                assert!(bytes_chunk.is_none());
424            }
425
426            (chunk, consumed.max(bytes_consumed))
427        }
428    }
429
430    #[test]
431    fn test_empty_data_no_chunk_until_final() {
432        let mut chunker = ChunkerTestWrapper::new(128);
433
434        // Passing empty slice without final => no chunk
435        let (chunk, consumed) = chunker.next_chunk(&[], false);
436        assert!(chunk.is_none());
437        assert_eq!(consumed, 0);
438
439        // Passing empty slice again with is_final = true => no leftover data, so no chunk
440        let (chunk, consumed) = chunker.next_chunk(&[], true);
441        assert!(chunk.is_none());
442        assert_eq!(consumed, 0);
443    }
444
445    #[test]
446    fn test_data_smaller_than_minimum_no_boundary() {
447        let mut chunker = ChunkerTestWrapper::new(128);
448
449        // Create a small random data buffer. For example, length=3.
450        let data = make_test_data(0, 63);
451
452        // We expect no chunk until we finalize, because there's not enough data
453        // to trigger a boundary, nor to reach the maximum chunk size.
454        let (chunk, consumed) = chunker.next_chunk(&data, false);
455        assert!(chunk.is_none());
456        assert_eq!(consumed, data.len());
457
458        // Now finalize: we expect a chunk with the leftover data
459        let (chunk, consumed) = chunker.next_chunk(&[], true);
460        assert!(chunk.is_some());
461        assert_eq!(consumed, 0);
462
463        let chunk = chunk.unwrap();
464        assert_eq!(chunk.data.len(), 63);
465        assert_eq!(&chunk.data[..], &data[..], "Chunk should contain exactly what was passed in");
466    }
467
468    #[test]
469    fn test_multiple_chunks_produced() {
470        let mut chunker = ChunkerTestWrapper::new(128);
471
472        // Produce 100 bytes of random data
473        let data = make_test_data(42, 10000);
474
475        // Pass everything at once, final = true
476        let chunks = chunker.next_block(&data, true);
477        assert!(!chunks.is_empty());
478
479        check_chunks_equal(&chunks, &data);
480    }
481
482    #[test]
483    fn test_repeated_calls_partial_consumption() {
484        // We'll feed in two pieces of data to ensure partial consumption
485
486        let data = make_test_data(42, 10000);
487
488        let mut chunks_1 = Vec::new();
489
490        let mut pos = 0;
491        let mut chunker = ChunkerTestWrapper::new(128);
492
493        while pos < data.len() {
494            for i in 0..16 {
495                let next_pos = (pos + i).min(data.len());
496                chunks_1.append(&mut chunker.next_block(&data[pos..next_pos], next_pos == data.len()));
497                pos = next_pos;
498            }
499        }
500
501        check_chunks_equal(&chunks_1, &data);
502
503        // Now, rechunk with all at once and make sure it's equal.
504        let chunks_2 = ChunkerTestWrapper::new(128).next_block(&data, true);
505
506        assert_eq!(chunks_1, chunks_2);
507    }
508
509    #[test]
510    fn test_exact_maximum_chunk() {
511        // If the data hits the maximum chunk size exactly, we should force a boundary.
512        // For target_chunk_size = 128, if MAXIMUM_CHUNK_MULTIPLIER = 2, then max = 256.
513        // Adjust if your constants differ.
514        let mut chunker = ChunkerTestWrapper::new(512);
515
516        // Use constant data
517        let data = vec![0; 8 * *MAXIMUM_CHUNK_MULTIPLIER * 512];
518
519        let chunks = chunker.next_block(&data, true);
520
521        assert_eq!(chunks.len(), 8);
522
523        for c in chunks.iter() {
524            assert_eq!(c.data.len(), *MAXIMUM_CHUNK_MULTIPLIER * 512);
525        }
526    }
527
528    #[test]
529    fn test_partition() {
530        for _i in 1..5 {
531            let data = make_test_data(42, 1000000);
532            let mut chunker = Chunker::new(1024);
533            let chunks = chunker.next_block(&data, true);
534            let mut chunk_offsets = HashSet::new();
535            let mut offset = 0;
536            eprintln!("{:?}", chunker.minimum_chunk);
537            for i in 0..chunks.len() {
538                chunk_offsets.insert(offset);
539                offset += chunks[i].data.len();
540            }
541
542            let partitions =
543                find_partitions(&mut Cursor::new(&mut data.as_slice()), data.len(), 1024, 100000, 10000).unwrap();
544            assert!(partitions.len() > 1);
545            for i in 0..partitions.len() {
546                assert!(chunk_offsets.contains(&partitions[i]));
547            }
548        }
549    }
550
551    /// Simple SplitMix64-based deterministic random number generator.
552    /// Portable to C, Python, etc. (see https://prng.di.unimi.it/splitmix64.c)
553    fn splitmix64_next(state: &mut u64) -> u64 {
554        *state = state.wrapping_add(0x9E3779B97F4A7C15);
555        let mut z = *state;
556        z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
557        z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
558        z ^ (z >> 31)
559    }
560
561    fn create_random_data(n: usize, seed: u64) -> Vec<u8> {
562        // This test will actually need to be run in different environments, so to generate
563        // the table below, create random data using a simple SplitMix rng that can be ported here
564        // as is without dependening on other po
565        let mut ret = Vec::with_capacity(n + 7);
566
567        let mut state = seed;
568
569        while ret.len() < n {
570            let next_u64 = splitmix64_next(&mut state);
571            ret.extend_from_slice(&next_u64.to_le_bytes());
572        }
573
574        // Has extra bits on there since we're adding in blocks of 8.
575        ret.resize(n, 0);
576
577        ret
578    }
579
580    fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec<usize> {
581        chunks
582            .iter()
583            .scan(0, |state, chunk| {
584                *state += chunk.data.len();
585                Some(*state)
586            })
587            .collect()
588    }
589
590    #[test]
591    fn test_chunk_boundaries() {
592        let data = create_random_data(256000, 1);
593
594        // Now, run the chunks through the default chunker.
595        let chunks = ChunkerTestWrapper::default().next_block(&data, true);
596
597        // Get the boundaries indices as determined by the size of the chunks above.
598        let ref_chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
599
600        // Test that it's correct across different chunk varieties.
601        for add_size in [1, 37, 255] {
602            let mut chunker = Chunker::default();
603
604            // Add repeatedly in blocks of add_size, appending to alt_chunks
605            let mut alt_chunks = Vec::with_capacity(chunks.len());
606
607            let mut pos = 0;
608            while pos < data.len() {
609                let next_pos = (pos + add_size).min(data.len());
610                let next_chunk = chunker.next_block(&data[pos..next_pos], next_pos == data.len());
611                alt_chunks.extend(next_chunk);
612                pos = next_pos;
613            }
614
615            let alt_boundaries = get_chunk_boundaries(&alt_chunks);
616
617            assert_eq!(alt_boundaries, ref_chunk_boundaries);
618        }
619    }
620
621    #[test]
622    fn test_correctness_1mb_random_data() {
623        // Test this data.
624        let data = create_random_data(1000000, 0);
625
626        // Uncomment these to create the lines below:
627        // eprintln!("(data[0], {});", data[0] as usize);
628        // eprintln!("(data[127], {});", data[127] as usize);
629        // eprintln!("(data[111111], {});", data[111111] as usize);
630
631        assert_eq!(data[0], 175);
632        assert_eq!(data[127], 132);
633        assert_eq!(data[111111], 118);
634
635        // Now, run the chunks through the default chunker.
636        let chunks = ChunkerTestWrapper::default().next_block(&data, true);
637
638        // Get the boundaries indices as determined by the size of the chunks above.
639        let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
640
641        // Uncomment this to create the line below.
642        // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
643        assert_eq!(
644            chunk_boundaries,
645            vec![
646                84493, 134421, 144853, 243318, 271793, 336457, 467529, 494581, 582000, 596735, 616815, 653164, 678202,
647                724510, 815591, 827760, 958832, 991092, 1000000
648            ]
649        );
650    }
651
652    #[test]
653    fn test_correctness_1mb_const_data() {
654        // Test this data.
655        let data = vec![59u8; 1000000];
656
657        // Now, run the chunks through the default chunker.
658        let chunks = ChunkerTestWrapper::default().next_block(&data, true);
659
660        // Get the boundaries indices as determined by the size of the chunks above.
661        let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
662
663        // Uncomment this to create the line below.
664        // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
665        assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000])
666    }
667
668    fn get_triggering_base_data(n: usize, padding: usize) -> Vec<u8> {
669        // This pattern is known to trigger the boundary detection in the chunker, so repeat it to test the
670        // correctness of the minimum chunk size processing.
671        let mut data = vec![
672            154, 52, 42, 34, 159, 75, 126, 224, 70, 236, 12, 196, 79, 236, 178, 124, 127, 50, 99, 178, 44, 176, 174,
673            126, 250, 235, 205, 174, 252, 122, 35, 10, 20, 101, 214, 69, 193, 8, 115, 105, 158, 228, 120, 111, 136,
674            162, 198, 251, 211, 183, 253, 252, 164, 147, 63, 16, 186, 162, 117, 23, 170, 36, 205, 187, 174, 76, 210,
675            174, 211, 175, 12, 173, 145, 59, 2, 70, 222, 181, 159, 227, 182, 156, 189, 51, 226, 106, 24, 50, 183, 157,
676            140, 10, 8, 23, 212, 70, 10, 234, 23, 33, 219, 254, 39, 236, 70, 49, 191, 116, 9, 115, 15, 101, 26, 159,
677            165, 220, 15, 170, 56, 125, 92, 163, 94, 235, 38, 40, 49, 81,
678        ];
679
680        // Add padding so we can comprehensively test the nuances of boundaries.
681        data.resize(data.len() + padding, 0u8);
682
683        // Repeat the above pattern until we've filled out n bytes.
684        while data.len() < n {
685            let n_take = (n - data.len()).min(data.len());
686            data.extend_from_within(0..n_take);
687        }
688
689        data
690    }
691
692    #[test]
693    fn test_correctness_100kb_hitting_data() {
694        // To ensure we've checked all the nuances of dealing with minimum chunk boundaries,
695        // and with the correct chunks as well, run through all the different options with the padding,
696        // checking each one.  With this, then, we have a pattern that hits once per pattern with varying
697        // bits between the widths.
698
699        let mut data_sample_at_11111 = [0u8; 128];
700        let mut ref_cb = vec![Vec::new(); 128];
701
702        data_sample_at_11111[0] = 236;
703        ref_cb[0] = vec![8256, 16448, 24640, 32832, 41024, 49216, 57408, 65536];
704        data_sample_at_11111[1] = 50;
705        ref_cb[1] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
706        data_sample_at_11111[2] = 36;
707        ref_cb[2] = vec![8254, 16574, 24894, 33214, 41534, 49854, 58174, 65536];
708        data_sample_at_11111[3] = 116;
709        ref_cb[3] = vec![8317, 16570, 24823, 33076, 41329, 49582, 57835, 65536];
710        data_sample_at_11111[4] = 126;
711        ref_cb[4] = vec![8248, 16564, 24880, 33196, 41512, 49828, 58144, 65536];
712        data_sample_at_11111[5] = 145;
713        ref_cb[5] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536];
714        data_sample_at_11111[6] = 235;
715        ref_cb[6] = vec![8238, 16546, 24854, 33162, 41470, 49778, 58086, 65536];
716        data_sample_at_11111[7] = 228;
717        ref_cb[7] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
718        data_sample_at_11111[8] = 70;
719        ref_cb[8] = vec![8224, 16520, 24816, 33112, 41408, 49704, 58000, 65536];
720        data_sample_at_11111[9] = 178;
721        ref_cb[9] = vec![8284, 16504, 24724, 32944, 41164, 49384, 57604, 65536];
722        data_sample_at_11111[10] = 173;
723        ref_cb[10] = vec![8206, 16486, 24766, 33046, 41326, 49606, 57886, 65536];
724        data_sample_at_11111[11] = 0;
725        ref_cb[11] = vec![8265, 16466, 24667, 32868, 41069, 49270, 57471, 65536];
726        data_sample_at_11111[12] = 252;
727        ref_cb[12] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536];
728        data_sample_at_11111[13] = 159;
729        ref_cb[13] = vec![8242, 16561, 24880, 33199, 41518, 49837, 58156, 65536];
730        data_sample_at_11111[14] = 69;
731        ref_cb[14] = vec![8300, 16536, 24772, 33008, 41244, 49480, 57716, 65536];
732        data_sample_at_11111[15] = 219;
733        ref_cb[15] = vec![8215, 16509, 24803, 33097, 41391, 49685, 57979, 65536];
734        data_sample_at_11111[16] = 126;
735        ref_cb[16] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
736        data_sample_at_11111[17] = 10;
737        ref_cb[17] = vec![8329, 16594, 24859, 33124, 41389, 49654, 57919, 65536];
738        data_sample_at_11111[18] = 124;
739        ref_cb[18] = vec![8240, 16562, 24884, 33206, 41528, 49850, 58172, 65536];
740        data_sample_at_11111[19] = 24;
741        ref_cb[19] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
742        data_sample_at_11111[20] = 196;
743        ref_cb[20] = vec![8204, 16492, 24780, 33068, 41356, 49644, 57932, 65536];
744        data_sample_at_11111[21] = 106;
745        ref_cb[21] = vec![8259, 16454, 24649, 32844, 41039, 49234, 57429, 65536];
746        data_sample_at_11111[22] = 196;
747        ref_cb[22] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
748        data_sample_at_11111[23] = 183;
749        ref_cb[23] = vec![8218, 16523, 24828, 33133, 41438, 49743, 58048, 65536];
750        data_sample_at_11111[24] = 124;
751        ref_cb[24] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
752        data_sample_at_11111[25] = 70;
753        ref_cb[25] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
754        data_sample_at_11111[26] = 126;
755        ref_cb[26] = vec![8226, 16542, 24858, 33174, 41490, 49806, 58122, 65536];
756        data_sample_at_11111[27] = 191;
757        ref_cb[27] = vec![8279, 16494, 24709, 32924, 41139, 49354, 57569, 65536];
758        data_sample_at_11111[28] = 69;
759        ref_cb[28] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
760        data_sample_at_11111[29] = 163;
761        ref_cb[29] = vec![8228, 16549, 24870, 33191, 41512, 49833, 58154, 65536];
762        data_sample_at_11111[30] = 252;
763        ref_cb[30] = vec![8280, 16496, 24712, 32928, 41144, 49360, 57576, 65536];
764        data_sample_at_11111[31] = 0;
765        ref_cb[31] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
766        data_sample_at_11111[32] = 173;
767        ref_cb[32] = vec![8224, 16544, 24864, 33184, 41504, 49824, 58144, 65536];
768        data_sample_at_11111[33] = 42;
769        ref_cb[33] = vec![8275, 16486, 24697, 32908, 41119, 49330, 57541, 65536];
770        data_sample_at_11111[34] = 70;
771        ref_cb[34] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
772        data_sample_at_11111[35] = 174;
773        ref_cb[35] = vec![8214, 16527, 24840, 33153, 41466, 49779, 58092, 65536];
774        data_sample_at_11111[36] = 235;
775        ref_cb[36] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
776        data_sample_at_11111[37] = 186;
777        ref_cb[37] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
778        data_sample_at_11111[38] = 0;
779        ref_cb[38] = vec![8198, 16498, 24798, 33098, 41398, 49698, 57998, 65536];
780        data_sample_at_11111[39] = 157;
781        ref_cb[39] = vec![8247, 16597, 24947, 33297, 41647, 49997, 58347, 65536];
782        data_sample_at_11111[40] = 126;
783        ref_cb[40] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
784        data_sample_at_11111[41] = 49;
785        ref_cb[41] = vec![8345, 16626, 24907, 33188, 41469, 49750, 58031, 65536];
786        data_sample_at_11111[42] = 36;
787        ref_cb[42] = vec![8224, 16554, 24884, 33214, 41544, 49874, 58204, 65536];
788        data_sample_at_11111[43] = 0;
789        ref_cb[43] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
790        data_sample_at_11111[44] = 236;
791        ref_cb[44] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
792        data_sample_at_11111[45] = 105;
793        ref_cb[45] = vec![8195, 16499, 24803, 33107, 41411, 49715, 58019, 65536];
794        data_sample_at_11111[46] = 0;
795        ref_cb[46] = vec![8242, 16594, 24946, 33298, 41650, 50002, 58354, 65536];
796        data_sample_at_11111[47] = 24;
797        ref_cb[47] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
798        data_sample_at_11111[48] = 126;
799        ref_cb[48] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
800        data_sample_at_11111[49] = 0;
801        ref_cb[49] = vec![8206, 16525, 24844, 33163, 41482, 49801, 58120, 65536];
802        data_sample_at_11111[50] = 70;
803        ref_cb[50] = vec![8252, 16618, 24984, 33350, 41716, 50082, 58448, 65536];
804        data_sample_at_11111[51] = 236;
805        ref_cb[51] = vec![8298, 16532, 24766, 33000, 41234, 49468, 57702, 65536];
806        data_sample_at_11111[52] = 0;
807        ref_cb[52] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
808        data_sample_at_11111[53] = 12;
809        ref_cb[53] = vec![8209, 16535, 24861, 33187, 41513, 49839, 58165, 65536];
810        data_sample_at_11111[54] = 236;
811        ref_cb[54] = vec![8254, 16626, 24998, 33370, 41742, 50114, 58486, 65536];
812        data_sample_at_11111[55] = 0;
813        ref_cb[55] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
814        data_sample_at_11111[56] = 173;
815        ref_cb[56] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
816        data_sample_at_11111[57] = 196;
817        ref_cb[57] = vec![8204, 16529, 24854, 33179, 41504, 49829, 58154, 65536];
818        data_sample_at_11111[58] = 0;
819        ref_cb[58] = vec![8248, 16618, 24988, 33358, 41728, 50098, 58468, 65536];
820        data_sample_at_11111[59] = 159;
821        ref_cb[59] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
822        data_sample_at_11111[60] = 178;
823        ref_cb[60] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
824        data_sample_at_11111[61] = 0;
825        ref_cb[61] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
826        data_sample_at_11111[62] = 10;
827        ref_cb[62] = vec![8234, 16594, 24954, 33314, 41674, 50034, 58394, 65536];
828        data_sample_at_11111[63] = 101;
829        ref_cb[63] = vec![8277, 16490, 24703, 32916, 41129, 49342, 57555, 65536];
830        data_sample_at_11111[64] = 0;
831        ref_cb[64] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
832        data_sample_at_11111[65] = 15;
833        ref_cb[65] = vec![8363, 16662, 24961, 33260, 41559, 49858, 58157, 65536];
834        data_sample_at_11111[66] = 147;
835        ref_cb[66] = vec![8212, 16554, 24896, 33238, 41580, 49922, 58264, 65536];
836        data_sample_at_11111[67] = 0;
837        ref_cb[67] = vec![8254, 16639, 25024, 33409, 41794, 50179, 58564, 65536];
838        data_sample_at_11111[68] = 0;
839        ref_cb[68] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
840        data_sample_at_11111[69] = 227;
841        ref_cb[69] = vec![8338, 16612, 24886, 33160, 41434, 49708, 57982, 65536];
842        data_sample_at_11111[70] = 126;
843        ref_cb[70] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
844        data_sample_at_11111[71] = 0;
845        ref_cb[71] = vec![8223, 16581, 24939, 33297, 41655, 50013, 58371, 65536];
846        data_sample_at_11111[72] = 101;
847        ref_cb[72] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
848        data_sample_at_11111[73] = 186;
849        ref_cb[73] = vec![8305, 16546, 24787, 33028, 41269, 49510, 57751, 65536];
850        data_sample_at_11111[74] = 52;
851        ref_cb[74] = vec![8346, 16628, 24910, 33192, 41474, 49756, 58038, 65536];
852        data_sample_at_11111[75] = 0;
853        ref_cb[75] = vec![8387, 16710, 25033, 33356, 41679, 50002, 58325, 65536];
854        data_sample_at_11111[76] = 70;
855        ref_cb[76] = vec![8224, 16588, 24952, 33316, 41680, 50044, 58408, 65536];
856        data_sample_at_11111[77] = 228;
857        ref_cb[77] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
858        data_sample_at_11111[78] = 0;
859        ref_cb[78] = vec![8304, 16544, 24784, 33024, 41264, 49504, 57744, 65536];
860        data_sample_at_11111[79] = 0;
861        ref_cb[79] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
862        data_sample_at_11111[80] = 50;
863        ref_cb[80] = vec![8384, 16704, 25024, 33344, 41664, 49984, 58304, 65536];
864        data_sample_at_11111[81] = 214;
865        ref_cb[81] = vec![8215, 16575, 24935, 33295, 41655, 50015, 58375, 65536];
866        data_sample_at_11111[82] = 0;
867        ref_cb[82] = vec![8254, 16654, 25054, 33454, 41854, 50254, 58654, 65536];
868        data_sample_at_11111[83] = 0;
869        ref_cb[83] = vec![8293, 16522, 24751, 32980, 41209, 49438, 57667, 65536];
870        data_sample_at_11111[84] = 50;
871        ref_cb[84] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
872        data_sample_at_11111[85] = 69;
873        ref_cb[85] = vec![8371, 16678, 24985, 33292, 41599, 49906, 58213, 65536];
874        data_sample_at_11111[86] = 0;
875        ref_cb[86] = vec![8196, 16542, 24888, 33234, 41580, 49926, 58272, 65536];
876        data_sample_at_11111[87] = 0;
877        ref_cb[87] = vec![8234, 16619, 25004, 33389, 41774, 50159, 58544, 65536];
878        data_sample_at_11111[88] = 70;
879        ref_cb[88] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
880        data_sample_at_11111[89] = 136;
881        ref_cb[89] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536];
882        data_sample_at_11111[90] = 0;
883        ref_cb[90] = vec![8348, 16632, 24916, 33200, 41484, 49768, 58052, 65536];
884        data_sample_at_11111[91] = 0;
885        ref_cb[91] = vec![8386, 16708, 25030, 33352, 41674, 49996, 58318, 65536];
886        data_sample_at_11111[92] = 101;
887        ref_cb[92] = vec![8204, 16564, 24924, 33284, 41644, 50004, 58364, 65536];
888        data_sample_at_11111[93] = 36;
889        ref_cb[93] = vec![8241, 16639, 25037, 33435, 41833, 50231, 58629, 65536];
890        data_sample_at_11111[94] = 196;
891        ref_cb[94] = vec![8278, 16492, 24706, 32920, 41134, 49348, 57562, 65536];
892        data_sample_at_11111[95] = 0;
893        ref_cb[95] = vec![8315, 16566, 24817, 33068, 41319, 49570, 57821, 65536];
894        data_sample_at_11111[96] = 0;
895        ref_cb[96] = vec![8352, 16640, 24928, 33216, 41504, 49792, 58080, 65536];
896        data_sample_at_11111[97] = 24;
897        ref_cb[97] = vec![8389, 16714, 25039, 33364, 41689, 50014, 58339, 65536];
898        data_sample_at_11111[98] = 8;
899        ref_cb[98] = vec![8200, 16562, 24924, 33286, 41648, 50010, 58372, 65536];
900        data_sample_at_11111[99] = 0;
901        ref_cb[99] = vec![8236, 16635, 25034, 33433, 41832, 50231, 58630, 65536];
902        data_sample_at_11111[100] = 0;
903        ref_cb[100] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
904        data_sample_at_11111[101] = 125;
905        ref_cb[101] = vec![8308, 16552, 24796, 33040, 41284, 49528, 57772, 65536];
906        data_sample_at_11111[102] = 173;
907        ref_cb[102] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
908        data_sample_at_11111[103] = 126;
909        ref_cb[103] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
910        data_sample_at_11111[104] = 0;
911        ref_cb[104] = vec![8416, 16768, 25120, 33472, 41824, 50176, 58528, 65536];
912        data_sample_at_11111[105] = 0;
913        ref_cb[105] = vec![8219, 16607, 24995, 33383, 41771, 50159, 58547, 65536];
914        data_sample_at_11111[106] = 159;
915        ref_cb[106] = vec![8254, 16678, 25102, 33526, 41950, 50374, 58798, 65536];
916        data_sample_at_11111[107] = 210;
917        ref_cb[107] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
918        data_sample_at_11111[108] = 178;
919        ref_cb[108] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536];
920        data_sample_at_11111[109] = 0;
921        ref_cb[109] = vec![8359, 16654, 24949, 33244, 41539, 49834, 58129, 65536];
922        data_sample_at_11111[110] = 0;
923        ref_cb[110] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
924        data_sample_at_11111[111] = 170;
925        ref_cb[111] = vec![8429, 16794, 25159, 33524, 41889, 50254, 58619, 65536];
926        data_sample_at_11111[112] = 173;
927        ref_cb[112] = vec![8224, 16624, 25024, 33424, 41824, 50224, 58624, 65536];
928        data_sample_at_11111[113] = 235;
929        ref_cb[113] = vec![8258, 16452, 24646, 32840, 41034, 49228, 57422, 65536];
930        data_sample_at_11111[114] = 0;
931        ref_cb[114] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
932        data_sample_at_11111[115] = 0;
933        ref_cb[115] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
934        data_sample_at_11111[116] = 0;
935        ref_cb[116] = vec![8360, 16656, 24952, 33248, 41544, 49840, 58136, 65536];
936        data_sample_at_11111[117] = 24;
937        ref_cb[117] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
938        data_sample_at_11111[118] = 228;
939        ref_cb[118] = vec![8428, 16792, 25156, 33520, 41884, 50248, 58612, 65536];
940        data_sample_at_11111[119] = 0;
941        ref_cb[119] = vec![8215, 16613, 25011, 33409, 41807, 50205, 58603, 65536];
942        data_sample_at_11111[120] = 0;
943        ref_cb[120] = vec![8248, 16680, 25112, 33544, 41976, 50408, 58840, 65536];
944        data_sample_at_11111[121] = 0;
945        ref_cb[121] = vec![8281, 16498, 24715, 32932, 41149, 49366, 57583, 65536];
946        data_sample_at_11111[122] = 101;
947        ref_cb[122] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
948        data_sample_at_11111[123] = 174;
949        ref_cb[123] = vec![8347, 16630, 24913, 33196, 41479, 49762, 58045, 65536];
950        data_sample_at_11111[124] = 126;
951        ref_cb[124] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
952        data_sample_at_11111[125] = 0;
953        ref_cb[125] = vec![8413, 16762, 25111, 33460, 41809, 50158, 58507, 65536];
954        data_sample_at_11111[126] = 0;
955        ref_cb[126] = vec![8192, 16574, 24956, 33338, 41720, 50102, 58484, 65536];
956        data_sample_at_11111[127] = 0;
957        ref_cb[127] = vec![8224, 16639, 25054, 33469, 41884, 50299, 58714, 65536];
958
959        // Now run the loop with this reference data.
960        for i in 0..128 {
961            let data = get_triggering_base_data(65536, i);
962
963            // This check is here so that the tests written against this chunker
964            // can verify that the test data input is correct.
965            assert_eq!(data[11111], data_sample_at_11111[i]);
966
967            // Uncomment to create the line above.
968            // eprintln!("data_sample_at_11111[{i}]={};", data[11111]);
969
970            // Now, run the chunks through the default chunker.
971            let chunks = ChunkerTestWrapper::default().next_block(&data, true);
972
973            // Get the boundaries indices as determined by the size of the chunks above.
974            let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
975
976            // Uncomment this to generate the table above.
977            // eprintln!("ref_cb[{i}]=vec!{chunk_boundaries:?};");
978
979            assert_eq!(chunk_boundaries, ref_cb[i]);
980        }
981
982        // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
983        // assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000])
984    }
985}