quickcdc/lib.rs
1//! # quickcdc
2//! `quickcdc` is a fast content defined chunker.
3//! * For some background information, see [AE: An Asymmetric Extremum Content Defined Chunking Algorithm](https://ieeexplore.ieee.org/document/7218510) by Yucheng Zhang.
4//! * Modification(s):
5//! * User may provide salt, introducing entropy / cutpoint variation (i.e. files re-processed with different salt values will produce different cutpoints).
6//! * Warp forward (reduced window size), skipping some unnecessary processing that happens before minimum chunk size is reached.
7//!
8//! This should be faster than many CDC algorithms (anecdotal performance: 2GB/s on an amd1950x with an NVMe drive), but faster alternatives exist.
9//! * For more information, see [FastCDC](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf)
10//!
11//! NOTE: This implementation performs much faster when built with `--release`.
12//!
13#![cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
14
15extern crate rand;
16
17use rand::prelude::*;
18use std::f64::consts;
19use std::mem;
20
21const SIZEOF_U64: usize = mem::size_of::<u64>();
22
23#[derive(Debug)]
24pub struct Chunker<'a> {
25 slice: &'a [u8],
26 window_size: usize,
27 max_chunksize: usize,
28 min_chunksize: usize,
29 salt: u64,
30 bytes_processed: usize,
31 bytes_remaining: usize,
32}
33
34#[derive(Debug)]
35pub enum ChunkerError {
36 InsufficientMaxSize,
37 InsufficientTargetSize,
38}
39
40impl<'a> Chunker<'a> {
41 /// Given a {slice, target size, max_size, salt}, supply an iterable struct that produces chunked slices.
42 ///
43 /// # Examples
44 /// ```
45 /// use quickcdc;
46 /// use rand::Rng;
47 ///
48 /// let mut rng = rand::thread_rng();
49 /// let mut sample = [0u8; 1024];
50 /// rng.fill(&mut sample[..]);
51 /// let target_size = 64;
52 /// let max_chunksize = 128;
53 /// let salt = 15222894464462204665;
54 ///
55 /// let chunker = quickcdc::Chunker::with_params(&sample[..], target_size, max_chunksize, salt).unwrap();
56 /// for x in chunker {
57 /// println!("{}", x.len());
58 /// }
59 ///
60 /// ```
61 pub fn with_params(
62 slice: &[u8],
63 target_chunksize_bytes: usize,
64 max_chunksize_bytes: usize,
65 salt: u64,
66 ) -> Result<Chunker, ChunkerError> {
67 if 2 * target_chunksize_bytes > max_chunksize_bytes {
68 return Err(ChunkerError::InsufficientMaxSize);
69 }
70
71 if target_chunksize_bytes < 64 {
72 return Err(ChunkerError::InsufficientTargetSize);
73 }
74
75 let target_window_size = (target_chunksize_bytes as f64 / (consts::E - 1.0)) as usize;
76 let my_window_size = (target_window_size as f64 * 0.56) as usize;
77 let min_chunksize = target_chunksize_bytes - target_window_size;
78 let chunker: Chunker = Chunker {
79 slice,
80 window_size: my_window_size,
81 salt,
82 max_chunksize: max_chunksize_bytes,
83 min_chunksize,
84 bytes_processed: 0,
85 bytes_remaining: slice.len(),
86 };
87 Ok(chunker)
88 }
89
90 /// Return a good (random) salt value.
91 pub fn get_random_salt() -> u64 {
92 let mut rng = rand::thread_rng();
93 rng.next_u64()
94 }
95}
96
97/// Returns the next content-defined chunk.
98impl<'a> Iterator for Chunker<'a> {
99 type Item = &'a [u8];
100
101 fn next(&mut self) -> Option<&'a [u8]> {
102 if self.bytes_remaining == 0 {
103 return None;
104 }
105
106 let next_slice = next_chunked_slice(
107 &self.slice[(self.bytes_processed)..],
108 self.window_size,
109 self.min_chunksize,
110 self.max_chunksize,
111 self.salt,
112 );
113 self.bytes_processed += next_slice.len();
114 self.bytes_remaining -= next_slice.len();
115
116 Some(next_slice)
117 }
118}
119
120/// Return the next content-defined slice.
121fn next_chunked_slice(
122 remaining: &[u8],
123 window_size: usize,
124 min_chunksize: usize,
125 max_chunksize: usize,
126 salt: u64,
127) -> &[u8] {
128 let remaining_bytes_length = remaining.len();
129
130 // under minimum chunk size remaining
131 if remaining_bytes_length <= min_chunksize + window_size {
132 return &remaining[..remaining_bytes_length];
133 }
134
135 let mut marker_position = 0;
136 let end_index = remaining_bytes_length - SIZEOF_U64;
137
138 // Warp forward to avoid unnecessary processing
139 for i in min_chunksize..end_index {
140 // Max chunksize reached, force a cutpoint.
141 // This generally happens when processing data that doesn't change (e.g. sparse files / all zeros).
142 if i == max_chunksize {
143 return &remaining[..i];
144 }
145
146 // Recast a pair of u64 pointers, to be used for comparison.
147 // Since 'i' never iterates beyond slice (i.e. remaining_bytes_length - SIZEOF_U64),
148 // we never dereference anything beyond the end of our slice.
149 let current_as_u64 = &remaining[i] as *const u8 as *const u64;
150 let marker_as_u64 = &remaining[marker_position] as *const u8 as *const u64;
151
152 // Update marker position, if necessary
153 if !swapped_salted_isgt(current_as_u64, marker_as_u64, salt) {
154 marker_position = i;
155 continue;
156 }
157
158 // End of window reached without a new marker position, force a cutpoint
159 if i == marker_position + window_size {
160 return &remaining[..i];
161 }
162 }
163
164 // force a cutpoint
165 let cutpoint = if max_chunksize < remaining_bytes_length {
166 max_chunksize
167 } else {
168 remaining_bytes_length
169 };
170 &remaining[..cutpoint]
171}
172
173/// Utility Function: Compare pointers to two 64-bit portions of data.
174///
175/// It does the following:
176/// * Dereferences each pointer into a u64 value.
177/// * Byte-swaps each value, and XOR the result with supplied salt.
178/// * Compare swapped+salted values, return comparison result.
179///
180/// De-referencing the pointers is an unsafe operation. As long as the pointers do not extend beyond
181/// the end of the slice being chunked, this function will not result in undefined behavior.
182#[inline]
183fn swapped_salted_isgt(first: *const u64, second: *const u64, salt: u64) -> bool {
184 let compare_first = unsafe { (*first).swap_bytes() } ^ salt;
185 let compare_second = unsafe { (*second).swap_bytes() } ^ salt;
186 if compare_first > compare_second {
187 return true;
188 }
189 false
190}