quickcdc/
lib.rs

1//! # quickcdc
2//! `quickcdc` is a fast content defined chunker.
3//! * For some background information, see [AE: An Asymmetric Extremum Content Defined Chunking Algorithm](https://ieeexplore.ieee.org/document/7218510) by Yucheng Zhang.
4//! * Modification(s):
5//!   * User may provide salt, introducing entropy / cutpoint variation (i.e. files re-processed with different salt values will produce different cutpoints).
6//!   * Warp forward (reduced window size), skipping some unnecessary processing that happens before minimum chunk size is reached.
7//!
8//! This should be faster than many CDC algorithms (anecdotal performance: 2GB/s on an amd1950x with an NVMe drive), but faster alternatives exist.
9//! * For more information, see [FastCDC](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf)
10//!
11//! NOTE: This implementation performs much faster when built with `--release`.
12//!
13#![cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
14
15extern crate rand;
16
17use rand::prelude::*;
18use std::f64::consts;
19use std::mem;
20
21const SIZEOF_U64: usize = mem::size_of::<u64>();
22
23#[derive(Debug)]
24pub struct Chunker<'a> {
25    slice: &'a [u8],
26    window_size: usize,
27    max_chunksize: usize,
28    min_chunksize: usize,
29    salt: u64,
30    bytes_processed: usize,
31    bytes_remaining: usize,
32}
33
34#[derive(Debug)]
35pub enum ChunkerError {
36    InsufficientMaxSize,
37    InsufficientTargetSize,
38}
39
40impl<'a> Chunker<'a> {
41    /// Given a {slice, target size, max_size, salt}, supply an iterable struct that produces chunked slices.
42    ///
43    /// # Examples
44    /// ```
45    /// use quickcdc;
46    /// use rand::Rng;
47    ///
48    /// let mut rng = rand::thread_rng();
49    /// let mut sample = [0u8; 1024];
50    /// rng.fill(&mut sample[..]);
51    /// let target_size = 64;
52    /// let max_chunksize = 128;
53    /// let salt = 15222894464462204665;
54    ///
55    /// let chunker = quickcdc::Chunker::with_params(&sample[..], target_size, max_chunksize, salt).unwrap();
56    /// for x in chunker {
57    ///     println!("{}", x.len());
58    /// }
59    ///
60    /// ```
61    pub fn with_params(
62        slice: &[u8],
63        target_chunksize_bytes: usize,
64        max_chunksize_bytes: usize,
65        salt: u64,
66    ) -> Result<Chunker, ChunkerError> {
67        if 2 * target_chunksize_bytes > max_chunksize_bytes {
68            return Err(ChunkerError::InsufficientMaxSize);
69        }
70
71        if target_chunksize_bytes < 64 {
72            return Err(ChunkerError::InsufficientTargetSize);
73        }
74
75        let target_window_size = (target_chunksize_bytes as f64 / (consts::E - 1.0)) as usize;
76        let my_window_size = (target_window_size as f64 * 0.56) as usize;
77        let min_chunksize = target_chunksize_bytes - target_window_size;
78        let chunker: Chunker = Chunker {
79            slice,
80            window_size: my_window_size,
81            salt,
82            max_chunksize: max_chunksize_bytes,
83            min_chunksize,
84            bytes_processed: 0,
85            bytes_remaining: slice.len(),
86        };
87        Ok(chunker)
88    }
89
90    /// Return a good (random) salt value.
91    pub fn get_random_salt() -> u64 {
92        let mut rng = rand::thread_rng();
93        rng.next_u64()
94    }
95}
96
97/// Returns the next content-defined chunk.
98impl<'a> Iterator for Chunker<'a> {
99    type Item = &'a [u8];
100
101    fn next(&mut self) -> Option<&'a [u8]> {
102        if self.bytes_remaining == 0 {
103            return None;
104        }
105
106        let next_slice = next_chunked_slice(
107            &self.slice[(self.bytes_processed)..],
108            self.window_size,
109            self.min_chunksize,
110            self.max_chunksize,
111            self.salt,
112        );
113        self.bytes_processed += next_slice.len();
114        self.bytes_remaining -= next_slice.len();
115
116        Some(next_slice)
117    }
118}
119
120/// Return the next content-defined slice.
121fn next_chunked_slice(
122    remaining: &[u8],
123    window_size: usize,
124    min_chunksize: usize,
125    max_chunksize: usize,
126    salt: u64,
127) -> &[u8] {
128    let remaining_bytes_length = remaining.len();
129
130    // under minimum chunk size remaining
131    if remaining_bytes_length <= min_chunksize + window_size {
132        return &remaining[..remaining_bytes_length];
133    }
134
135    let mut marker_position = 0;
136    let end_index = remaining_bytes_length - SIZEOF_U64;
137
138    // Warp forward to avoid unnecessary processing
139    for i in min_chunksize..end_index {
140        // Max chunksize reached, force a cutpoint.
141        // This generally happens when processing data that doesn't change (e.g. sparse files / all zeros).
142        if i == max_chunksize {
143            return &remaining[..i];
144        }
145
146        // Recast a pair of u64 pointers, to be used for comparison.
147        // Since 'i' never iterates beyond slice (i.e. remaining_bytes_length - SIZEOF_U64),
148        // we never dereference anything beyond the end of our slice.
149        let current_as_u64 = &remaining[i] as *const u8 as *const u64;
150        let marker_as_u64 = &remaining[marker_position] as *const u8 as *const u64;
151
152        // Update marker position, if necessary
153        if !swapped_salted_isgt(current_as_u64, marker_as_u64, salt) {
154            marker_position = i;
155            continue;
156        }
157
158        // End of window reached without a new marker position, force a cutpoint
159        if i == marker_position + window_size {
160            return &remaining[..i];
161        }
162    }
163
164    // force a cutpoint
165    let cutpoint = if max_chunksize < remaining_bytes_length {
166        max_chunksize
167    } else {
168        remaining_bytes_length
169    };
170    &remaining[..cutpoint]
171}
172
173/// Utility Function: Compare pointers to two 64-bit portions of data.
174///
175/// It does the following:
176/// * Dereferences each pointer into a u64 value.
177/// * Byte-swaps each value, and XOR the result with supplied salt.
178/// * Compare swapped+salted values, return comparison result.
179///
180/// De-referencing the pointers is an unsafe operation.  As long as the pointers do not extend beyond
181/// the end of the slice being chunked, this function will not result in undefined behavior.
182#[inline]
183fn swapped_salted_isgt(first: *const u64, second: *const u64, salt: u64) -> bool {
184    let compare_first = unsafe { (*first).swap_bytes() } ^ salt;
185    let compare_second = unsafe { (*second).swap_bytes() } ^ salt;
186    if compare_first > compare_second {
187        return true;
188    }
189    false
190}