1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
use super::{RollingHash, CDC};
use std::default::Default;
use std::{cmp, mem};
use {gear, Gear};

fn get_masks(avg_size: usize, nc_level: usize, seed: u64) -> (u64, u64) {
    let bits = (avg_size.next_power_of_two() - 1).count_ones();
    if bits == 13 {
        // From the paper
        return (0x0003590703530000, 0x0000d90003530000);
    }
    let mut mask = 0u64;
    let mut v = seed;
    let a = 6364136223846793005;
    let c = 1442695040888963407;
    while mask.count_ones() < bits - nc_level as u32 {
        v = v.wrapping_mul(a).wrapping_add(c);
        mask = (mask | 1).rotate_left(v as u32 & 0x3f);
    }
    let mask_long = mask;
    while mask.count_ones() < bits + nc_level as u32 {
        v = v.wrapping_mul(a).wrapping_add(c);
        mask = (mask | 1).rotate_left(v as u32 & 0x3f);
    }
    let mask_short = mask;
    (mask_short, mask_long)
}

/// FastCDC chunking
///
/// * Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication"
/// * Paper-URL: https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
/// * Presentation: https://www.usenix.org/sites/default/files/conference/protected-files/atc16_slides_xia.pdf
pub struct FastCDC {
    current_chunk_size: u64,
    gear: Gear,
    mask_short: u64,
    mask_long: u64,
    ignore_size: u64,
    min_size: u64,
    avg_size: u64,
    max_size: u64,
}

impl Default for FastCDC {
    fn default() -> Self {
        FastCDC::new()
    }
}

impl FastCDC {
    /// Create new FastCDC engine with default chunking settings
    pub fn new() -> Self {
        FastCDC::new_with_chunk_bits(gear::CHUNK_BITS)
    }

    fn reset(&mut self) {
        self.gear.reset();
        self.current_chunk_size = 0;
    }

    /// Create new `FastCDC` engine with custom chunking settings
    ///
    /// `chunk_bits` is number of bits that need to match in
    /// the edge condition. `CHUNK_BITS` constant is the default.
    pub fn new_with_chunk_bits(chunk_bits: u32) -> Self {
        let (mask_short, mask_long) = get_masks(1 << chunk_bits, 2, 0);
        let gear = Gear::new_with_chunk_bits(chunk_bits);
        const DIGEST_SIZE: usize = 64;
        debug_assert_eq!(
            mem::size_of::<<Gear as RollingHash>::Digest>() * 8,
            DIGEST_SIZE
        );

        const SPREAD_BITS: u32 = 3;
        const WINDOW_SIZE: usize = 64;

        let min_size = (1 << (gear.chunk_bits - SPREAD_BITS + 1)) as u64;

        let ignore_size = min_size - WINDOW_SIZE as u64;
        let avg_size = (1 << gear.chunk_bits) as u64;
        let max_size = (1 << (gear.chunk_bits + SPREAD_BITS)) as u64;

        Self {
            current_chunk_size: 0,
            gear: gear,
            mask_short: mask_short,
            mask_long: mask_long,
            ignore_size: ignore_size,
            min_size: min_size,
            avg_size: avg_size,
            max_size: max_size,
        }
    }
}

impl CDC for FastCDC {
    /// Find chunk edge using `FastCDC` defaults.
    fn find_chunk<'a>(&mut self, whole_buf: &'a [u8]) -> Option<(&'a [u8], &'a [u8])> {
        let mut left = whole_buf;

        debug_assert!(self.current_chunk_size < self.max_size);

        // ignore bytes that are not going to influence the digest
        if self.current_chunk_size < self.ignore_size {
            let skip_bytes = cmp::min(self.ignore_size - self.current_chunk_size, left.len() as u64);
            self.current_chunk_size += skip_bytes;
            left= &left[skip_bytes as usize..];
        }

        // ignore edges in bytes that are smaller than min_size
        if self.current_chunk_size < self.min_size {
            let roll_bytes = cmp::min(self.min_size - self.current_chunk_size, left.len() as u64);
            self.gear.roll(&left[..roll_bytes as usize]);
            self.current_chunk_size += roll_bytes;
            left = &left [roll_bytes as usize..];
        }

        // roll through early bytes with smaller probability
        if self.current_chunk_size < self.avg_size {
            let roll_bytes = cmp::min(self.avg_size - self.current_chunk_size, left.len() as u64);
            let result = self.gear.find_chunk_mask(left, self.mask_short);

            if let Some((_last, rest)) = result {
                let result = (&whole_buf[..whole_buf.len() - rest.len()], rest);
                self.reset();
                return Some(result);
            }

            self.current_chunk_size += roll_bytes;
            left = &left[roll_bytes as usize..];
        }

        // roll through late bytes with higher probability
        if self.current_chunk_size < self.max_size {
            let roll_bytes = cmp::min(self.max_size - self.current_chunk_size, left.len() as u64);
            let result = self.gear.find_chunk_mask(left, self.mask_long);

            if let Some((_last, rest)) = result {
                let result = (&whole_buf[..whole_buf.len() - rest.len()], rest);
                self.reset();
                return Some(result);
            }

            self.current_chunk_size += roll_bytes;
            left = &left[roll_bytes as usize..];
        }

        if self.current_chunk_size >= self.max_size {
            debug_assert_eq!(self.current_chunk_size, self.max_size);
            let result = (&whole_buf[..whole_buf.len() - left.len()], left);
            self.reset();
            return Some(result);
        }

        if left.is_empty() {
            return None;
        }
        unreachable!();
    }
}

#[cfg(test)]
mod tests {

    #[cfg(feature = "bench")]
    mod bench {
        use test::Bencher;
        use super::*;

        use tests::test_data_1mb;

        use {CDC, FastCDC};

        #[bench]
        fn perf_1mb_004k_chunks(b: &mut Bencher) {
            let v = test_data_1mb();
            b.bytes = v.len() as u64;

            b.iter(|| {
                let mut cdc = FastCDC::new_with_chunk_bits(12);
                let mut buf = v.as_slice();

                while let Some((_last, rest)) = cdc.find_chunk(buf) {
                    buf = rest;
                }
            });
        }

        #[bench]
        fn perf_1mb_008k_chunks(b: &mut Bencher) {
            let v = test_data_1mb();
            b.bytes = v.len() as u64;

            b.iter(|| {
                let mut cdc = FastCDC::new_with_chunk_bits(13);
                let mut buf = v.as_slice();

                while let Some((_last, rest)) = cdc.find_chunk(buf) {
                    buf = rest;
                }
            });
        }

        #[bench]
        fn perf_1mb_064k_chunks(b: &mut Bencher) {
            let v = test_data_1mb();
            b.bytes = v.len() as u64;

            b.iter(|| {
                let mut cdc = FastCDC::new_with_chunk_bits(16);
                let mut buf = v.as_slice();

                while let Some((_last, rest)) = cdc.find_chunk(buf) {
                    buf = rest;
                }
            });
        }

        #[bench]
        fn perf_1mb_128k_chunks(b: &mut Bencher) {
            let v = test_data_1mb();
            b.bytes = v.len() as u64;

            b.iter(|| {
                let mut cdc = FastCDC::new_with_chunk_bits(17);
                let mut buf = v.as_slice();

                while let Some((_last, rest)) = cdc.find_chunk(buf) {
                    buf = rest;
                }
            });
        }
    }
}