rdedup_cdc/
fastcdc.rs

1use super::{RollingHash, CDC};
2use std::default::Default;
3use std::{cmp, mem};
4use {gear, Gear};
5
6fn get_masks(avg_size: usize, nc_level: usize, seed: u64) -> (u64, u64) {
7    let bits = (avg_size.next_power_of_two() - 1).count_ones();
8    if bits == 13 {
9        // From the paper
10        return (0x0003590703530000, 0x0000d90003530000);
11    }
12    let mut mask = 0u64;
13    let mut v = seed;
14    let a = 6364136223846793005;
15    let c = 1442695040888963407;
16    while mask.count_ones() < bits - nc_level as u32 {
17        v = v.wrapping_mul(a).wrapping_add(c);
18        mask = (mask | 1).rotate_left(v as u32 & 0x3f);
19    }
20    let mask_long = mask;
21    while mask.count_ones() < bits + nc_level as u32 {
22        v = v.wrapping_mul(a).wrapping_add(c);
23        mask = (mask | 1).rotate_left(v as u32 & 0x3f);
24    }
25    let mask_short = mask;
26    (mask_short, mask_long)
27}
28
29/// FastCDC chunking
30///
31/// * Paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication"
32/// * Paper-URL: https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
33/// * Presentation: https://www.usenix.org/sites/default/files/conference/protected-files/atc16_slides_xia.pdf
34pub struct FastCDC {
35    current_chunk_size: u64,
36    gear: Gear,
37    mask_short: u64,
38    mask_long: u64,
39    ignore_size: u64,
40    min_size: u64,
41    avg_size: u64,
42    max_size: u64,
43}
44
45impl Default for FastCDC {
46    fn default() -> Self {
47        FastCDC::new()
48    }
49}
50
51impl FastCDC {
52    /// Create new FastCDC engine with default chunking settings
53    pub fn new() -> Self {
54        FastCDC::new_with_chunk_bits(gear::CHUNK_BITS)
55    }
56
57    fn reset(&mut self) {
58        self.gear.reset();
59        self.current_chunk_size = 0;
60    }
61
62    /// Create new `FastCDC` engine with custom chunking settings
63    ///
64    /// `chunk_bits` is number of bits that need to match in
65    /// the edge condition. `CHUNK_BITS` constant is the default.
66    pub fn new_with_chunk_bits(chunk_bits: u32) -> Self {
67        let (mask_short, mask_long) = get_masks(1 << chunk_bits, 2, 0);
68        let gear = Gear::new_with_chunk_bits(chunk_bits);
69        const DIGEST_SIZE: usize = 64;
70        debug_assert_eq!(
71            mem::size_of::<<Gear as RollingHash>::Digest>() * 8,
72            DIGEST_SIZE
73        );
74
75        const SPREAD_BITS: u32 = 3;
76        const WINDOW_SIZE: usize = 64;
77
78        let min_size = (1 << (gear.chunk_bits - SPREAD_BITS + 1)) as u64;
79
80        let ignore_size = min_size - WINDOW_SIZE as u64;
81        let avg_size = (1 << gear.chunk_bits) as u64;
82        let max_size = (1 << (gear.chunk_bits + SPREAD_BITS)) as u64;
83
84        Self {
85            current_chunk_size: 0,
86            gear: gear,
87            mask_short: mask_short,
88            mask_long: mask_long,
89            ignore_size: ignore_size,
90            min_size: min_size,
91            avg_size: avg_size,
92            max_size: max_size,
93        }
94    }
95}
96
97impl CDC for FastCDC {
98    /// Find chunk edge using `FastCDC` defaults.
99    fn find_chunk<'a>(&mut self, whole_buf: &'a [u8]) -> Option<(&'a [u8], &'a [u8])> {
100        let mut left = whole_buf;
101
102        debug_assert!(self.current_chunk_size < self.max_size);
103
104        // ignore bytes that are not going to influence the digest
105        if self.current_chunk_size < self.ignore_size {
106            let skip_bytes = cmp::min(self.ignore_size - self.current_chunk_size, left.len() as u64);
107            self.current_chunk_size += skip_bytes;
108            left= &left[skip_bytes as usize..];
109        }
110
111        // ignore edges in bytes that are smaller than min_size
112        if self.current_chunk_size < self.min_size {
113            let roll_bytes = cmp::min(self.min_size - self.current_chunk_size, left.len() as u64);
114            self.gear.roll(&left[..roll_bytes as usize]);
115            self.current_chunk_size += roll_bytes;
116            left = &left [roll_bytes as usize..];
117        }
118
119        // roll through early bytes with smaller probability
120        if self.current_chunk_size < self.avg_size {
121            let roll_bytes = cmp::min(self.avg_size - self.current_chunk_size, left.len() as u64);
122            let result = self.gear.find_chunk_mask(left, self.mask_short);
123
124            if let Some((_last, rest)) = result {
125                let result = (&whole_buf[..whole_buf.len() - rest.len()], rest);
126                self.reset();
127                return Some(result);
128            }
129
130            self.current_chunk_size += roll_bytes;
131            left = &left[roll_bytes as usize..];
132        }
133
134        // roll through late bytes with higher probability
135        if self.current_chunk_size < self.max_size {
136            let roll_bytes = cmp::min(self.max_size - self.current_chunk_size, left.len() as u64);
137            let result = self.gear.find_chunk_mask(left, self.mask_long);
138
139            if let Some((_last, rest)) = result {
140                let result = (&whole_buf[..whole_buf.len() - rest.len()], rest);
141                self.reset();
142                return Some(result);
143            }
144
145            self.current_chunk_size += roll_bytes;
146            left = &left[roll_bytes as usize..];
147        }
148
149        if self.current_chunk_size >= self.max_size {
150            debug_assert_eq!(self.current_chunk_size, self.max_size);
151            let result = (&whole_buf[..whole_buf.len() - left.len()], left);
152            self.reset();
153            return Some(result);
154        }
155
156        if left.is_empty() {
157            return None;
158        }
159        unreachable!();
160    }
161}
162
163#[cfg(test)]
164mod tests {
165
166    #[cfg(feature = "bench")]
167    mod bench {
168        use test::Bencher;
169        use super::*;
170
171        use tests::test_data_1mb;
172
173        use {CDC, FastCDC};
174
175        #[bench]
176        fn perf_1mb_004k_chunks(b: &mut Bencher) {
177            let v = test_data_1mb();
178            b.bytes = v.len() as u64;
179
180            b.iter(|| {
181                let mut cdc = FastCDC::new_with_chunk_bits(12);
182                let mut buf = v.as_slice();
183
184                while let Some((_last, rest)) = cdc.find_chunk(buf) {
185                    buf = rest;
186                }
187            });
188        }
189
190        #[bench]
191        fn perf_1mb_008k_chunks(b: &mut Bencher) {
192            let v = test_data_1mb();
193            b.bytes = v.len() as u64;
194
195            b.iter(|| {
196                let mut cdc = FastCDC::new_with_chunk_bits(13);
197                let mut buf = v.as_slice();
198
199                while let Some((_last, rest)) = cdc.find_chunk(buf) {
200                    buf = rest;
201                }
202            });
203        }
204
205        #[bench]
206        fn perf_1mb_064k_chunks(b: &mut Bencher) {
207            let v = test_data_1mb();
208            b.bytes = v.len() as u64;
209
210            b.iter(|| {
211                let mut cdc = FastCDC::new_with_chunk_bits(16);
212                let mut buf = v.as_slice();
213
214                while let Some((_last, rest)) = cdc.find_chunk(buf) {
215                    buf = rest;
216                }
217            });
218        }
219
220        #[bench]
221        fn perf_1mb_128k_chunks(b: &mut Bencher) {
222            let v = test_data_1mb();
223            b.bytes = v.len() as u64;
224
225            b.iter(|| {
226                let mut cdc = FastCDC::new_with_chunk_bits(17);
227                let mut buf = v.as_slice();
228
229                while let Some((_last, rest)) = cdc.find_chunk(buf) {
230                    buf = rest;
231                }
232            });
233        }
234    }
235}