fast_cdc/
lib.rs

1#![allow(clippy::unused_io_amount)]
2pub use cdchunking;
3
4use cdchunking::ChunkerImpl;
5use std::cmp::min;
6use std::io::{Result as IoResult, Seek, Write};
7
8const TABLE: [u64; 256] = [
9    1553318008, 574654857, 759734804, 310648967, 1393527547, 1195718329, 694400241, 1154184075,
10    1319583805, 1298164590, 122602963, 989043992, 1918895050, 933636724, 1369634190, 1963341198,
11    1565176104, 1296753019, 1105746212, 1191982839, 1195494369, 29065008, 1635524067, 722221599,
12    1355059059, 564669751, 1620421856, 1100048288, 1018120624, 1087284781, 1723604070, 1415454125,
13    737834957, 1854265892, 1605418437, 1697446953, 973791659, 674750707, 1669838606, 320299026,
14    1130545851, 1725494449, 939321396, 748475270, 554975894, 1651665064, 1695413559, 671470969,
15    992078781, 1935142196, 1062778243, 1901125066, 1935811166, 1644847216, 744420649, 2068980838,
16    1988851904, 1263854878, 1979320293, 111370182, 817303588, 478553825, 694867320, 685227566,
17    345022554, 2095989693, 1770739427, 165413158, 1322704750, 46251975, 710520147, 700507188,
18    2104251000, 1350123687, 1593227923, 1756802846, 1179873910, 1629210470, 358373501, 807118919,
19    751426983, 172199468, 174707988, 1951167187, 1328704411, 2129871494, 1242495143, 1793093310,
20    1721521010, 306195915, 1609230749, 1992815783, 1790818204, 234528824, 551692332, 1930351755,
21    110996527, 378457918, 638641695, 743517326, 368806918, 1583529078, 1767199029, 182158924,
22    1114175764, 882553770, 552467890, 1366456705, 934589400, 1574008098, 1798094820, 1548210079,
23    821697741, 601807702, 332526858, 1693310695, 136360183, 1189114632, 506273277, 397438002,
24    620771032, 676183860, 1747529440, 909035644, 142389739, 1991534368, 272707803, 1905681287,
25    1210958911, 596176677, 1380009185, 1153270606, 1150188963, 1067903737, 1020928348, 978324723,
26    962376754, 1368724127, 1133797255, 1367747748, 1458212849, 537933020, 1295159285, 2104731913,
27    1647629177, 1691336604, 922114202, 170715530, 1608833393, 62657989, 1140989235, 381784875,
28    928003604, 449509021, 1057208185, 1239816707, 525522922, 476962140, 102897870, 132620570,
29    419788154, 2095057491, 1240747817, 1271689397, 973007445, 1380110056, 1021668229, 12064370,
30    1186917580, 1017163094, 597085928, 2018803520, 1795688603, 1722115921, 2015264326, 506263638,
31    1002517905, 1229603330, 1376031959, 763839898, 1970623926, 1109937345, 524780807, 1976131071,
32    905940439, 1313298413, 772929676, 1578848328, 1108240025, 577439381, 1293318580, 1512203375,
33    371003697, 308046041, 320070446, 1252546340, 568098497, 1341794814, 1922466690, 480833267,
34    1060838440, 969079660, 1836468543, 2049091118, 2023431210, 383830867, 2112679659, 231203270,
35    1551220541, 1377927987, 275637462, 2110145570, 1700335604, 738389040, 1688841319, 1506456297,
36    1243730675, 258043479, 599084776, 41093802, 792486733, 1897397356, 28077829, 1520357900,
37    361516586, 1119263216, 209458355, 45979201, 363681532, 477245280, 2107748241, 601938891,
38    244572459, 1689418013, 1141711990, 1485744349, 1181066840, 1950794776, 410494836, 1445347454,
39    2137242950, 852679640, 1014566730, 1999335993, 1871390758, 1736439305, 231222289, 603972436,
40    783045542, 370384393, 184356284, 709706295, 1453549767, 591603172, 768512391, 854125182,
41];
42
43const BLOCK_MIN_SIZE: usize = 1024 * 2; // 2 KB
44const BLOCK_AVG_SIZE: usize = 1024 * 8; // 8 KB
45const BLOCK_MAX_SIZE: usize = 1024 * 64; // 64 KB
46                                         // const BLOCK_MIN_SIZE: usize = 4;
47                                         // const BLOCK_AVG_SIZE: usize = 8;
48                                         // const BLOCK_MAX_SIZE: usize = 16;
49
50// Empirically derived values where the padded zero bits are almost evenly
51// distributed for slightly higher deduplication ratio according to our
52// large scale tests
53const MASK_SMALL: u64 = 0x0000d90303530000; // 15 '1' bits
54const _MASK_AVERAGE: u64 = 0x0000d90303530000; // 13 '1' bits
55const MASK_LARGE: u64 = 0x0000d90003530000; // 11 '1' bits
56
57pub fn cut(buffer: &[u8]) -> usize {
58    let mut fp = 0;
59    let mut i = BLOCK_MIN_SIZE;
60    let mut n = buffer.len();
61    let mut avg_size = BLOCK_AVG_SIZE;
62
63    if n <= BLOCK_MIN_SIZE {
64        return n;
65    }
66
67    if n >= BLOCK_MAX_SIZE {
68        n = BLOCK_MAX_SIZE;
69    } else if n <= BLOCK_AVG_SIZE {
70        avg_size = n;
71    }
72
73    while i < avg_size {
74        fp = (fp << 1) + TABLE[buffer[i] as usize];
75        if fp & MASK_SMALL == 0 {
76            return i;
77        }
78        i += 1;
79    }
80
81    while i < n {
82        fp = (fp << 1) + TABLE[buffer[i] as usize];
83        if fp & MASK_LARGE == 0 {
84            return i;
85        }
86        i += 1;
87    }
88
89    i
90}
91
92const MAX_BUFFER_SIZE: usize = BLOCK_MAX_SIZE * 2;
93
94#[derive(Debug)]
95pub struct Chunker<W: Write + Seek> {
96    dst: W,       // destination writer
97    buf: Vec<u8>, // chunker buffer ()
98    len: usize,
99}
100
101impl<W: Write + Seek> Chunker<W> {
102    /// Create a new Chunker
103    pub fn new(dst: W) -> Self {
104        Self {
105            dst,
106            buf: vec![0u8; MAX_BUFFER_SIZE],
107            len: 0,
108        }
109    }
110
111    pub fn into_inner(&self) -> &W {
112        &self.dst
113    }
114
115    pub fn into_owned(self) -> W {
116        self.dst
117    }
118}
119
120fn cut_without_limit(buffer: &[u8]) -> Option<usize> {
121    let mut fp = 0;
122    let mut i = BLOCK_MIN_SIZE;
123    let mut n = buffer.len();
124
125    if n < BLOCK_MIN_SIZE {
126        return None;
127    }
128
129    if n >= BLOCK_MAX_SIZE {
130        n = BLOCK_MAX_SIZE;
131    }
132    //  else {
133    //     return None;
134    // }
135
136    while i < n {
137        fp = (fp << 1) + TABLE[buffer[i] as usize];
138        if fp & MASK_LARGE == 0 {
139            return Some(i);
140        }
141        i += 1;
142    }
143
144    Some(i)
145}
146
147impl<W: Write + Seek> Write for Chunker<W> {
148    fn write(&mut self, buffer: &[u8]) -> IoResult<usize> {
149        if buffer.is_empty() {
150            return Ok(0);
151        }
152
153        let buf = &mut self.buf;
154        let len = self.len;
155        let buffer_len = buffer.len();
156        let mut data_written = 0;
157        let mut pos = 0;
158
159        // copy source data into chunker buffer
160        let mut in_len = min(MAX_BUFFER_SIZE - len, buffer_len);
161        assert!(in_len > 0);
162        buf[len..len + in_len].copy_from_slice(&buffer[..in_len]);
163        self.len += in_len;
164
165        // find chunks
166        while let Some(cut_pos) = cut_without_limit(&buf[pos..self.len]) {
167            data_written += self.dst.write(&buf[pos..pos + cut_pos])?;
168            pos += cut_pos;
169
170            let left_len = buf[pos..].len();
171            if in_len < buffer_len && left_len < BLOCK_MAX_SIZE {
172                // copy data that left in the beginning of the chunker buffer
173                buf.copy_within(pos..pos + left_len, 0);
174                self.len = left_len;
175
176                // if we have source data, copy them into chunker buffer
177                if in_len < buffer_len {
178                    let len_to_copy = min(pos, buffer_len - in_len);
179                    buf[left_len..left_len + len_to_copy]
180                        .copy_from_slice(&buffer[in_len..in_len + len_to_copy]);
181                    self.len += len_to_copy;
182                    in_len += len_to_copy;
183                }
184
185                //
186                pos = 0;
187            }
188        }
189
190        assert!(buffer_len >= data_written);
191        self.len = buffer_len - data_written;
192        if self.len > 0 {
193            buf.copy_within(pos..pos + self.len, 0);
194        }
195
196        Ok(buffer_len)
197    }
198
199    fn flush(&mut self) -> IoResult<()> {
200        // flush remaining data
201        if self.len > 0 {
202            self.dst.write(&self.buf[0..self.len])?;
203        }
204
205        // reset chunker
206        self.len = 0;
207
208        // flush destination
209        self.dst.flush()
210    }
211}
212
213pub struct FastCDC {}
214
215impl ChunkerImpl for FastCDC {
216    fn find_boundary(&mut self, data: &[u8]) -> Option<usize> {
217        Some(cut(data) - 1)
218    }
219
220    fn reset(&mut self) {}
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226    use fake::Fake;
227    use std::cell::RefCell;
228    use std::io::Cursor;
229
230    #[test]
231    fn cut_test() {
232        let buf: [u8; 3] = [0xb, 0xc, 0xff];
233        let i = cut(&buf);
234        assert_eq!(i, buf.len());
235    }
236
237    #[test]
238    fn cut_test2() {
239        // perpare test data
240        let mut data: Vec<u8> = Vec::with_capacity(1075);
241        for _ in 0..1024 {
242            data.push((0..255).fake::<u8>())
243        }
244
245        // Create buf with Write trait
246        let buf: Vec<u8> = Vec::with_capacity(1024);
247        let mut cursor = RefCell::new(Cursor::new(buf));
248
249        // chunk data
250        let mut chunker = Chunker::new(cursor.get_mut());
251        let size_written = chunker.write(&data).unwrap();
252        assert!(chunker.flush().is_ok()); // Chunker flush work
253        assert_eq!(cursor.get_mut().get_ref(), &data); // Data is the same
254        assert_eq!(data.len(), size_written); // data len is the same
255    }
256}