Skip to main content

asuran_chunker/
fastcdc.rs

1use super::{Chunker, ChunkerError};
2
3use std::io::Read;
4
5/// Settings for a fastcdc `Chunker`
6///
7/// These are limited to `usize`, and not `u64`, because this implementation makes
8/// extensive use of in memory buffers of size `max_size`
9///
10/// This chunker, unlike `BuzHash` does not support any attempted mitigation of
11/// chunk based fingerprinting attacks. Those who are concerned about such an
12/// attack may wish to use the `BuzHash` chunker until such a time that a better
13/// repository format that does not leak information about chunk sizes can be
14/// developed.
15#[derive(Clone, Copy)]
16pub struct FastCDC {
17    pub min_size: usize,
18    pub max_size: usize,
19    pub avg_size: usize,
20}
21
22impl Chunker for FastCDC {
23    type Chunks = FastCDCChunker;
24    fn chunk_boxed(&self, read: Box<dyn Read + Send + 'static>) -> Self::Chunks {
25        FastCDCChunker {
26            settings: *self,
27            buffer: vec![0_u8; self.max_size],
28            length: 0,
29            read,
30            eof: false,
31        }
32    }
33}
34
35impl Default for FastCDC {
36    fn default() -> Self {
37        FastCDC {
38            min_size: 32_768,
39            avg_size: 65_536,
40            max_size: 131_072,
41        }
42    }
43}
44
45pub struct FastCDCChunker {
46    /// The settings used for this `Chunker`
47    settings: FastCDC,
48    /// The in memory buffer used to hack the chosen FastCDC implementation into working
49    ///
50    /// This must always be kept at a size of `max_size`
51    buffer: Vec<u8>,
52    /// The length of the data in the buffer
53    length: usize,
54    /// The reader this `Chunker` is slicing
55    read: Box<dyn Read + Send + 'static>,
56    /// Has the reader hit EoF?
57    eof: bool,
58}
59
60impl FastCDCChunker {
61    /// Drains a specified number of bytes from the reader, and refills it back up to `max_size` with
62    /// zeros.
63    ///
64    /// Additionally updates the pointer to the new end of the vec
65    ///
66    /// # Errors
67    ///
68    /// Returns `ChunkerError::InternalError` if the given count of bytes to drain is greater than the
69    /// current size of the used region of the buffer.
70    ///
71    /// # Panics
72    ///
73    /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
74    /// consumer of the struct should never be exposed to this error.
75    fn drain_bytes(&mut self, count: usize) -> Result<Vec<u8>, ChunkerError> {
76        assert!(self.buffer.len() == self.settings.max_size);
77        if count > self.length {
78            Err(ChunkerError::InternalError(format!(
79                "Invalid count given to FastCDCChunker::drain_bytes. Count: {}, Length: {}",
80                count, self.length
81            )))
82        } else {
83            // Drain the bytes from the vec
84            let output = self.buffer.drain(..count).collect::<Vec<_>>();
85            // Update the length
86            self.length -= count;
87            // Resize the buffer back to max_size
88            self.buffer.resize(self.settings.max_size, 0_u8);
89            // Return the drained bytes
90            Ok(output)
91        }
92    }
93
94    /// Returns true if the internal buffer is empty
95    fn is_empty(&self) -> bool {
96        self.length == 0
97    }
98
99    /// Attempts fill the buffer back up with bytes from the read
100    ///
101    /// Returns the number of bytes read. Will not attempt to read bytes if `EoF` has already been
102    /// encountered.
103    ///
104    /// # Errors
105    ///
106    /// Returns `ChunkerError::IOError` if the reader provides any error value during reading
107    ///
108    /// # Panics
109    ///
110    /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
111    /// consumer of the struct shuold never be exposed to this error.
112    fn read_bytes(&mut self) -> Result<usize, ChunkerError> {
113        assert!(self.buffer.len() == self.settings.max_size);
114        if self.eof {
115            Ok(0)
116        } else {
117            let mut total_bytes = 0;
118            // While we have not hit eof, and there is still room in the buffer, keep reading
119            while !self.eof && self.length < self.settings.max_size {
120                // read some bytes
121                let bytes_read = self.read.read(&mut self.buffer[self.length..])?;
122                // Update the length
123                self.length += bytes_read;
124                // If the number of bytes read was zero, set the eof flag
125                if bytes_read == 0 {
126                    self.eof = true;
127                }
128                // Update the total
129                total_bytes += bytes_read;
130            }
131            Ok(total_bytes)
132        }
133    }
134
135    /// Uses the `FastCDC` algorithm to produce the next chunk of data.
136    ///
137    /// # Errors
138    ///
139    /// Returns `ChunkerError::Empty` if `EoF` has been hit
140    ///
141    /// # Panics
142    ///
143    /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
144    /// consumer of the struct should never be exposed to this error.
145    fn next_chunk(&mut self) -> Result<Vec<u8>, ChunkerError> {
146        assert_eq!(self.buffer.len(), self.settings.max_size);
147        // First, perform a read, to make sure the buffer is as full as it can be
148        self.read_bytes()?;
149        // Check to see if we are empty, if so, return early
150        if self.is_empty() {
151            Err(ChunkerError::Empty)
152        } else {
153            // Attempt to produce our slice
154            let mut slicer = fastcdc::FastCDC::new(
155                &self.buffer[..self.length],
156                self.settings.min_size,
157                self.settings.avg_size,
158                self.settings.max_size,
159            );
160            if let Some(chunk) = slicer.next() {
161                let result = self.drain_bytes(chunk.length)?;
162                Ok(result)
163            } else {
164                // We really shouldn't be here, since we ruled out the empty case, earlier but we
165                // will error anyway
166                Err(ChunkerError::Empty)
167            }
168        }
169    }
170}
171
172impl Iterator for FastCDCChunker {
173    type Item = Result<Vec<u8>, ChunkerError>;
174
175    fn next(&mut self) -> Option<Result<Vec<u8>, ChunkerError>> {
176        let slice = self.next_chunk();
177        if let Err(ChunkerError::Empty) = slice {
178            None
179        } else {
180            Some(slice)
181        }
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188    use rand::prelude::*;
189    use std::io::Cursor;
190
191    // Provides a test slice 10 times the default max size in length
192    fn get_test_data() -> Vec<u8> {
193        let size = FastCDC::default().max_size * 10;
194        let mut vec = vec![0_u8; size];
195        rand::thread_rng().fill_bytes(&mut vec);
196        vec
197    }
198
199    // Data should be split into one or more chunks.
200    //
201    // In this case, the data is larger than `max_size`, so it should be more than one chunk
202    #[test]
203    fn one_or_more_chunks() {
204        let data = get_test_data();
205        let cursor = Cursor::new(data);
206        let chunker = FastCDC::default();
207        let chunks = chunker
208            .chunk(cursor)
209            .map(|x| x.unwrap())
210            .collect::<Vec<_>>();
211        assert!(chunks.len() > 1);
212    }
213
214    // Data should be identical after reassembaly by simple concatenation
215    #[test]
216    fn reassemble_data() {
217        let data = get_test_data();
218        let cursor = Cursor::new(data.clone());
219        let chunks = FastCDC::default()
220            .chunk(cursor)
221            .map(|x| x.unwrap())
222            .collect::<Vec<_>>();
223        let rebuilt: Vec<u8> = chunks.concat();
224        assert_eq!(data, rebuilt);
225    }
226
227    // Running the chunker over the same data twice should result in identical chunks
228    #[test]
229    fn identical_chunks() {
230        let data = get_test_data();
231        let cursor1 = Cursor::new(data.clone());
232        let chunks1 = FastCDC::default()
233            .chunk(cursor1)
234            .map(|x| x.unwrap())
235            .collect::<Vec<_>>();
236        let cursor2 = Cursor::new(data);
237        let chunks2 = FastCDC::default()
238            .chunk(cursor2)
239            .map(|x| x.unwrap())
240            .collect::<Vec<_>>();
241        assert_eq!(chunks1, chunks2);
242    }
243
244    // Verifies that this `Chunker` does not produce chunks larger than its max size
245    #[test]
246    fn max_size() {
247        let data = get_test_data();
248        let max_size = FastCDC::default().max_size;
249
250        let chunks = FastCDC::default()
251            .chunk(Cursor::new(data))
252            .map(|x| x.unwrap())
253            .collect::<Vec<_>>();
254
255        for chunk in chunks {
256            assert!(chunk.len() <= max_size);
257        }
258    }
259
260    // Verifies that this `Chunker`, at most, produces 1 under-sized chunk
261    #[test]
262    fn min_size() {
263        let data = get_test_data();
264        let min_size = FastCDC::default().min_size;
265
266        let chunks = FastCDC::default()
267            .chunk(Cursor::new(data))
268            .map(|x| x.unwrap())
269            .collect::<Vec<_>>();
270
271        let mut undersized_count = 0;
272        for chunk in chunks {
273            if chunk.len() < min_size {
274                undersized_count += 1;
275            }
276        }
277
278        assert!(undersized_count <= 1);
279    }
280}