asuran_chunker/fastcdc.rs
1use super::{Chunker, ChunkerError};
2
3use std::io::Read;
4
5/// Settings for a fastcdc `Chunker`
6///
7/// These are limited to `usize`, and not `u64`, because this implementation makes
8/// extensive use of in memory buffers of size `max_size`
9///
10/// This chunker, unlike `BuzHash` does not support any attempted mitigation of
11/// chunk based fingerprinting attacks. Those who are concerned about such an
12/// attack may wish to use the `BuzHash` chunker until such a time that a better
13/// repository format that does not leak information about chunk sizes can be
14/// developed.
15#[derive(Clone, Copy)]
16pub struct FastCDC {
17 pub min_size: usize,
18 pub max_size: usize,
19 pub avg_size: usize,
20}
21
22impl Chunker for FastCDC {
23 type Chunks = FastCDCChunker;
24 fn chunk_boxed(&self, read: Box<dyn Read + Send + 'static>) -> Self::Chunks {
25 FastCDCChunker {
26 settings: *self,
27 buffer: vec![0_u8; self.max_size],
28 length: 0,
29 read,
30 eof: false,
31 }
32 }
33}
34
35impl Default for FastCDC {
36 fn default() -> Self {
37 FastCDC {
38 min_size: 32_768,
39 avg_size: 65_536,
40 max_size: 131_072,
41 }
42 }
43}
44
45pub struct FastCDCChunker {
46 /// The settings used for this `Chunker`
47 settings: FastCDC,
48 /// The in memory buffer used to hack the chosen FastCDC implementation into working
49 ///
50 /// This must always be kept at a size of `max_size`
51 buffer: Vec<u8>,
52 /// The length of the data in the buffer
53 length: usize,
54 /// The reader this `Chunker` is slicing
55 read: Box<dyn Read + Send + 'static>,
56 /// Has the reader hit EoF?
57 eof: bool,
58}
59
60impl FastCDCChunker {
61 /// Drains a specified number of bytes from the reader, and refills it back up to `max_size` with
62 /// zeros.
63 ///
64 /// Additionally updates the pointer to the new end of the vec
65 ///
66 /// # Errors
67 ///
68 /// Returns `ChunkerError::InternalError` if the given count of bytes to drain is greater than the
69 /// current size of the used region of the buffer.
70 ///
71 /// # Panics
72 ///
73 /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
74 /// consumer of the struct should never be exposed to this error.
75 fn drain_bytes(&mut self, count: usize) -> Result<Vec<u8>, ChunkerError> {
76 assert!(self.buffer.len() == self.settings.max_size);
77 if count > self.length {
78 Err(ChunkerError::InternalError(format!(
79 "Invalid count given to FastCDCChunker::drain_bytes. Count: {}, Length: {}",
80 count, self.length
81 )))
82 } else {
83 // Drain the bytes from the vec
84 let output = self.buffer.drain(..count).collect::<Vec<_>>();
85 // Update the length
86 self.length -= count;
87 // Resize the buffer back to max_size
88 self.buffer.resize(self.settings.max_size, 0_u8);
89 // Return the drained bytes
90 Ok(output)
91 }
92 }
93
94 /// Returns true if the internal buffer is empty
95 fn is_empty(&self) -> bool {
96 self.length == 0
97 }
98
99 /// Attempts fill the buffer back up with bytes from the read
100 ///
101 /// Returns the number of bytes read. Will not attempt to read bytes if `EoF` has already been
102 /// encountered.
103 ///
104 /// # Errors
105 ///
106 /// Returns `ChunkerError::IOError` if the reader provides any error value during reading
107 ///
108 /// # Panics
109 ///
110 /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
111 /// consumer of the struct shuold never be exposed to this error.
112 fn read_bytes(&mut self) -> Result<usize, ChunkerError> {
113 assert!(self.buffer.len() == self.settings.max_size);
114 if self.eof {
115 Ok(0)
116 } else {
117 let mut total_bytes = 0;
118 // While we have not hit eof, and there is still room in the buffer, keep reading
119 while !self.eof && self.length < self.settings.max_size {
120 // read some bytes
121 let bytes_read = self.read.read(&mut self.buffer[self.length..])?;
122 // Update the length
123 self.length += bytes_read;
124 // If the number of bytes read was zero, set the eof flag
125 if bytes_read == 0 {
126 self.eof = true;
127 }
128 // Update the total
129 total_bytes += bytes_read;
130 }
131 Ok(total_bytes)
132 }
133 }
134
135 /// Uses the `FastCDC` algorithm to produce the next chunk of data.
136 ///
137 /// # Errors
138 ///
139 /// Returns `ChunkerError::Empty` if `EoF` has been hit
140 ///
141 /// # Panics
142 ///
143 /// Panics if the internal buffer's length is not `max_size`. This is an invariant, and the end
144 /// consumer of the struct should never be exposed to this error.
145 fn next_chunk(&mut self) -> Result<Vec<u8>, ChunkerError> {
146 assert_eq!(self.buffer.len(), self.settings.max_size);
147 // First, perform a read, to make sure the buffer is as full as it can be
148 self.read_bytes()?;
149 // Check to see if we are empty, if so, return early
150 if self.is_empty() {
151 Err(ChunkerError::Empty)
152 } else {
153 // Attempt to produce our slice
154 let mut slicer = fastcdc::FastCDC::new(
155 &self.buffer[..self.length],
156 self.settings.min_size,
157 self.settings.avg_size,
158 self.settings.max_size,
159 );
160 if let Some(chunk) = slicer.next() {
161 let result = self.drain_bytes(chunk.length)?;
162 Ok(result)
163 } else {
164 // We really shouldn't be here, since we ruled out the empty case, earlier but we
165 // will error anyway
166 Err(ChunkerError::Empty)
167 }
168 }
169 }
170}
171
172impl Iterator for FastCDCChunker {
173 type Item = Result<Vec<u8>, ChunkerError>;
174
175 fn next(&mut self) -> Option<Result<Vec<u8>, ChunkerError>> {
176 let slice = self.next_chunk();
177 if let Err(ChunkerError::Empty) = slice {
178 None
179 } else {
180 Some(slice)
181 }
182 }
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188 use rand::prelude::*;
189 use std::io::Cursor;
190
191 // Provides a test slice 10 times the default max size in length
192 fn get_test_data() -> Vec<u8> {
193 let size = FastCDC::default().max_size * 10;
194 let mut vec = vec![0_u8; size];
195 rand::thread_rng().fill_bytes(&mut vec);
196 vec
197 }
198
199 // Data should be split into one or more chunks.
200 //
201 // In this case, the data is larger than `max_size`, so it should be more than one chunk
202 #[test]
203 fn one_or_more_chunks() {
204 let data = get_test_data();
205 let cursor = Cursor::new(data);
206 let chunker = FastCDC::default();
207 let chunks = chunker
208 .chunk(cursor)
209 .map(|x| x.unwrap())
210 .collect::<Vec<_>>();
211 assert!(chunks.len() > 1);
212 }
213
214 // Data should be identical after reassembaly by simple concatenation
215 #[test]
216 fn reassemble_data() {
217 let data = get_test_data();
218 let cursor = Cursor::new(data.clone());
219 let chunks = FastCDC::default()
220 .chunk(cursor)
221 .map(|x| x.unwrap())
222 .collect::<Vec<_>>();
223 let rebuilt: Vec<u8> = chunks.concat();
224 assert_eq!(data, rebuilt);
225 }
226
227 // Running the chunker over the same data twice should result in identical chunks
228 #[test]
229 fn identical_chunks() {
230 let data = get_test_data();
231 let cursor1 = Cursor::new(data.clone());
232 let chunks1 = FastCDC::default()
233 .chunk(cursor1)
234 .map(|x| x.unwrap())
235 .collect::<Vec<_>>();
236 let cursor2 = Cursor::new(data);
237 let chunks2 = FastCDC::default()
238 .chunk(cursor2)
239 .map(|x| x.unwrap())
240 .collect::<Vec<_>>();
241 assert_eq!(chunks1, chunks2);
242 }
243
244 // Verifies that this `Chunker` does not produce chunks larger than its max size
245 #[test]
246 fn max_size() {
247 let data = get_test_data();
248 let max_size = FastCDC::default().max_size;
249
250 let chunks = FastCDC::default()
251 .chunk(Cursor::new(data))
252 .map(|x| x.unwrap())
253 .collect::<Vec<_>>();
254
255 for chunk in chunks {
256 assert!(chunk.len() <= max_size);
257 }
258 }
259
260 // Verifies that this `Chunker`, at most, produces 1 under-sized chunk
261 #[test]
262 fn min_size() {
263 let data = get_test_data();
264 let min_size = FastCDC::default().min_size;
265
266 let chunks = FastCDC::default()
267 .chunk(Cursor::new(data))
268 .map(|x| x.unwrap())
269 .collect::<Vec<_>>();
270
271 let mut undersized_count = 0;
272 for chunk in chunks {
273 if chunk.len() < min_size {
274 undersized_count += 1;
275 }
276 }
277
278 assert!(undersized_count <= 1);
279 }
280}