1use blake2::{Blake2b, Digest};
2use sha2::Sha512;
3use std::collections::LinkedList;
4use std::default::Default;
5
6mod table;
7#[cfg(not(feature = "window_size"))]
8mod table_const;
9#[cfg(feature = "window_size")]
10mod table_gen;
11
12use crate::table::*;
13
14const DEGREE: usize = 64;
15const ZIP_MAGIC: [u8; 4] = [0x50, 0x4b, 0x03, 0x04];
16const ZIP_CRC_OFFSET: usize = 10; #[derive(Debug)]
19pub struct ChunkerOptions {
20 pub window_size: usize,
21 pub min_chunk: usize,
22 pub avg_chunk: usize,
23 pub max_chunk: usize,
24 pub detect_zip_boundary: bool,
25}
26
27impl Default for ChunkerOptions {
28 fn default() -> Self {
29 Self {
30 window_size: DEFAULT_WINDOW_SIZE,
31 min_chunk: 8 * 1024,
32 avg_chunk: 16 * 1024,
33 max_chunk: 32 * 1024,
34 detect_zip_boundary: false,
35 }
36 }
37}
38
39#[derive(Debug)]
40pub struct Chunk {
41 pub size: usize,
42 pub digest: Vec<u8>,
43}
44
45pub struct Stats {
46 pub size: usize,
47 pub sha512: Vec<u8>,
48}
49
50pub struct Chunker {
51 table: Table,
52 options: ChunkerOptions,
53 hash: u64,
54 hash_mask: u64,
55 window: Vec<u8>,
56 window_size: usize,
57 window_offset: usize,
58 chunk_size: usize,
59 chunk_digest: Blake2b<blake2::digest::consts::U18>,
60 digest: Sha512,
61 total_size: usize,
62 zip_magic_offset: usize,
63 zip_header_offset: usize,
64 chunks: LinkedList<Chunk>,
65}
66
67impl Chunker {
68 pub fn new(options: ChunkerOptions) -> Self {
69 let hash_mask = options.avg_chunk - 1;
70
71 Self {
72 table: Table::new(options.window_size),
73 hash: 0,
74 hash_mask: hash_mask as u64,
75 window: vec![0; options.window_size],
76 window_size: options.window_size,
77 window_offset: 0,
78 chunk_size: 0,
79 chunk_digest: Blake2b::new(),
80 digest: Sha512::new(),
81 total_size: 0,
82 zip_magic_offset: 0,
83 zip_header_offset: 0,
84 chunks: LinkedList::new(),
85
86 options,
87 }
88 }
89
90 pub fn update(&mut self, data: &[u8]) {
91 let window_size = self.options.window_size;
92
93 let mut chunk_start = 0;
94
95 self.digest.update(data);
96 self.total_size += data.len();
97
98 for i in 0..data.len() {
99 self.chunk_size += 1;
100
101 if self.options.detect_zip_boundary && self.zip_magic_offset < ZIP_MAGIC.len() {
102 let b = data[i];
103 if ZIP_MAGIC[self.zip_magic_offset] == b {
104 self.zip_magic_offset += 1;
105 } else {
106 self.zip_magic_offset = 0;
107 self.zip_header_offset = 0;
108 }
109 }
110
111 let seen_zip_magic = self.zip_magic_offset == ZIP_MAGIC.len();
112 if seen_zip_magic
113 || self.zip_header_offset > 0 && self.zip_header_offset <= ZIP_CRC_OFFSET
114 {
115 self.zip_header_offset += 1;
116 }
117
118 let seen_zip_header = self.zip_header_offset == ZIP_CRC_OFFSET + 1;
119
120 if self.chunk_size + self.window_size <= self.options.min_chunk && !seen_zip_header {
122 continue;
123 }
124
125 let b = data[i];
126 let dropped_byte = self.window[self.window_offset] as usize;
127 let shifted_byte = self.hash >> (DEGREE - 8 - 1);
128 self.window[self.window_offset] = b;
129 self.window_offset = (self.window_offset + 1) % window_size;
130
131 self.hash <<= 8;
132 self.hash ^= b as u64;
133 self.hash ^= self.table.drop[dropped_byte];
134 self.hash ^= self.table.shift[shifted_byte as usize];
135
136 if !(seen_zip_header
137 || (self.chunk_size >= self.options.min_chunk
138 && (self.hash & self.hash_mask) == self.hash_mask)
139 || self.chunk_size >= self.options.max_chunk)
140 {
141 continue;
142 }
143
144 self.hash = 0;
145 self.chunk_digest.update(&data[chunk_start..=i]);
146 self.chunks.push_back(Chunk {
147 size: self.chunk_size,
148 digest: self.chunk_digest.finalize_reset().to_vec(),
149 });
150 chunk_start = i + 1;
151 self.reset();
152 }
153
154 if chunk_start < data.len() {
155 self.chunk_digest.update(&data[chunk_start..]);
156 }
157 }
158
159 pub fn finalize_reset(&mut self) -> Stats {
160 let total_size = self.total_size;
161 let chunk_size = self.chunk_size;
162 let digest = self.chunk_digest.finalize_reset();
163
164 self.total_size = 0;
165 self.reset();
166
167 if chunk_size != 0 {
168 self.chunks.push_back(Chunk {
169 size: chunk_size,
170 digest: digest.to_vec(),
171 })
172 }
173
174 Stats {
175 size: total_size,
176 sha512: self.digest.finalize_reset().to_vec(),
177 }
178 }
179
180 fn reset(&mut self) {
181 self.hash = 0;
182 self.chunk_size = 0;
183 self.zip_magic_offset = 0;
184 self.zip_header_offset = 0;
185 self.window.fill(0);
186 }
187}
188
189impl Iterator for Chunker {
190 type Item = Chunk;
191
192 fn next(&mut self) -> Option<Self::Item> {
193 self.chunks.pop_front()
194 }
195}
196
197#[cfg(test)]
198mod tests {
199 extern crate base64;
200
201 use super::*;
202
203 #[test]
204 #[cfg(feature = "window_size")]
205 fn it_computes_rolling_hash() {
206 let mut chunker = Chunker::new(ChunkerOptions {
207 window_size: 16,
208 min_chunk: 0,
209 max_chunk: 1024 * 1024,
210
211 avg_chunk: 1024 * 1024,
213 detect_zip_boundary: false,
214 });
215
216 for i in 0..1024u64 {
217 chunker.update(&[(i & 0xff) as u8]);
218 }
219 let rolling_hash = chunker.hash;
220
221 chunker.reset();
222 for i in (1024 - 16)..1024u64 {
223 chunker.update(&[(i & 0xff) as u8]);
224 }
225 let non_rolling_hash = chunker.hash;
226 assert_eq!(rolling_hash, non_rolling_hash);
227
228 assert_eq!(rolling_hash, 1976718474515856107);
229 }
230
231 #[test]
232 fn it_computes_chunks() {
233 let mut chunker = Chunker::new(ChunkerOptions::default());
234
235 let size: usize = 256 * 1024;
236
237 for _i in 0..size {
238 chunker.update(&[0x33, 0x31, 0x85]);
239 }
240
241 let stats = chunker.finalize_reset();
242 assert_eq!(stats.size, size * 3);
243
244 assert_eq!(chunker.count(), 24);
245 }
246
247 #[test]
248 fn it_doesnt_chunk_early_after_skipping() {
249 let mut chunker = Chunker::new(ChunkerOptions::default());
250
251 chunker.update(&[0xff; 8 * 1024 + 8]);
252
253 let stats = chunker.finalize_reset();
254 assert_eq!(stats.size, 8 * 1024 + 8);
255
256 assert_eq!(chunker.count(), 1);
257 }
258}