1use blake2::{Blake2b, Digest};
2use sha2::Sha512;
3use std::collections::LinkedList;
4use std::default::Default;
5
6mod table;
7#[cfg(not(feature = "window_size"))]
8mod table_const;
9#[cfg(feature = "window_size")]
10mod table_gen;
11
12use crate::table::*;
13
14const DEGREE: usize = 64;
15const ZIP_HEADER: [u8; 4] = [0x50, 0x4b, 0x03, 0x04];
16
17#[derive(Debug)]
18pub struct ChunkerOptions {
19 pub window_size: usize,
20 pub min_chunk: usize,
21 pub avg_chunk: usize,
22 pub max_chunk: usize,
23 pub detect_zip_boundary: bool,
24}
25
26impl Default for ChunkerOptions {
27 fn default() -> Self {
28 Self {
29 window_size: DEFAULT_WINDOW_SIZE,
30 min_chunk: 8 * 1024,
31 avg_chunk: 16 * 1024,
32 max_chunk: 32 * 1024,
33 detect_zip_boundary: false,
34 }
35 }
36}
37
38#[derive(Debug)]
39pub struct Chunk {
40 pub size: usize,
41 pub digest: Vec<u8>,
42}
43
44pub struct Stats {
45 pub size: usize,
46 pub sha512: Vec<u8>,
47}
48
49pub struct Chunker {
50 table: Table,
51 options: ChunkerOptions,
52 hash: u64,
53 hash_mask: u64,
54 window: Vec<u8>,
55 window_size: usize,
56 window_offset: usize,
57 chunk_size: usize,
58 chunk_digest: Blake2b<blake2::digest::consts::U18>,
59 digest: Sha512,
60 total_size: usize,
61 zip_header_offset: usize,
62 chunks: LinkedList<Chunk>,
63}
64
65impl Chunker {
66 pub fn new(options: ChunkerOptions) -> Self {
67 let hash_mask = options.avg_chunk - 1;
68
69 Self {
70 table: Table::new(options.window_size),
71 hash: 0,
72 hash_mask: hash_mask as u64,
73 window: vec![0; options.window_size],
74 window_size: options.window_size,
75 window_offset: 0,
76 chunk_size: 0,
77 chunk_digest: Blake2b::new(),
78 digest: Sha512::new(),
79 total_size: 0,
80 zip_header_offset: 0,
81 chunks: LinkedList::new(),
82
83 options,
84 }
85 }
86
87 pub fn update(&mut self, data: &[u8]) {
88 let window_size = self.options.window_size;
89
90 let mut chunk_start = 0;
91
92 self.digest.update(data);
93 self.total_size += data.len();
94
95 for i in 0..data.len() {
96 self.chunk_size += 1;
97
98 if self.options.detect_zip_boundary && self.zip_header_offset < ZIP_HEADER.len() {
99 let b = data[i];
100 if ZIP_HEADER[self.zip_header_offset] == b {
101 self.zip_header_offset += 1;
102 } else {
103 self.zip_header_offset = 0;
104 }
105 }
106
107 let seen_zip_header = self.zip_header_offset == ZIP_HEADER.len();
108
109 if self.chunk_size + self.window_size <= self.options.min_chunk && !seen_zip_header {
111 continue;
112 }
113
114 let b = data[i];
115 let dropped_byte = self.window[self.window_offset] as usize;
116 let shifted_byte = self.hash >> (DEGREE - 8 - 1);
117 self.window[self.window_offset] = b;
118 self.window_offset = (self.window_offset + 1) % window_size;
119
120 self.hash <<= 8;
121 self.hash ^= b as u64;
122 self.hash ^= self.table.drop[dropped_byte];
123 self.hash ^= self.table.shift[shifted_byte as usize];
124
125 if !(seen_zip_header
126 || (self.chunk_size >= self.options.min_chunk
127 && (self.hash & self.hash_mask) == self.hash_mask)
128 || self.chunk_size >= self.options.max_chunk)
129 {
130 continue;
131 }
132
133 self.chunk_digest.update(&data[chunk_start..=i]);
134 self.chunks.push_back(Chunk {
135 size: self.chunk_size,
136 digest: self.chunk_digest.finalize_reset().to_vec(),
137 });
138 chunk_start = i + 1;
139 self.reset();
140 }
141
142 if chunk_start < data.len() {
143 self.chunk_digest.update(&data[chunk_start..]);
144 }
145 }
146
147 pub fn finalize_reset(&mut self) -> Stats {
148 let total_size = self.total_size;
149 let chunk_size = self.chunk_size;
150 let digest = self.chunk_digest.finalize_reset();
151
152 self.total_size = 0;
153 self.reset();
154
155 if chunk_size != 0 {
156 self.chunks.push_back(Chunk {
157 size: chunk_size,
158 digest: digest.to_vec(),
159 })
160 }
161
162 Stats {
163 size: total_size,
164 sha512: self.digest.finalize_reset().to_vec(),
165 }
166 }
167
168 fn reset(&mut self) {
169 self.hash = 0;
170 self.chunk_size = 0;
171 self.zip_header_offset = 0;
172 self.window.fill(0);
173 }
174}
175
176impl Iterator for Chunker {
177 type Item = Chunk;
178
179 fn next(&mut self) -> Option<Self::Item> {
180 self.chunks.pop_front()
181 }
182}
183
184#[cfg(test)]
185mod tests {
186 extern crate base64;
187
188 use super::*;
189
190 #[test]
191 #[cfg(feature = "window_size")]
192 fn it_computes_rolling_hash() {
193 let mut chunker = Chunker::new(ChunkerOptions {
194 window_size: 16,
195 min_chunk: 0,
196 max_chunk: 1024 * 1024,
197
198 avg_chunk: 1024 * 1024,
200 detect_zip_boundary: false,
201 });
202
203 for i in 0..1024u64 {
204 chunker.update(&[(i & 0xff) as u8]);
205 }
206 let rolling_hash = chunker.hash;
207
208 chunker.reset();
209 for i in (1024 - 16)..1024u64 {
210 chunker.update(&[(i & 0xff) as u8]);
211 }
212 let non_rolling_hash = chunker.hash;
213 assert_eq!(rolling_hash, non_rolling_hash);
214
215 assert_eq!(rolling_hash, 1976718474515856107);
216 }
217
218 #[test]
219 fn it_computes_chunks() {
220 let mut chunker = Chunker::new(ChunkerOptions::default());
221
222 let size: usize = 256 * 1024;
223
224 for _i in 0..size {
225 chunker.update(&[0x33, 0x31, 0x85]);
226 }
227
228 let stats = chunker.finalize_reset();
229 assert_eq!(stats.size, size * 3);
230
231 assert_eq!(chunker.count(), 24);
232 }
233
234 #[test]
235 fn it_doesnt_chunk_early_after_skipping() {
236 let mut chunker = Chunker::new(ChunkerOptions::default());
237
238 chunker.update(&[0xff; 8 * 1024 + 8]);
239
240 let stats = chunker.finalize_reset();
241 assert_eq!(stats.size, 8 * 1024 + 8);
242
243 assert_eq!(chunker.count(), 1);
244 }
245}