better_blockmap/
lib.rs

1use blake2::{Blake2b, Digest};
2use sha2::Sha512;
3use std::collections::LinkedList;
4use std::default::Default;
5
6mod table;
7#[cfg(not(feature = "window_size"))]
8mod table_const;
9#[cfg(feature = "window_size")]
10mod table_gen;
11
12use crate::table::*;
13
14const DEGREE: usize = 64;
15const ZIP_MAGIC: [u8; 4] = [0x50, 0x4b, 0x03, 0x04];
16const ZIP_CRC_OFFSET: usize = 10; // Offset after magic bytes
17
18#[derive(Debug)]
19pub struct ChunkerOptions {
20    pub window_size: usize,
21    pub min_chunk: usize,
22    pub avg_chunk: usize,
23    pub max_chunk: usize,
24    pub detect_zip_boundary: bool,
25}
26
27impl Default for ChunkerOptions {
28    fn default() -> Self {
29        Self {
30            window_size: DEFAULT_WINDOW_SIZE,
31            min_chunk: 8 * 1024,
32            avg_chunk: 16 * 1024,
33            max_chunk: 32 * 1024,
34            detect_zip_boundary: false,
35        }
36    }
37}
38
39#[derive(Debug)]
40pub struct Chunk {
41    pub size: usize,
42    pub digest: Vec<u8>,
43}
44
45pub struct Stats {
46    pub size: usize,
47    pub sha512: Vec<u8>,
48}
49
50pub struct Chunker {
51    table: Table,
52    options: ChunkerOptions,
53    hash: u64,
54    hash_mask: u64,
55    window: Vec<u8>,
56    window_size: usize,
57    window_offset: usize,
58    chunk_size: usize,
59    chunk_digest: Blake2b<blake2::digest::consts::U18>,
60    digest: Sha512,
61    total_size: usize,
62    zip_magic_offset: usize,
63    zip_header_offset: usize,
64    chunks: LinkedList<Chunk>,
65}
66
67impl Chunker {
68    pub fn new(options: ChunkerOptions) -> Self {
69        let hash_mask = options.avg_chunk - 1;
70
71        Self {
72            table: Table::new(options.window_size),
73            hash: 0,
74            hash_mask: hash_mask as u64,
75            window: vec![0; options.window_size],
76            window_size: options.window_size,
77            window_offset: 0,
78            chunk_size: 0,
79            chunk_digest: Blake2b::new(),
80            digest: Sha512::new(),
81            total_size: 0,
82            zip_magic_offset: 0,
83            zip_header_offset: 0,
84            chunks: LinkedList::new(),
85
86            options,
87        }
88    }
89
90    pub fn update(&mut self, data: &[u8]) {
91        let window_size = self.options.window_size;
92
93        let mut chunk_start = 0;
94
95        self.digest.update(data);
96        self.total_size += data.len();
97
98        for i in 0..data.len() {
99            self.chunk_size += 1;
100
101            if self.options.detect_zip_boundary && self.zip_magic_offset < ZIP_MAGIC.len() {
102                let b = data[i];
103                if ZIP_MAGIC[self.zip_magic_offset] == b {
104                    self.zip_magic_offset += 1;
105                } else {
106                    self.zip_magic_offset = 0;
107                    self.zip_header_offset = 0;
108                }
109            }
110
111            let seen_zip_magic = self.zip_magic_offset == ZIP_MAGIC.len();
112            if seen_zip_magic
113                || self.zip_header_offset > 0 && self.zip_header_offset <= ZIP_CRC_OFFSET
114            {
115                self.zip_header_offset += 1;
116            }
117
118            let seen_zip_header = self.zip_header_offset == ZIP_CRC_OFFSET + 1;
119
120            // Skip until we are `window_size` bytes behind minimum chunk size
121            if self.chunk_size + self.window_size <= self.options.min_chunk && !seen_zip_header {
122                continue;
123            }
124
125            let b = data[i];
126            let dropped_byte = self.window[self.window_offset] as usize;
127            let shifted_byte = self.hash >> (DEGREE - 8 - 1);
128            self.window[self.window_offset] = b;
129            self.window_offset = (self.window_offset + 1) % window_size;
130
131            self.hash <<= 8;
132            self.hash ^= b as u64;
133            self.hash ^= self.table.drop[dropped_byte];
134            self.hash ^= self.table.shift[shifted_byte as usize];
135
136            if !(seen_zip_header
137                || (self.chunk_size >= self.options.min_chunk
138                    && (self.hash & self.hash_mask) == self.hash_mask)
139                || self.chunk_size >= self.options.max_chunk)
140            {
141                continue;
142            }
143
144            self.hash = 0;
145            self.chunk_digest.update(&data[chunk_start..=i]);
146            self.chunks.push_back(Chunk {
147                size: self.chunk_size,
148                digest: self.chunk_digest.finalize_reset().to_vec(),
149            });
150            chunk_start = i + 1;
151            self.reset();
152        }
153
154        if chunk_start < data.len() {
155            self.chunk_digest.update(&data[chunk_start..]);
156        }
157    }
158
159    pub fn finalize_reset(&mut self) -> Stats {
160        let total_size = self.total_size;
161        let chunk_size = self.chunk_size;
162        let digest = self.chunk_digest.finalize_reset();
163
164        self.total_size = 0;
165        self.reset();
166
167        if chunk_size != 0 {
168            self.chunks.push_back(Chunk {
169                size: chunk_size,
170                digest: digest.to_vec(),
171            })
172        }
173
174        Stats {
175            size: total_size,
176            sha512: self.digest.finalize_reset().to_vec(),
177        }
178    }
179
180    fn reset(&mut self) {
181        self.hash = 0;
182        self.chunk_size = 0;
183        self.zip_magic_offset = 0;
184        self.zip_header_offset = 0;
185        self.window.fill(0);
186    }
187}
188
189impl Iterator for Chunker {
190    type Item = Chunk;
191
192    fn next(&mut self) -> Option<Self::Item> {
193        self.chunks.pop_front()
194    }
195}
196
197#[cfg(test)]
198mod tests {
199    extern crate base64;
200
201    use super::*;
202
203    #[test]
204    #[cfg(feature = "window_size")]
205    fn it_computes_rolling_hash() {
206        let mut chunker = Chunker::new(ChunkerOptions {
207            window_size: 16,
208            min_chunk: 0,
209            max_chunk: 1024 * 1024,
210
211            // Make sure we never chunk for this test
212            avg_chunk: 1024 * 1024,
213            detect_zip_boundary: false,
214        });
215
216        for i in 0..1024u64 {
217            chunker.update(&[(i & 0xff) as u8]);
218        }
219        let rolling_hash = chunker.hash;
220
221        chunker.reset();
222        for i in (1024 - 16)..1024u64 {
223            chunker.update(&[(i & 0xff) as u8]);
224        }
225        let non_rolling_hash = chunker.hash;
226        assert_eq!(rolling_hash, non_rolling_hash);
227
228        assert_eq!(rolling_hash, 1976718474515856107);
229    }
230
231    #[test]
232    fn it_computes_chunks() {
233        let mut chunker = Chunker::new(ChunkerOptions::default());
234
235        let size: usize = 256 * 1024;
236
237        for _i in 0..size {
238            chunker.update(&[0x33, 0x31, 0x85]);
239        }
240
241        let stats = chunker.finalize_reset();
242        assert_eq!(stats.size, size * 3);
243
244        assert_eq!(chunker.count(), 24);
245    }
246
247    #[test]
248    fn it_doesnt_chunk_early_after_skipping() {
249        let mut chunker = Chunker::new(ChunkerOptions::default());
250
251        chunker.update(&[0xff; 8 * 1024 + 8]);
252
253        let stats = chunker.finalize_reset();
254        assert_eq!(stats.size, 8 * 1024 + 8);
255
256        assert_eq!(chunker.count(), 1);
257    }
258}