better_blockmap/
lib.rs

1use blake2::{Blake2b, Digest};
2use sha2::Sha512;
3use std::collections::LinkedList;
4use std::default::Default;
5
6mod table;
7#[cfg(not(feature = "window_size"))]
8mod table_const;
9#[cfg(feature = "window_size")]
10mod table_gen;
11
12use crate::table::*;
13
14const DEGREE: usize = 64;
15const ZIP_HEADER: [u8; 4] = [0x50, 0x4b, 0x03, 0x04];
16
17#[derive(Debug)]
18pub struct ChunkerOptions {
19    pub window_size: usize,
20    pub min_chunk: usize,
21    pub avg_chunk: usize,
22    pub max_chunk: usize,
23    pub detect_zip_boundary: bool,
24}
25
26impl Default for ChunkerOptions {
27    fn default() -> Self {
28        Self {
29            window_size: DEFAULT_WINDOW_SIZE,
30            min_chunk: 8 * 1024,
31            avg_chunk: 16 * 1024,
32            max_chunk: 32 * 1024,
33            detect_zip_boundary: false,
34        }
35    }
36}
37
38#[derive(Debug)]
39pub struct Chunk {
40    pub size: usize,
41    pub digest: Vec<u8>,
42}
43
44pub struct Stats {
45    pub size: usize,
46    pub sha512: Vec<u8>,
47}
48
49pub struct Chunker {
50    table: Table,
51    options: ChunkerOptions,
52    hash: u64,
53    hash_mask: u64,
54    window: Vec<u8>,
55    window_size: usize,
56    window_offset: usize,
57    chunk_size: usize,
58    chunk_digest: Blake2b<blake2::digest::consts::U18>,
59    digest: Sha512,
60    total_size: usize,
61    zip_header_offset: usize,
62    chunks: LinkedList<Chunk>,
63}
64
65impl Chunker {
66    pub fn new(options: ChunkerOptions) -> Self {
67        let hash_mask = options.avg_chunk - 1;
68
69        Self {
70            table: Table::new(options.window_size),
71            hash: 0,
72            hash_mask: hash_mask as u64,
73            window: vec![0; options.window_size],
74            window_size: options.window_size,
75            window_offset: 0,
76            chunk_size: 0,
77            chunk_digest: Blake2b::new(),
78            digest: Sha512::new(),
79            total_size: 0,
80            zip_header_offset: 0,
81            chunks: LinkedList::new(),
82
83            options,
84        }
85    }
86
87    pub fn update(&mut self, data: &[u8]) {
88        let window_size = self.options.window_size;
89
90        let mut chunk_start = 0;
91
92        self.digest.update(data);
93        self.total_size += data.len();
94
95        for i in 0..data.len() {
96            self.chunk_size += 1;
97
98            if self.options.detect_zip_boundary && self.zip_header_offset < ZIP_HEADER.len() {
99                let b = data[i];
100                if ZIP_HEADER[self.zip_header_offset] == b {
101                    self.zip_header_offset += 1;
102                } else {
103                    self.zip_header_offset = 0;
104                }
105            }
106
107            let seen_zip_header = self.zip_header_offset == ZIP_HEADER.len();
108
109            // Skip until we are `window_size`  bytes behind minimum chunk size
110            if self.chunk_size + self.window_size <= self.options.min_chunk && !seen_zip_header {
111                continue;
112            }
113
114            let b = data[i];
115            let dropped_byte = self.window[self.window_offset] as usize;
116            let shifted_byte = self.hash >> (DEGREE - 8 - 1);
117            self.window[self.window_offset] = b;
118            self.window_offset = (self.window_offset + 1) % window_size;
119
120            self.hash <<= 8;
121            self.hash ^= b as u64;
122            self.hash ^= self.table.drop[dropped_byte];
123            self.hash ^= self.table.shift[shifted_byte as usize];
124
125            if !(seen_zip_header
126                || (self.chunk_size >= self.options.min_chunk
127                    && (self.hash & self.hash_mask) == self.hash_mask)
128                || self.chunk_size >= self.options.max_chunk)
129            {
130                continue;
131            }
132
133            self.chunk_digest.update(&data[chunk_start..=i]);
134            self.chunks.push_back(Chunk {
135                size: self.chunk_size,
136                digest: self.chunk_digest.finalize_reset().to_vec(),
137            });
138            chunk_start = i + 1;
139            self.reset();
140        }
141
142        if chunk_start < data.len() {
143            self.chunk_digest.update(&data[chunk_start..]);
144        }
145    }
146
147    pub fn finalize_reset(&mut self) -> Stats {
148        let total_size = self.total_size;
149        let chunk_size = self.chunk_size;
150        let digest = self.chunk_digest.finalize_reset();
151
152        self.total_size = 0;
153        self.reset();
154
155        if chunk_size != 0 {
156            self.chunks.push_back(Chunk {
157                size: chunk_size,
158                digest: digest.to_vec(),
159            })
160        }
161
162        Stats {
163            size: total_size,
164            sha512: self.digest.finalize_reset().to_vec(),
165        }
166    }
167
168    fn reset(&mut self) {
169        self.hash = 0;
170        self.chunk_size = 0;
171        self.zip_header_offset = 0;
172        self.window.fill(0);
173    }
174}
175
176impl Iterator for Chunker {
177    type Item = Chunk;
178
179    fn next(&mut self) -> Option<Self::Item> {
180        self.chunks.pop_front()
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    extern crate base64;
187
188    use super::*;
189
190    #[test]
191    #[cfg(feature = "window_size")]
192    fn it_computes_rolling_hash() {
193        let mut chunker = Chunker::new(ChunkerOptions {
194            window_size: 16,
195            min_chunk: 0,
196            max_chunk: 1024 * 1024,
197
198            // Make sure we never chunk for this test
199            avg_chunk: 1024 * 1024,
200            detect_zip_boundary: false,
201        });
202
203        for i in 0..1024u64 {
204            chunker.update(&[(i & 0xff) as u8]);
205        }
206        let rolling_hash = chunker.hash;
207
208        chunker.reset();
209        for i in (1024 - 16)..1024u64 {
210            chunker.update(&[(i & 0xff) as u8]);
211        }
212        let non_rolling_hash = chunker.hash;
213        assert_eq!(rolling_hash, non_rolling_hash);
214
215        assert_eq!(rolling_hash, 1976718474515856107);
216    }
217
218    #[test]
219    fn it_computes_chunks() {
220        let mut chunker = Chunker::new(ChunkerOptions::default());
221
222        let size: usize = 256 * 1024;
223
224        for _i in 0..size {
225            chunker.update(&[0x33, 0x31, 0x85]);
226        }
227
228        let stats = chunker.finalize_reset();
229        assert_eq!(stats.size, size * 3);
230
231        assert_eq!(chunker.count(), 24);
232    }
233
234    #[test]
235    fn it_doesnt_chunk_early_after_skipping() {
236        let mut chunker = Chunker::new(ChunkerOptions::default());
237
238        chunker.update(&[0xff; 8 * 1024 + 8]);
239
240        let stats = chunker.finalize_reset();
241        assert_eq!(stats.size, 8 * 1024 + 8);
242
243        assert_eq!(chunker.count(), 1);
244    }
245}