cdc_chunkers/
leap_based.rs

1use crate::{Chunk, SizeParams};
2
3const MIN_CHUNK_SIZE: usize = 1024 * 8;
4const MAX_CHUNK_SIZE: usize = 1024 * 16;
5
6const WINDOW_PRIMARY_COUNT: usize = 22;
7const WINDOW_SECONDARY_COUNT: usize = 2;
8const WINDOW_COUNT: usize = WINDOW_PRIMARY_COUNT + WINDOW_SECONDARY_COUNT;
9
10const WINDOW_SIZE: usize = 180;
11const WINDOW_MATRIX_SHIFT: usize = 42; // WINDOW_MATRIX_SHIFT * 4 < WINDOW_SIZE - 5
12
13enum PointStatus {
14    Ok,
15    Unsatisfied(usize),
16}
17
18pub struct Chunker<'a> {
19    buf: &'a [u8],
20    position: usize,
21    chunk_start: usize,
22    has_cut: bool,
23    sizes: SizeParams,
24}
25
26impl<'a> Chunker<'a> {
27    pub fn default_sizes() -> SizeParams {
28        SizeParams {
29            min: MIN_CHUNK_SIZE,
30            avg: (MAX_CHUNK_SIZE + MIN_CHUNK_SIZE) / 2,
31            max: MAX_CHUNK_SIZE,
32        }
33    }
34
35    pub fn new(buf: &'a [u8], sizes: SizeParams) -> Self {
36        Chunker {
37            buf,
38            position: sizes.min,
39            chunk_start: 0,
40            has_cut: false,
41            sizes,
42        }
43    }
44
45    fn is_point_satisfied(&self) -> PointStatus {
46        // primary check, T<=x<M where T is WINDOW_SECONDARY_COUNT and M is WINDOW_COUNT
47        for i in WINDOW_SECONDARY_COUNT..WINDOW_COUNT {
48            if !self
49                .is_window_qualified(&self.buf[self.position - i - WINDOW_SIZE..self.position - i])
50            {
51                // window is WINDOW_SIZE bytes long and moves to the left
52                let leap = WINDOW_COUNT - i;
53                return PointStatus::Unsatisfied(leap);
54            }
55        }
56
57        //secondary check, 0<=x<T bytes
58        for i in 0..WINDOW_SECONDARY_COUNT {
59            if !self
60                .is_window_qualified(&self.buf[self.position - i - WINDOW_SIZE..self.position - i])
61            {
62                let leap = WINDOW_COUNT - WINDOW_SECONDARY_COUNT - i;
63                return PointStatus::Unsatisfied(leap);
64            }
65        }
66
67        PointStatus::Ok
68    }
69
70    fn is_window_qualified(&self, window: &[u8]) -> bool {
71        (0..5)
72            .map(|index| window[WINDOW_SIZE - 1 - index * WINDOW_MATRIX_SHIFT]) // init array
73            .enumerate()
74            .map(|(index, byte)| EF_MATRIX[byte as usize][index]) // get elements from ef_matrix
75            .fold(0u8, |acc, value| acc ^ value)
76            != 0
77    }
78}
79
80impl Iterator for Chunker<'_> {
81    type Item = Chunk;
82
83    fn next(&mut self) -> Option<Self::Item> {
84        if self.position == self.buf.len() {
85            return if self.has_cut {
86                None
87            } else {
88                self.has_cut = true;
89                let chunk = Chunk::new(self.chunk_start, self.position - self.chunk_start);
90                Some(chunk)
91            };
92        }
93
94        while self.position < self.buf.len() {
95            if self.position - self.chunk_start > self.sizes.max {
96                let pos = self.chunk_start;
97                let len = self.position - self.chunk_start;
98
99                self.chunk_start = self.position;
100                self.position += self.sizes.min;
101
102                return Some(Chunk::new(pos, len));
103            } else {
104                match self.is_point_satisfied() {
105                    PointStatus::Ok => {
106                        let pos = self.chunk_start;
107                        let len = self.position - self.chunk_start;
108
109                        self.chunk_start = self.position;
110                        self.position += self.sizes.min;
111
112                        return Some(Chunk::new(pos, len));
113                    }
114                    PointStatus::Unsatisfied(leap) => {
115                        self.position += leap;
116                    }
117                }
118            }
119        }
120
121        self.position = self.buf.len();
122        self.has_cut = true;
123        Some(Chunk::new(
124            self.chunk_start,
125            self.position - self.chunk_start,
126        ))
127    }
128}
129
130const EF_MATRIX: [[u8; 5]; 256] = [
131    [0, 0, 2, 0, 2],
132    [1, 3, 3, 1, 3],
133    [2, 0, 0, 3, 2],
134    [3, 3, 3, 1, 2],
135    [3, 3, 0, 1, 2],
136    [1, 3, 3, 2, 1],
137    [1, 3, 3, 1, 1],
138    [0, 3, 3, 3, 0],
139    [1, 3, 0, 2, 0],
140    [2, 1, 3, 1, 1],
141    [1, 0, 0, 1, 0],
142    [0, 1, 1, 2, 3],
143    [2, 3, 3, 0, 2],
144    [1, 1, 3, 3, 1],
145    [0, 0, 3, 2, 0],
146    [3, 3, 1, 2, 2],
147    [2, 0, 3, 3, 0],
148    [2, 0, 1, 2, 1],
149    [3, 3, 3, 1, 2],
150    [0, 0, 2, 2, 2],
151    [1, 1, 3, 3, 1],
152    [1, 1, 3, 1, 3],
153    [0, 1, 1, 3, 0],
154    [3, 2, 3, 0, 3],
155    [1, 3, 3, 1, 3],
156    [3, 3, 2, 3, 1],
157    [0, 3, 0, 1, 0],
158    [1, 2, 3, 2, 0],
159    [3, 1, 0, 2, 3],
160    [2, 0, 1, 1, 0],
161    [2, 2, 1, 2, 3],
162    [0, 0, 1, 1, 1],
163    [1, 0, 1, 2, 2],
164    [1, 1, 0, 0, 3],
165    [1, 3, 0, 0, 2],
166    [2, 3, 0, 3, 1],
167    [3, 3, 0, 1, 3],
168    [2, 1, 3, 2, 1],
169    [0, 1, 3, 2, 1],
170    [0, 3, 2, 0, 2],
171    [0, 0, 0, 3, 1],
172    [3, 3, 0, 1, 3],
173    [3, 3, 0, 1, 2],
174    [3, 1, 0, 1, 0],
175    [2, 3, 1, 0, 0],
176    [1, 2, 2, 2, 1],
177    [2, 3, 1, 0, 2],
178    [3, 1, 0, 3, 0],
179    [3, 0, 3, 1, 2],
180    [3, 2, 3, 3, 3],
181    [2, 2, 0, 1, 2],
182    [2, 3, 0, 2, 2],
183    [3, 1, 3, 2, 1],
184    [1, 2, 3, 0, 0],
185    [0, 3, 3, 0, 0],
186    [1, 0, 3, 2, 1],
187    [0, 2, 0, 1, 1],
188    [2, 3, 2, 3, 2],
189    [0, 1, 2, 1, 3],
190    [0, 3, 0, 1, 3],
191    [1, 1, 1, 3, 2],
192    [1, 3, 2, 0, 2],
193    [0, 0, 1, 1, 1],
194    [2, 3, 2, 3, 1],
195    [2, 3, 2, 0, 2],
196    [2, 1, 3, 1, 2],
197    [3, 1, 0, 0, 0],
198    [1, 3, 1, 2, 1],
199    [2, 3, 0, 1, 1],
200    [3, 3, 3, 2, 3],
201    [3, 0, 1, 3, 0],
202    [0, 0, 3, 1, 1],
203    [2, 2, 3, 3, 2],
204    [3, 3, 1, 0, 0],
205    [2, 1, 1, 3, 1],
206    [3, 1, 0, 1, 0],
207    [0, 0, 1, 2, 3],
208    [0, 1, 3, 0, 1],
209    [2, 3, 1, 0, 0],
210    [2, 1, 0, 2, 1],
211    [2, 0, 3, 1, 1],
212    [0, 1, 3, 1, 2],
213    [3, 2, 2, 2, 3],
214    [1, 1, 2, 1, 3],
215    [0, 0, 2, 3, 3],
216    [1, 3, 3, 3, 0],
217    [3, 0, 0, 0, 2],
218    [2, 3, 3, 1, 1],
219    [3, 1, 2, 3, 1],
220    [0, 2, 0, 0, 3],
221    [0, 3, 2, 2, 0],
222    [3, 3, 2, 0, 1],
223    [0, 3, 1, 0, 1],
224    [1, 1, 0, 1, 0],
225    [3, 0, 3, 3, 3],
226    [1, 3, 1, 0, 0],
227    [2, 3, 0, 0, 2],
228    [1, 3, 3, 3, 3],
229    [0, 2, 0, 0, 2],
230    [0, 3, 1, 1, 1],
231    [2, 3, 3, 2, 2],
232    [1, 3, 2, 2, 1],
233    [2, 1, 1, 2, 3],
234    [0, 2, 1, 1, 0],
235    [2, 1, 2, 1, 2],
236    [1, 3, 2, 1, 0],
237    [1, 1, 1, 0, 2],
238    [2, 1, 0, 2, 3],
239    [0, 1, 3, 2, 3],
240    [0, 3, 1, 3, 2],
241    [1, 0, 1, 2, 3],
242    [2, 3, 0, 0, 0],
243    [2, 3, 3, 2, 0],
244    [0, 1, 1, 3, 2],
245    [2, 0, 1, 2, 0],
246    [1, 0, 3, 1, 0],
247    [1, 1, 1, 2, 1],
248    [3, 3, 1, 1, 2],
249    [2, 2, 1, 3, 0],
250    [3, 0, 2, 1, 1],
251    [1, 0, 3, 1, 0],
252    [0, 2, 2, 1, 0],
253    [0, 0, 3, 1, 3],
254    [3, 1, 3, 3, 2],
255    [1, 3, 3, 2, 2],
256    [0, 3, 0, 1, 0],
257    [2, 0, 2, 1, 2],
258    [0, 2, 2, 1, 1],
259    [3, 1, 1, 2, 2],
260    [1, 3, 1, 2, 1],
261    [3, 0, 3, 2, 3],
262    [2, 0, 0, 1, 1],
263    [0, 2, 0, 0, 1],
264    [3, 3, 0, 2, 0],
265    [3, 1, 1, 2, 3],
266    [2, 3, 0, 2, 3],
267    [0, 3, 1, 2, 2],
268    [1, 1, 2, 0, 3],
269    [0, 0, 2, 2, 1],
270    [2, 2, 2, 1, 2],
271    [2, 3, 0, 2, 3],
272    [1, 3, 2, 1, 3],
273    [3, 2, 2, 0, 1],
274    [1, 0, 0, 1, 3],
275    [1, 0, 3, 3, 3],
276    [2, 3, 2, 1, 0],
277    [3, 0, 2, 0, 1],
278    [3, 2, 0, 1, 0],
279    [1, 2, 3, 1, 0],
280    [2, 2, 2, 3, 1],
281    [2, 0, 1, 2, 3],
282    [1, 2, 1, 2, 1],
283    [3, 1, 2, 2, 3],
284    [1, 2, 2, 1, 0],
285    [2, 0, 1, 1, 2],
286    [1, 0, 0, 1, 1],
287    [3, 0, 2, 2, 2],
288    [3, 1, 3, 3, 1],
289    [2, 0, 0, 0, 0],
290    [1, 0, 3, 3, 1],
291    [2, 0, 2, 3, 3],
292    [0, 3, 0, 0, 0],
293    [2, 2, 3, 2, 3],
294    [3, 0, 2, 3, 2],
295    [0, 0, 1, 3, 2],
296    [3, 0, 1, 1, 3],
297    [3, 1, 3, 3, 0],
298    [0, 2, 1, 0, 2],
299    [1, 0, 0, 2, 2],
300    [0, 3, 3, 3, 1],
301    [2, 0, 0, 0, 3],
302    [3, 3, 1, 0, 0],
303    [2, 2, 1, 2, 0],
304    [0, 1, 1, 1, 0],
305    [3, 2, 0, 2, 1],
306    [1, 3, 0, 2, 2],
307    [1, 2, 3, 1, 2],
308    [1, 0, 2, 3, 3],
309    [3, 2, 0, 3, 2],
310    [3, 3, 2, 1, 0],
311    [0, 2, 3, 2, 3],
312    [1, 2, 2, 0, 2],
313    [0, 0, 2, 3, 3],
314    [1, 1, 0, 0, 1],
315    [3, 3, 0, 2, 2],
316    [0, 3, 2, 0, 3],
317    [0, 0, 0, 1, 0],
318    [1, 0, 3, 2, 2],
319    [2, 0, 2, 1, 2],
320    [0, 2, 3, 3, 3],
321    [1, 2, 0, 2, 1],
322    [1, 0, 1, 3, 1],
323    [1, 0, 1, 0, 2],
324    [3, 3, 2, 2, 2],
325    [2, 0, 1, 3, 1],
326    [2, 2, 2, 0, 1],
327    [3, 0, 3, 2, 0],
328    [3, 2, 1, 2, 0],
329    [1, 0, 1, 0, 1],
330    [3, 1, 3, 2, 2],
331    [2, 3, 0, 1, 2],
332    [3, 0, 0, 3, 3],
333    [2, 1, 0, 3, 3],
334    [0, 2, 0, 1, 2],
335    [1, 0, 3, 1, 1],
336    [1, 1, 3, 2, 1],
337    [0, 1, 0, 0, 0],
338    [0, 3, 0, 2, 1],
339    [0, 2, 3, 0, 3],
340    [1, 0, 2, 3, 1],
341    [2, 1, 1, 1, 2],
342    [1, 0, 2, 3, 3],
343    [0, 2, 3, 2, 3],
344    [0, 0, 3, 2, 1],
345    [0, 0, 3, 2, 0],
346    [3, 3, 3, 0, 2],
347    [3, 0, 1, 3, 1],
348    [3, 2, 0, 1, 2],
349    [1, 2, 0, 1, 2],
350    [0, 0, 3, 2, 0],
351    [1, 0, 3, 0, 2],
352    [2, 0, 3, 3, 1],
353    [2, 2, 3, 3, 0],
354    [2, 3, 2, 1, 1],
355    [3, 3, 2, 2, 2],
356    [1, 1, 2, 1, 0],
357    [1, 3, 2, 2, 3],
358    [0, 2, 3, 1, 0],
359    [2, 1, 0, 1, 3],
360    [3, 0, 3, 2, 3],
361    [0, 0, 1, 0, 2],
362    [2, 0, 0, 2, 0],
363    [0, 1, 0, 3, 0],
364    [3, 2, 2, 0, 3],
365    [2, 2, 0, 2, 0],
366    [2, 2, 0, 0, 2],
367    [3, 3, 1, 1, 1],
368    [0, 0, 0, 2, 1],
369    [1, 3, 2, 1, 2],
370    [1, 3, 0, 0, 3],
371    [0, 0, 2, 1, 1],
372    [3, 3, 0, 1, 3],
373    [2, 2, 0, 0, 2],
374    [1, 0, 0, 3, 1],
375    [3, 2, 2, 1, 0],
376    [2, 3, 3, 2, 3],
377    [1, 2, 0, 2, 2],
378    [2, 0, 3, 1, 3],
379    [3, 0, 0, 0, 3],
380    [2, 0, 0, 2, 2],
381    [2, 0, 0, 1, 2],
382    [0, 0, 3, 2, 1],
383    [0, 0, 0, 2, 1],
384    [1, 3, 3, 0, 1],
385    [2, 0, 0, 2, 0],
386    [3, 3, 1, 3, 1],
387];