cdc_chunkers/
rabin.rs

1use crate::{Chunk, SizeParams};
2
3// implementation taken from zbox
4// https://github.com/zboxfs/zbox
5
6// taken from pcompress implementation
7// https://github.com/moinakg/pcompress
8const PRIME: u64 = 153_191u64;
9const MASK: u64 = 0x00ff_ffff_ffffu64;
10const MIN_SIZE: usize = 16 * 1024; // minimal chunk size, 16k
11const AVG_SIZE: usize = 32 * 1024; // average chunk size, 32k
12const MAX_SIZE: usize = 64 * 1024; // maximum chunk size, 64k
13
14// Irreducible polynomial for Rabin modulus, from pcompress
15const FP_POLY: u64 = 0xbfe6_b8a5_bf37_8d83u64;
16
17// since we will skip MIN_SIZE when sliding window, it only
18// needs to target (AVG_SIZE - MIN_SIZE) cut length,
19// note the (AVG_SIZE - MIN_SIZE) must be 2^n
20const CUT_MASK: u64 = (AVG_SIZE - MIN_SIZE - 1) as u64;
21
22// rolling hash window constants
23const WIN_SIZE: usize = 16; // must be 2^n
24const WIN_MASK: usize = WIN_SIZE - 1;
25const WIN_SLIDE_OFFSET: usize = 64;
26const WIN_SLIDE_POS: usize = MIN_SIZE - WIN_SLIDE_OFFSET;
27
28pub struct Chunker<'a> {
29    buf: &'a [u8],
30    params: ChunkerParams, // chunker parameters
31    pos: usize,
32    len: usize,
33    sizes: SizeParams,
34    win_slide_pos: usize,
35}
36
37/// Pre-calculated chunker parameters
38#[derive(Clone)]
39pub struct ChunkerParams {
40    poly_pow: u64,     // poly power
41    out_map: Vec<u64>, // pre-computed out byte map, length is 256
42    ir: Vec<u64>,      // irreducible polynomial, length is 256
43}
44
45impl<'a> Chunker<'a> {
46    pub fn default_sizes() -> SizeParams {
47        SizeParams {
48            min: MIN_SIZE,
49            avg: AVG_SIZE,
50            max: MAX_SIZE,
51        }
52    }
53
54    pub fn new(buf: &'a [u8]) -> Chunker {
55        Chunker {
56            buf,
57            pos: 0,
58            params: ChunkerParams::new(),
59            len: buf.len(),
60            sizes: Self::default_sizes(),
61            win_slide_pos: WIN_SLIDE_POS,
62        }
63    }
64
65    pub fn with_params(buf: &'a [u8], params: ChunkerParams, sizes: SizeParams) -> Self {
66        debug_assert!(sizes.min > WIN_SLIDE_OFFSET);
67
68        Self {
69            buf,
70            params,
71            pos: 0,
72            len: buf.len(),
73            sizes,
74            win_slide_pos: sizes.min - WIN_SLIDE_OFFSET,
75        }
76    }
77
78    fn find_border(&mut self) -> Option<usize> {
79        if self.len == self.pos {
80            return None;
81        }
82
83        if self.len - self.pos < self.sizes.min {
84            let pos = self.pos;
85            self.pos = self.len;
86            return Some(self.len - pos);
87        }
88
89        self.pos += self.win_slide_pos;
90        let mut chunk_len = self.win_slide_pos;
91
92        let mut win = [0u8; WIN_SIZE];
93        let mut win_idx = 0;
94        let mut roll_hash = 0;
95
96        while self.pos < self.len {
97            let ch = self.buf[self.pos];
98            let out = win[win_idx] as usize;
99            let pushed_out = self.params.out_map[out];
100
101            // calculate Rabin rolling hash
102            roll_hash = (roll_hash * PRIME) & MASK;
103            roll_hash += u64::from(ch);
104            roll_hash = roll_hash.wrapping_sub(pushed_out) & MASK;
105
106            // forward circle window
107            win[win_idx] = ch;
108            win_idx = (win_idx + 1) & WIN_MASK;
109
110            chunk_len += 1;
111            self.pos += 1;
112
113            if chunk_len >= self.sizes.min {
114                let checksum = roll_hash ^ self.params.ir[out];
115
116                if (checksum & CUT_MASK) == 0 || chunk_len >= self.sizes.max {
117                    return Some(chunk_len);
118                }
119            }
120        }
121
122        Some(chunk_len)
123    }
124
125    pub fn give_params(self) -> ChunkerParams {
126        self.params
127    }
128}
129
130impl<'a> Iterator for Chunker<'a> {
131    type Item = Chunk;
132
133    fn next(&mut self) -> Option<Self::Item> {
134        let start = self.pos;
135
136        self.find_border().map(|length| Chunk::new(start, length))
137    }
138}
139
140impl ChunkerParams {
141    pub fn new() -> Self {
142        let mut cp = ChunkerParams::default();
143
144        // calculate poly power, it is actually PRIME ^ WIN_SIZE
145        for _ in 0..WIN_SIZE {
146            cp.poly_pow = (cp.poly_pow * PRIME) & MASK;
147        }
148
149        // pre-calculate out map table and irreducible polynomial
150        // for each possible byte, copy from PCompress implementation
151        for i in 0..256 {
152            cp.out_map[i] = (i as u64 * cp.poly_pow) & MASK;
153
154            let (mut term, mut pow, mut val) = (1u64, 1u64, 1u64);
155            for _ in 0..WIN_SIZE {
156                if (term & FP_POLY) != 0 {
157                    val += (pow * i as u64) & MASK;
158                }
159                pow = (pow * PRIME) & MASK;
160                term *= 2;
161            }
162            cp.ir[i] = val;
163        }
164
165        cp
166    }
167}
168
169impl Default for ChunkerParams {
170    fn default() -> Self {
171        let mut ret = ChunkerParams {
172            poly_pow: 1,
173            out_map: vec![0u64; 256],
174            ir: vec![0u64; 256],
175        };
176        ret.out_map.shrink_to_fit();
177        ret.ir.shrink_to_fit();
178        ret
179    }
180}
181
182#[cfg(test)]
183mod tests {
184    use crate::rabin::{Chunker, ChunkerParams};
185    use crate::SizeParams;
186
187    #[test]
188    fn rabin_works_with_different_sizes() {
189        let sizes = SizeParams::new(3000, 50000, 100000);
190
191        let data = vec![3; 1024 * 1024 * 300];
192
193        let chunker = Chunker::with_params(&data, ChunkerParams::default(), sizes);
194        for chunk in chunker {
195            println!("{:?}", chunk);
196        }
197    }
198}