Skip to main content

hash_roll/
bup.rs

1use crate::{Chunk, ChunkIncr, ToChunkIncr};
2use std::fmt;
3use std::num::Wrapping;
4
5const BLOBBITS: u8 = 13;
6const BLOBSIZE: u32 = 1 << (BLOBBITS as u32);
7
8const WINDOW_BITS: u8 = 6;
9const WINDOW_SIZE: usize = 1 << (WINDOW_BITS as usize);
10
11const ROLLSUM_CHAR_OFFSET: usize = 31;
12
13/// Rolling sum used by [`Bup`] for splitting
14///
15/// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.c
16/// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.h
17///
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct RollSum {
20    window_len: usize,
21}
22
23impl ToChunkIncr for RollSum {
24    type Incr = RollSumIncr;
25
26    fn to_chunk_incr(&self) -> Self::Incr {
27        self.into()
28    }
29}
30
31impl RollSum {
32    pub fn with_window(window_len: usize) -> Self {
33        Self { window_len }
34    }
35}
36
37impl Chunk for RollSum {
38    type SearchState = RollSumSearchState;
39
40    fn to_search_state(&self) -> Self::SearchState {
41        self.into()
42    }
43
44    fn find_chunk_edge(
45        &self,
46        state: &mut Self::SearchState,
47        data: &[u8],
48    ) -> (Option<usize>, usize) {
49        for i in state.offset..data.len() {
50            let a = data[i];
51            let d = if i >= self.window_len {
52                data[i - self.window_len]
53            } else {
54                0
55            };
56
57            state.state.add(self.window_len, d, a);
58
59            if state.state.at_split() {
60                state.reset(self);
61                return (Some(i + 1), i + 1);
62            }
63        }
64
65        // keep k elements = discard all but k
66        let discard_ct = data.len().saturating_sub(self.window_len);
67        state.offset = data.len() - discard_ct;
68        (None, discard_ct)
69    }
70}
71
72#[derive(Debug, Clone, PartialEq, Eq)]
73pub struct RollSumState {
74    // NOTE: in bup, these are `unsigned`, but masking indicates they'll end up being used as
75    // u16's. In `librsync`, these are `uint_fast16_t`, which end up being u32 on most platforms.
76    // Both only require `u16` values to be represented. We use `u32` here as it's likely to be
77    // somewhat more performant, but this should be examined
78    s1: Wrapping<u32>,
79    s2: Wrapping<u32>,
80}
81
82impl From<&RollSum> for RollSumState {
83    fn from(s: &RollSum) -> Self {
84        let ws = Wrapping(s.window_len as u32);
85        // NOTE: bup uses this initialization, but librsync uses zeros.
86        //
87        // I believe the idea is to allow a slightly different implimentation of the "setup"
88        // portion of the processing (ie: before the window is filled)
89        Self {
90            s1: ws * Wrapping(ROLLSUM_CHAR_OFFSET as u32),
91            s2: ws * (ws - Wrapping(1)) * Wrapping(ROLLSUM_CHAR_OFFSET as u32),
92        }
93    }
94}
95
96impl RollSumState {
97    fn reset(&mut self, params: &RollSum) {
98        *self = params.into()
99    }
100}
101
102#[derive(Debug, Clone, PartialEq, Eq)]
103pub struct RollSumSearchState {
104    state: RollSumState,
105    offset: usize,
106}
107
108impl From<&RollSum> for RollSumSearchState {
109    fn from(s: &RollSum) -> Self {
110        Self {
111            state: s.into(),
112            offset: 0,
113        }
114    }
115}
116
117impl RollSumSearchState {
118    fn reset(&mut self, params: &RollSum) {
119        self.offset = 0;
120        self.state.reset(params);
121    }
122}
123
124impl Default for RollSum {
125    fn default() -> Self {
126        Self::with_window(WINDOW_SIZE)
127    }
128}
129
130/// Incrimental instance of [`RollSum`]
131///
132/// Performance note: Bup's Roll sum algorithm requires tracking the entire window. As a result,
133/// this includes a circular buffer which all inputs are copied through. If your use case allows
134/// it, use the non-incrimental variant for improved performance.
135#[derive(Clone, PartialEq, Eq)]
136pub struct RollSumIncr {
137    state: RollSumState,
138
139    /// window offset
140    wofs: Wrapping<usize>,
141    window: Box<[u8]>,
142}
143
144impl fmt::Debug for RollSumIncr {
145    fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> Result<(), ::std::fmt::Error> {
146        f.debug_struct("RollSumIncr")
147            .field("state", &self.state)
148            .field("window", &::fmt_extra::Hs(&self.window[..]))
149            .field("wofs", &self.wofs)
150            .finish()
151    }
152}
153
154impl From<&RollSum> for RollSumIncr {
155    fn from(params: &RollSum) -> Self {
156        Self {
157            state: params.into(),
158            window: vec![0; params.window_len].into_boxed_slice(),
159            wofs: Wrapping(0),
160        }
161    }
162}
163
164impl Default for RollSumIncr {
165    fn default() -> Self {
166        (&RollSum::default()).into()
167    }
168}
169
170impl RollSumState {
171    fn add(&mut self, window_len: usize, drop: u8, add: u8) {
172        let d = Wrapping(drop as u32);
173        self.s1 += Wrapping(add as u32);
174        self.s1 -= d;
175        self.s2 += self.s1;
176        self.s2 -= Wrapping(window_len as u32) * (d + Wrapping(ROLLSUM_CHAR_OFFSET as u32));
177    }
178
179    fn digest(&self) -> u32 {
180        (self.s1.0 << 16) | (self.s2.0 & 0xffff)
181    }
182
183    fn at_split(&self) -> bool {
184        (self.digest() & (BLOBSIZE - 1)) == (BLOBSIZE - 1)
185    }
186}
187
188impl RollSumIncr {
189    pub fn digest(&self) -> u32 {
190        self.state.digest()
191    }
192
193    fn add(&mut self, drop: u8, add: u8) {
194        self.state.add(self.window.len(), drop, add);
195    }
196
197    pub fn roll_byte(&mut self, ch: u8) {
198        let w = self.window[self.wofs.0];
199        self.add(w, ch);
200        self.window[self.wofs.0] = ch;
201        self.wofs = Wrapping((self.wofs + Wrapping(1)).0 & (self.window.len() - 1));
202    }
203
204    #[cfg(test)]
205    pub(crate) fn roll(&mut self, data: &[u8]) {
206        for &i in data.iter() {
207            self.roll_byte(i);
208        }
209    }
210
211    /*
212    fn sum(data: &[u8]) -> u32 {
213        let mut x = Self::default();
214        x.roll(data);
215        x.digest()
216    }
217    */
218
219    pub fn at_split(&self) -> bool {
220        self.state.at_split()
221    }
222}
223
224impl ChunkIncr for RollSumIncr {
225    fn push(&mut self, data: &[u8]) -> Option<usize> {
226        for (i, &v) in data.iter().enumerate() {
227            self.roll_byte(v);
228            if self.at_split() {
229                return Some(i + 1);
230            }
231        }
232
233        None
234    }
235}
236
237#[cfg(test)]
238mod test {
239    use super::*;
240    use rand::RngCore;
241    use rollsum::Engine;
242
243    #[test]
244    fn rs() {
245        let mut m = RollSumIncr::default();
246        m.roll_byte(3);
247        assert_eq!(m.digest(), 130279491);
248    }
249
250    #[test]
251    fn compare_rollsum() {
252        let mut m1 = RollSumIncr::default();
253        let mut m2 = rollsum::Bup::default();
254
255        assert_eq!(m1.digest(), m2.digest());
256
257        m1.roll_byte(4);
258        m2.roll_byte(4);
259
260        assert_eq!(m1.digest(), m2.digest());
261
262        m1.roll_byte(18);
263        m2.roll_byte(18);
264
265        assert_eq!(m1.digest(), m2.digest());
266
267        let mut r = rand::thread_rng();
268        let mut b = [0u8; 2048];
269
270        r.fill_bytes(&mut b);
271
272        for (i, &v) in b.iter().enumerate() {
273            m1.roll_byte(v);
274            m2.roll_byte(v);
275            println!("i={}, v={}", i, v);
276            assert_eq!(m1.digest(), m2.digest());
277        }
278
279        m1.roll(&b);
280        m2.roll(&b);
281
282        assert_eq!(m1.digest(), m2.digest());
283    }
284
285    #[test]
286    fn compare_bup() {
287        use super::ChunkIncr;
288        let mut m1 = RollSumIncr::default();
289        let mut m2 = rollsum::Bup::default();
290
291        let mut r = rand::thread_rng();
292        let mut b = [0u8; 2048];
293
294        r.fill_bytes(&mut b);
295
296        let mut x = &b[..];
297        loop {
298            let v1 = m1.push(&x);
299            let v2 = m2.find_chunk_edge(&x);
300            assert_eq!(v1, v2);
301
302            match v1 {
303                None => break,
304                Some(v) => {
305                    x = &x[v..];
306                }
307            }
308        }
309    }
310}