rdedup_cdc/
bup.rs

1use {RollingHash, CDC};
2use std::default::Default;
3
4const WINDOW_BITS: usize = 6;
5const WINDOW_SIZE: usize = 1 << WINDOW_BITS;
6
7const CHAR_OFFSET: usize = 31;
8
9/// Default chunk size used by `bup`
10pub const CHUNK_SIZE: u32 = 1 << CHUNK_BITS;
11
12/// Default chunk size used by `bup` (log2)
13pub const CHUNK_BITS: u32 = 13;
14
15
16/// Rolling checksum method used by `bup`
17///
18/// Strongly based on
19/// https://github.com/bup/bup/blob/706e8d273/lib/bup/bupsplit.c
20/// https://github.com/bup/bup/blob/706e8d273/lib/bup/bupsplit.h
21/// (a bit like https://godoc.org/camlistore.org/pkg/rollsum)
22pub struct Bup {
23    s1: usize,
24    s2: usize,
25    window: [u8; WINDOW_SIZE],
26    wofs: usize,
27    chunk_bits: u32,
28}
29
30impl Default for Bup {
31    fn default() -> Self {
32        Bup {
33            s1: WINDOW_SIZE * CHAR_OFFSET,
34            s2: WINDOW_SIZE * (WINDOW_SIZE-1) * CHAR_OFFSET,
35            window: [0; WINDOW_SIZE],
36            wofs: 0,
37            chunk_bits: CHUNK_BITS,
38        }
39    }
40}
41
42
43impl RollingHash for Bup {
44    type Digest = u32;
45
46    fn roll_byte(&mut self, newch: u8) {
47        // Since this crate is performance ciritical, and
48        // we're in strict control of `wofs`, it is justified
49        // to skip bound checking to increase the performance
50        // https://github.com/rust-lang/rfcs/issues/811
51        let prevch = unsafe { *self.window.get_unchecked(self.wofs) };
52        self.add(prevch, newch);
53        unsafe { *self.window.get_unchecked_mut(self.wofs)  = newch };
54        self.wofs = (self.wofs + 1) % WINDOW_SIZE;
55    }
56
57    fn digest(&self) -> u32 {
58        ((self.s1 as u32) << 16) | ((self.s2 as u32) & 0xffff)
59    }
60
61    fn reset(&mut self) {
62        *self = Bup {
63            chunk_bits: self.chunk_bits,
64            .. Default::default()
65        }
66    }
67}
68
69impl CDC for Bup {
70    fn find_chunk<'a>(&mut self, buf: &'a [u8]) -> Option<(&'a [u8], &'a [u8])> {
71        let chunk_mask = (1 << self.chunk_bits) - 1;
72        for (i, &b) in buf.iter().enumerate() {
73            self.roll_byte(b);
74
75            if self.digest() & chunk_mask == chunk_mask {
76                self.reset();
77                return Some((&buf[..i+1], &buf[i+1..]));
78            }
79        }
80        None
81    }
82}
83
84
85impl Bup {
86    /// Create new Bup engine with default chunking settings
87    pub fn new() -> Self {
88        Default::default()
89    }
90
91    /// Create new Bup engine with custom chunking settings
92    ///
93    /// `chunk_bits` is number of bits that need to match in
94    /// the edge condition. `CHUNK_BITS` constant is the default.
95    pub fn new_with_chunk_bits(chunk_bits: u32) -> Self {
96        assert!(chunk_bits < 32);
97        Bup {
98            chunk_bits: chunk_bits,
99            .. Default::default()
100        }
101    }
102
103    fn add(&mut self, drop: u8, add: u8) {
104        self.s1 += add as usize;
105        self.s1 -= drop as usize;
106        self.s2 += self.s1;
107        self.s2 -= WINDOW_SIZE * (drop as usize + CHAR_OFFSET);
108    }
109
110    /// Counts the number of low bits set in the rollsum, assuming
111    /// the digest has the bottom `CHUNK_BITS` bits set to `1`
112    /// (i.e. assuming a digest at a default bup chunk edge, as
113    /// returned by `find_chunk_edge`).
114    /// Be aware that there's a deliberate 'bug' in this function
115    /// in order to match expected return values from other bupsplit
116    /// implementations.
117    pub fn count_bits(&self) -> u32 {
118        let mut bits = self.chunk_bits;
119        let mut rsum = self.digest() >> self.chunk_bits;
120        // Yes, the ordering of this loop does mean that the
121        // `CHUNK_BITS+1`th bit will be ignored. This isn't actually
122        // a problem as the distribution of values will be the same,
123        // but it is unexpected.
124        loop {
125            rsum >>= 1;
126            if (rsum & 1) == 0 {
127                break;
128            }
129            bits += 1;
130        }
131        bits
132    }
133}
134
135#[cfg(feature = "bench")]
136mod tests {
137    use test::Bencher;
138    use super::{Bup, CDC};
139    use tests::test_data_1mb;
140
141    #[bench]
142    fn bup_perf_1mb(b: &mut Bencher) {
143        let v = test_data_1mb();
144        b.bytes = v.len() as u64;
145
146        b.iter(|| {
147            let mut cdc = Bup::new();
148            let mut buf = v.as_slice();
149
150            while let Some((_last, rest)) = cdc.find_chunk(buf) {
151                buf = rest;
152            }
153        });
154    }
155}