Skip to main content

hash_roll/
zpaq.rs

1#![cfg(feature = "zpaq")]
2
3//! `zpaq` impliments the chunking algorithm used in the zpaq archiving tool
4
5use std::fmt;
6use std::num::Wrapping;
7
8use crate::{Chunk, ChunkIncr, RangeExt, ToChunkIncr};
9use std::ops::Bound;
10use std::ops::RangeBounds;
11
12/**
13 * A splitter used in go 'dedup' and zpaq that does not require looking back in the source
14 * data to update
15 *
16 * PDF: ??
17 *
18 * Note: go-dedup & zpaq calculate the relationship between their parameters slightly differently.
19 * We support both of these (via the seperate with_*() constructors, but it'd be nice to clarify
20 * why they differ and what affect the differences have.
21 *
22 * References:
23 *
24 *  - http://encode.ru/threads/456-zpaq-updates?p=45192&viewfull=1#post45192
25 *  - https://github.com/klauspost/dedup/blob/master/writer.go#L668, 'zpaqWriter'
26 *  - https://github.com/zpaq/zpaq/blob/master/zpaq.cpp
27 *
28 * Parameters:
29 *
30 *  - fragment (aka average_size_pow_2): average size = 2**fragment KiB
31 *      in Zpaq (the compressor), this defaults to 6
32 *  - min_size, max_size: additional bounds on the blocks. Not technically needed for the algorithm
33 *      to function
34 *
35 *  In Zpaq-compressor, min & max size are calculated using the fragment value
36 *  In go's dedup, fragment is calculated using a min & max size
37 *
38 * In-block state:
39 *
40 *  - hash: u32, current hash
41 *  - last_byte: u8, previous byte read
42 *  - predicted_byte: array of 256 u8's.
43 *
44 * Between-block state:
45 *
46 *  - None
47 */
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct Zpaq {
50    range: (Bound<u64>, Bound<u64>),
51    max_hash: u32,
52}
53
54impl Zpaq {
55    /* this is taken from go-dedup */
56    fn fragment_ave_from_max(max: u64) -> u8 {
57        /* TODO: convert this to pure integer math */
58        (max as f64 / (64f64 * 64f64)).log2() as u8
59    }
60
61    /* these are based on the zpaq (not go-dedup) calculations */
62    fn fragment_ave_from_range<T: RangeBounds<u64>>(range: T) -> u8 {
63        let v = match range.end_bound() {
64            Bound::Included(i) => *i,
65            Bound::Excluded(i) => *i - 1,
66            Bound::Unbounded => {
67                /* try to guess based on first */
68                64 * match range.start_bound() {
69                    Bound::Included(i) => *i,
70                    Bound::Excluded(i) => *i + 1,
71                    Bound::Unbounded => {
72                        /* welp, lets use the default */
73                        return 16;
74                    }
75                }
76            }
77        };
78
79        Self::fragment_ave_from_max(v)
80    }
81
82    /* these are based on the zpaq (not go-dedup) calculations */
83    fn range_from_fragment_ave(fragment_ave: u8) -> impl RangeBounds<u64> {
84        assert!(fragment_ave <= 32);
85        assert!(fragment_ave >= 10);
86        64 << (fragment_ave - 10)..8128 << fragment_ave
87    }
88
89    fn range_from_max(max: u64) -> impl RangeBounds<u64> {
90        max / 64..max
91    }
92
93    fn max_hash_from_fragment_ave(fragment_ave: u8) -> u32 {
94        assert!(fragment_ave <= 32);
95        1 << (32 - fragment_ave)
96        /*
97         * go-dedup does this:
98         * (22f64 - fragment_ave).exp2() as u32
99         *
100         * Which should be equivalent to the integer math above (which is used by zpaq).
101         */
102    }
103
104    /**
105     * Create a splitter using the range of output block sizes.
106     *
107     * The average block size will be the max block size (if any) divided by 4, using the same
108     * algorithm to calculate it as go-dedup.
109     */
110    pub fn with_range(range: impl RangeBounds<u64> + Clone) -> Self {
111        let f = Self::fragment_ave_from_range(range.clone());
112        Self::with_average_and_range(f, range)
113    }
114
115    /**
116     * Create a splitter using the defaults from Zpaq (the compressor) given a average size
117     * formated as a power of 2.
118     *
119     * Corresponds to zpaq's argument "-fragment".
120     */
121    pub fn with_average_size_pow_2(average_size_pow_2: u8) -> Self {
122        let r = Self::range_from_fragment_ave(average_size_pow_2);
123        Self::with_average_and_range(average_size_pow_2, r)
124    }
125
126    /**
127     * Use the defaults from go-dedup to generate a splitter given the max size of a split.
128     *
129     * The average block size will be the max block size (if any) divided by 4, using the same
130     * algorithm to calculate it as go-dedup.
131     */
132    pub fn with_max_size(max: u64) -> Self {
133        Self::with_average_and_range(Self::fragment_ave_from_max(max), Self::range_from_max(max))
134    }
135
136    /**
137     * Create a splitter with control of all parameters
138     *
139     * All the other constructors use this internally
140     */
141    pub fn with_average_and_range(average_size_pow_2: u8, range: impl RangeBounds<u64>) -> Self {
142        Zpaq {
143            range: range.into_tuple(),
144            max_hash: Self::max_hash_from_fragment_ave(average_size_pow_2),
145        }
146    }
147
148    /*
149    fn average_block_size(&self) -> u64 {
150        /* I don't know If i really trust this, do some more confirmation */
151        1024 << self.fragment
152    }
153    */
154
155    fn split_here(&self, hash: u32, index: u64) -> bool {
156        (hash < self.max_hash && !self.range.under_min(&index)) || self.range.exceeds_max(&index)
157    }
158}
159
160impl Default for Zpaq {
161    /**
162     * Create a splitter using the defaults from Zpaq (the compressor)
163     *
164     * Average size is 65536 bytes (64KiB), max is 520192 bytes (508KiB), min is 4096 bytes (4KiB)
165     */
166    fn default() -> Self {
167        Self::with_average_size_pow_2(16)
168    }
169}
170
171/// Incrimental instance of [`Zpaq`].
172///
173/// `Zpaq` doesn't require input look back, so the incrimental and non-incrimental performance
174/// should be similar.
175#[derive(Debug)]
176pub struct ZpaqIncr {
177    params: Zpaq,
178    state: ZpaqHash,
179    idx: u64,
180}
181
182/// Intermediate state from [`Chunk::find_chunk_edge`] for [`Zpaq`].
183#[derive(Default, Debug)]
184pub struct ZpaqSearchState {
185    state: ZpaqHash,
186    idx: u64,
187}
188
189impl ZpaqSearchState {
190    fn feed(&mut self, v: u8) -> u32 {
191        self.idx += 1;
192        self.state.feed(v)
193    }
194}
195
196impl Chunk for Zpaq {
197    type SearchState = ZpaqSearchState;
198
199    fn to_search_state(&self) -> Self::SearchState {
200        Default::default()
201    }
202
203    fn find_chunk_edge(
204        &self,
205        state: &mut Self::SearchState,
206        data: &[u8],
207    ) -> (Option<usize>, usize) {
208        for i in 0..data.len() {
209            let h = state.feed(data[i]);
210            if self.split_here(h, (state.idx + 1) as u64) {
211                *state = self.to_search_state();
212                return (Some(i + 1), i + 1);
213            }
214        }
215
216        (None, data.len())
217    }
218}
219
220impl From<&Zpaq> for ZpaqSearchState {
221    fn from(_: &Zpaq) -> Self {
222        Default::default()
223    }
224}
225
226impl From<&Zpaq> for ZpaqIncr {
227    fn from(s: &Zpaq) -> Self {
228        s.clone().into()
229    }
230}
231
232impl ToChunkIncr for Zpaq {
233    type Incr = ZpaqIncr;
234    fn to_chunk_incr(&self) -> Self::Incr {
235        self.into()
236    }
237}
238
239impl ZpaqIncr {
240    fn feed(&mut self, v: u8) -> u32 {
241        self.idx += 1;
242        self.state.feed(v)
243    }
244
245    fn reset(&mut self) {
246        self.idx = 0;
247        self.state = Default::default();
248    }
249}
250
251impl ChunkIncr for ZpaqIncr {
252    fn push(&mut self, data: &[u8]) -> Option<usize> {
253        for (i, &v) in data.iter().enumerate() {
254            let h = self.feed(v);
255            if self.params.split_here(h, self.idx) {
256                self.reset();
257                return Some(i + 1);
258            }
259        }
260
261        None
262    }
263}
264
265impl From<Zpaq> for ZpaqIncr {
266    fn from(params: Zpaq) -> Self {
267        Self {
268            params,
269            state: Default::default(),
270            idx: 0,
271        }
272    }
273}
274
275/**
276 * The rolling hash component of the zpaq splitter
277 */
278#[derive(Clone)]
279pub struct ZpaqHash {
280    hash: Wrapping<u32>,
281    last_byte: u8,
282    predicted_byte: [u8; 256],
283}
284
285impl PartialEq for ZpaqHash {
286    fn eq(&self, other: &Self) -> bool {
287        self.hash == other.hash
288            && self.last_byte == other.last_byte
289            && &self.predicted_byte[..] == &other.predicted_byte[..]
290    }
291}
292
293impl Eq for ZpaqHash {}
294
295impl fmt::Debug for ZpaqHash {
296    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
297        f.debug_struct("ZpaqHash")
298            .field("hash", &self.hash)
299            .field("last_byte", &self.last_byte)
300            .field("predicted_byte", &fmt_extra::Hs(&self.predicted_byte[..]))
301            .finish()
302    }
303}
304
305impl Default for ZpaqHash {
306    fn default() -> Self {
307        ZpaqHash {
308            hash: Wrapping(0),
309            last_byte: 0,
310            predicted_byte: [0; 256],
311        }
312    }
313}
314
315impl ZpaqHash {
316    /*
317     * we can only get away with this because Zpaq doesn't need to look at old data to make it's
318     * splitting decision, it only examines it's state + current value (and the state is
319     * relatively large, but isn't a window into past data).
320     */
321    fn feed(&mut self, c: u8) -> u32 {
322        self.hash = if c == self.predicted_byte[self.last_byte as usize] {
323            (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(314159265)
324        } else {
325            (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(271828182)
326        };
327
328        self.predicted_byte[self.last_byte as usize] = c;
329        self.last_byte = c;
330        self.hash.0
331    }
332}